From 80bc2c5f6f55c0fab45d2e90fdb84a2677313bdb Mon Sep 17 00:00:00 2001 From: urban Date: Mon, 29 Sep 2025 21:18:53 +0200 Subject: [PATCH] Update download.sh --- download.sh | 489 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 287 insertions(+), 202 deletions(-) diff --git a/download.sh b/download.sh index 43b3a01..a333cb6 100644 --- a/download.sh +++ b/download.sh @@ -1,239 +1,324 @@ -#!/bin/bash -idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx" -LOCK_FILE="/tmp/my_script.lock" +#!/usr/bin/env bash +# Hardened rewrite of the provided script with safer defaults, clearer structure, +# robust option parsing, and better error handling. +# +# Usage examples: +# ./script.sh -c "https://example.com/Some-Casting-X.html" +# ./script.sh -p "/post/6545eda2cb76e.html" +# ./script.sh -f posts.txt +# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run -## Setup idx -# Extract the directory from the file path -directory_path="$(dirname "$idx")" +set -Eeuo pipefail +IFS=$'\n\t' -# Test if the directory exists, and if not, create it -if [ ! -d "$directory_path" ]; then - mkdir -p "$directory_path" -fi +SCRIPT_NAME=${0##*/} +VERSION="1.2.0" -# Test if the file exists, and if not, create it -if [ ! -e "$idx" ]; then - touch "$idx" -fi +# ------------------------------- logging & utils ------------------------------- +log() { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; } +die() { log "ERROR: $*" >&2; exit 1; } +need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; } -lock(){ - echo "Creating lock.." - exec 200>"$LOCK_FILE" - flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 ) +need curl +need flock +need awk +need sed +need grep +need cut +need dirname + +# Safer temp dir for partial files, etc. +TMPDIR=${TMPDIR:-/tmp} +WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX") +cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; } + +# ----------------------------- lock/unlock logic ------------------------------ +LOCK_FD=200 +LOCK_FILE="/tmp/${SCRIPT_NAME}.lock" +lock() { + log "Creating lock… ($LOCK_FILE)" + # shellcheck disable=SC2094 + exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE" + if ! flock -n "$LOCK_FD"; then + die "Lock failed — another process is running." + fi +} +unlock() { + log "Releasing lock…" + flock -u "$LOCK_FD" || true } -unlock(){ - echo "Removing lock.." - # Release the lock - flock -u 200 +# Always cleanup on exit/interrupt +cleanup() { + unlock || true + cleanup_tmp || true } -_exit(){ - unlock - exit $1 +trap cleanup EXIT INT TERM + +# ------------------------------ output handling ------------------------------- +OUTDIR=${OUTDIR:-$PWD} +mkdir -p "$OUTDIR" + +# ---------------------------- idx path resolution ----------------------------- +IDX="" # will be set via set_idx_from_url + +host_from_url() { + # Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST + # Usage: host_from_url "https://foo.bar/baz" -> foo.bar + local u=$1 + if [[ $u == http*://* ]]; then + # strip scheme + u=${u#*//} + printf '%s\n' "${u%%/*}" + else + # If relative path, we need BASE_HOST to be set by caller + [[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST" + printf '%s\n' "$BASE_HOST" + fi } -get_link_for_item(){ - local item=$1 - local content - - #echo "$item" - - # Use curl to download the URL and store its content in the 'content' variable - - content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || { - echo "Failed to download the URL: $item" - return 1 - } - - #curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html' - #["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"] - - echo "$content" -} - -log(){ - local message=$1 - echo $(date '+%F %H:%M:%S') "$message" -} - -get_items(){ - local url=$1 - local content - local links - - # Use curl to download the URL and store its content in the 'content' variable - content=$(curl -s "$url") || { - log "Failed to download the URL: $url" - return 1 - } - - # Print the stored content - echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1 +set_idx_from_url() { + # idx base: /storage/disk1/X/idx/blog//idx + local base_url=$1 + local host_and_tail + if [[ $base_url == http*://* ]]; then + host_and_tail=${base_url#*//} # host/... + else + [[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL" + host_and_tail="$BASE_HOST/${base_url#/}" + fi + local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}" + idx_root=${idx_root%/} + IDX="$idx_root/idx" + local dir + dir=$(dirname "$IDX") + mkdir -p "$dir" + : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX" } link_exists_in_file() { - local link_to_check="$1" - if grep -Fxq "$link_to_check" "$idx"; then - return 0 # Link exists in the file - else - return 1 # Link does not exist in the file - fi + [[ -n $IDX ]] || die "IDX är inte satt" + local link_to_check=$1 + grep -Fqx -- "$link_to_check" "$IDX" +} + +append_to_idx() { + [[ -n $IDX ]] || die "IDX är inte satt" + printf '%s\n' "$1" >> "$IDX" +} + +# ------------------------------- network layer -------------------------------- +CURL_OPTS=( + --fail-with-body + --show-error + --location + --connect-timeout 10 + --max-time 0 + --retry 3 + --retry-delay 1 + --retry-connrefused + --compressed + -sS + -A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION" +) + +fetch() { + local url=$1 + curl "${CURL_OPTS[@]}" "$url" +} + +post_text() { + local url=$1 + local body=$2 + curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url" +} + +# ------------------------------- parsing layer -------------------------------- +# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available. +get_items() { + local url=$1 + local content + content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; } + printf '%s\n' "$content" \ + | sed 's/>/>\n/g' \ + | grep "class='js-pop'" \ + | awk -F"'" '{print $2}' \ + | cut -d'?' -f1 +} + +# Expecting a single line: ["",""] +JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$' + +sanitize_filename() { + local in=$1 + local truncated=${in:0:80} + # replace anything not alnum, underscore, dot or dash with underscore + printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g' +} + +# ------------------------------ core operations -------------------------------- +NOOP=0 +DEBUG=0 +SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10} +PAGES=${PAGES:-10} +LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000} + +resolve_item_link() { + local relative_item=$1 + post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item" } download_and_save_link() { - local url="$1" - local filename="$2" - local truncated="${filename:0:50}" - local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}" + local url=$1 + local base_name=$2 + local sanitized + sanitized=$(sanitize_filename "$base_name") + local outpath="$OUTDIR/${sanitized}.mp4" + log "Downloading $url -> $outpath" + (( NOOP )) && { log "(dry-run) Skipping download"; return 0; } + curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; } + mv -f "$outpath.part" "$outpath" +} - log "Downloading $url with name $sanitized" - # Use curl to download the URL and save the content to the specified file - if ! curl -L -o "$sanitized.mp4" "$url"; then - log "Curl failed to download the URL: $url" +do_post() { + local line=$1 + [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först" + if link_exists_in_file "$line"; then + log "Link already indexed: $line" + return 0 + fi + log "Resolving link: $line" + local result + if ! result=$(resolve_item_link "$line"); then + log "Resolver failed for: $line" + return 1 + fi + if [[ $result =~ $JSON_PAIR_RE ]]; then + local name=${BASH_REMATCH[1]} + local url=${BASH_REMATCH[2]} + if download_and_save_link "$url" "$name"; then + log "Download success — updating index" + append_to_idx "$line" + else + log "Download failed for: $url" + return 1 + fi + else + log "Resolver returned unexpected payload: $result" return 1 fi } -pattern='^\["([^"]*)","([^"]*)"\]$' -# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html -do_post(){ - local line="$1" - - if link_exists_in_file "$line"; then - log "Link already exists: $line" - else - log "Resolving link $line" - result="$(get_link_for_item "$line")" - - # Test if the string matches the pattern - if [[ $result =~ $pattern ]]; then - val1="${BASH_REMATCH[1]}" - val2="${BASH_REMATCH[2]}" - if download_and_save_link "$val2" "$val1"; then - log "Download success - updating index" - echo "$line" >> "$idx" - else - log "Download failed" - fi - else - log "no match $result" - fi - fi -} - -# Iterate over rows of posts -do_list_of_posts(){ - local links="$1" - - # Iterate over rows of posts +do_list_of_posts() { + local links=$1 while IFS= read -r line; do + [[ -n $line ]] || continue do_post "$line" done <<< "$links" } -# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html -do_collection_page(){ - local current_url="$1" - - log "Current page: $current_url" - - # Download and parse out items - links="$(get_items "$current_url")" - - do_list_of_posts "$links" +# Process a collection page containing multiple posts (pagination supported) +process_collection() { + local base_url=$1 + BASE_HOST=$(host_from_url "$base_url") + set_idx_from_url "$base_url" + log "Collection base: $base_url (host=$BASE_HOST)" + local i current_url + for (( i=0; i