Update download.sh

2025-09-29 21:18:53 +02:00
parent 569c5b6611
commit 80bc2c5f6f
1 changed files with 287 additions and 202 deletions
--- a/download.sh
+++ b/download.sh
@ -1,239 +1,324 @@
-#!/bin/bash
-idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx"
-LOCK_FILE="/tmp/my_script.lock"
+#!/usr/bin/env bash
+# Hardened rewrite of the provided script with safer defaults, clearer structure,
+# robust option parsing, and better error handling.
+#
+# Usage examples:
+#   ./script.sh -c "https://example.com/Some-Casting-X.html"
+#   ./script.sh -p "/post/6545eda2cb76e.html"
+#   ./script.sh -f posts.txt
+#   ./script.sh -n -c "https://example.com/Some-Casting-X.html"   # dry-run

-## Setup idx
-# Extract the directory from the file path
-directory_path="$(dirname "$idx")"
+set -Eeuo pipefail
+IFS=$'\n\t'

-# Test if the directory exists, and if not, create it
-if [ ! -d "$directory_path" ]; then
-  mkdir -p "$directory_path"
-fi
+SCRIPT_NAME=${0##*/}
+VERSION="1.2.0"

-# Test if the file exists, and if not, create it
-if [ ! -e "$idx" ]; then
-  touch "$idx"
-fi
+# ------------------------------- logging & utils -------------------------------
+log()  { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; }
+die()  { log "ERROR: $*" >&2; exit 1; }
+need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }

+need curl
+need flock
+need awk
+need sed
+need grep
+need cut
+need dirname
+
+# Safer temp dir for partial files, etc.
+TMPDIR=${TMPDIR:-/tmp}
+WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
+cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
+
+# ----------------------------- lock/unlock logic ------------------------------
+LOCK_FD=200
+LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
 lock() {
-  echo "Creating lock.."
-  exec 200>"$LOCK_FILE"
-  flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 )
+  log "Creating lock… ($LOCK_FILE)"
+  # shellcheck disable=SC2094
+  exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
+  if ! flock -n "$LOCK_FD"; then
+    die "Lock failed — another process is running."
+  fi
 }
-
 unlock() {
-  echo "Removing lock.."
-  # Release the lock
-  flock -u 200
-}
-_exit(){
-  unlock
-  exit $1
+  log "Releasing lock…"
+  flock -u "$LOCK_FD" || true
 }

-get_link_for_item(){
-  local item=$1
-  local content
+# Always cleanup on exit/interrupt
+cleanup() {
+  unlock || true
+  cleanup_tmp || true
+}
+trap cleanup EXIT INT TERM

-  #echo "$item"
+# ------------------------------ output handling -------------------------------
+OUTDIR=${OUTDIR:-$PWD}
+mkdir -p "$OUTDIR"

-  # Use curl to download the URL and store its content in the 'content' variable
+# ---------------------------- idx path resolution -----------------------------
+IDX=""  # will be set via set_idx_from_url

-  content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || {
-    echo "Failed to download the URL: $item"
-    return 1
+host_from_url() {
+  # Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
+  # Usage: host_from_url "https://foo.bar/baz" -> foo.bar
+  local u=$1
+  if [[ $u == http*://* ]]; then
+    # strip scheme
+    u=${u#*//}
+    printf '%s\n' "${u%%/*}"
+  else
+    # If relative path, we need BASE_HOST to be set by caller
+    [[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
+    printf '%s\n' "$BASE_HOST"
+  fi
 }

-  #curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html'
-  #["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"]
-
-  echo "$content"
-}
-
-log(){
-  local message=$1
-  echo $(date '+%F %H:%M:%S') "$message"
-}
-
-get_items(){
-  local url=$1
-  local content
-  local links
-
-  # Use curl to download the URL and store its content in the 'content' variable
-  content=$(curl -s "$url") || {
-    log "Failed to download the URL: $url"
-    return 1
-  }
-
-  # Print the stored content
-  echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1
+set_idx_from_url() {
+  # idx base: /storage/disk1/X/idx/blog/<host+path>/idx
+  local base_url=$1
+  local host_and_tail
+  if [[ $base_url == http*://* ]]; then
+    host_and_tail=${base_url#*//}   # host/...
+  else
+    [[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
+    host_and_tail="$BASE_HOST/${base_url#/}"
+  fi
+  local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
+  idx_root=${idx_root%/}
+  IDX="$idx_root/idx"
+  local dir
+  dir=$(dirname "$IDX")
+  mkdir -p "$dir"
+  : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
 }

 link_exists_in_file() {
-  local link_to_check="$1"
-  if grep -Fxq "$link_to_check" "$idx"; then
-    return 0 # Link exists in the file
-  else
-    return 1 # Link does not exist in the file
-  fi
+  [[ -n $IDX ]] || die "IDX är inte satt"
+  local link_to_check=$1
+  grep -Fqx -- "$link_to_check" "$IDX"
+}
+
+append_to_idx() {
+  [[ -n $IDX ]] || die "IDX är inte satt"
+  printf '%s\n' "$1" >> "$IDX"
+}
+
+# ------------------------------- network layer --------------------------------
+CURL_OPTS=(
+  --fail-with-body
+  --show-error
+  --location
+  --connect-timeout 10
+  --max-time 0
+  --retry 3
+  --retry-delay 1
+  --retry-connrefused
+  --compressed
+  -sS
+  -A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
+)
+
+fetch() {
+  local url=$1
+  curl "${CURL_OPTS[@]}" "$url"
+}
+
+post_text() {
+  local url=$1
+  local body=$2
+  curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
+}
+
+# ------------------------------- parsing layer --------------------------------
+# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
+get_items() {
+  local url=$1
+  local content
+  content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
+  printf '%s\n' "$content" \
+    | sed 's/>/>\n/g' \
+    | grep "class='js-pop'" \
+    | awk -F"'" '{print $2}' \
+    | cut -d'?' -f1
+}
+
+# Expecting a single line: ["<filename>","<download_url>"]
+JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
+
+sanitize_filename() {
+  local in=$1
+  local truncated=${in:0:80}
+  # replace anything not alnum, underscore, dot or dash with underscore
+  printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
+}
+
+# ------------------------------ core operations --------------------------------
+NOOP=0
+DEBUG=0
+SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
+PAGES=${PAGES:-10}
+LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
+
+resolve_item_link() {
+  local relative_item=$1
+  post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
 }

 download_and_save_link() {
-  local url="$1"
-  local filename="$2"
-  local truncated="${filename:0:50}"
-  local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}"
+  local url=$1
+  local base_name=$2
+  local sanitized
+  sanitized=$(sanitize_filename "$base_name")
+  local outpath="$OUTDIR/${sanitized}.mp4"
+  log "Downloading $url -> $outpath"
+  (( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
+  curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
+  mv -f "$outpath.part" "$outpath"
+}

-  log "Downloading $url with name $sanitized"
-  # Use curl to download the URL and save the content to the specified file
-  if ! curl -L -o "$sanitized.mp4" "$url"; then
-    log "Curl failed to download the URL: $url"
+do_post() {
+  local line=$1
+  [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
+  if link_exists_in_file "$line"; then
+    log "Link already indexed: $line"
+    return 0
+  fi
+  log "Resolving link: $line"
+  local result
+  if ! result=$(resolve_item_link "$line"); then
+    log "Resolver failed for: $line"
+    return 1
+  fi
+  if [[ $result =~ $JSON_PAIR_RE ]]; then
+    local name=${BASH_REMATCH[1]}
+    local url=${BASH_REMATCH[2]}
+    if download_and_save_link "$url" "$name"; then
+      log "Download success — updating index"
+      append_to_idx "$line"
+    else
+      log "Download failed for: $url"
+      return 1
+    fi
+  else
+    log "Resolver returned unexpected payload: $result"
    return 1
  fi
 }

-pattern='^\["([^"]*)","([^"]*)"\]$'
-# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html
-do_post(){
-  local line="$1"
-
-  if link_exists_in_file "$line"; then
-    log "Link already exists: $line"
-  else
-    log "Resolving link $line"
-    result="$(get_link_for_item "$line")"
-
-    # Test if the string matches the pattern
-    if [[ $result =~ $pattern ]]; then
-      val1="${BASH_REMATCH[1]}"
-      val2="${BASH_REMATCH[2]}"
-      if download_and_save_link "$val2" "$val1"; then
-        log "Download success - updating index"
-        echo "$line" >> "$idx"
-      else
-        log "Download failed"
-      fi
-    else
-      log "no match $result"
-    fi
-  fi
-}
-
-# Iterate over rows of posts
 do_list_of_posts() {
-  local links="$1"
-
-  # Iterate over rows of posts
+  local links=$1
  while IFS= read -r line; do
+    [[ -n $line ]] || continue
    do_post "$line"
  done <<< "$links"
 }

-# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html
-do_collection_page(){
-  local current_url="$1"
-
-  log "Current page: $current_url"
-
-  # Download and parse out items
-  links="$(get_items "$current_url")"
-
-  do_list_of_posts "$links"
-}
-
-usage() {
-  echo "Options -c, -p and -f cannot be used together."
-  _exit 1
-}
-
-collection=""
-post=""
-file=""
-debug=0
-while getopts ":n:d:c:p:f:" opt; do
-  case $opt in
-    d)
-      set -x
-      debug=1
-      ;;
-    n)
-    # No op
-      no_op="1"
-      ;;
-    c)
-      # Collection
-      if [ -n "$post" ] || [ -n "$file" ] || [ "$OPTARG" == "-p" ] || [ "$OPTARG" == "-f" ]; then
-        usage
-      fi
-      collection="$OPTARG"
-      ;;
-    p)
-      # post
-      if [ -n "$collection" ] || [ -n "$file" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-f" ]; then
-        usage
-      fi
-      post="$OPTARG"
-      ;;
-    f)
-      # file of posts
-      if [ -n "$collection" ] || [ -n "$post" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-p" ]; then
-        usage
-      fi
-      file="$OPTARG"
-      ;;
-    \?)
-      echo "Invalid option: -$OPTARG"
-      _exit 1
-      ;;
-    :)
-      echo "Option -$OPTARG requires an argument."
-      _exit 1
-      ;;
-  esac
-done
-
-echo "debug: $debug"
-lock
-
-# Collection
-if [ -n "$collection" ]; then
-  echo "Scrapping collection.."
-  base_url="$collection"
-  ## For pages 1 to x
-  for ((i=0; i<10; i++)); do
-    if [ $i -eq 0 ]; then
+# Process a collection page containing multiple posts (pagination supported)
+process_collection() {
+  local base_url=$1
+  BASE_HOST=$(host_from_url "$base_url")
+  set_idx_from_url "$base_url"
+  log "Collection base: $base_url (host=$BASE_HOST)"
+  local i current_url
+  for (( i=0; i<PAGES; i++ )); do
+    if (( i == 0 )); then
      current_url="$base_url"
    else
      current_url="${base_url}?page=$(( i * 30 ))"
    fi
-
-    #Do collection page
-    do_collection_page "$current_url"
-    sleep 10
+    log "Current page: $current_url"
+    local links
+    links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
+    do_list_of_posts "$links"
+    (( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
  done
-fi
+}

-# Single post
-if [ -n "$post" ]; then
-  echo "Scrapping post.."
+process_single_post() {
+  local post=$1
+  # If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
+  BASE_HOST=$(host_from_url "$post")
+  set_idx_from_url "$post"
  do_post "$post"
-fi
+}

-# File of posts
-if [ -n "$file" ]; then
-  echo "Scrapping file"
-  # Check if the file exists
-  if [ -e "$file" ]; then
-    # Open the file for reading
+process_file_of_posts() {
+  local file=$1
+  [[ -s $file ]] || die "File not found or empty: $file"
+  # Determine BASE_HOST from first non-empty line if possible
+  local first
+  first=$(grep -vE '^\s*$' "$file" | head -n1)
+  [[ -n $first ]] || die "No usable lines in: $file"
+  BASE_HOST=$(host_from_url "$first")
+  set_idx_from_url "$first"
  while IFS= read -r line; do
-      # Process each line here, for example, echo it
+    [[ -z $line ]] && continue
    do_post "$line"
  done < "$file"
-  else
-    echo "File not found: $file"
-  fi
+}
+
+# --------------------------------- CLI parsing ---------------------------------
+usage() {
+  cat <<EOF
+$SCRIPT_NAME v$VERSION
+
+Usage:
+  $SCRIPT_NAME [-n] [-d] -c URL
+  $SCRIPT_NAME [-n] [-d] -p POST
+  $SCRIPT_NAME [-n] [-d] -f FILE
+
+Options:
+  -n            Dry-run (no downloads, still logs and updates idx suppressed)
+  -d            Debug (set -x)
+  -c URL        Collection URL (first page URL)
+  -p POST       Single post path or URL (e.g. /post/abcd.html)
+  -f FILE       File with one post per line
+  -h            Help
+
+Env vars:
+  OUTDIR                 Output directory (default: current dir)
+  PAGES                  How many collection pages to traverse (default: $PAGES)
+  SLEEP_BETWEEN_PAGES    Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
+  LOCAL_RESOLVER_URL     Resolver endpoint (default: $LOCAL_RESOLVER_URL)
+EOF
+}
+
+collection=""; post=""; file=""
+while getopts ":ndc:p:f:h" opt; do
+  case "$opt" in
+    n) NOOP=1 ;;
+    d) DEBUG=1 ;;
+    c) collection=$OPTARG ;;
+    p) post=$OPTARG ;;
+    f) file=$OPTARG ;;
+    h) usage; exit 0 ;;
+    :) die "Option -$OPTARG requires an argument." ;;
+    \?) die "Invalid option: -$OPTARG" ;;
+  esac
+done
+
+(( DEBUG )) && set -x
+
+# Enforce mutual exclusivity between -c, -p, -f
+set -- "$collection" "$post" "$file"
+count=0
+for x in "$@"; do [[ -n $x ]] && ((count++)); done
+(( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
+
+lock
+log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
+
+if [[ -n $collection ]]; then
+  process_collection "$collection"
+elif [[ -n $post ]]; then
+  process_single_post "$post"
+elif [[ -n $file ]]; then
+  process_file_of_posts "$file"
 fi

-unlock
+# cleanup happens via trap
+exit 0