Update download.sh

2025-09-29 21:18:53 +02:00
parent 569c5b6611
commit 80bc2c5f6f
1 changed files with 287 additions and 202 deletions
--- a/download.sh
+++ b/download.sh
@ -1,239 +1,324 @@
-#!/bin/bash
+#!/usr/bin/env bash
-idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx"
+# Hardened rewrite of the provided script with safer defaults, clearer structure,
-LOCK_FILE="/tmp/my_script.lock"
+# robust option parsing, and better error handling.
 #
 # Usage examples:
 #   ./script.sh -c "https://example.com/Some-Casting-X.html"
 #   ./script.sh -p "/post/6545eda2cb76e.html"
 #   ./script.sh -f posts.txt
 #   ./script.sh -n -c "https://example.com/Some-Casting-X.html"   # dry-run
-## Setup idx
+set -Eeuo pipefail
-# Extract the directory from the file path
+IFS=$'\n\t'
 directory_path="$(dirname "$idx")"
-# Test if the directory exists, and if not, create it
+SCRIPT_NAME=${0##*/}
-if [ ! -d "$directory_path" ]; then
+VERSION="1.2.0"
  mkdir -p "$directory_path"
 fi
-# Test if the file exists, and if not, create it
+# ------------------------------- logging & utils -------------------------------
-if [ ! -e "$idx" ]; then
+log()  { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; }
-  touch "$idx"
+die()  { log "ERROR: $*" >&2; exit 1; }
-fi
+need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
-lock(){
+need curl
-  echo "Creating lock.."
+need flock
-  exec 200>"$LOCK_FILE"
+need awk
-  flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 )
+need sed
 need grep
 need cut
 need dirname
 # Safer temp dir for partial files, etc.
 TMPDIR=${TMPDIR:-/tmp}
 WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
 cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
 # ----------------------------- lock/unlock logic ------------------------------
 LOCK_FD=200
 LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
 lock() {
  log "Creating lock… ($LOCK_FILE)"
  # shellcheck disable=SC2094
  exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
  if ! flock -n "$LOCK_FD"; then
    die "Lock failed — another process is running."
  fi
 }
 unlock() {
  log "Releasing lock…"
  flock -u "$LOCK_FD" || true
 }
-unlock(){
+# Always cleanup on exit/interrupt
-  echo "Removing lock.."
+cleanup() {
-  # Release the lock
+  unlock || true
-  flock -u 200
+  cleanup_tmp || true
 }
-_exit(){
+trap cleanup EXIT INT TERM
-  unlock
+
-  exit $1
+# ------------------------------ output handling -------------------------------
 OUTDIR=${OUTDIR:-$PWD}
 mkdir -p "$OUTDIR"
 # ---------------------------- idx path resolution -----------------------------
 IDX=""  # will be set via set_idx_from_url
 host_from_url() {
  # Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
  # Usage: host_from_url "https://foo.bar/baz" -> foo.bar
  local u=$1
  if [[ $u == http*://* ]]; then
    # strip scheme
    u=${u#*//}
    printf '%s\n' "${u%%/*}"
  else
    # If relative path, we need BASE_HOST to be set by caller
    [[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
    printf '%s\n' "$BASE_HOST"
  fi
 }
-get_link_for_item(){
+set_idx_from_url() {
-  local item=$1
+  # idx base: /storage/disk1/X/idx/blog/<host+path>/idx
-  local content
+  local base_url=$1
-
+  local host_and_tail
-  #echo "$item"
+  if [[ $base_url == http*://* ]]; then
-
+    host_and_tail=${base_url#*//}   # host/...
-  # Use curl to download the URL and store its content in the 'content' variable
+  else
-
+    [[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
-  content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || {
+    host_and_tail="$BASE_HOST/${base_url#/}"
-    echo "Failed to download the URL: $item"
+  fi
-    return 1
+  local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
-  }
+  idx_root=${idx_root%/}
-
+  IDX="$idx_root/idx"
-  #curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html'
+  local dir
-  #["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"]
+  dir=$(dirname "$IDX")
-
+  mkdir -p "$dir"
-  echo "$content"
+  : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
 }
 log(){
  local message=$1
  echo $(date '+%F %H:%M:%S') "$message"
 }
 get_items(){
  local url=$1
  local content
  local links
  # Use curl to download the URL and store its content in the 'content' variable
  content=$(curl -s "$url") || {
    log "Failed to download the URL: $url"
    return 1
  }
  # Print the stored content
  echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1
 }
 link_exists_in_file() {
-  local link_to_check="$1"
+  [[ -n $IDX ]] || die "IDX är inte satt"
-  if grep -Fxq "$link_to_check" "$idx"; then
+  local link_to_check=$1
-    return 0 # Link exists in the file
+  grep -Fqx -- "$link_to_check" "$IDX"
-  else
+}
-    return 1 # Link does not exist in the file
+
-  fi
+append_to_idx() {
  [[ -n $IDX ]] || die "IDX är inte satt"
  printf '%s\n' "$1" >> "$IDX"
 }
 # ------------------------------- network layer --------------------------------
 CURL_OPTS=(
  --fail-with-body
  --show-error
  --location
  --connect-timeout 10
  --max-time 0
  --retry 3
  --retry-delay 1
  --retry-connrefused
  --compressed
  -sS
  -A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
 )
 fetch() {
  local url=$1
  curl "${CURL_OPTS[@]}" "$url"
 }
 post_text() {
  local url=$1
  local body=$2
  curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
 }
 # ------------------------------- parsing layer --------------------------------
 # NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
 get_items() {
  local url=$1
  local content
  content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
  printf '%s\n' "$content" \
    | sed 's/>/>\n/g' \
    | grep "class='js-pop'" \
    | awk -F"'" '{print $2}' \
    | cut -d'?' -f1
 }
 # Expecting a single line: ["<filename>","<download_url>"]
 JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
 sanitize_filename() {
  local in=$1
  local truncated=${in:0:80}
  # replace anything not alnum, underscore, dot or dash with underscore
  printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
 }
 # ------------------------------ core operations --------------------------------
 NOOP=0
 DEBUG=0
 SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
 PAGES=${PAGES:-10}
 LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
 resolve_item_link() {
  local relative_item=$1
  post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
 }
 download_and_save_link() {
-  local url="$1"
+  local url=$1
-  local filename="$2"
+  local base_name=$2
-  local truncated="${filename:0:50}"
+  local sanitized
-  local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}"
+  sanitized=$(sanitize_filename "$base_name")
  local outpath="$OUTDIR/${sanitized}.mp4"
  log "Downloading $url -> $outpath"
  (( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
  curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
  mv -f "$outpath.part" "$outpath"
 }
-  log "Downloading $url with name $sanitized"
+do_post() {
-  # Use curl to download the URL and save the content to the specified file
+  local line=$1
-  if ! curl -L -o "$sanitized.mp4" "$url"; then
+  [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
-    log "Curl failed to download the URL: $url"
+  if link_exists_in_file "$line"; then
    log "Link already indexed: $line"
    return 0
  fi
  log "Resolving link: $line"
  local result
  if ! result=$(resolve_item_link "$line"); then
    log "Resolver failed for: $line"
    return 1
  fi
  if [[ $result =~ $JSON_PAIR_RE ]]; then
    local name=${BASH_REMATCH[1]}
    local url=${BASH_REMATCH[2]}
    if download_and_save_link "$url" "$name"; then
      log "Download success — updating index"
      append_to_idx "$line"
    else
      log "Download failed for: $url"
      return 1
    fi
  else
    log "Resolver returned unexpected payload: $result"
    return 1
  fi
 }
-pattern='^\["([^"]*)","([^"]*)"\]$'
+do_list_of_posts() {
-# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html
+  local links=$1
 do_post(){
  local line="$1"
  if link_exists_in_file "$line"; then
    log "Link already exists: $line"
  else
    log "Resolving link $line"
    result="$(get_link_for_item "$line")"
    # Test if the string matches the pattern
    if [[ $result =~ $pattern ]]; then
      val1="${BASH_REMATCH[1]}"
      val2="${BASH_REMATCH[2]}"
      if download_and_save_link "$val2" "$val1"; then
        log "Download success - updating index"
        echo "$line" >> "$idx"
      else
        log "Download failed"
      fi
    else
      log "no match $result"
    fi
  fi
 }
 # Iterate over rows of posts
 do_list_of_posts(){
  local links="$1"
  # Iterate over rows of posts
  while IFS= read -r line; do
    [[ -n $line ]] || continue
    do_post "$line"
  done <<< "$links"
 }
-# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html
+# Process a collection page containing multiple posts (pagination supported)
-do_collection_page(){
+process_collection() {
-  local current_url="$1"
+  local base_url=$1
-
+  BASE_HOST=$(host_from_url "$base_url")
  set_idx_from_url "$base_url"
  log "Collection base: $base_url (host=$BASE_HOST)"
  local i current_url
  for (( i=0; i<PAGES; i++ )); do
    if (( i == 0 )); then
      current_url="$base_url"
    else
      current_url="${base_url}?page=$(( i * 30 ))"
    fi
    log "Current page: $current_url"
-
+    local links
-  # Download and parse out items
+    links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
  links="$(get_items "$current_url")"
    do_list_of_posts "$links"
    (( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
  done
 }
 process_single_post() {
  local post=$1
  # If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
  BASE_HOST=$(host_from_url "$post")
  set_idx_from_url "$post"
  do_post "$post"
 }
 process_file_of_posts() {
  local file=$1
  [[ -s $file ]] || die "File not found or empty: $file"
  # Determine BASE_HOST from first non-empty line if possible
  local first
  first=$(grep -vE '^\s*$' "$file" | head -n1)
  [[ -n $first ]] || die "No usable lines in: $file"
  BASE_HOST=$(host_from_url "$first")
  set_idx_from_url "$first"
  while IFS= read -r line; do
    [[ -z $line ]] && continue
    do_post "$line"
  done < "$file"
 }
 # --------------------------------- CLI parsing ---------------------------------
 usage() {
-  echo "Options -c, -p and -f cannot be used together."
+  cat <<EOF
-  _exit 1
+$SCRIPT_NAME v$VERSION
 Usage:
  $SCRIPT_NAME [-n] [-d] -c URL
  $SCRIPT_NAME [-n] [-d] -p POST
  $SCRIPT_NAME [-n] [-d] -f FILE
 Options:
  -n            Dry-run (no downloads, still logs and updates idx suppressed)
  -d            Debug (set -x)
  -c URL        Collection URL (first page URL)
  -p POST       Single post path or URL (e.g. /post/abcd.html)
  -f FILE       File with one post per line
  -h            Help
 Env vars:
  OUTDIR                 Output directory (default: current dir)
  PAGES                  How many collection pages to traverse (default: $PAGES)
  SLEEP_BETWEEN_PAGES    Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
  LOCAL_RESOLVER_URL     Resolver endpoint (default: $LOCAL_RESOLVER_URL)
 EOF
 }
-collection=""
+collection=""; post=""; file=""
-post=""
+while getopts ":ndc:p:f:h" opt; do
-file=""
+  case "$opt" in
-debug=0
+    n) NOOP=1 ;;
-while getopts ":n:d:c:p:f:" opt; do
+    d) DEBUG=1 ;;
-  case $opt in
+    c) collection=$OPTARG ;;
-    d)
+    p) post=$OPTARG ;;
-      set -x
+    f) file=$OPTARG ;;
-      debug=1
+    h) usage; exit 0 ;;
-      ;;
+    :) die "Option -$OPTARG requires an argument." ;;
-    n)
+    \?) die "Invalid option: -$OPTARG" ;;
    # No op
      no_op="1"
      ;;
    c)
      # Collection
      if [ -n "$post" ] || [ -n "$file" ] || [ "$OPTARG" == "-p" ] || [ "$OPTARG" == "-f" ]; then
        usage
      fi
      collection="$OPTARG"
      ;;
    p)
      # post
      if [ -n "$collection" ] || [ -n "$file" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-f" ]; then
        usage
      fi
      post="$OPTARG"
      ;;
    f)
      # file of posts
      if [ -n "$collection" ] || [ -n "$post" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-p" ]; then
        usage
      fi
      file="$OPTARG"
      ;;
    \?)
      echo "Invalid option: -$OPTARG"
      _exit 1
      ;;
    :)
      echo "Option -$OPTARG requires an argument."
      _exit 1
      ;;
  esac
 done
-echo "debug: $debug"
+(( DEBUG )) && set -x
 # Enforce mutual exclusivity between -c, -p, -f
 set -- "$collection" "$post" "$file"
 count=0
 for x in "$@"; do [[ -n $x ]] && ((count++)); done
 (( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
 lock
 log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
-# Collection
+if [[ -n $collection ]]; then
-if [ -n "$collection" ]; then
+  process_collection "$collection"
-  echo "Scrapping collection.."
+elif [[ -n $post ]]; then
-  base_url="$collection"
+  process_single_post "$post"
-  ## For pages 1 to x
+elif [[ -n $file ]]; then
-  for ((i=0; i<10; i++)); do
+  process_file_of_posts "$file"
    if [ $i -eq 0 ]; then
      current_url="$base_url"
    else
      current_url="${base_url}?page=$((i * 30))"
    fi
    #Do collection page
    do_collection_page "$current_url"
    sleep 10
  done
 fi
-# Single post
+# cleanup happens via trap
-if [ -n "$post" ]; then
+exit 0
  echo "Scrapping post.."
  do_post "$post"
 fi
 # File of posts
 if [ -n "$file" ]; then
  echo "Scrapping file"
  # Check if the file exists
  if [ -e "$file" ]; then
    # Open the file for reading
    while IFS= read -r line; do
      # Process each line here, for example, echo it
      do_post "$line"
    done < "$file"
  else
    echo "File not found: $file"
  fi
 fi
 unlock