FileDownloader/download.sh

#!/usr/bin/env bash
# Hardened rewrite of the provided script with safer defaults, clearer structure,
# robust option parsing, and better error handling.
#
# Usage examples:
#   ./script.sh -c "https://example.com/Some-Casting-X.html"
#   ./script.sh -p "/post/6545eda2cb76e.html"
#   ./script.sh -f posts.txt
#   ./script.sh -n -c "https://example.com/Some-Casting-X.html"   # dry-run

set -Eeuo pipefail
IFS=$'\n\t'

SCRIPT_NAME=${0##*/}
VERSION="1.2.1"

# ------------------------------- logging & utils -------------------------------
log()  { printf '[%s] %s
' "$(date '+%F %T%z')" "$*" >&2; }
die()  { log "ERROR: $*"; exit 1; }
need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }

need curl
need flock
need awk
need sed
need grep
need cut
need dirname

log "Starting $SCRIPT_NAME v$VERSION (PID $$)"

# Safer temp dir for partial files, etc.
TMPDIR=${TMPDIR:-/tmp}
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }

# ----------------------------- lock/unlock logic ------------------------------
# Use a fixed numeric FD (200) for wider Bash compatibility (e.g., macOS bash 3.2).
LOCK_FD=200
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
LOCK_HELD=0

lock() {
  log "Creating lock… ($LOCK_FILE)"
  exec ${LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
  if flock -n ${LOCK_FD}; then
    LOCK_HELD=1
  else
    die "Lock failed — another process is running."
  fi
}

unlock() {
  if (( LOCK_HELD )); then
    log "Releasing lock…"
    flock -u ${LOCK_FD} || true
    # Close the FD to avoid lingering descriptors
    exec ${LOCK_FD}>&- || true
    LOCK_HELD=0
  fi
}

# Always cleanup on exit/interrupt
cleanup() {
  unlock || true
  cleanup_tmp || true
}
trap cleanup EXIT INT TERM

# ------------------------------ output handling -------------------------------
OUTDIR=${OUTDIR:-$PWD}
mkdir -p "$OUTDIR"

# ---------------------------- idx path resolution -----------------------------
IDX=""  # will be set via set_idx_from_url

host_from_url() {
  # Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
  # Usage: host_from_url "https://foo.bar/baz" -> foo.bar
  local u=$1
  if [[ $u == http*://* ]]; then
    # strip scheme
    u=${u#*//}
    printf '%s\n' "${u%%/*}"
  else
    # If relative path, we need BASE_HOST to be set by caller
    [[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
    printf '%s\n' "$BASE_HOST"
  fi
}

set_idx_from_url() {
  # idx base: /storage/disk1/X/idx/blog/<host+path>/idx
  local base_url=$1
  local host_and_tail
  if [[ $base_url == http*://* ]]; then
    host_and_tail=${base_url#*//}   # host/...
  else
    [[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
    host_and_tail="$BASE_HOST/${base_url#/}"
  fi
  local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
  idx_root=${idx_root%/}
  IDX="$idx_root/idx"
  local dir
  dir=$(dirname "$IDX")
  mkdir -p "$dir"
  : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
}

link_exists_in_file() {
  [[ -n $IDX ]] || die "IDX är inte satt"
  local link_to_check=$1
  grep -Fqx -- "$link_to_check" "$IDX"
}

append_to_idx() {
  [[ -n $IDX ]] || die "IDX är inte satt"
  printf '%s\n' "$1" >> "$IDX"
}

# ------------------------------- network layer --------------------------------
CURL_OPTS=(
  --fail-with-body
  --show-error
  --location
  --connect-timeout 10
  --max-time 0
  --retry 3
  --retry-delay 1
  --retry-connrefused
  --compressed
  -sS
  -A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
)

fetch() {
  local url=$1
  curl "${CURL_OPTS[@]}" "$url"
}

post_text() {
  local url=$1
  local body=$2
  curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
}

# ------------------------------- parsing layer --------------------------------
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
get_items() {
  local url=$1
  local content
  content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
  printf '%s\n' "$content" \
    | sed 's/>/>\n/g' \
    | grep "class='js-pop'" \
    | awk -F"'" '{print $2}' \
    | cut -d'?' -f1
}

# Expecting a single line: ["<filename>","<download_url>"]
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'

sanitize_filename() {
  local in=$1
  local truncated=${in:0:80}
  # replace anything not alnum, underscore, dot or dash with underscore
  printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
}

# ------------------------------ core operations --------------------------------
NOOP=0
DEBUG=0
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
PAGES=${PAGES:-10}
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}

resolve_item_link() {
  local relative_item=$1
  post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
}

download_and_save_link() {
  local url=$1
  local base_name=$2
  local sanitized
  sanitized=$(sanitize_filename "$base_name")
  local outpath="$OUTDIR/${sanitized}.mp4"
  log "Downloading $url -> $outpath"
  (( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
  curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
  mv -f "$outpath.part" "$outpath"
}

do_post() {
  local line=$1
  [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
  if link_exists_in_file "$line"; then
    log "Link already indexed: $line"
    return 0
  fi
  log "Resolving link: $line"
  local result
  if ! result=$(resolve_item_link "$line"); then
    log "Resolver failed for: $line"
    return 1
  fi
  if [[ $result =~ $JSON_PAIR_RE ]]; then
    local name=${BASH_REMATCH[1]}
    local url=${BASH_REMATCH[2]}
    if download_and_save_link "$url" "$name"; then
      log "Download success — updating index"
      append_to_idx "$line"
    else
      log "Download failed for: $url"
      return 1
    fi
  else
    log "Resolver returned unexpected payload: $result"
    return 1
  fi
}

do_list_of_posts() {
  local links=$1
  while IFS= read -r line; do
    [[ -n $line ]] || continue
    do_post "$line"
  done <<< "$links"
}

# Process a collection page containing multiple posts (pagination supported)
process_collection() {
  local base_url=$1
  BASE_HOST=$(host_from_url "$base_url")
  set_idx_from_url "$base_url"
  log "Collection base: $base_url (host=$BASE_HOST)"
  local i current_url
  for (( i=0; i<PAGES; i++ )); do
    if (( i == 0 )); then
      current_url="$base_url"
    else
      current_url="${base_url}?page=$(( i * 30 ))"
    fi
    log "Current page: $current_url"
    local links
    links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
    do_list_of_posts "$links"
    (( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
  done
}

process_single_post() {
  local post=$1
  # If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
  BASE_HOST=$(host_from_url "$post")
  set_idx_from_url "$post"
  do_post "$post"
}

process_file_of_posts() {
  local file=$1
  [[ -s $file ]] || die "File not found or empty: $file"
  # Determine BASE_HOST from first non-empty line if possible
  local first
  first=$(grep -vE '^\s*$' "$file" | head -n1)
  [[ -n $first ]] || die "No usable lines in: $file"
  BASE_HOST=$(host_from_url "$first")
  set_idx_from_url "$first"
  while IFS= read -r line; do
    [[ -z $line ]] && continue
    do_post "$line"
  done < "$file"
}

# --------------------------------- CLI parsing ---------------------------------
usage() {
  cat <<EOF
$SCRIPT_NAME v$VERSION

Usage:
  $SCRIPT_NAME [-n] [-d] -c URL
  $SCRIPT_NAME [-n] [-d] -p POST
  $SCRIPT_NAME [-n] [-d] -f FILE

Options:
  -n            Dry-run (no downloads, still logs and updates idx suppressed)
  -d            Debug (set -x)
  -c URL        Collection URL (first page URL)
  -p POST       Single post path or URL (e.g. /post/abcd.html)
  -f FILE       File with one post per line
  -h            Help

Env vars:
  OUTDIR                 Output directory (default: current dir)
  PAGES                  How many collection pages to traverse (default: $PAGES)
  SLEEP_BETWEEN_PAGES    Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
  LOCAL_RESOLVER_URL     Resolver endpoint (default: $LOCAL_RESOLVER_URL)
EOF
}

collection=""; post=""; file=""
while getopts ":ndc:p:f:h" opt; do
  case "$opt" in
    n) NOOP=1 ;;
    d) DEBUG=1 ;;
    c) collection=$OPTARG ;;
    p) post=$OPTARG ;;
    f) file=$OPTARG ;;
    h) usage; exit 0 ;;
    :) die "Option -$OPTARG requires an argument." ;;
    \?) die "Invalid option: -$OPTARG" ;;
  esac
done

(( DEBUG )) && { log "Debug mode enabled"; set -x; }

# Enforce mutual exclusivity between -c, -p, -f
# (avoid set -e pitfalls by not looping over empty args)
count=0
[[ -n $collection ]] && ((count++))
[[ -n $post ]] && ((count++))
[[ -n $file ]] && ((count++))

log "Arg summary: collection='${collection:-}' post='${post:-}' file='${file:-}' (count=$count)"
if (( count != 1 )); then
  usage
  die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."
fi

lock
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"

if [[ -n $collection ]]; then
  process_collection "$collection"
elif [[ -n $post ]]; then
  process_single_post "$post"
elif [[ -n $file ]]; then
  process_file_of_posts "$file"
fi

# cleanup happens via trap
exit 0