Files
FileDownloader/download.sh
Urban Modig 27d4318160 log
2025-10-01 11:06:25 +02:00

338 lines
9.2 KiB
Bash

#!/usr/bin/env bash
# Hardened rewrite of the provided script with safer defaults, clearer structure,
# robust option parsing, and better error handling.
#
# Usage examples:
# ./script.sh -c "https://example.com/Some-Casting-X.html"
# ./script.sh -p "/post/6545eda2cb76e.html"
# ./script.sh -f posts.txt
# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run
set -Eeuo pipefail
IFS=$'\n\t'
SCRIPT_NAME=${0##*/}
VERSION="1.2.1"
# ------------------------------- logging & utils -------------------------------
log() { printf '[%s] %s
' "$(date '+%F %T%z')" "$*" >&2; }
die() { log "ERROR: $*"; exit 1; }
need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
need curl
need flock
need awk
need sed
need grep
need cut
need dirname
log "Starting $SCRIPT_NAME v$VERSION (PID $$)"
# Safer temp dir for partial files, etc.
TMPDIR=${TMPDIR:-/tmp}
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
# ----------------------------- lock/unlock logic ------------------------------
# Use a fixed numeric FD (200) for wider Bash compatibility (e.g., macOS bash 3.2).
LOCK_FD=200
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
LOCK_HELD=0
lock() {
log "Creating lock… ($LOCK_FILE)"
exec ${LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
if flock -n ${LOCK_FD}; then
LOCK_HELD=1
else
die "Lock failed — another process is running."
fi
}
unlock() {
if (( LOCK_HELD )); then
log "Releasing lock…"
flock -u ${LOCK_FD} || true
# Close the FD to avoid lingering descriptors
exec ${LOCK_FD}>&- || true
LOCK_HELD=0
fi
}
# Always cleanup on exit/interrupt
cleanup() {
unlock || true
cleanup_tmp || true
}
trap cleanup EXIT INT TERM
# ------------------------------ output handling -------------------------------
OUTDIR=${OUTDIR:-$PWD}
mkdir -p "$OUTDIR"
# ---------------------------- idx path resolution -----------------------------
IDX="" # will be set via set_idx_from_url
host_from_url() {
# Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
# Usage: host_from_url "https://foo.bar/baz" -> foo.bar
local u=$1
if [[ $u == http*://* ]]; then
# strip scheme
u=${u#*//}
printf '%s\n' "${u%%/*}"
else
# If relative path, we need BASE_HOST to be set by caller
[[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
printf '%s\n' "$BASE_HOST"
fi
}
set_idx_from_url() {
# idx base: /storage/disk1/X/idx/blog/<host+path>/idx
local base_url=$1
local host_and_tail
if [[ $base_url == http*://* ]]; then
host_and_tail=${base_url#*//} # host/...
else
[[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
host_and_tail="$BASE_HOST/${base_url#/}"
fi
local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
idx_root=${idx_root%/}
IDX="$idx_root/idx"
local dir
dir=$(dirname "$IDX")
mkdir -p "$dir"
: >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
}
link_exists_in_file() {
[[ -n $IDX ]] || die "IDX är inte satt"
local link_to_check=$1
grep -Fqx -- "$link_to_check" "$IDX"
}
append_to_idx() {
[[ -n $IDX ]] || die "IDX är inte satt"
printf '%s\n' "$1" >> "$IDX"
}
# ------------------------------- network layer --------------------------------
CURL_OPTS=(
--fail-with-body
--show-error
--location
--connect-timeout 10
--max-time 0
--retry 3
--retry-delay 1
--retry-connrefused
--compressed
-sS
-A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
)
fetch() {
local url=$1
curl "${CURL_OPTS[@]}" "$url"
}
post_text() {
local url=$1
local body=$2
curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
}
# ------------------------------- parsing layer --------------------------------
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
get_items() {
local url=$1
local content
content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
printf '%s\n' "$content" \
| sed 's/>/>\n/g' \
| grep "class='js-pop'" \
| awk -F"'" '{print $2}' \
| cut -d'?' -f1
}
# Expecting a single line: ["<filename>","<download_url>"]
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
sanitize_filename() {
local in=$1
local truncated=${in:0:80}
# replace anything not alnum, underscore, dot or dash with underscore
printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
}
# ------------------------------ core operations --------------------------------
NOOP=0
DEBUG=0
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
PAGES=${PAGES:-10}
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
resolve_item_link() {
local relative_item=$1
post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
}
download_and_save_link() {
local url=$1
local base_name=$2
local sanitized
sanitized=$(sanitize_filename "$base_name")
local outpath="$OUTDIR/${sanitized}.mp4"
log "Downloading $url -> $outpath"
(( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
mv -f "$outpath.part" "$outpath"
}
do_post() {
local line=$1
[[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
if link_exists_in_file "$line"; then
log "Link already indexed: $line"
return 0
fi
log "Resolving link: $line"
local result
if ! result=$(resolve_item_link "$line"); then
log "Resolver failed for: $line"
return 1
fi
if [[ $result =~ $JSON_PAIR_RE ]]; then
local name=${BASH_REMATCH[1]}
local url=${BASH_REMATCH[2]}
if download_and_save_link "$url" "$name"; then
log "Download success — updating index"
append_to_idx "$line"
else
log "Download failed for: $url"
return 1
fi
else
log "Resolver returned unexpected payload: $result"
return 1
fi
}
do_list_of_posts() {
local links=$1
while IFS= read -r line; do
[[ -n $line ]] || continue
do_post "$line"
done <<< "$links"
}
# Process a collection page containing multiple posts (pagination supported)
process_collection() {
local base_url=$1
BASE_HOST=$(host_from_url "$base_url")
set_idx_from_url "$base_url"
log "Collection base: $base_url (host=$BASE_HOST)"
local i current_url
for (( i=0; i<PAGES; i++ )); do
if (( i == 0 )); then
current_url="$base_url"
else
current_url="${base_url}?page=$(( i * 30 ))"
fi
log "Current page: $current_url"
local links
links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
do_list_of_posts "$links"
(( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
done
}
process_single_post() {
local post=$1
# If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
BASE_HOST=$(host_from_url "$post")
set_idx_from_url "$post"
do_post "$post"
}
process_file_of_posts() {
local file=$1
[[ -s $file ]] || die "File not found or empty: $file"
# Determine BASE_HOST from first non-empty line if possible
local first
first=$(grep -vE '^\s*$' "$file" | head -n1)
[[ -n $first ]] || die "No usable lines in: $file"
BASE_HOST=$(host_from_url "$first")
set_idx_from_url "$first"
while IFS= read -r line; do
[[ -z $line ]] && continue
do_post "$line"
done < "$file"
}
# --------------------------------- CLI parsing ---------------------------------
usage() {
cat <<EOF
$SCRIPT_NAME v$VERSION
Usage:
$SCRIPT_NAME [-n] [-d] -c URL
$SCRIPT_NAME [-n] [-d] -p POST
$SCRIPT_NAME [-n] [-d] -f FILE
Options:
-n Dry-run (no downloads, still logs and updates idx suppressed)
-d Debug (set -x)
-c URL Collection URL (first page URL)
-p POST Single post path or URL (e.g. /post/abcd.html)
-f FILE File with one post per line
-h Help
Env vars:
OUTDIR Output directory (default: current dir)
PAGES How many collection pages to traverse (default: $PAGES)
SLEEP_BETWEEN_PAGES Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
LOCAL_RESOLVER_URL Resolver endpoint (default: $LOCAL_RESOLVER_URL)
EOF
}
collection=""; post=""; file=""
while getopts ":ndc:p:f:h" opt; do
case "$opt" in
n) NOOP=1 ;;
d) DEBUG=1 ;;
c) collection=$OPTARG ;;
p) post=$OPTARG ;;
f) file=$OPTARG ;;
h) usage; exit 0 ;;
:) die "Option -$OPTARG requires an argument." ;;
\?) die "Invalid option: -$OPTARG" ;;
esac
done
(( DEBUG )) && { log "Debug mode enabled"; set -x; }
# Enforce mutual exclusivity between -c, -p, -f
set -- "$collection" "$post" "$file"
count=0
for x in "$@"; do [[ -n $x ]] && ((count++)); done
(( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
lock
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
if [[ -n $collection ]]; then
process_collection "$collection"
elif [[ -n $post ]]; then
process_single_post "$post"
elif [[ -n $file ]]; then
process_file_of_posts "$file"
fi
# cleanup happens via trap
exit 0