#!/usr/bin/env bash # Hardened rewrite of the provided script with safer defaults, clearer structure, # robust option parsing, and better error handling. # # Usage examples: # ./script.sh -c "https://example.com/Some-Casting-X.html" # ./script.sh -p "/post/6545eda2cb76e.html" # ./script.sh -f posts.txt # ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run set -Eeuo pipefail IFS=$'\n\t' SCRIPT_NAME=${0##*/} VERSION="1.2.2" # ------------------------------- logging & utils ------------------------------- log() { printf '[%s] %s ' "$(date '+%F %T%z')" "$*" >&2; } die() { log "ERROR: $*"; exit 1; } need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; } need curl need flock need awk need sed need grep need cut need dirname log "Starting $SCRIPT_NAME v$VERSION (PID $$)" # Safer temp dir for partial files, etc. TMPDIR=${TMPDIR:-/tmp} WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX") cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; } # ----------------------------- lock/unlock logic ------------------------------ # Use a fixed numeric FD (200) for wider Bash compatibility (e.g., macOS bash 3.2). LOCK_FILE="/tmp/${SCRIPT_NAME}.lock" LOCK_HELD=0 lock() { log "Creating lock… ($LOCK_FILE)" # Open lock file on FD 200 and try to acquire non-blocking lock exec 200>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE" if flock -n 200; then LOCK_HELD=1 else die "Lock failed — another process is running." fi } unlock() { if (( LOCK_HELD )); then log "Releasing lock…" flock -u 200 || true exec 200>&- || true LOCK_HELD=0 fi } # Always cleanup on exit/interrupt cleanup() { unlock || true cleanup_tmp || true } trap cleanup EXIT INT TERM # ------------------------------ output handling ------------------------------- OUTDIR=${OUTDIR:-$PWD} mkdir -p "$OUTDIR" # ---------------------------- idx path resolution ----------------------------- IDX="" # will be set via set_idx_from_url host_from_url() { # Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST # Usage: host_from_url "https://foo.bar/baz" -> foo.bar local u=$1 if [[ $u == http*://* ]]; then # strip scheme u=${u#*//} printf '%s\n' "${u%%/*}" else # If relative path, we need BASE_HOST to be set by caller [[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST" printf '%s\n' "$BASE_HOST" fi } set_idx_from_url() { # idx base: /storage/disk1/X/idx/blog//idx local base_url=$1 local host_and_tail if [[ $base_url == http*://* ]]; then host_and_tail=${base_url#*//} # host/... else [[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL" host_and_tail="$BASE_HOST/${base_url#/}" fi local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}" idx_root=${idx_root%/} IDX="$idx_root/idx" local dir dir=$(dirname "$IDX") mkdir -p "$dir" : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX" } link_exists_in_file() { [[ -n $IDX ]] || die "IDX är inte satt" local link_to_check=$1 grep -Fqx -- "$link_to_check" "$IDX" } append_to_idx() { [[ -n $IDX ]] || die "IDX är inte satt" printf '%s\n' "$1" >> "$IDX" } # ------------------------------- network layer -------------------------------- CURL_OPTS=( --fail-with-body --show-error --location --connect-timeout 10 --max-time 0 --retry 3 --retry-delay 1 --retry-connrefused --compressed -sS -A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION" ) fetch() { local url=$1 curl "${CURL_OPTS[@]}" "$url" } post_text() { local url=$1 local body=$2 curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url" } # ------------------------------- parsing layer -------------------------------- # NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available. get_items() { local url=$1 local content content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; } printf '%s\n' "$content" \ | sed 's/>/>\n/g' \ | grep "class='js-pop'" \ | awk -F"'" '{print $2}' \ | cut -d'?' -f1 } # Expecting a single line: ["",""] JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$' sanitize_filename() { local in=$1 local truncated=${in:0:80} # replace anything not alnum, underscore, dot or dash with underscore printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g' } # ------------------------------ core operations -------------------------------- NOOP=0 DEBUG=0 SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10} PAGES=${PAGES:-10} LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000} resolve_item_link() { local relative_item=$1 post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item" } download_and_save_link() { local url=$1 local base_name=$2 local sanitized sanitized=$(sanitize_filename "$base_name") local outpath="$OUTDIR/${sanitized}.mp4" log "Downloading $url -> $outpath" (( NOOP )) && { log "(dry-run) Skipping download"; return 0; } curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; } mv -f "$outpath.part" "$outpath" } do_post() { local line=$1 [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först" if link_exists_in_file "$line"; then log "Link already indexed: $line" return 0 fi log "Resolving link: $line" local result if ! result=$(resolve_item_link "$line"); then log "Resolver failed for: $line" return 1 fi if [[ $result =~ $JSON_PAIR_RE ]]; then local name=${BASH_REMATCH[1]} local url=${BASH_REMATCH[2]} if download_and_save_link "$url" "$name"; then log "Download success — updating index" append_to_idx "$line" else log "Download failed for: $url" return 1 fi else log "Resolver returned unexpected payload: $result" return 1 fi } do_list_of_posts() { local links=$1 while IFS= read -r line; do [[ -n $line ]] || continue do_post "$line" done <<< "$links" } # Process a collection page containing multiple posts (pagination supported) process_collection() { local base_url=$1 BASE_HOST=$(host_from_url "$base_url") set_idx_from_url "$base_url" log "Collection base: $base_url (host=$BASE_HOST)" local i current_url for (( i=0; i