345 lines
9.3 KiB
Bash
Executable File
345 lines
9.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Hardened rewrite of the provided script with safer defaults, clearer structure,
|
|
# robust option parsing, and better error handling.
|
|
#
|
|
# Usage examples:
|
|
# ./script.sh -c "https://example.com/Some-Casting-X.html"
|
|
# ./script.sh -p "/post/6545eda2cb76e.html"
|
|
# ./script.sh -f posts.txt
|
|
# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run
|
|
|
|
set -Eeuo pipefail
|
|
IFS=$'\n\t'
|
|
|
|
SCRIPT_NAME=${0##*/}
|
|
VERSION="1.2.1"
|
|
|
|
# ------------------------------- logging & utils -------------------------------
|
|
log() { printf '[%s] %s
|
|
' "$(date '+%F %T%z')" "$*" >&2; }
|
|
die() { log "ERROR: $*"; exit 1; }
|
|
need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
|
|
|
|
need curl
|
|
need flock
|
|
need awk
|
|
need sed
|
|
need grep
|
|
need cut
|
|
need dirname
|
|
|
|
log "Starting $SCRIPT_NAME v$VERSION (PID $$)"
|
|
|
|
# Safer temp dir for partial files, etc.
|
|
TMPDIR=${TMPDIR:-/tmp}
|
|
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
|
|
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
|
|
|
|
# ----------------------------- lock/unlock logic ------------------------------
|
|
# Use a fixed numeric FD (200) for wider Bash compatibility (e.g., macOS bash 3.2).
|
|
LOCK_FD=200
|
|
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
|
|
LOCK_HELD=0
|
|
|
|
lock() {
|
|
log "Creating lock… ($LOCK_FILE)"
|
|
exec ${LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
|
|
if flock -n ${LOCK_FD}; then
|
|
LOCK_HELD=1
|
|
else
|
|
die "Lock failed — another process is running."
|
|
fi
|
|
}
|
|
|
|
unlock() {
|
|
if (( LOCK_HELD )); then
|
|
log "Releasing lock…"
|
|
flock -u ${LOCK_FD} || true
|
|
# Close the FD to avoid lingering descriptors
|
|
exec ${LOCK_FD}>&- || true
|
|
LOCK_HELD=0
|
|
fi
|
|
}
|
|
|
|
# Always cleanup on exit/interrupt
|
|
cleanup() {
|
|
unlock || true
|
|
cleanup_tmp || true
|
|
}
|
|
trap cleanup EXIT INT TERM
|
|
|
|
# ------------------------------ output handling -------------------------------
|
|
OUTDIR=${OUTDIR:-$PWD}
|
|
mkdir -p "$OUTDIR"
|
|
|
|
# ---------------------------- idx path resolution -----------------------------
|
|
IDX="" # will be set via set_idx_from_url
|
|
|
|
host_from_url() {
|
|
# Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
|
|
# Usage: host_from_url "https://foo.bar/baz" -> foo.bar
|
|
local u=$1
|
|
if [[ $u == http*://* ]]; then
|
|
# strip scheme
|
|
u=${u#*//}
|
|
printf '%s\n' "${u%%/*}"
|
|
else
|
|
# If relative path, we need BASE_HOST to be set by caller
|
|
[[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
|
|
printf '%s\n' "$BASE_HOST"
|
|
fi
|
|
}
|
|
|
|
set_idx_from_url() {
|
|
# idx base: /storage/disk1/X/idx/blog/<host+path>/idx
|
|
local base_url=$1
|
|
local host_and_tail
|
|
if [[ $base_url == http*://* ]]; then
|
|
host_and_tail=${base_url#*//} # host/...
|
|
else
|
|
[[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
|
|
host_and_tail="$BASE_HOST/${base_url#/}"
|
|
fi
|
|
local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
|
|
idx_root=${idx_root%/}
|
|
IDX="$idx_root/idx"
|
|
local dir
|
|
dir=$(dirname "$IDX")
|
|
mkdir -p "$dir"
|
|
: >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
|
|
}
|
|
|
|
link_exists_in_file() {
|
|
[[ -n $IDX ]] || die "IDX är inte satt"
|
|
local link_to_check=$1
|
|
grep -Fqx -- "$link_to_check" "$IDX"
|
|
}
|
|
|
|
append_to_idx() {
|
|
[[ -n $IDX ]] || die "IDX är inte satt"
|
|
printf '%s\n' "$1" >> "$IDX"
|
|
}
|
|
|
|
# ------------------------------- network layer --------------------------------
|
|
CURL_OPTS=(
|
|
--fail-with-body
|
|
--show-error
|
|
--location
|
|
--connect-timeout 10
|
|
--max-time 0
|
|
--retry 3
|
|
--retry-delay 1
|
|
--retry-connrefused
|
|
--compressed
|
|
-sS
|
|
-A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
|
|
)
|
|
|
|
fetch() {
|
|
local url=$1
|
|
curl "${CURL_OPTS[@]}" "$url"
|
|
}
|
|
|
|
post_text() {
|
|
local url=$1
|
|
local body=$2
|
|
curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
|
|
}
|
|
|
|
# ------------------------------- parsing layer --------------------------------
|
|
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
|
|
get_items() {
|
|
local url=$1
|
|
local content
|
|
content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
|
|
printf '%s\n' "$content" \
|
|
| sed 's/>/>\n/g' \
|
|
| grep "class='js-pop'" \
|
|
| awk -F"'" '{print $2}' \
|
|
| cut -d'?' -f1
|
|
}
|
|
|
|
# Expecting a single line: ["<filename>","<download_url>"]
|
|
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
|
|
|
|
sanitize_filename() {
|
|
local in=$1
|
|
local truncated=${in:0:80}
|
|
# replace anything not alnum, underscore, dot or dash with underscore
|
|
printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
|
|
}
|
|
|
|
# ------------------------------ core operations --------------------------------
|
|
NOOP=0
|
|
DEBUG=0
|
|
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
|
|
PAGES=${PAGES:-10}
|
|
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
|
|
|
|
resolve_item_link() {
|
|
local relative_item=$1
|
|
post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
|
|
}
|
|
|
|
download_and_save_link() {
|
|
local url=$1
|
|
local base_name=$2
|
|
local sanitized
|
|
sanitized=$(sanitize_filename "$base_name")
|
|
local outpath="$OUTDIR/${sanitized}.mp4"
|
|
log "Downloading $url -> $outpath"
|
|
(( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
|
|
curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
|
|
mv -f "$outpath.part" "$outpath"
|
|
}
|
|
|
|
do_post() {
|
|
local line=$1
|
|
[[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
|
|
if link_exists_in_file "$line"; then
|
|
log "Link already indexed: $line"
|
|
return 0
|
|
fi
|
|
log "Resolving link: $line"
|
|
local result
|
|
if ! result=$(resolve_item_link "$line"); then
|
|
log "Resolver failed for: $line"
|
|
return 1
|
|
fi
|
|
if [[ $result =~ $JSON_PAIR_RE ]]; then
|
|
local name=${BASH_REMATCH[1]}
|
|
local url=${BASH_REMATCH[2]}
|
|
if download_and_save_link "$url" "$name"; then
|
|
log "Download success — updating index"
|
|
append_to_idx "$line"
|
|
else
|
|
log "Download failed for: $url"
|
|
return 1
|
|
fi
|
|
else
|
|
log "Resolver returned unexpected payload: $result"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
do_list_of_posts() {
|
|
local links=$1
|
|
while IFS= read -r line; do
|
|
[[ -n $line ]] || continue
|
|
do_post "$line"
|
|
done <<< "$links"
|
|
}
|
|
|
|
# Process a collection page containing multiple posts (pagination supported)
|
|
process_collection() {
|
|
local base_url=$1
|
|
BASE_HOST=$(host_from_url "$base_url")
|
|
set_idx_from_url "$base_url"
|
|
log "Collection base: $base_url (host=$BASE_HOST)"
|
|
local i current_url
|
|
for (( i=0; i<PAGES; i++ )); do
|
|
if (( i == 0 )); then
|
|
current_url="$base_url"
|
|
else
|
|
current_url="${base_url}?page=$(( i * 30 ))"
|
|
fi
|
|
log "Current page: $current_url"
|
|
local links
|
|
links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
|
|
do_list_of_posts "$links"
|
|
(( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
|
|
done
|
|
}
|
|
|
|
process_single_post() {
|
|
local post=$1
|
|
# If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
|
|
BASE_HOST=$(host_from_url "$post")
|
|
set_idx_from_url "$post"
|
|
do_post "$post"
|
|
}
|
|
|
|
process_file_of_posts() {
|
|
local file=$1
|
|
[[ -s $file ]] || die "File not found or empty: $file"
|
|
# Determine BASE_HOST from first non-empty line if possible
|
|
local first
|
|
first=$(grep -vE '^\s*$' "$file" | head -n1)
|
|
[[ -n $first ]] || die "No usable lines in: $file"
|
|
BASE_HOST=$(host_from_url "$first")
|
|
set_idx_from_url "$first"
|
|
while IFS= read -r line; do
|
|
[[ -z $line ]] && continue
|
|
do_post "$line"
|
|
done < "$file"
|
|
}
|
|
|
|
# --------------------------------- CLI parsing ---------------------------------
|
|
usage() {
|
|
cat <<EOF
|
|
$SCRIPT_NAME v$VERSION
|
|
|
|
Usage:
|
|
$SCRIPT_NAME [-n] [-d] -c URL
|
|
$SCRIPT_NAME [-n] [-d] -p POST
|
|
$SCRIPT_NAME [-n] [-d] -f FILE
|
|
|
|
Options:
|
|
-n Dry-run (no downloads, still logs and updates idx suppressed)
|
|
-d Debug (set -x)
|
|
-c URL Collection URL (first page URL)
|
|
-p POST Single post path or URL (e.g. /post/abcd.html)
|
|
-f FILE File with one post per line
|
|
-h Help
|
|
|
|
Env vars:
|
|
OUTDIR Output directory (default: current dir)
|
|
PAGES How many collection pages to traverse (default: $PAGES)
|
|
SLEEP_BETWEEN_PAGES Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
|
|
LOCAL_RESOLVER_URL Resolver endpoint (default: $LOCAL_RESOLVER_URL)
|
|
EOF
|
|
}
|
|
|
|
collection=""; post=""; file=""
|
|
while getopts ":ndc:p:f:h" opt; do
|
|
case "$opt" in
|
|
n) NOOP=1 ;;
|
|
d) DEBUG=1 ;;
|
|
c) collection=$OPTARG ;;
|
|
p) post=$OPTARG ;;
|
|
f) file=$OPTARG ;;
|
|
h) usage; exit 0 ;;
|
|
:) die "Option -$OPTARG requires an argument." ;;
|
|
\?) die "Invalid option: -$OPTARG" ;;
|
|
esac
|
|
done
|
|
|
|
(( DEBUG )) && { log "Debug mode enabled"; set -x; }
|
|
|
|
# Enforce mutual exclusivity between -c, -p, -f
|
|
# (avoid set -e pitfalls by not looping over empty args)
|
|
count=0
|
|
[[ -n $collection ]] && ((count++))
|
|
[[ -n $post ]] && ((count++))
|
|
[[ -n $file ]] && ((count++))
|
|
|
|
log "Arg summary: collection='${collection:-}' post='${post:-}' file='${file:-}' (count=$count)"
|
|
if (( count != 1 )); then
|
|
usage
|
|
die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."
|
|
fi
|
|
|
|
lock
|
|
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
|
|
|
|
if [[ -n $collection ]]; then
|
|
process_collection "$collection"
|
|
elif [[ -n $post ]]; then
|
|
process_single_post "$post"
|
|
elif [[ -n $file ]]; then
|
|
process_file_of_posts "$file"
|
|
fi
|
|
|
|
# cleanup happens via trap
|
|
exit 0
|