Update download.sh

This commit is contained in:
2025-09-29 21:18:53 +02:00
parent 569c5b6611
commit 80bc2c5f6f

View File

@ -1,239 +1,324 @@
#!/bin/bash #!/usr/bin/env bash
idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx" # Hardened rewrite of the provided script with safer defaults, clearer structure,
LOCK_FILE="/tmp/my_script.lock" # robust option parsing, and better error handling.
#
# Usage examples:
# ./script.sh -c "https://example.com/Some-Casting-X.html"
# ./script.sh -p "/post/6545eda2cb76e.html"
# ./script.sh -f posts.txt
# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run
## Setup idx set -Eeuo pipefail
# Extract the directory from the file path IFS=$'\n\t'
directory_path="$(dirname "$idx")"
# Test if the directory exists, and if not, create it SCRIPT_NAME=${0##*/}
if [ ! -d "$directory_path" ]; then VERSION="1.2.0"
mkdir -p "$directory_path"
fi
# Test if the file exists, and if not, create it # ------------------------------- logging & utils -------------------------------
if [ ! -e "$idx" ]; then log() { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; }
touch "$idx" die() { log "ERROR: $*" >&2; exit 1; }
fi need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
lock(){ need curl
echo "Creating lock.." need flock
exec 200>"$LOCK_FILE" need awk
flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 ) need sed
need grep
need cut
need dirname
# Safer temp dir for partial files, etc.
TMPDIR=${TMPDIR:-/tmp}
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
# ----------------------------- lock/unlock logic ------------------------------
LOCK_FD=200
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
lock() {
log "Creating lock… ($LOCK_FILE)"
# shellcheck disable=SC2094
exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
if ! flock -n "$LOCK_FD"; then
die "Lock failed — another process is running."
fi
}
unlock() {
log "Releasing lock…"
flock -u "$LOCK_FD" || true
} }
unlock(){ # Always cleanup on exit/interrupt
echo "Removing lock.." cleanup() {
# Release the lock unlock || true
flock -u 200 cleanup_tmp || true
} }
_exit(){ trap cleanup EXIT INT TERM
unlock
exit $1 # ------------------------------ output handling -------------------------------
OUTDIR=${OUTDIR:-$PWD}
mkdir -p "$OUTDIR"
# ---------------------------- idx path resolution -----------------------------
IDX="" # will be set via set_idx_from_url
host_from_url() {
# Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
# Usage: host_from_url "https://foo.bar/baz" -> foo.bar
local u=$1
if [[ $u == http*://* ]]; then
# strip scheme
u=${u#*//}
printf '%s\n' "${u%%/*}"
else
# If relative path, we need BASE_HOST to be set by caller
[[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
printf '%s\n' "$BASE_HOST"
fi
} }
get_link_for_item(){ set_idx_from_url() {
local item=$1 # idx base: /storage/disk1/X/idx/blog/<host+path>/idx
local content local base_url=$1
local host_and_tail
#echo "$item" if [[ $base_url == http*://* ]]; then
host_and_tail=${base_url#*//} # host/...
# Use curl to download the URL and store its content in the 'content' variable else
[[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || { host_and_tail="$BASE_HOST/${base_url#/}"
echo "Failed to download the URL: $item" fi
return 1 local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
} idx_root=${idx_root%/}
IDX="$idx_root/idx"
#curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html' local dir
#["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"] dir=$(dirname "$IDX")
mkdir -p "$dir"
echo "$content" : >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
}
log(){
local message=$1
echo $(date '+%F %H:%M:%S') "$message"
}
get_items(){
local url=$1
local content
local links
# Use curl to download the URL and store its content in the 'content' variable
content=$(curl -s "$url") || {
log "Failed to download the URL: $url"
return 1
}
# Print the stored content
echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1
} }
link_exists_in_file() { link_exists_in_file() {
local link_to_check="$1" [[ -n $IDX ]] || die "IDX är inte satt"
if grep -Fxq "$link_to_check" "$idx"; then local link_to_check=$1
return 0 # Link exists in the file grep -Fqx -- "$link_to_check" "$IDX"
else }
return 1 # Link does not exist in the file
fi append_to_idx() {
[[ -n $IDX ]] || die "IDX är inte satt"
printf '%s\n' "$1" >> "$IDX"
}
# ------------------------------- network layer --------------------------------
CURL_OPTS=(
--fail-with-body
--show-error
--location
--connect-timeout 10
--max-time 0
--retry 3
--retry-delay 1
--retry-connrefused
--compressed
-sS
-A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
)
fetch() {
local url=$1
curl "${CURL_OPTS[@]}" "$url"
}
post_text() {
local url=$1
local body=$2
curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
}
# ------------------------------- parsing layer --------------------------------
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
get_items() {
local url=$1
local content
content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
printf '%s\n' "$content" \
| sed 's/>/>\n/g' \
| grep "class='js-pop'" \
| awk -F"'" '{print $2}' \
| cut -d'?' -f1
}
# Expecting a single line: ["<filename>","<download_url>"]
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
sanitize_filename() {
local in=$1
local truncated=${in:0:80}
# replace anything not alnum, underscore, dot or dash with underscore
printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
}
# ------------------------------ core operations --------------------------------
NOOP=0
DEBUG=0
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
PAGES=${PAGES:-10}
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
resolve_item_link() {
local relative_item=$1
post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
} }
download_and_save_link() { download_and_save_link() {
local url="$1" local url=$1
local filename="$2" local base_name=$2
local truncated="${filename:0:50}" local sanitized
local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}" sanitized=$(sanitize_filename "$base_name")
local outpath="$OUTDIR/${sanitized}.mp4"
log "Downloading $url -> $outpath"
(( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
mv -f "$outpath.part" "$outpath"
}
log "Downloading $url with name $sanitized" do_post() {
# Use curl to download the URL and save the content to the specified file local line=$1
if ! curl -L -o "$sanitized.mp4" "$url"; then [[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
log "Curl failed to download the URL: $url" if link_exists_in_file "$line"; then
log "Link already indexed: $line"
return 0
fi
log "Resolving link: $line"
local result
if ! result=$(resolve_item_link "$line"); then
log "Resolver failed for: $line"
return 1
fi
if [[ $result =~ $JSON_PAIR_RE ]]; then
local name=${BASH_REMATCH[1]}
local url=${BASH_REMATCH[2]}
if download_and_save_link "$url" "$name"; then
log "Download success — updating index"
append_to_idx "$line"
else
log "Download failed for: $url"
return 1
fi
else
log "Resolver returned unexpected payload: $result"
return 1 return 1
fi fi
} }
pattern='^\["([^"]*)","([^"]*)"\]$' do_list_of_posts() {
# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html local links=$1
do_post(){
local line="$1"
if link_exists_in_file "$line"; then
log "Link already exists: $line"
else
log "Resolving link $line"
result="$(get_link_for_item "$line")"
# Test if the string matches the pattern
if [[ $result =~ $pattern ]]; then
val1="${BASH_REMATCH[1]}"
val2="${BASH_REMATCH[2]}"
if download_and_save_link "$val2" "$val1"; then
log "Download success - updating index"
echo "$line" >> "$idx"
else
log "Download failed"
fi
else
log "no match $result"
fi
fi
}
# Iterate over rows of posts
do_list_of_posts(){
local links="$1"
# Iterate over rows of posts
while IFS= read -r line; do while IFS= read -r line; do
[[ -n $line ]] || continue
do_post "$line" do_post "$line"
done <<< "$links" done <<< "$links"
} }
# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html # Process a collection page containing multiple posts (pagination supported)
do_collection_page(){ process_collection() {
local current_url="$1" local base_url=$1
BASE_HOST=$(host_from_url "$base_url")
set_idx_from_url "$base_url"
log "Collection base: $base_url (host=$BASE_HOST)"
local i current_url
for (( i=0; i<PAGES; i++ )); do
if (( i == 0 )); then
current_url="$base_url"
else
current_url="${base_url}?page=$(( i * 30 ))"
fi
log "Current page: $current_url" log "Current page: $current_url"
local links
# Download and parse out items links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
links="$(get_items "$current_url")"
do_list_of_posts "$links" do_list_of_posts "$links"
(( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
done
} }
process_single_post() {
local post=$1
# If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
BASE_HOST=$(host_from_url "$post")
set_idx_from_url "$post"
do_post "$post"
}
process_file_of_posts() {
local file=$1
[[ -s $file ]] || die "File not found or empty: $file"
# Determine BASE_HOST from first non-empty line if possible
local first
first=$(grep -vE '^\s*$' "$file" | head -n1)
[[ -n $first ]] || die "No usable lines in: $file"
BASE_HOST=$(host_from_url "$first")
set_idx_from_url "$first"
while IFS= read -r line; do
[[ -z $line ]] && continue
do_post "$line"
done < "$file"
}
# --------------------------------- CLI parsing ---------------------------------
usage() { usage() {
echo "Options -c, -p and -f cannot be used together." cat <<EOF
_exit 1 $SCRIPT_NAME v$VERSION
Usage:
$SCRIPT_NAME [-n] [-d] -c URL
$SCRIPT_NAME [-n] [-d] -p POST
$SCRIPT_NAME [-n] [-d] -f FILE
Options:
-n Dry-run (no downloads, still logs and updates idx suppressed)
-d Debug (set -x)
-c URL Collection URL (first page URL)
-p POST Single post path or URL (e.g. /post/abcd.html)
-f FILE File with one post per line
-h Help
Env vars:
OUTDIR Output directory (default: current dir)
PAGES How many collection pages to traverse (default: $PAGES)
SLEEP_BETWEEN_PAGES Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
LOCAL_RESOLVER_URL Resolver endpoint (default: $LOCAL_RESOLVER_URL)
EOF
} }
collection="" collection=""; post=""; file=""
post="" while getopts ":ndc:p:f:h" opt; do
file="" case "$opt" in
debug=0 n) NOOP=1 ;;
while getopts ":n:d:c:p:f:" opt; do d) DEBUG=1 ;;
case $opt in c) collection=$OPTARG ;;
d) p) post=$OPTARG ;;
set -x f) file=$OPTARG ;;
debug=1 h) usage; exit 0 ;;
;; :) die "Option -$OPTARG requires an argument." ;;
n) \?) die "Invalid option: -$OPTARG" ;;
# No op
no_op="1"
;;
c)
# Collection
if [ -n "$post" ] || [ -n "$file" ] || [ "$OPTARG" == "-p" ] || [ "$OPTARG" == "-f" ]; then
usage
fi
collection="$OPTARG"
;;
p)
# post
if [ -n "$collection" ] || [ -n "$file" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-f" ]; then
usage
fi
post="$OPTARG"
;;
f)
# file of posts
if [ -n "$collection" ] || [ -n "$post" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-p" ]; then
usage
fi
file="$OPTARG"
;;
\?)
echo "Invalid option: -$OPTARG"
_exit 1
;;
:)
echo "Option -$OPTARG requires an argument."
_exit 1
;;
esac esac
done done
echo "debug: $debug" (( DEBUG )) && set -x
# Enforce mutual exclusivity between -c, -p, -f
set -- "$collection" "$post" "$file"
count=0
for x in "$@"; do [[ -n $x ]] && ((count++)); done
(( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
lock lock
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
# Collection if [[ -n $collection ]]; then
if [ -n "$collection" ]; then process_collection "$collection"
echo "Scrapping collection.." elif [[ -n $post ]]; then
base_url="$collection" process_single_post "$post"
## For pages 1 to x elif [[ -n $file ]]; then
for ((i=0; i<10; i++)); do process_file_of_posts "$file"
if [ $i -eq 0 ]; then
current_url="$base_url"
else
current_url="${base_url}?page=$((i * 30))"
fi
#Do collection page
do_collection_page "$current_url"
sleep 10
done
fi fi
# Single post # cleanup happens via trap
if [ -n "$post" ]; then exit 0
echo "Scrapping post.."
do_post "$post"
fi
# File of posts
if [ -n "$file" ]; then
echo "Scrapping file"
# Check if the file exists
if [ -e "$file" ]; then
# Open the file for reading
while IFS= read -r line; do
# Process each line here, for example, echo it
do_post "$line"
done < "$file"
else
echo "File not found: $file"
fi
fi
unlock