Update download.sh

This commit is contained in:
2025-09-29 21:18:53 +02:00
parent 569c5b6611
commit 80bc2c5f6f

View File

@ -1,239 +1,324 @@
#!/bin/bash
idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx"
LOCK_FILE="/tmp/my_script.lock"
#!/usr/bin/env bash
# Hardened rewrite of the provided script with safer defaults, clearer structure,
# robust option parsing, and better error handling.
#
# Usage examples:
# ./script.sh -c "https://example.com/Some-Casting-X.html"
# ./script.sh -p "/post/6545eda2cb76e.html"
# ./script.sh -f posts.txt
# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run
## Setup idx
# Extract the directory from the file path
directory_path="$(dirname "$idx")"
set -Eeuo pipefail
IFS=$'\n\t'
# Test if the directory exists, and if not, create it
if [ ! -d "$directory_path" ]; then
mkdir -p "$directory_path"
fi
SCRIPT_NAME=${0##*/}
VERSION="1.2.0"
# Test if the file exists, and if not, create it
if [ ! -e "$idx" ]; then
touch "$idx"
fi
# ------------------------------- logging & utils -------------------------------
log() { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; }
die() { log "ERROR: $*" >&2; exit 1; }
need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
lock(){
echo "Creating lock.."
exec 200>"$LOCK_FILE"
flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 )
need curl
need flock
need awk
need sed
need grep
need cut
need dirname
# Safer temp dir for partial files, etc.
TMPDIR=${TMPDIR:-/tmp}
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
# ----------------------------- lock/unlock logic ------------------------------
LOCK_FD=200
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
lock() {
log "Creating lock… ($LOCK_FILE)"
# shellcheck disable=SC2094
exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
if ! flock -n "$LOCK_FD"; then
die "Lock failed — another process is running."
fi
}
unlock() {
log "Releasing lock…"
flock -u "$LOCK_FD" || true
}
unlock(){
echo "Removing lock.."
# Release the lock
flock -u 200
# Always cleanup on exit/interrupt
cleanup() {
unlock || true
cleanup_tmp || true
}
_exit(){
unlock
exit $1
trap cleanup EXIT INT TERM
# ------------------------------ output handling -------------------------------
OUTDIR=${OUTDIR:-$PWD}
mkdir -p "$OUTDIR"
# ---------------------------- idx path resolution -----------------------------
IDX="" # will be set via set_idx_from_url
host_from_url() {
# Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
# Usage: host_from_url "https://foo.bar/baz" -> foo.bar
local u=$1
if [[ $u == http*://* ]]; then
# strip scheme
u=${u#*//}
printf '%s\n' "${u%%/*}"
else
# If relative path, we need BASE_HOST to be set by caller
[[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
printf '%s\n' "$BASE_HOST"
fi
}
get_link_for_item(){
local item=$1
local content
#echo "$item"
# Use curl to download the URL and store its content in the 'content' variable
content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || {
echo "Failed to download the URL: $item"
return 1
}
#curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html'
#["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"]
echo "$content"
}
log(){
local message=$1
echo $(date '+%F %H:%M:%S') "$message"
}
get_items(){
local url=$1
local content
local links
# Use curl to download the URL and store its content in the 'content' variable
content=$(curl -s "$url") || {
log "Failed to download the URL: $url"
return 1
}
# Print the stored content
echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1
set_idx_from_url() {
# idx base: /storage/disk1/X/idx/blog/<host+path>/idx
local base_url=$1
local host_and_tail
if [[ $base_url == http*://* ]]; then
host_and_tail=${base_url#*//} # host/...
else
[[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
host_and_tail="$BASE_HOST/${base_url#/}"
fi
local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
idx_root=${idx_root%/}
IDX="$idx_root/idx"
local dir
dir=$(dirname "$IDX")
mkdir -p "$dir"
: >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
}
link_exists_in_file() {
local link_to_check="$1"
if grep -Fxq "$link_to_check" "$idx"; then
return 0 # Link exists in the file
else
return 1 # Link does not exist in the file
fi
[[ -n $IDX ]] || die "IDX är inte satt"
local link_to_check=$1
grep -Fqx -- "$link_to_check" "$IDX"
}
append_to_idx() {
[[ -n $IDX ]] || die "IDX är inte satt"
printf '%s\n' "$1" >> "$IDX"
}
# ------------------------------- network layer --------------------------------
CURL_OPTS=(
--fail-with-body
--show-error
--location
--connect-timeout 10
--max-time 0
--retry 3
--retry-delay 1
--retry-connrefused
--compressed
-sS
-A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
)
fetch() {
local url=$1
curl "${CURL_OPTS[@]}" "$url"
}
post_text() {
local url=$1
local body=$2
curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
}
# ------------------------------- parsing layer --------------------------------
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
get_items() {
local url=$1
local content
content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
printf '%s\n' "$content" \
| sed 's/>/>\n/g' \
| grep "class='js-pop'" \
| awk -F"'" '{print $2}' \
| cut -d'?' -f1
}
# Expecting a single line: ["<filename>","<download_url>"]
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
sanitize_filename() {
local in=$1
local truncated=${in:0:80}
# replace anything not alnum, underscore, dot or dash with underscore
printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
}
# ------------------------------ core operations --------------------------------
NOOP=0
DEBUG=0
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
PAGES=${PAGES:-10}
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
resolve_item_link() {
local relative_item=$1
post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
}
download_and_save_link() {
local url="$1"
local filename="$2"
local truncated="${filename:0:50}"
local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}"
local url=$1
local base_name=$2
local sanitized
sanitized=$(sanitize_filename "$base_name")
local outpath="$OUTDIR/${sanitized}.mp4"
log "Downloading $url -> $outpath"
(( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
mv -f "$outpath.part" "$outpath"
}
log "Downloading $url with name $sanitized"
# Use curl to download the URL and save the content to the specified file
if ! curl -L -o "$sanitized.mp4" "$url"; then
log "Curl failed to download the URL: $url"
do_post() {
local line=$1
[[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
if link_exists_in_file "$line"; then
log "Link already indexed: $line"
return 0
fi
log "Resolving link: $line"
local result
if ! result=$(resolve_item_link "$line"); then
log "Resolver failed for: $line"
return 1
fi
if [[ $result =~ $JSON_PAIR_RE ]]; then
local name=${BASH_REMATCH[1]}
local url=${BASH_REMATCH[2]}
if download_and_save_link "$url" "$name"; then
log "Download success — updating index"
append_to_idx "$line"
else
log "Download failed for: $url"
return 1
fi
else
log "Resolver returned unexpected payload: $result"
return 1
fi
}
pattern='^\["([^"]*)","([^"]*)"\]$'
# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html
do_post(){
local line="$1"
if link_exists_in_file "$line"; then
log "Link already exists: $line"
else
log "Resolving link $line"
result="$(get_link_for_item "$line")"
# Test if the string matches the pattern
if [[ $result =~ $pattern ]]; then
val1="${BASH_REMATCH[1]}"
val2="${BASH_REMATCH[2]}"
if download_and_save_link "$val2" "$val1"; then
log "Download success - updating index"
echo "$line" >> "$idx"
else
log "Download failed"
fi
else
log "no match $result"
fi
fi
}
# Iterate over rows of posts
do_list_of_posts(){
local links="$1"
# Iterate over rows of posts
do_list_of_posts() {
local links=$1
while IFS= read -r line; do
[[ -n $line ]] || continue
do_post "$line"
done <<< "$links"
}
# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html
do_collection_page(){
local current_url="$1"
# Process a collection page containing multiple posts (pagination supported)
process_collection() {
local base_url=$1
BASE_HOST=$(host_from_url "$base_url")
set_idx_from_url "$base_url"
log "Collection base: $base_url (host=$BASE_HOST)"
local i current_url
for (( i=0; i<PAGES; i++ )); do
if (( i == 0 )); then
current_url="$base_url"
else
current_url="${base_url}?page=$(( i * 30 ))"
fi
log "Current page: $current_url"
# Download and parse out items
links="$(get_items "$current_url")"
local links
links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
do_list_of_posts "$links"
(( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
done
}
process_single_post() {
local post=$1
# If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
BASE_HOST=$(host_from_url "$post")
set_idx_from_url "$post"
do_post "$post"
}
process_file_of_posts() {
local file=$1
[[ -s $file ]] || die "File not found or empty: $file"
# Determine BASE_HOST from first non-empty line if possible
local first
first=$(grep -vE '^\s*$' "$file" | head -n1)
[[ -n $first ]] || die "No usable lines in: $file"
BASE_HOST=$(host_from_url "$first")
set_idx_from_url "$first"
while IFS= read -r line; do
[[ -z $line ]] && continue
do_post "$line"
done < "$file"
}
# --------------------------------- CLI parsing ---------------------------------
usage() {
echo "Options -c, -p and -f cannot be used together."
_exit 1
cat <<EOF
$SCRIPT_NAME v$VERSION
Usage:
$SCRIPT_NAME [-n] [-d] -c URL
$SCRIPT_NAME [-n] [-d] -p POST
$SCRIPT_NAME [-n] [-d] -f FILE
Options:
-n Dry-run (no downloads, still logs and updates idx suppressed)
-d Debug (set -x)
-c URL Collection URL (first page URL)
-p POST Single post path or URL (e.g. /post/abcd.html)
-f FILE File with one post per line
-h Help
Env vars:
OUTDIR Output directory (default: current dir)
PAGES How many collection pages to traverse (default: $PAGES)
SLEEP_BETWEEN_PAGES Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
LOCAL_RESOLVER_URL Resolver endpoint (default: $LOCAL_RESOLVER_URL)
EOF
}
collection=""
post=""
file=""
debug=0
while getopts ":n:d:c:p:f:" opt; do
case $opt in
d)
set -x
debug=1
;;
n)
# No op
no_op="1"
;;
c)
# Collection
if [ -n "$post" ] || [ -n "$file" ] || [ "$OPTARG" == "-p" ] || [ "$OPTARG" == "-f" ]; then
usage
fi
collection="$OPTARG"
;;
p)
# post
if [ -n "$collection" ] || [ -n "$file" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-f" ]; then
usage
fi
post="$OPTARG"
;;
f)
# file of posts
if [ -n "$collection" ] || [ -n "$post" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-p" ]; then
usage
fi
file="$OPTARG"
;;
\?)
echo "Invalid option: -$OPTARG"
_exit 1
;;
:)
echo "Option -$OPTARG requires an argument."
_exit 1
;;
collection=""; post=""; file=""
while getopts ":ndc:p:f:h" opt; do
case "$opt" in
n) NOOP=1 ;;
d) DEBUG=1 ;;
c) collection=$OPTARG ;;
p) post=$OPTARG ;;
f) file=$OPTARG ;;
h) usage; exit 0 ;;
:) die "Option -$OPTARG requires an argument." ;;
\?) die "Invalid option: -$OPTARG" ;;
esac
done
echo "debug: $debug"
(( DEBUG )) && set -x
# Enforce mutual exclusivity between -c, -p, -f
set -- "$collection" "$post" "$file"
count=0
for x in "$@"; do [[ -n $x ]] && ((count++)); done
(( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
lock
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
# Collection
if [ -n "$collection" ]; then
echo "Scrapping collection.."
base_url="$collection"
## For pages 1 to x
for ((i=0; i<10; i++)); do
if [ $i -eq 0 ]; then
current_url="$base_url"
else
current_url="${base_url}?page=$((i * 30))"
fi
#Do collection page
do_collection_page "$current_url"
sleep 10
done
if [[ -n $collection ]]; then
process_collection "$collection"
elif [[ -n $post ]]; then
process_single_post "$post"
elif [[ -n $file ]]; then
process_file_of_posts "$file"
fi
# Single post
if [ -n "$post" ]; then
echo "Scrapping post.."
do_post "$post"
fi
# File of posts
if [ -n "$file" ]; then
echo "Scrapping file"
# Check if the file exists
if [ -e "$file" ]; then
# Open the file for reading
while IFS= read -r line; do
# Process each line here, for example, echo it
do_post "$line"
done < "$file"
else
echo "File not found: $file"
fi
fi
unlock
# cleanup happens via trap
exit 0