Update download.sh
This commit is contained in:
475
download.sh
475
download.sh
@ -1,239 +1,324 @@
|
||||
#!/bin/bash
|
||||
idx="/storage/disk1/X/idx/blog/$(echo $base_url | cut -d'/' -f3-)/idx"
|
||||
LOCK_FILE="/tmp/my_script.lock"
|
||||
#!/usr/bin/env bash
|
||||
# Hardened rewrite of the provided script with safer defaults, clearer structure,
|
||||
# robust option parsing, and better error handling.
|
||||
#
|
||||
# Usage examples:
|
||||
# ./script.sh -c "https://example.com/Some-Casting-X.html"
|
||||
# ./script.sh -p "/post/6545eda2cb76e.html"
|
||||
# ./script.sh -f posts.txt
|
||||
# ./script.sh -n -c "https://example.com/Some-Casting-X.html" # dry-run
|
||||
|
||||
## Setup idx
|
||||
# Extract the directory from the file path
|
||||
directory_path="$(dirname "$idx")"
|
||||
set -Eeuo pipefail
|
||||
IFS=$'\n\t'
|
||||
|
||||
# Test if the directory exists, and if not, create it
|
||||
if [ ! -d "$directory_path" ]; then
|
||||
mkdir -p "$directory_path"
|
||||
fi
|
||||
SCRIPT_NAME=${0##*/}
|
||||
VERSION="1.2.0"
|
||||
|
||||
# Test if the file exists, and if not, create it
|
||||
if [ ! -e "$idx" ]; then
|
||||
touch "$idx"
|
||||
fi
|
||||
# ------------------------------- logging & utils -------------------------------
|
||||
log() { printf '[%s] %s\n' "$(date '+%F %T%z')" "$*"; }
|
||||
die() { log "ERROR: $*" >&2; exit 1; }
|
||||
need() { command -v "$1" >/dev/null 2>&1 || die "Kräver '$1' i PATH"; }
|
||||
|
||||
need curl
|
||||
need flock
|
||||
need awk
|
||||
need sed
|
||||
need grep
|
||||
need cut
|
||||
need dirname
|
||||
|
||||
# Safer temp dir for partial files, etc.
|
||||
TMPDIR=${TMPDIR:-/tmp}
|
||||
WORKDIR=$(mktemp -d "$TMPDIR/${SCRIPT_NAME%.sh}.XXXXXX")
|
||||
cleanup_tmp() { rm -rf "$WORKDIR" 2>/dev/null || true; }
|
||||
|
||||
# ----------------------------- lock/unlock logic ------------------------------
|
||||
LOCK_FD=200
|
||||
LOCK_FILE="/tmp/${SCRIPT_NAME}.lock"
|
||||
lock() {
|
||||
echo "Creating lock.."
|
||||
exec 200>"$LOCK_FILE"
|
||||
flock -n 200 || ( echo "Lock failed - process exist!!" && exit 1 )
|
||||
log "Creating lock… ($LOCK_FILE)"
|
||||
# shellcheck disable=SC2094
|
||||
exec {LOCK_FD}>"$LOCK_FILE" || die "Could not open lock file $LOCK_FILE"
|
||||
if ! flock -n "$LOCK_FD"; then
|
||||
die "Lock failed — another process is running."
|
||||
fi
|
||||
}
|
||||
|
||||
unlock() {
|
||||
echo "Removing lock.."
|
||||
# Release the lock
|
||||
flock -u 200
|
||||
}
|
||||
_exit(){
|
||||
unlock
|
||||
exit $1
|
||||
log "Releasing lock…"
|
||||
flock -u "$LOCK_FD" || true
|
||||
}
|
||||
|
||||
get_link_for_item(){
|
||||
local item=$1
|
||||
local content
|
||||
# Always cleanup on exit/interrupt
|
||||
cleanup() {
|
||||
unlock || true
|
||||
cleanup_tmp || true
|
||||
}
|
||||
trap cleanup EXIT INT TERM
|
||||
|
||||
#echo "$item"
|
||||
# ------------------------------ output handling -------------------------------
|
||||
OUTDIR=${OUTDIR:-$PWD}
|
||||
mkdir -p "$OUTDIR"
|
||||
|
||||
# Use curl to download the URL and store its content in the 'content' variable
|
||||
# ---------------------------- idx path resolution -----------------------------
|
||||
IDX="" # will be set via set_idx_from_url
|
||||
|
||||
content=$(curl -s -X POST localhost:3000 -H "Content-Type: text/plain" --data "https://sxyprn.com$item") || {
|
||||
echo "Failed to download the URL: $item"
|
||||
return 1
|
||||
host_from_url() {
|
||||
# Extract host from absolute URL (https://host/...), or from path (/post/..) use fallback BASE_HOST
|
||||
# Usage: host_from_url "https://foo.bar/baz" -> foo.bar
|
||||
local u=$1
|
||||
if [[ $u == http*://* ]]; then
|
||||
# strip scheme
|
||||
u=${u#*//}
|
||||
printf '%s\n' "${u%%/*}"
|
||||
else
|
||||
# If relative path, we need BASE_HOST to be set by caller
|
||||
[[ -n ${BASE_HOST:-} ]] || die "Kan inte härleda host från relativ länk utan BASE_HOST"
|
||||
printf '%s\n' "$BASE_HOST"
|
||||
fi
|
||||
}
|
||||
|
||||
#curl -X POST localhost:3000 -H "Content-Type: text/plain" --data 'https://sxyprn.com/post/653e2c6329e1c.html'
|
||||
#["MomPov E233 Malinda - 49 Year Old Horny Divorced Blonde MILF Beauty https://streamvid.net/ozfe24wrw95h #milf #casting #pov #anal - [01:08:17] (29.10.2023) on SexyPorn","https://sxyprn.com/cdn8/c9/22t1338zl607azp5q71zd1s4p6a/DjtVYfJJupZm-lC44cUtgw/1698771257/k615f1vfaardx6lcs07bsab3g6c/x86v5436eb27ck6836209zek16c.vid"]
|
||||
|
||||
echo "$content"
|
||||
}
|
||||
|
||||
log(){
|
||||
local message=$1
|
||||
echo $(date '+%F %H:%M:%S') "$message"
|
||||
}
|
||||
|
||||
get_items(){
|
||||
local url=$1
|
||||
local content
|
||||
local links
|
||||
|
||||
# Use curl to download the URL and store its content in the 'content' variable
|
||||
content=$(curl -s "$url") || {
|
||||
log "Failed to download the URL: $url"
|
||||
return 1
|
||||
}
|
||||
|
||||
# Print the stored content
|
||||
echo "$content" | sed 's/>/>\n/g' | grep "class='js-pop'" | awk -F"'" '{print $2}' | cut -d'?' -f1
|
||||
set_idx_from_url() {
|
||||
# idx base: /storage/disk1/X/idx/blog/<host+path>/idx
|
||||
local base_url=$1
|
||||
local host_and_tail
|
||||
if [[ $base_url == http*://* ]]; then
|
||||
host_and_tail=${base_url#*//} # host/...
|
||||
else
|
||||
[[ -n ${BASE_HOST:-} ]] || die "BASE_HOST krävs för att sätta idx från relativ URL"
|
||||
host_and_tail="$BASE_HOST/${base_url#/}"
|
||||
fi
|
||||
local idx_root="/storage/disk1/X/idx/blog/${host_and_tail}"
|
||||
idx_root=${idx_root%/}
|
||||
IDX="$idx_root/idx"
|
||||
local dir
|
||||
dir=$(dirname "$IDX")
|
||||
mkdir -p "$dir"
|
||||
: >"$IDX" 2>/dev/null || touch "$IDX" || die "Kunde inte skapa idx: $IDX"
|
||||
}
|
||||
|
||||
link_exists_in_file() {
|
||||
local link_to_check="$1"
|
||||
if grep -Fxq "$link_to_check" "$idx"; then
|
||||
return 0 # Link exists in the file
|
||||
else
|
||||
return 1 # Link does not exist in the file
|
||||
fi
|
||||
[[ -n $IDX ]] || die "IDX är inte satt"
|
||||
local link_to_check=$1
|
||||
grep -Fqx -- "$link_to_check" "$IDX"
|
||||
}
|
||||
|
||||
append_to_idx() {
|
||||
[[ -n $IDX ]] || die "IDX är inte satt"
|
||||
printf '%s\n' "$1" >> "$IDX"
|
||||
}
|
||||
|
||||
# ------------------------------- network layer --------------------------------
|
||||
CURL_OPTS=(
|
||||
--fail-with-body
|
||||
--show-error
|
||||
--location
|
||||
--connect-timeout 10
|
||||
--max-time 0
|
||||
--retry 3
|
||||
--retry-delay 1
|
||||
--retry-connrefused
|
||||
--compressed
|
||||
-sS
|
||||
-A "Mozilla/5.0 (X11; Linux x86_64) Bash/$BASH_VERSION $SCRIPT_NAME/$VERSION"
|
||||
)
|
||||
|
||||
fetch() {
|
||||
local url=$1
|
||||
curl "${CURL_OPTS[@]}" "$url"
|
||||
}
|
||||
|
||||
post_text() {
|
||||
local url=$1
|
||||
local body=$2
|
||||
curl "${CURL_OPTS[@]}" -H 'Content-Type: text/plain' -X POST --data "$body" "$url"
|
||||
}
|
||||
|
||||
# ------------------------------- parsing layer --------------------------------
|
||||
# NOTE: HTML parsing via grep/sed/awk is brittle; consider 'pup' or 'htmlq' if available.
|
||||
get_items() {
|
||||
local url=$1
|
||||
local content
|
||||
content=$(fetch "$url") || { log "Failed to download the URL: $url"; return 1; }
|
||||
printf '%s\n' "$content" \
|
||||
| sed 's/>/>\n/g' \
|
||||
| grep "class='js-pop'" \
|
||||
| awk -F"'" '{print $2}' \
|
||||
| cut -d'?' -f1
|
||||
}
|
||||
|
||||
# Expecting a single line: ["<filename>","<download_url>"]
|
||||
JSON_PAIR_RE='^\["([^"]*)","([^"]*)"\]$'
|
||||
|
||||
sanitize_filename() {
|
||||
local in=$1
|
||||
local truncated=${in:0:80}
|
||||
# replace anything not alnum, underscore, dot or dash with underscore
|
||||
printf '%s' "$truncated" | sed 's/[^A-Za-z0-9_.-]/_/g'
|
||||
}
|
||||
|
||||
# ------------------------------ core operations --------------------------------
|
||||
NOOP=0
|
||||
DEBUG=0
|
||||
SLEEP_BETWEEN_PAGES=${SLEEP_BETWEEN_PAGES:-10}
|
||||
PAGES=${PAGES:-10}
|
||||
LOCAL_RESOLVER_URL=${LOCAL_RESOLVER_URL:-http://localhost:3000}
|
||||
|
||||
resolve_item_link() {
|
||||
local relative_item=$1
|
||||
post_text "$LOCAL_RESOLVER_URL" "https://sxyprn.com$relative_item"
|
||||
}
|
||||
|
||||
download_and_save_link() {
|
||||
local url="$1"
|
||||
local filename="$2"
|
||||
local truncated="${filename:0:50}"
|
||||
local sanitized="${truncated//[^a-zA-Z0-9_.-]/_}"
|
||||
local url=$1
|
||||
local base_name=$2
|
||||
local sanitized
|
||||
sanitized=$(sanitize_filename "$base_name")
|
||||
local outpath="$OUTDIR/${sanitized}.mp4"
|
||||
log "Downloading $url -> $outpath"
|
||||
(( NOOP )) && { log "(dry-run) Skipping download"; return 0; }
|
||||
curl "${CURL_OPTS[@]}" -o "$outpath.part" "$url" || { rm -f "$outpath.part"; return 1; }
|
||||
mv -f "$outpath.part" "$outpath"
|
||||
}
|
||||
|
||||
log "Downloading $url with name $sanitized"
|
||||
# Use curl to download the URL and save the content to the specified file
|
||||
if ! curl -L -o "$sanitized.mp4" "$url"; then
|
||||
log "Curl failed to download the URL: $url"
|
||||
do_post() {
|
||||
local line=$1
|
||||
[[ -n $IDX ]] || die "IDX är inte satt — kalla set_idx_from_url först"
|
||||
if link_exists_in_file "$line"; then
|
||||
log "Link already indexed: $line"
|
||||
return 0
|
||||
fi
|
||||
log "Resolving link: $line"
|
||||
local result
|
||||
if ! result=$(resolve_item_link "$line"); then
|
||||
log "Resolver failed for: $line"
|
||||
return 1
|
||||
fi
|
||||
if [[ $result =~ $JSON_PAIR_RE ]]; then
|
||||
local name=${BASH_REMATCH[1]}
|
||||
local url=${BASH_REMATCH[2]}
|
||||
if download_and_save_link "$url" "$name"; then
|
||||
log "Download success — updating index"
|
||||
append_to_idx "$line"
|
||||
else
|
||||
log "Download failed for: $url"
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
log "Resolver returned unexpected payload: $result"
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
pattern='^\["([^"]*)","([^"]*)"\]$'
|
||||
# Fetch post ie https://sxyprn.com/post/6545eda2cb76e.html
|
||||
do_post(){
|
||||
local line="$1"
|
||||
|
||||
if link_exists_in_file "$line"; then
|
||||
log "Link already exists: $line"
|
||||
else
|
||||
log "Resolving link $line"
|
||||
result="$(get_link_for_item "$line")"
|
||||
|
||||
# Test if the string matches the pattern
|
||||
if [[ $result =~ $pattern ]]; then
|
||||
val1="${BASH_REMATCH[1]}"
|
||||
val2="${BASH_REMATCH[2]}"
|
||||
if download_and_save_link "$val2" "$val1"; then
|
||||
log "Download success - updating index"
|
||||
echo "$line" >> "$idx"
|
||||
else
|
||||
log "Download failed"
|
||||
fi
|
||||
else
|
||||
log "no match $result"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Iterate over rows of posts
|
||||
do_list_of_posts() {
|
||||
local links="$1"
|
||||
|
||||
# Iterate over rows of posts
|
||||
local links=$1
|
||||
while IFS= read -r line; do
|
||||
[[ -n $line ]] || continue
|
||||
do_post "$line"
|
||||
done <<< "$links"
|
||||
}
|
||||
|
||||
# Fetch a page of posts ie https://sxyprn.com/Woodman-Casting-X.html
|
||||
do_collection_page(){
|
||||
local current_url="$1"
|
||||
|
||||
log "Current page: $current_url"
|
||||
|
||||
# Download and parse out items
|
||||
links="$(get_items "$current_url")"
|
||||
|
||||
do_list_of_posts "$links"
|
||||
}
|
||||
|
||||
usage() {
|
||||
echo "Options -c, -p and -f cannot be used together."
|
||||
_exit 1
|
||||
}
|
||||
|
||||
collection=""
|
||||
post=""
|
||||
file=""
|
||||
debug=0
|
||||
while getopts ":n:d:c:p:f:" opt; do
|
||||
case $opt in
|
||||
d)
|
||||
set -x
|
||||
debug=1
|
||||
;;
|
||||
n)
|
||||
# No op
|
||||
no_op="1"
|
||||
;;
|
||||
c)
|
||||
# Collection
|
||||
if [ -n "$post" ] || [ -n "$file" ] || [ "$OPTARG" == "-p" ] || [ "$OPTARG" == "-f" ]; then
|
||||
usage
|
||||
fi
|
||||
collection="$OPTARG"
|
||||
;;
|
||||
p)
|
||||
# post
|
||||
if [ -n "$collection" ] || [ -n "$file" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-f" ]; then
|
||||
usage
|
||||
fi
|
||||
post="$OPTARG"
|
||||
;;
|
||||
f)
|
||||
# file of posts
|
||||
if [ -n "$collection" ] || [ -n "$post" ] || [ "$OPTARG" == "-c" ] || [ "$OPTARG" == "-p" ]; then
|
||||
usage
|
||||
fi
|
||||
file="$OPTARG"
|
||||
;;
|
||||
\?)
|
||||
echo "Invalid option: -$OPTARG"
|
||||
_exit 1
|
||||
;;
|
||||
:)
|
||||
echo "Option -$OPTARG requires an argument."
|
||||
_exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo "debug: $debug"
|
||||
lock
|
||||
|
||||
# Collection
|
||||
if [ -n "$collection" ]; then
|
||||
echo "Scrapping collection.."
|
||||
base_url="$collection"
|
||||
## For pages 1 to x
|
||||
for ((i=0; i<10; i++)); do
|
||||
if [ $i -eq 0 ]; then
|
||||
# Process a collection page containing multiple posts (pagination supported)
|
||||
process_collection() {
|
||||
local base_url=$1
|
||||
BASE_HOST=$(host_from_url "$base_url")
|
||||
set_idx_from_url "$base_url"
|
||||
log "Collection base: $base_url (host=$BASE_HOST)"
|
||||
local i current_url
|
||||
for (( i=0; i<PAGES; i++ )); do
|
||||
if (( i == 0 )); then
|
||||
current_url="$base_url"
|
||||
else
|
||||
current_url="${base_url}?page=$(( i * 30 ))"
|
||||
fi
|
||||
|
||||
#Do collection page
|
||||
do_collection_page "$current_url"
|
||||
sleep 10
|
||||
log "Current page: $current_url"
|
||||
local links
|
||||
links=$(get_items "$current_url") || { log "Failed to extract items on: $current_url"; continue; }
|
||||
do_list_of_posts "$links"
|
||||
(( i+1 < PAGES )) && { log "Sleeping $SLEEP_BETWEEN_PAGES s"; sleep "$SLEEP_BETWEEN_PAGES"; }
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
# Single post
|
||||
if [ -n "$post" ]; then
|
||||
echo "Scrapping post.."
|
||||
process_single_post() {
|
||||
local post=$1
|
||||
# If post is relative like "/post/…", we need a BASE_HOST; attempt to guess from IDX if missing
|
||||
BASE_HOST=$(host_from_url "$post")
|
||||
set_idx_from_url "$post"
|
||||
do_post "$post"
|
||||
fi
|
||||
}
|
||||
|
||||
# File of posts
|
||||
if [ -n "$file" ]; then
|
||||
echo "Scrapping file"
|
||||
# Check if the file exists
|
||||
if [ -e "$file" ]; then
|
||||
# Open the file for reading
|
||||
process_file_of_posts() {
|
||||
local file=$1
|
||||
[[ -s $file ]] || die "File not found or empty: $file"
|
||||
# Determine BASE_HOST from first non-empty line if possible
|
||||
local first
|
||||
first=$(grep -vE '^\s*$' "$file" | head -n1)
|
||||
[[ -n $first ]] || die "No usable lines in: $file"
|
||||
BASE_HOST=$(host_from_url "$first")
|
||||
set_idx_from_url "$first"
|
||||
while IFS= read -r line; do
|
||||
# Process each line here, for example, echo it
|
||||
[[ -z $line ]] && continue
|
||||
do_post "$line"
|
||||
done < "$file"
|
||||
else
|
||||
echo "File not found: $file"
|
||||
fi
|
||||
}
|
||||
|
||||
# --------------------------------- CLI parsing ---------------------------------
|
||||
usage() {
|
||||
cat <<EOF
|
||||
$SCRIPT_NAME v$VERSION
|
||||
|
||||
Usage:
|
||||
$SCRIPT_NAME [-n] [-d] -c URL
|
||||
$SCRIPT_NAME [-n] [-d] -p POST
|
||||
$SCRIPT_NAME [-n] [-d] -f FILE
|
||||
|
||||
Options:
|
||||
-n Dry-run (no downloads, still logs and updates idx suppressed)
|
||||
-d Debug (set -x)
|
||||
-c URL Collection URL (first page URL)
|
||||
-p POST Single post path or URL (e.g. /post/abcd.html)
|
||||
-f FILE File with one post per line
|
||||
-h Help
|
||||
|
||||
Env vars:
|
||||
OUTDIR Output directory (default: current dir)
|
||||
PAGES How many collection pages to traverse (default: $PAGES)
|
||||
SLEEP_BETWEEN_PAGES Seconds between pages (default: $SLEEP_BETWEEN_PAGES)
|
||||
LOCAL_RESOLVER_URL Resolver endpoint (default: $LOCAL_RESOLVER_URL)
|
||||
EOF
|
||||
}
|
||||
|
||||
collection=""; post=""; file=""
|
||||
while getopts ":ndc:p:f:h" opt; do
|
||||
case "$opt" in
|
||||
n) NOOP=1 ;;
|
||||
d) DEBUG=1 ;;
|
||||
c) collection=$OPTARG ;;
|
||||
p) post=$OPTARG ;;
|
||||
f) file=$OPTARG ;;
|
||||
h) usage; exit 0 ;;
|
||||
:) die "Option -$OPTARG requires an argument." ;;
|
||||
\?) die "Invalid option: -$OPTARG" ;;
|
||||
esac
|
||||
done
|
||||
|
||||
(( DEBUG )) && set -x
|
||||
|
||||
# Enforce mutual exclusivity between -c, -p, -f
|
||||
set -- "$collection" "$post" "$file"
|
||||
count=0
|
||||
for x in "$@"; do [[ -n $x ]] && ((count++)); done
|
||||
(( count == 1 )) || { usage; die "Options -c, -p och -f kan inte användas samtidigt (exakt en krävs)."; }
|
||||
|
||||
lock
|
||||
log "NOOP=$NOOP DEBUG=$DEBUG OUTDIR=$OUTDIR"
|
||||
|
||||
if [[ -n $collection ]]; then
|
||||
process_collection "$collection"
|
||||
elif [[ -n $post ]]; then
|
||||
process_single_post "$post"
|
||||
elif [[ -n $file ]]; then
|
||||
process_file_of_posts "$file"
|
||||
fi
|
||||
|
||||
unlock
|
||||
# cleanup happens via trap
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user