From c6d2dc9b7573c67a17e76436b642545803b21801 Mon Sep 17 00:00:00 2001 From: Urban Date: Sun, 26 Oct 2025 10:32:38 +0100 Subject: [PATCH] updates --- ._rename_and_filter.sh | Bin 0 -> 4096 bytes wcx_sync.sh | 336 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+) create mode 100755 ._rename_and_filter.sh create mode 100755 wcx_sync.sh diff --git a/._rename_and_filter.sh b/._rename_and_filter.sh new file mode 100755 index 0000000000000000000000000000000000000000..1b0b030db769cfc5c0eaa6fffb1565dad96a9738 GIT binary patch literal 4096 zcmZQz6=P>$Vqox1Ojhs@R)|o50+1L3ClDJkFz{^v(m+1nBL)UWIUt(=a103vf+zv$ zU|0ae0uVl&WCXGn05MQpDmgz_FR`E?Csi*evACqPI5j0Du_RSFAQ+^2?~A_~AbPTr zpfQLZ1*0J_8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*A%H6cK%Gqx1_QZ}jLc$% zqSWI2(xT*4g|z&lY=z9clGMDC%>2B>oSaI9oYb@ug`}Lsylh}!7^-VXQ>gxjdqsvp H?*D%PolYtO literal 0 HcmV?d00001 diff --git a/wcx_sync.sh b/wcx_sync.sh new file mode 100755 index 0000000..9702b45 --- /dev/null +++ b/wcx_sync.sh @@ -0,0 +1,336 @@ +#!/usr/bin/env bash +set -euo pipefail + +# wcx_sync.sh — v2.5 (robust parser via Python + jq 1.7) +# Keeps wcx_index.json synced from https://www.woodmancastingx.com/casting-xxx/?page=1..N +# Requires: curl, jq >= 1.7, python3 (NO external Python packages needed) + +BASE_URL="https://www.woodmancastingx.com" +LIST_PATH="/casting-xxx/" +OUT_FILE="wcx_index.json" +END_PAGE=47 +SLEEP_MS=300 +DRY_RUN=0 +LOG_FILE="" +DEBUG=0 +USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) wcx_sync.sh/2.5" +CURL_RETRIES=3 +CURL_TIMEOUT=20 + +usage() { + cat <>"$LOG_FILE"; echo "[$ts] $*"; } +warn(){ log "WARNING: $*"; } +die(){ echo "Error: $*" >&2; exit 1; } +on_err(){ local ec=$?; warn "ERROR at line ${BASH_LINENO[0]} (cmd: ${BASH_COMMAND}) — exiting ($ec)"; exit "$ec"; } +trap on_err ERR + +need(){ command -v "$1" >/dev/null 2>&1 || die "Missing required tool: $1"; } +need curl; need jq; need python3 + +trim() { local s="$1"; s="${s#"${s%%[!$' \t\r\n']*}"}"; s="${s%"${s##*[!$' \t\r\n']}"}"; printf '%s' "$s"; } +abs_url(){ local href="$1"; if [[ "$href" =~ ^https?:// ]]; then printf '%s' "$href"; else if [[ "$href" =~ ^/ ]]; then printf '%s%s' "$BASE_URL" "$href"; else printf '%s/%s' "$BASE_URL" "$href"; fi; fi; } +slug_to_title(){ + local href="$1"; local base="${href##*/}"; base="${base%.html}"; local name="${base%_*}"; name="${name//-/ }" + python3 - "$name" <<'PY' +import sys +s=sys.argv[1].strip() +print(" ".join(w.capitalize() for w in s.split()) if s else "") +PY +} + +# ---------- args ---------- +while [[ $# -gt 0 ]]; do + case "$1" in + --out) OUT_FILE="${2:?}"; shift 2 ;; + --pages|--end-page) END_PAGE="${2:?}"; shift 2 ;; + --sleep-ms) SLEEP_MS="${2:?}"; shift 2 ;; + --dry-run) DRY_RUN=1; shift ;; + --log) LOG_FILE="${2:?}"; shift 2 ;; + --timeout) CURL_TIMEOUT="${2:?}"; shift 2 ;; + --retries) CURL_RETRIES="${2:?}"; shift 2 ;; + --debug) DEBUG=1; shift ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown flag: $1";; + esac +done + +WORKDIR="$(mktemp -d -t wcx_sync.XXXXXX)" +trap 'rm -rf "$WORKDIR"' EXIT +TODAY="$(date +%F)" + +log "Workdir: $WORKDIR" + +# read existing or init +if [[ -f "$OUT_FILE" ]]; then + jq -e . "$OUT_FILE" >/dev/null 2>&1 || die "Invalid JSON in $OUT_FILE" + cp "$OUT_FILE" "$WORKDIR/old.json" +else + echo "[]" > "$WORKDIR/old.json" +fi + +: > "$WORKDIR/new_items.jsonl" +total_pages=$END_PAGE +[[ "$total_pages" =~ ^[0-9]+$ ]] || die "--pages must be an integer" + +log "Start. pages=1..$total_pages, out=$OUT_FILE, dry_run=$DRY_RUN" + +pages_fetched=0; items_found=0; items_skipped_no_title=0; debug_shown=0 + +# ---------- Python extractor (HTML → JSONL) ---------- +py_extract() { +python3 - "$1" "$BASE_URL" "$DEBUG" <<'PY' +import sys, html +from html.parser import HTMLParser + +html_file, base_url, debug = sys.argv[1], sys.argv[2], int(sys.argv[3]) + +class Node: + def __init__(self, name, attrs, parent=None): + self.name=name + self.attrs=dict(attrs) + self.children=[] + self.parent=parent + self.text=[] + def add_child(self, node): self.children.append(node) + def add_text(self, t): + if t: self.text.append(t) + def get_text(self): + return "".join(self.text + [c.get_text() for c in self.children if isinstance(c, Node)]) + +class Parser(HTMLParser): + def __init__(self): + super().__init__(convert_charrefs=True) + self.root=Node('root',{}) + self.stack=[self.root] + def handle_starttag(self, tag, attrs): + node=Node(tag, attrs, self.stack[-1]) + self.stack[-1].add_child(node) + self.stack.append(node) + def handle_endtag(self, tag): + # pop to the last matching tag + for i in range(len(self.stack)-1, -1, -1): + if self.stack[i].name == tag: + self.stack=self.stack[:i] + break + def handle_data(self, data): + self.stack[-1].add_text(data) + +def has_class(node, cls): + c=node.attrs.get('class','') + return any(x.strip()==cls for x in c.split()) if isinstance(c,str) else False + +def classes(node): + c=node.attrs.get('class','') + return set(x.strip() for x in c.split()) if isinstance(c,str) else set() + +def find_all(node, tag=None, must_classes=None): + out=[] + st=[node] + while st: + n=st.pop() + if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))): + out.append(n) + st.extend(n.children) + return out + +def find_first(node, tag=None, must_classes=None, attr=None): + st=[node] + while st: + n=st.pop(0) + if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))): + if attr: + if attr in n.attrs: + return n + else: + return n + st.extend(n.children) + return None + +def get_text_by_selector(a_node, tag, cls): + for n in a_node.children: + pass + # BFS under a_node + st=[a_node] + while st: + n=st.pop(0) + if n.name==tag and cls in classes(n): + return n.get_text().strip() + st.extend(n.children) + return "" + +def get_attr_under(a_node, tag, cls, attr): + st=[a_node] + while st: + n=st.pop(0) + if n.name==tag and cls in classes(n): + v=n.attrs.get(attr,'') + if v: return v.strip() + st.extend(n.children) + return "" + +def abs_url(base, href): + if href.startswith('http://') or href.startswith('https://'): + return href + if href.startswith('/'): + return base + href + return base.rstrip('/') + '/' + href + +with open(html_file,'r',encoding='utf-8', errors='ignore') as f: + data=f.read() + +p=Parser() +p.feed(data) + +# scope: div.items.container_3 +items_containers = [n for n in find_all(p.root, 'div') if {'items','container_3'}.issubset(classes(n))] +if not items_containers: + # fallback: search entire tree + items_containers=[p.root] + +count=0 +for container in items_containers: + anchors = [n for n in find_all(container, 'a') if {'item','scene'}.issubset(classes(n))] + for a in anchors: + href=a.attrs.get('href','').strip() + if not href: + continue + + # Title fallbacks: span.title → a@title → anchor text → img.thumb@alt → slug + title = get_text_by_selector(a, 'span', 'title').strip() + if not title: + title = a.attrs.get('title','') or '' + title = html.unescape(title).strip() + if not title: + title = a.get_text().strip() + if not title: + title = get_attr_under(a, 'img', 'thumb', 'alt') + if not title: + # slug: /casting-x/tanika_8927.html -> Tanika + base=href.split('/')[-1].split('.')[0] if '/' in href else href + base=base.rsplit('_',1)[0] + title = " ".join(w.capitalize() for w in base.replace('-', ' ').split()) + + title=html.unescape(title).strip() + if not title: + if debug: + sys.stdout.write("") # nothing + continue + + duration = get_text_by_selector(a, 'span', 'duration').strip() + thumb = get_attr_under(a, 'img', 'thumb', 'src').strip() + details = abs_url(base_url, href) + + obj = { + "id": title, + "titel": title, + "thumb": thumb if thumb else None, + "details": details, + "duration": duration, + "published": "" # set on merge if new + } + # compact JSON line (skip null fields) + import json + obj = {k:v for k,v in obj.items() if v is not None} + print(json.dumps(obj, ensure_ascii=False)) + count += 1 + +if debug: + print(f"__DEBUG_COUNT__={count}", file=sys.stderr) +PY +} + +# ---------- crawl pages ---------- +for (( p=1; p<=total_pages; p++ )); do + page_url="${BASE_URL}${LIST_PATH}?page=${p}" + html_file="$WORKDIR/page_${p}.html" + + log "Fetching page $p: $page_url" + if ! curl -fsSL --retry "$CURL_RETRIES" --connect-timeout "$CURL_TIMEOUT" -A "$USER_AGENT" "$page_url" -o "$html_file"; then + warn "Could not fetch $page_url — skipping"; continue + fi + if [[ ! -s "$html_file" ]]; then warn "Empty file for page $p — skipping"; continue; fi + pages_fetched=$((pages_fetched+1)) + + # Extract items → append JSON lines + py_extract "$html_file" >> "$WORKDIR/new_items.jsonl" + if [[ $DEBUG -eq 1 ]]; then + dbg_count=$(grep -c '^{' "$WORKDIR/new_items.jsonl" || true) + log "DEBUG: accumulated JSONL lines after page $p: $dbg_count" + fi + + # polite delay + if [[ $p -lt $total_pages && $SLEEP_MS -gt 0 ]]; then + if command -v usleep >/dev/null 2>&1; then usleep "$((SLEEP_MS * 1000))" || true + else sleep "$(awk "BEGIN{printf \"%.3f\", ${SLEEP_MS}/1000}")" || true + fi + fi +done + +lines_jsonl=$(wc -l < "$WORKDIR/new_items.jsonl" || echo 0) +log "Collected lines in new_items.jsonl: $lines_jsonl" + +# unique by title (last occurrence wins) +if [[ -s "$WORKDIR/new_items.jsonl" ]]; then + jq -s 'reduce .[] as $o ({}; .[$o.titel] = $o) | to_entries | map(.value)' \ + "$WORKDIR/new_items.jsonl" > "$WORKDIR/new_items.json" +else + echo "[]" > "$WORKDIR/new_items.json" +fi + +# merge & sort (published on new; updated when duration changes) +jq --arg today "$TODAY" ' + def toMap: reduce .[] as $x ({}; .[$x.titel] = $x); + + . as $newArr | input as $oldArr + | ($oldArr | toMap) as $old + | reduce $newArr[] as $n ($old; + . as $m + | ($m[$n.titel] // null) as $o + | if $o == null then + .[$n.titel] = ($n + {published: $today}) + else + .[$n.titel] = + ( $o + | (if ($n.details // "") != "" then . + {details: $n.details} else . end) + | (if ((.thumb // "") == "" and ($n.thumb // "") != "") then . + {thumb: $n.thumb} else . end) + | (if (($n.duration // "") != "" and ($n.duration // "") != ($o.duration // "")) + then . + {duration: $n.duration, updated: $today} + else . + end) + ) + end + ) + | (. | to_entries | map(.value)) + | sort_by(.updated // .published) | reverse +' "$WORKDIR/new_items.json" "$WORKDIR/old.json" > "$WORKDIR/merged.json" + +new_count=$(jq 'length' "$WORKDIR/new_items.json"); old_count=$(jq 'length' "$WORKDIR/old.json"); merged_count=$(jq 'length' "$WORKDIR/merged.json") +log "Pages fetched: $pages_fetched / $total_pages" +log "new_items.json: $new_count | old.json: $old_count | merged.json: $merged_count" + +# atomic write +if [[ $DRY_RUN -eq 1 ]]; then + log "Dry-run: NOT writing $OUT_FILE" +else + tmp_out="$WORKDIR/out.tmp.json" + jq . "$WORKDIR/merged.json" > "$tmp_out" + [[ -s "$tmp_out" ]] || die "Merged output is empty — aborting write" + mv "$tmp_out" "$OUT_FILE" + log "Updated $OUT_FILE" +fi