#!/usr/bin/env bash set -euo pipefail # wcx_sync.sh — v2.5 (robust parser via Python + jq 1.7) # Keeps wcx_index.json synced from https://www.woodmancastingx.com/casting-xxx/?page=1..N # Requires: curl, jq >= 1.7, python3 (NO external Python packages needed) BASE_URL="https://www.woodmancastingx.com" LIST_PATH="/casting-xxx/" OUT_FILE="wcx_index.json" END_PAGE=47 SLEEP_MS=300 DRY_RUN=0 LOG_FILE="" DEBUG=0 USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) wcx_sync.sh/2.5" CURL_RETRIES=3 CURL_TIMEOUT=20 usage() { cat <>"$LOG_FILE"; echo "[$ts] $*"; } warn(){ log "WARNING: $*"; } die(){ echo "Error: $*" >&2; exit 1; } on_err(){ local ec=$?; warn "ERROR at line ${BASH_LINENO[0]} (cmd: ${BASH_COMMAND}) — exiting ($ec)"; exit "$ec"; } trap on_err ERR need(){ command -v "$1" >/dev/null 2>&1 || die "Missing required tool: $1"; } need curl; need jq; need python3 trim() { local s="$1"; s="${s#"${s%%[!$' \t\r\n']*}"}"; s="${s%"${s##*[!$' \t\r\n']}"}"; printf '%s' "$s"; } abs_url(){ local href="$1"; if [[ "$href" =~ ^https?:// ]]; then printf '%s' "$href"; else if [[ "$href" =~ ^/ ]]; then printf '%s%s' "$BASE_URL" "$href"; else printf '%s/%s' "$BASE_URL" "$href"; fi; fi; } slug_to_title(){ local href="$1"; local base="${href##*/}"; base="${base%.html}"; local name="${base%_*}"; name="${name//-/ }" python3 - "$name" <<'PY' import sys s=sys.argv[1].strip() print(" ".join(w.capitalize() for w in s.split()) if s else "") PY } # ---------- args ---------- while [[ $# -gt 0 ]]; do case "$1" in --out) OUT_FILE="${2:?}"; shift 2 ;; --pages|--end-page) END_PAGE="${2:?}"; shift 2 ;; --sleep-ms) SLEEP_MS="${2:?}"; shift 2 ;; --dry-run) DRY_RUN=1; shift ;; --log) LOG_FILE="${2:?}"; shift 2 ;; --timeout) CURL_TIMEOUT="${2:?}"; shift 2 ;; --retries) CURL_RETRIES="${2:?}"; shift 2 ;; --debug) DEBUG=1; shift ;; -h|--help) usage; exit 0 ;; *) die "Unknown flag: $1";; esac done WORKDIR="$(mktemp -d -t wcx_sync.XXXXXX)" trap 'rm -rf "$WORKDIR"' EXIT TODAY="$(date +%F)" log "Workdir: $WORKDIR" # read existing or init if [[ -f "$OUT_FILE" ]]; then jq -e . "$OUT_FILE" >/dev/null 2>&1 || die "Invalid JSON in $OUT_FILE" cp "$OUT_FILE" "$WORKDIR/old.json" else echo "[]" > "$WORKDIR/old.json" fi : > "$WORKDIR/new_items.jsonl" total_pages=$END_PAGE [[ "$total_pages" =~ ^[0-9]+$ ]] || die "--pages must be an integer" log "Start. pages=1..$total_pages, out=$OUT_FILE, dry_run=$DRY_RUN" pages_fetched=0; items_found=0; items_skipped_no_title=0; debug_shown=0 # ---------- Python extractor (HTML → JSONL) ---------- py_extract() { python3 - "$1" "$BASE_URL" "$DEBUG" <<'PY' import sys, html from html.parser import HTMLParser html_file, base_url, debug = sys.argv[1], sys.argv[2], int(sys.argv[3]) class Node: def __init__(self, name, attrs, parent=None): self.name=name self.attrs=dict(attrs) self.children=[] self.parent=parent self.text=[] def add_child(self, node): self.children.append(node) def add_text(self, t): if t: self.text.append(t) def get_text(self): return "".join(self.text + [c.get_text() for c in self.children if isinstance(c, Node)]) class Parser(HTMLParser): def __init__(self): super().__init__(convert_charrefs=True) self.root=Node('root',{}) self.stack=[self.root] def handle_starttag(self, tag, attrs): node=Node(tag, attrs, self.stack[-1]) self.stack[-1].add_child(node) self.stack.append(node) def handle_endtag(self, tag): # pop to the last matching tag for i in range(len(self.stack)-1, -1, -1): if self.stack[i].name == tag: self.stack=self.stack[:i] break def handle_data(self, data): self.stack[-1].add_text(data) def has_class(node, cls): c=node.attrs.get('class','') return any(x.strip()==cls for x in c.split()) if isinstance(c,str) else False def classes(node): c=node.attrs.get('class','') return set(x.strip() for x in c.split()) if isinstance(c,str) else set() def find_all(node, tag=None, must_classes=None): out=[] st=[node] while st: n=st.pop() if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))): out.append(n) st.extend(n.children) return out def find_first(node, tag=None, must_classes=None, attr=None): st=[node] while st: n=st.pop(0) if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))): if attr: if attr in n.attrs: return n else: return n st.extend(n.children) return None def get_text_by_selector(a_node, tag, cls): for n in a_node.children: pass # BFS under a_node st=[a_node] while st: n=st.pop(0) if n.name==tag and cls in classes(n): return n.get_text().strip() st.extend(n.children) return "" def get_attr_under(a_node, tag, cls, attr): st=[a_node] while st: n=st.pop(0) if n.name==tag and cls in classes(n): v=n.attrs.get(attr,'') if v: return v.strip() st.extend(n.children) return "" def abs_url(base, href): if href.startswith('http://') or href.startswith('https://'): return href if href.startswith('/'): return base + href return base.rstrip('/') + '/' + href with open(html_file,'r',encoding='utf-8', errors='ignore') as f: data=f.read() p=Parser() p.feed(data) # scope: div.items.container_3 items_containers = [n for n in find_all(p.root, 'div') if {'items','container_3'}.issubset(classes(n))] if not items_containers: # fallback: search entire tree items_containers=[p.root] count=0 for container in items_containers: anchors = [n for n in find_all(container, 'a') if {'item','scene'}.issubset(classes(n))] for a in anchors: href=a.attrs.get('href','').strip() if not href: continue # Title fallbacks: span.title → a@title → anchor text → img.thumb@alt → slug title = get_text_by_selector(a, 'span', 'title').strip() if not title: title = a.attrs.get('title','') or '' title = html.unescape(title).strip() if not title: title = a.get_text().strip() if not title: title = get_attr_under(a, 'img', 'thumb', 'alt') if not title: # slug: /casting-x/tanika_8927.html -> Tanika base=href.split('/')[-1].split('.')[0] if '/' in href else href base=base.rsplit('_',1)[0] title = " ".join(w.capitalize() for w in base.replace('-', ' ').split()) title=html.unescape(title).strip() if not title: if debug: sys.stdout.write("") # nothing continue duration = get_text_by_selector(a, 'span', 'duration').strip() thumb = get_attr_under(a, 'img', 'thumb', 'src').strip() details = abs_url(base_url, href) obj = { "id": title, "titel": title, "thumb": thumb if thumb else None, "details": details, "duration": duration, "published": "" # set on merge if new } # compact JSON line (skip null fields) import json obj = {k:v for k,v in obj.items() if v is not None} print(json.dumps(obj, ensure_ascii=False)) count += 1 if debug: print(f"__DEBUG_COUNT__={count}", file=sys.stderr) PY } # ---------- crawl pages ---------- for (( p=1; p<=total_pages; p++ )); do page_url="${BASE_URL}${LIST_PATH}?page=${p}" html_file="$WORKDIR/page_${p}.html" log "Fetching page $p: $page_url" if ! curl -fsSL --retry "$CURL_RETRIES" --connect-timeout "$CURL_TIMEOUT" -A "$USER_AGENT" "$page_url" -o "$html_file"; then warn "Could not fetch $page_url — skipping"; continue fi if [[ ! -s "$html_file" ]]; then warn "Empty file for page $p — skipping"; continue; fi pages_fetched=$((pages_fetched+1)) # Extract items → append JSON lines py_extract "$html_file" >> "$WORKDIR/new_items.jsonl" if [[ $DEBUG -eq 1 ]]; then dbg_count=$(grep -c '^{' "$WORKDIR/new_items.jsonl" || true) log "DEBUG: accumulated JSONL lines after page $p: $dbg_count" fi # polite delay if [[ $p -lt $total_pages && $SLEEP_MS -gt 0 ]]; then if command -v usleep >/dev/null 2>&1; then usleep "$((SLEEP_MS * 1000))" || true else sleep "$(awk "BEGIN{printf \"%.3f\", ${SLEEP_MS}/1000}")" || true fi fi done lines_jsonl=$(wc -l < "$WORKDIR/new_items.jsonl" || echo 0) log "Collected lines in new_items.jsonl: $lines_jsonl" # unique by title (last occurrence wins) if [[ -s "$WORKDIR/new_items.jsonl" ]]; then jq -s 'reduce .[] as $o ({}; .[$o.titel] = $o) | to_entries | map(.value)' \ "$WORKDIR/new_items.jsonl" > "$WORKDIR/new_items.json" else echo "[]" > "$WORKDIR/new_items.json" fi # merge & sort (published on new; updated when duration changes) jq --arg today "$TODAY" ' def toMap: reduce .[] as $x ({}; .[$x.titel] = $x); . as $newArr | input as $oldArr | ($oldArr | toMap) as $old | reduce $newArr[] as $n ($old; . as $m | ($m[$n.titel] // null) as $o | if $o == null then .[$n.titel] = ($n + {published: $today}) else .[$n.titel] = ( $o | (if ($n.details // "") != "" then . + {details: $n.details} else . end) | (if ((.thumb // "") == "" and ($n.thumb // "") != "") then . + {thumb: $n.thumb} else . end) | (if (($n.duration // "") != "" and ($n.duration // "") != ($o.duration // "")) then . + {duration: $n.duration, updated: $today} else . end) ) end ) | (. | to_entries | map(.value)) | sort_by(.updated // .published) | reverse ' "$WORKDIR/new_items.json" "$WORKDIR/old.json" > "$WORKDIR/merged.json" new_count=$(jq 'length' "$WORKDIR/new_items.json"); old_count=$(jq 'length' "$WORKDIR/old.json"); merged_count=$(jq 'length' "$WORKDIR/merged.json") log "Pages fetched: $pages_fetched / $total_pages" log "new_items.json: $new_count | old.json: $old_count | merged.json: $merged_count" # atomic write if [[ $DRY_RUN -eq 1 ]]; then log "Dry-run: NOT writing $OUT_FILE" else tmp_out="$WORKDIR/out.tmp.json" jq . "$WORKDIR/merged.json" > "$tmp_out" [[ -s "$tmp_out" ]] || die "Merged output is empty — aborting write" mv "$tmp_out" "$OUT_FILE" log "Updated $OUT_FILE" fi