updates
This commit is contained in:
336
wcx_sync.sh
Executable file
336
wcx_sync.sh
Executable file
@ -0,0 +1,336 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# wcx_sync.sh — v2.5 (robust parser via Python + jq 1.7)
|
||||
# Keeps wcx_index.json synced from https://www.woodmancastingx.com/casting-xxx/?page=1..N
|
||||
# Requires: curl, jq >= 1.7, python3 (NO external Python packages needed)
|
||||
|
||||
BASE_URL="https://www.woodmancastingx.com"
|
||||
LIST_PATH="/casting-xxx/"
|
||||
OUT_FILE="wcx_index.json"
|
||||
END_PAGE=47
|
||||
SLEEP_MS=300
|
||||
DRY_RUN=0
|
||||
LOG_FILE=""
|
||||
DEBUG=0
|
||||
USER_AGENT="Mozilla/5.0 (X11; Linux x86_64) wcx_sync.sh/2.5"
|
||||
CURL_RETRIES=3
|
||||
CURL_TIMEOUT=20
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [options]
|
||||
|
||||
Options:
|
||||
--out FILE Output JSON file (default: ${OUT_FILE})
|
||||
--pages N Last page to fetch (start=1, default: ${END_PAGE})
|
||||
--sleep-ms N Delay between pages in ms (default: ${SLEEP_MS})
|
||||
--dry-run Do everything except writing the output file
|
||||
--log FILE Append logs to FILE (and also print to stdout)
|
||||
--timeout SEC curl timeout (default: ${CURL_TIMEOUT})
|
||||
--retries N curl retries (default: ${CURL_RETRIES})
|
||||
--debug Verbose per-item extraction logs
|
||||
-h|--help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
log() { local ts; ts="$(date +'%Y-%m-%d %H:%M:%S')"; [[ -n "${LOG_FILE}" ]] && echo "[$ts] $*" >>"$LOG_FILE"; echo "[$ts] $*"; }
|
||||
warn(){ log "WARNING: $*"; }
|
||||
die(){ echo "Error: $*" >&2; exit 1; }
|
||||
on_err(){ local ec=$?; warn "ERROR at line ${BASH_LINENO[0]} (cmd: ${BASH_COMMAND}) — exiting ($ec)"; exit "$ec"; }
|
||||
trap on_err ERR
|
||||
|
||||
need(){ command -v "$1" >/dev/null 2>&1 || die "Missing required tool: $1"; }
|
||||
need curl; need jq; need python3
|
||||
|
||||
trim() { local s="$1"; s="${s#"${s%%[!$' \t\r\n']*}"}"; s="${s%"${s##*[!$' \t\r\n']}"}"; printf '%s' "$s"; }
|
||||
abs_url(){ local href="$1"; if [[ "$href" =~ ^https?:// ]]; then printf '%s' "$href"; else if [[ "$href" =~ ^/ ]]; then printf '%s%s' "$BASE_URL" "$href"; else printf '%s/%s' "$BASE_URL" "$href"; fi; fi; }
|
||||
slug_to_title(){
|
||||
local href="$1"; local base="${href##*/}"; base="${base%.html}"; local name="${base%_*}"; name="${name//-/ }"
|
||||
python3 - "$name" <<'PY'
|
||||
import sys
|
||||
s=sys.argv[1].strip()
|
||||
print(" ".join(w.capitalize() for w in s.split()) if s else "")
|
||||
PY
|
||||
}
|
||||
|
||||
# ---------- args ----------
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--out) OUT_FILE="${2:?}"; shift 2 ;;
|
||||
--pages|--end-page) END_PAGE="${2:?}"; shift 2 ;;
|
||||
--sleep-ms) SLEEP_MS="${2:?}"; shift 2 ;;
|
||||
--dry-run) DRY_RUN=1; shift ;;
|
||||
--log) LOG_FILE="${2:?}"; shift 2 ;;
|
||||
--timeout) CURL_TIMEOUT="${2:?}"; shift 2 ;;
|
||||
--retries) CURL_RETRIES="${2:?}"; shift 2 ;;
|
||||
--debug) DEBUG=1; shift ;;
|
||||
-h|--help) usage; exit 0 ;;
|
||||
*) die "Unknown flag: $1";;
|
||||
esac
|
||||
done
|
||||
|
||||
WORKDIR="$(mktemp -d -t wcx_sync.XXXXXX)"
|
||||
trap 'rm -rf "$WORKDIR"' EXIT
|
||||
TODAY="$(date +%F)"
|
||||
|
||||
log "Workdir: $WORKDIR"
|
||||
|
||||
# read existing or init
|
||||
if [[ -f "$OUT_FILE" ]]; then
|
||||
jq -e . "$OUT_FILE" >/dev/null 2>&1 || die "Invalid JSON in $OUT_FILE"
|
||||
cp "$OUT_FILE" "$WORKDIR/old.json"
|
||||
else
|
||||
echo "[]" > "$WORKDIR/old.json"
|
||||
fi
|
||||
|
||||
: > "$WORKDIR/new_items.jsonl"
|
||||
total_pages=$END_PAGE
|
||||
[[ "$total_pages" =~ ^[0-9]+$ ]] || die "--pages must be an integer"
|
||||
|
||||
log "Start. pages=1..$total_pages, out=$OUT_FILE, dry_run=$DRY_RUN"
|
||||
|
||||
pages_fetched=0; items_found=0; items_skipped_no_title=0; debug_shown=0
|
||||
|
||||
# ---------- Python extractor (HTML → JSONL) ----------
|
||||
py_extract() {
|
||||
python3 - "$1" "$BASE_URL" "$DEBUG" <<'PY'
|
||||
import sys, html
|
||||
from html.parser import HTMLParser
|
||||
|
||||
html_file, base_url, debug = sys.argv[1], sys.argv[2], int(sys.argv[3])
|
||||
|
||||
class Node:
|
||||
def __init__(self, name, attrs, parent=None):
|
||||
self.name=name
|
||||
self.attrs=dict(attrs)
|
||||
self.children=[]
|
||||
self.parent=parent
|
||||
self.text=[]
|
||||
def add_child(self, node): self.children.append(node)
|
||||
def add_text(self, t):
|
||||
if t: self.text.append(t)
|
||||
def get_text(self):
|
||||
return "".join(self.text + [c.get_text() for c in self.children if isinstance(c, Node)])
|
||||
|
||||
class Parser(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.root=Node('root',{})
|
||||
self.stack=[self.root]
|
||||
def handle_starttag(self, tag, attrs):
|
||||
node=Node(tag, attrs, self.stack[-1])
|
||||
self.stack[-1].add_child(node)
|
||||
self.stack.append(node)
|
||||
def handle_endtag(self, tag):
|
||||
# pop to the last matching tag
|
||||
for i in range(len(self.stack)-1, -1, -1):
|
||||
if self.stack[i].name == tag:
|
||||
self.stack=self.stack[:i]
|
||||
break
|
||||
def handle_data(self, data):
|
||||
self.stack[-1].add_text(data)
|
||||
|
||||
def has_class(node, cls):
|
||||
c=node.attrs.get('class','')
|
||||
return any(x.strip()==cls for x in c.split()) if isinstance(c,str) else False
|
||||
|
||||
def classes(node):
|
||||
c=node.attrs.get('class','')
|
||||
return set(x.strip() for x in c.split()) if isinstance(c,str) else set()
|
||||
|
||||
def find_all(node, tag=None, must_classes=None):
|
||||
out=[]
|
||||
st=[node]
|
||||
while st:
|
||||
n=st.pop()
|
||||
if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))):
|
||||
out.append(n)
|
||||
st.extend(n.children)
|
||||
return out
|
||||
|
||||
def find_first(node, tag=None, must_classes=None, attr=None):
|
||||
st=[node]
|
||||
while st:
|
||||
n=st.pop(0)
|
||||
if (tag is None or n.name==tag) and (must_classes is None or must_classes.issubset(classes(n))):
|
||||
if attr:
|
||||
if attr in n.attrs:
|
||||
return n
|
||||
else:
|
||||
return n
|
||||
st.extend(n.children)
|
||||
return None
|
||||
|
||||
def get_text_by_selector(a_node, tag, cls):
|
||||
for n in a_node.children:
|
||||
pass
|
||||
# BFS under a_node
|
||||
st=[a_node]
|
||||
while st:
|
||||
n=st.pop(0)
|
||||
if n.name==tag and cls in classes(n):
|
||||
return n.get_text().strip()
|
||||
st.extend(n.children)
|
||||
return ""
|
||||
|
||||
def get_attr_under(a_node, tag, cls, attr):
|
||||
st=[a_node]
|
||||
while st:
|
||||
n=st.pop(0)
|
||||
if n.name==tag and cls in classes(n):
|
||||
v=n.attrs.get(attr,'')
|
||||
if v: return v.strip()
|
||||
st.extend(n.children)
|
||||
return ""
|
||||
|
||||
def abs_url(base, href):
|
||||
if href.startswith('http://') or href.startswith('https://'):
|
||||
return href
|
||||
if href.startswith('/'):
|
||||
return base + href
|
||||
return base.rstrip('/') + '/' + href
|
||||
|
||||
with open(html_file,'r',encoding='utf-8', errors='ignore') as f:
|
||||
data=f.read()
|
||||
|
||||
p=Parser()
|
||||
p.feed(data)
|
||||
|
||||
# scope: div.items.container_3
|
||||
items_containers = [n for n in find_all(p.root, 'div') if {'items','container_3'}.issubset(classes(n))]
|
||||
if not items_containers:
|
||||
# fallback: search entire tree
|
||||
items_containers=[p.root]
|
||||
|
||||
count=0
|
||||
for container in items_containers:
|
||||
anchors = [n for n in find_all(container, 'a') if {'item','scene'}.issubset(classes(n))]
|
||||
for a in anchors:
|
||||
href=a.attrs.get('href','').strip()
|
||||
if not href:
|
||||
continue
|
||||
|
||||
# Title fallbacks: span.title → a@title → anchor text → img.thumb@alt → slug
|
||||
title = get_text_by_selector(a, 'span', 'title').strip()
|
||||
if not title:
|
||||
title = a.attrs.get('title','') or ''
|
||||
title = html.unescape(title).strip()
|
||||
if not title:
|
||||
title = a.get_text().strip()
|
||||
if not title:
|
||||
title = get_attr_under(a, 'img', 'thumb', 'alt')
|
||||
if not title:
|
||||
# slug: /casting-x/tanika_8927.html -> Tanika
|
||||
base=href.split('/')[-1].split('.')[0] if '/' in href else href
|
||||
base=base.rsplit('_',1)[0]
|
||||
title = " ".join(w.capitalize() for w in base.replace('-', ' ').split())
|
||||
|
||||
title=html.unescape(title).strip()
|
||||
if not title:
|
||||
if debug:
|
||||
sys.stdout.write("") # nothing
|
||||
continue
|
||||
|
||||
duration = get_text_by_selector(a, 'span', 'duration').strip()
|
||||
thumb = get_attr_under(a, 'img', 'thumb', 'src').strip()
|
||||
details = abs_url(base_url, href)
|
||||
|
||||
obj = {
|
||||
"id": title,
|
||||
"titel": title,
|
||||
"thumb": thumb if thumb else None,
|
||||
"details": details,
|
||||
"duration": duration,
|
||||
"published": "" # set on merge if new
|
||||
}
|
||||
# compact JSON line (skip null fields)
|
||||
import json
|
||||
obj = {k:v for k,v in obj.items() if v is not None}
|
||||
print(json.dumps(obj, ensure_ascii=False))
|
||||
count += 1
|
||||
|
||||
if debug:
|
||||
print(f"__DEBUG_COUNT__={count}", file=sys.stderr)
|
||||
PY
|
||||
}
|
||||
|
||||
# ---------- crawl pages ----------
|
||||
for (( p=1; p<=total_pages; p++ )); do
|
||||
page_url="${BASE_URL}${LIST_PATH}?page=${p}"
|
||||
html_file="$WORKDIR/page_${p}.html"
|
||||
|
||||
log "Fetching page $p: $page_url"
|
||||
if ! curl -fsSL --retry "$CURL_RETRIES" --connect-timeout "$CURL_TIMEOUT" -A "$USER_AGENT" "$page_url" -o "$html_file"; then
|
||||
warn "Could not fetch $page_url — skipping"; continue
|
||||
fi
|
||||
if [[ ! -s "$html_file" ]]; then warn "Empty file for page $p — skipping"; continue; fi
|
||||
pages_fetched=$((pages_fetched+1))
|
||||
|
||||
# Extract items → append JSON lines
|
||||
py_extract "$html_file" >> "$WORKDIR/new_items.jsonl"
|
||||
if [[ $DEBUG -eq 1 ]]; then
|
||||
dbg_count=$(grep -c '^{' "$WORKDIR/new_items.jsonl" || true)
|
||||
log "DEBUG: accumulated JSONL lines after page $p: $dbg_count"
|
||||
fi
|
||||
|
||||
# polite delay
|
||||
if [[ $p -lt $total_pages && $SLEEP_MS -gt 0 ]]; then
|
||||
if command -v usleep >/dev/null 2>&1; then usleep "$((SLEEP_MS * 1000))" || true
|
||||
else sleep "$(awk "BEGIN{printf \"%.3f\", ${SLEEP_MS}/1000}")" || true
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
lines_jsonl=$(wc -l < "$WORKDIR/new_items.jsonl" || echo 0)
|
||||
log "Collected lines in new_items.jsonl: $lines_jsonl"
|
||||
|
||||
# unique by title (last occurrence wins)
|
||||
if [[ -s "$WORKDIR/new_items.jsonl" ]]; then
|
||||
jq -s 'reduce .[] as $o ({}; .[$o.titel] = $o) | to_entries | map(.value)' \
|
||||
"$WORKDIR/new_items.jsonl" > "$WORKDIR/new_items.json"
|
||||
else
|
||||
echo "[]" > "$WORKDIR/new_items.json"
|
||||
fi
|
||||
|
||||
# merge & sort (published on new; updated when duration changes)
|
||||
jq --arg today "$TODAY" '
|
||||
def toMap: reduce .[] as $x ({}; .[$x.titel] = $x);
|
||||
|
||||
. as $newArr | input as $oldArr
|
||||
| ($oldArr | toMap) as $old
|
||||
| reduce $newArr[] as $n ($old;
|
||||
. as $m
|
||||
| ($m[$n.titel] // null) as $o
|
||||
| if $o == null then
|
||||
.[$n.titel] = ($n + {published: $today})
|
||||
else
|
||||
.[$n.titel] =
|
||||
( $o
|
||||
| (if ($n.details // "") != "" then . + {details: $n.details} else . end)
|
||||
| (if ((.thumb // "") == "" and ($n.thumb // "") != "") then . + {thumb: $n.thumb} else . end)
|
||||
| (if (($n.duration // "") != "" and ($n.duration // "") != ($o.duration // ""))
|
||||
then . + {duration: $n.duration, updated: $today}
|
||||
else .
|
||||
end)
|
||||
)
|
||||
end
|
||||
)
|
||||
| (. | to_entries | map(.value))
|
||||
| sort_by(.updated // .published) | reverse
|
||||
' "$WORKDIR/new_items.json" "$WORKDIR/old.json" > "$WORKDIR/merged.json"
|
||||
|
||||
new_count=$(jq 'length' "$WORKDIR/new_items.json"); old_count=$(jq 'length' "$WORKDIR/old.json"); merged_count=$(jq 'length' "$WORKDIR/merged.json")
|
||||
log "Pages fetched: $pages_fetched / $total_pages"
|
||||
log "new_items.json: $new_count | old.json: $old_count | merged.json: $merged_count"
|
||||
|
||||
# atomic write
|
||||
if [[ $DRY_RUN -eq 1 ]]; then
|
||||
log "Dry-run: NOT writing $OUT_FILE"
|
||||
else
|
||||
tmp_out="$WORKDIR/out.tmp.json"
|
||||
jq . "$WORKDIR/merged.json" > "$tmp_out"
|
||||
[[ -s "$tmp_out" ]] || die "Merged output is empty — aborting write"
|
||||
mv "$tmp_out" "$OUT_FILE"
|
||||
log "Updated $OUT_FILE"
|
||||
fi
|
||||
Reference in New Issue
Block a user