From ae3de2a2a93b0d31e82ecffc2280d8701e92d1d2 Mon Sep 17 00:00:00 2001 From: Urban Date: Tue, 21 Oct 2025 10:49:56 +0200 Subject: [PATCH] videocmp - new version --- videocmp_select.sh | 314 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 245 insertions(+), 69 deletions(-) diff --git a/videocmp_select.sh b/videocmp_select.sh index 2d93ca9..f00f9f0 100755 --- a/videocmp_select.sh +++ b/videocmp_select.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# videocmp_select.sh +# videocmp_select.sh — first-underscore grouping + daily log file + de-dup clusters + timing # # Modes: # 1) Pair mode: compare A and B @@ -9,41 +9,36 @@ # ./videocmp_select.sh --scan-dir [DIR] [--recursive] [options] # (DIR defaults to "." if omitted) # -# Examples of grouping by first token: -# Becky_California.mp4, Becky_California.1.mp4 → group key "Becky" -# Bianca_Golden.mp4, Bianca_Golden_Cut.mp4 → group key "Bianca" -# Emylia.mp4, Emylia_Argan.mp4, Emylia_Wish.mp4 → group key "Emylia" +# Logging: +# By default, logs are written to: ~/log/videocmp_select.YYMMDD.log +# Example: ~/log/videocmp_select.251014.log +# Override with: --log-file /path/to/file.log # -# Pipeline: -# 1) Validate files (ffprobe fields + ffmpeg deep decode) -# 2) Confirm same movie via snapshot SSIM @ --snapshot-time (default 12s) -# 3) Optional: run external compare impl (simple/advanced) for logging -# 4) Pick preferred: prefer --prefer-height (default 720), then longer duration, then larger file -# 5) Act on loser: --action print|move|delete (with --dry-run) +# Compare helpers (paths): +# Defaults point to /home/urban/dev/wcx_script +# --impl-simple PATH (default: /home/urban/dev/wcx_script/compare_simple.sh) +# --impl-advanced PATH (default: /home/urban/dev/wcx_script/compare_advanced.sh) # -# Common options: -# --snapshot-time SEC (default: 12) -# --snapshot-scale WxH (default: 320:-1) -# --snapshot-ssim THRESH (default: 0.97) -# --impl simple|advanced (default: simple) # logs only -# --impl-simple PATH (default: ./compare_simple.sh) -# --impl-advanced PATH (default: ./compare_advanced.sh) -# --impl-optional (default) warn if impl missing -# --impl-required error if chosen impl missing -# --prefer-height N (default: 720) -# --duration-eps SEC (default: 0.0) -# --action print|move|delete (default: print) -# --trash-dir PATH (default: $HOME/.video_trash) -# --dry-run -# --verbose -# -# Directory-scan options: -# --scan-dir [DIR] Enable directory mode (DIR optional; default ".") -# --recursive, -r Recurse into subfolders -# (Note: delimiter-based grouping is deprecated; this script now groups by first underscore token.) +# De-dup semantics (within each key/group): +# • Keep **one** representative for each set of files that are the **same movie**. +# • Two-stage "same movie": +# 1) Fast path: identical signature (WxH, duration_ms, size) ⇒ same; no SSIM needed. +# 2) Else: SSIM snapshot at --snapshot-time; SSIM ≥ --snapshot-ssim ⇒ same. +# • After clustering, pick a single **best** (prefer height → duration → size) and drop the rest. # # Exit codes: # 0 success | 1 differ/broken | 2 usage | 3 missing dependency + +# --- shell guard: require bash 4+ --- +if [[ -z "${BASH_VERSION:-}" ]]; then + echo "[ERR] This script requires bash. Re-running with bash..." >&2 + exec bash "$0" "$@" +fi +if (( ${BASH_VERSINFO[0]:-0} < 4 )); then + echo "[ERR] bash 4+ required (associative arrays). You have: ${BASH_VERSION}" >&2 + exit 2 +fi + set -u # -------- defaults -------- @@ -52,8 +47,8 @@ SNAP_SCALE="320:-1" SNAP_SSIM="0.97" IMPL="simple" -IMPL_SIMPLE="./compare_simple.sh" -IMPL_ADV="./compare_advanced.sh" +IMPL_SIMPLE="/home/urban/dev/wcx_script/compare_simple.sh" +IMPL_ADV="/home/urban/dev/wcx_script/compare_advanced.sh" IMPL_REQUIRED=0 PREF_HEIGHT=720 @@ -63,15 +58,37 @@ ACTION="print" TRASH_DIR="${HOME}/.video_trash" DRY=0 VERBOSE=0 +DEBUG=0 SCAN_DIR="" RECURSIVE=0 +# Fast-path signature config: +# duration is converted to integer milliseconds to avoid FP noise +DURATION_MS_PREC=0 + +# --- logging defaults --- +DATESTAMP=$(date +%y%m%d) +LOG_DIR="$HOME/log" +LOG_FILE="$LOG_DIR/videocmp_select.${DATESTAMP}.log" + # -------- helpers -------- need() { command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 3; }; } -v() { [[ $VERBOSE -eq 1 ]] && echo "[LOG]" "$@" >&2; } +v() { [[ $VERBOSE -eq 1 ]] && echo "[LOG]" "$@" >&2; } +dbg() { [[ $DEBUG -eq 1 ]] && printf '[DBG %(%F %T)T] %s\n' -1 "$*" >&2; } die() { echo "[ERR]" "$@" >&2; exit 1; } +# current time in milliseconds (portable-ish) +now_ms() { + local ms s n + ms=$(date +%s%3N 2>/dev/null) && { echo "$ms"; return; } + s=$(date +%s) + n=$(date +%N 2>/dev/null || echo 0) + echo $(( s*1000 + n/1000000 )) +} +# delta in ms +duration_ms() { echo $(( $2 - $1 )); } + need ffmpeg; need ffprobe; need awk; need grep; need stat; need sed; need tr; need find # -------- array-based option parser -------- @@ -81,6 +98,7 @@ i=0 while (( i < ${#ARGS[@]} )); do arg="${ARGS[i]}" case "$arg" in + --log-file) LOG_FILE="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-time) SNAP_T="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-scale) SNAP_SCALE="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-ssim) SNAP_SSIM="${ARGS[i+1]:-}"; i=$((i+2));; @@ -95,8 +113,8 @@ while (( i < ${#ARGS[@]} )); do --trash-dir) TRASH_DIR="${ARGS[i+1]:-}"; i=$((i+2));; --dry-run) DRY=1; i=$((i+1));; --verbose) VERBOSE=1; i=$((i+1));; + --debug) DEBUG=1; VERBOSE=1; i=$((i+1));; --scan-dir) - # optional arg: use next token unless it looks like another option next="${ARGS[i+1]:-}" if [[ -n "$next" && "$next" != --* ]]; then SCAN_DIR="$next"; i=$((i+2)) @@ -105,24 +123,36 @@ while (( i < ${#ARGS[@]} )); do fi ;; --recursive|-r) RECURSIVE=1; i=$((i+1));; - --delimiter|--delim) # backward-compat: ignore but warn once - echo "[WARN] --delimiter is deprecated; grouping now uses first underscore token." >&2 - i=$((i+ (arg=="--delimiter"?2:1) )) ;; --help|-h) - grep -E '^# ' "$0" | sed 's/^# //' - exit 0 ;; + grep -E '^# ' "$0" | sed 's/^# //' ; exit 0 ;; *) - # leave positional for pair mode REM_ARR+=("$arg"); i=$((i+1));; esac done -v "[DBG] Options: SCAN_DIR='${SCAN_DIR:-}' RECURSIVE=$RECURSIVE PREF_HEIGHT=$PREF_HEIGHT SNAP_T=$SNAP_T" +# initialize logging sink (after options so --log-file can override) +mkdir -p "$LOG_DIR" "$(dirname "$LOG_FILE")" 2>/dev/null || true +# send both stdout and stderr through tee to the log file +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "[INFO] Running with: ${BASH:-/bin/bash} ${BASH_VERSION}" +echo "[INFO] Logging to: $LOG_FILE" +v "[DBG] Options: SCAN_DIR='${SCAN_DIR:-}' RECURSIVE=$RECURSIVE PREF_HEIGHT=$PREF_HEIGHT SNAP_T=$SNAP_T IMPL_SIMPLE=$IMPL_SIMPLE IMPL_ADV=$IMPL_ADV" # -------- core functions -------- -probe_meta() { # file -> "w h dur codec size" + +# portable file size (GNU/BSD/BusyBox) +get_size() { + local f="$1" s + if s=$(stat -c %s -- "$f" 2>/dev/null); then echo "$s"; return; fi + if s=$(stat -f %z -- "$f" 2>/dev/null); then echo "$s"; return; fi + if s=$(wc -c <"$f" 2>/dev/null); then echo "$s"; return; fi + echo 0 +} + +probe_meta() { # file -> "w h dur_sec vcodec size bytes" local f="$1" size w h dur vcodec - size=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f") + size=$(get_size "$f") w=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=nw=1:nk=1 "$f" || echo 0) h=$(ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=nw=1:nk=1 "$f" || echo 0) dur=$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$f" || echo 0) @@ -130,6 +160,17 @@ probe_meta() { # file -> "w h dur codec size" echo "$w $h $dur $vcodec $size" } +# Convert seconds(float) → ms(int) +sec_to_ms() { awk 'BEGIN{ printf("%d", ('"$1"') * 1000 + 0.5) }'; } + +# Build a fast-path signature: width x height : duration_ms : size +signature() { # file -> sig string + local f="$1" W H DUR VC SIZE DMS + read -r W H DUR VC SIZE <<<"$(probe_meta "$f")" + DMS=$(sec_to_ms "$DUR") + echo "${W}x${H}:${DMS}ms:${SIZE}" +} + check_ok() { # file -> 0 ok / 1 bad local f="$1" w h dur w=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=nw=1:nk=1 "$f" 2>/dev/null || echo "") @@ -175,6 +216,7 @@ run_impl() { # impl, A, B score_file() { # file -> "tier720 dur size" local f="$1" + local W H DUR VC SIZE read -r W H DUR VC SIZE <<<"$(probe_meta "$f")" local tier=1; [[ "$H" -eq "$PREF_HEIGHT" ]] && tier=0 echo "$tier $DUR $SIZE" @@ -182,6 +224,7 @@ score_file() { # file -> "tier720 dur size" pick_winner() { # A B -> "KEEP|DROP|why" local a="$1" b="$2" + local aTier aDur aSize bTier bDur bSize read -r aTier aDur aSize <<<"$(score_file "$a")" read -r bTier bDur bSize <<<"$(score_file "$b")" v "Quality scores: A[tier=$aTier dur=$aDur size=$aSize] B[tier=$bTier dur=$bDur size=$bSize]" @@ -201,6 +244,13 @@ pick_winner() { # A B -> "KEEP|DROP|why" act_on_loser() { # loser keep local loser="$1" keeper="$2" + # Safety: ensure loser belongs to current group (by first token) to prevent cross-group actions + local lkey + lkey="$(first_underscore_key "$(basename -- "$loser")")" + if [[ -n "${CURRENT_GROUP_KEY:-}" && "$lkey" != "$CURRENT_GROUP_KEY" ]]; then + echo "[ACTION] skip: '$loser' not in current group '$CURRENT_GROUP_KEY' (lkey='$lkey')"; return 0; fi + if [[ ! -e "$loser" ]]; then + echo "[ACTION] skip: not found → $loser"; return 0; fi case "$ACTION" in print) echo "[ACTION] Keep: $keeper" @@ -211,7 +261,7 @@ act_on_loser() { # loser keep if [[ $DRY -eq 1 ]]; then echo "[ACTION] dry-run: mv \"$loser\" \"$TRASH_DIR/\"" else - mv -- "$loser" "$TRASH_DIR/" && echo "[ACTION] moved to trash: $loser -> $TRASH_DIR/" + mv -- "$loser" "$TRASH_DIR/" && echo "[ACTION] moved to trash: $loser -> $TRASH_DIR/" || echo "[ACTION] move failed: $loser" fi echo "[ACTION] kept: $keeper" ;; @@ -219,7 +269,7 @@ act_on_loser() { # loser keep if [[ $DRY -eq 1 ]]; then echo "[ACTION] dry-run: rm \"$loser\"" else - rm -- "$loser" && echo "[ACTION] deleted: $loser" + rm -- "$loser" && echo "[ACTION] deleted: $loser" || echo "[ACTION] delete failed: $loser" fi echo "[ACTION] kept: $keeper" ;; @@ -246,8 +296,7 @@ scan_and_collect() { fi } -# New grouping rule: key = first token of basename (without extension) before the first underscore. -# If there is no underscore, the whole basename (without extension) is the key. +# key = first token of basename (without extension) before the first underscore; lowercased first_underscore_key() { local base extless key base="$(basename -- "$1")" @@ -257,32 +306,154 @@ first_underscore_key() { else key="$extless" fi - echo "$key" + echo "${key,,}" +} + +# union helper: merge cluster B into A (robust to unset B) +union_clusters() { # cidA cidB -> merges B into A + local A="$1" B="$2" line + [[ "$A" == "$B" ]] && return 0 + # If B doesn't exist (already merged/unset), skip gracefully + if [[ -z "${CLUSTER_CONTENT[$B]+x}" ]]; then + dbg "union: skip, cluster B=$B not found" + return 0 + fi + # Ensure target A content exists + if [[ -z "${CLUSTER_CONTENT[$A]+x}" ]]; then + CLUSTER_CONTENT["$A"]="" + fi + while IFS= read -r line; do + [[ -z "$line" ]] && continue + FILE_TO_CID["$line"]="$A" + CLUSTER_CONTENT["$A"]+=$'\n'"$line" + done < <(printf "%s\n" "${CLUSTER_CONTENT[$B]}" | sed '/^$/d') + unset 'CLUSTER_CONTENT[$B]' +} + +cluster_group() { # files... -> create clusters so that "same movie" files share a cid + local arr=("$@") + local CID=0 + + # Use dynamic scoping: refer to process_group's local assoc arrays + # shellcheck disable=SC2154 + : "${FILE_TO_CID[@]:-}" "${CLUSTER_CONTENT[@]:-}" "${SIG_CACHE[@]:-}" + + # init each file in its own cluster + local f id + for f in "${arr[@]}"; do + id=$(( ++CID )) + FILE_TO_CID["$f"]="$id" + CLUSTER_CONTENT["$id"]+=$'\n'"$f" + SIG_CACHE["$f"]="$(signature "$f")" + dbg "init cid=$id file=$f sig=${SIG_CACHE[$f]}" + done + + local n=${#arr[@]} + local i j fi fj ci cj t0 t1 + for ((i=0;i $(signature "$f")"; done + + dbg "group='${key}' files=${#okfiles[@]}" + + # Build clusters of "same movie" + cluster_group "${okfiles[@]}" + + # For each cluster, keep just one best and drop the rest + local cid content arr best keepers=() + for cid in "${!CLUSTER_CONTENT[@]}"; do + content="${CLUSTER_CONTENT[$cid]:-}" + mapfile -t arr < <(printf "%s" "$content" | sed '/^$/d') + dbg "cluster id=$cid size=${#arr[@]}" + if (( ${#arr[@]} == 0 )); then continue; fi + if (( ${#arr[@]} == 1 )); then keepers+=("${arr[0]}"); continue; fi + best="$(choose_best_in_cluster "${arr[@]}")" + keepers+=("$best") + # drop all others in the cluster + local x + for x in "${arr[@]}"; do + [[ "$x" == "$best" ]] && continue + echo + echo "[CLUSTER] $(basename -- "$x") is duplicate of cluster best $(basename -- "$best")" + act_on_loser "$x" "$best" + done done - echo; echo "=== Group result: kept $(basename -- "$best") ===" + echo + echo "=== Group result: kept ${#keepers[@]} unique file(s) ===" + for f in "${keepers[@]}"; do echo " • $(basename -- "$f")"; done + group_t1=$(now_ms) + echo "[TIME] group '${key}' took $(duration_ms "$group_t0" "$group_t1") ms" } dir_mode() { @@ -305,7 +476,8 @@ dir_mode() { process_group "${grpfiles[@]}" done - echo; echo ">> Directory scan complete." + echo + echo ">> Directory scan complete." } # ----- pair comparison driver ----- @@ -316,15 +488,19 @@ compare_pair() { # A B if (( okA==0 || okB==0 )); then echo "[FAIL] One or both files broken. A_ok=$okA B_ok=$okB" >&2; exit 1; fi echo "[OK] Both files decoded cleanly." - echo; echo "== Step 2: Snapshot compare =="; - if ! same_movie_or_skip "$A" "$B"; then - echo "[FAIL] Files are not the same movie (SSIM < ${SNAP_SSIM})." >&2; exit 1 + echo; echo "== Step 2: Same-movie test (fast path then SSIM) ==" + if [[ "$(signature "$A")" == "$(signature "$B")" ]]; then + echo "[OK] Identical signature ⇒ treat as same movie" + else + if ! same_movie_or_skip "$A" "$B"; then + echo "[FAIL] Files are not the same movie (SSIM < ${SNAP_SSIM})." >&2; exit 1 + fi fi echo "[OK] Same movie." echo; echo "== Step 3: External compare ($IMPL) =="; run_impl "$IMPL" "$A" "$B" - echo; echo "== Step 4: Quality selection (prefer ${PREF_HEIGHT}p) =="; + echo; echo "== Step 4: Quality selection (prefer ${PREF_HEIGHT}p) ==" read -r keep drop why <<<"$(pick_winner "$A" "$B" | tr '|' ' ')" echo "[DECISION] Keep: $keep"; echo "[DECISION] Drop: $drop"; echo "[REASON] $why"