#!/usr/bin/env bash # videocmp_select.sh — first-underscore grouping + daily log file + de-dup clusters + timing # # Modes: # 1) Pair mode: compare A and B # ./videocmp_select.sh A.mp4 B.mp4 [options] # # 2) Directory scan mode: group files by the **first token before the first underscore** in the basename # ./videocmp_select.sh --scan-dir [DIR] [--recursive] [options] # (DIR defaults to "." if omitted) # # Logging: # By default, logs are written to: ~/log/videocmp_select.YYMMDD.log # Example: ~/log/videocmp_select.251014.log # Override with: --log-file /path/to/file.log # # Compare helpers (paths): # Defaults point to /home/urban/dev/wcx_script # --impl-simple PATH (default: /home/urban/dev/wcx_script/compare_simple.sh) # --impl-advanced PATH (default: /home/urban/dev/wcx_script/compare_advanced.sh) # # De-dup semantics (within each key/group): # • Keep **one** representative for each set of files that are the **same movie**. # • Two-stage "same movie": # 1) Fast path: identical signature (WxH, duration_ms, size) ⇒ same; no SSIM needed. # 2) Else: SSIM snapshot at --snapshot-time; SSIM ≥ --snapshot-ssim ⇒ same. # • After clustering, pick a single **best** (prefer height → duration → size) and drop the rest. # # Exit codes: # 0 success | 1 differ/broken | 2 usage | 3 missing dependency # --- shell guard: require bash 4+ --- if [[ -z "${BASH_VERSION:-}" ]]; then echo "[ERR] This script requires bash. Re-running with bash..." >&2 exec bash "$0" "$@" fi if (( ${BASH_VERSINFO[0]:-0} < 4 )); then echo "[ERR] bash 4+ required (associative arrays). You have: ${BASH_VERSION}" >&2 exit 2 fi set -u # -------- defaults -------- SNAP_T="12" SNAP_SCALE="320:-1" SNAP_SSIM="0.97" IMPL="simple" IMPL_SIMPLE="/home/urban/dev/wcx_script/compare_simple.sh" IMPL_ADV="/home/urban/dev/wcx_script/compare_advanced.sh" IMPL_REQUIRED=0 PREF_HEIGHT=720 DURATION_EPS=0.0 ACTION="print" TRASH_DIR="${HOME}/.video_trash" DRY=0 VERBOSE=0 DEBUG=0 SCAN_DIR="" RECURSIVE=0 # Fast-path signature config: # duration is converted to integer milliseconds to avoid FP noise DURATION_MS_PREC=0 # --- logging defaults --- DATESTAMP=$(date +%y%m%d) LOG_DIR="$HOME/log" LOG_FILE="$LOG_DIR/videocmp_select.${DATESTAMP}.log" # Add to defaults section IGNORE_PATTERN="" # -------- helpers -------- need() { command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 3; }; } v() { [[ $VERBOSE -eq 1 ]] && echo "[LOG]" "$@" >&2; } dbg() { [[ $DEBUG -eq 1 ]] && printf '[DBG %(%F %T)T] %s\n' -1 "$*" >&2; } die() { echo "[ERR]" "$@" >&2; exit 1; } # current time in milliseconds (portable-ish) now_ms() { local ms s n ms=$(date +%s%3N 2>/dev/null) && { echo "$ms"; return; } s=$(date +%s) n=$(date +%N 2>/dev/null || echo 0) echo $(( s*1000 + n/1000000 )) } # delta in ms duration_ms() { echo $(( $2 - $1 )); } need ffmpeg; need ffprobe; need awk; need grep; need stat; need sed; need tr; need find # -------- array-based option parser -------- ARGS=("$@") REM_ARR=() i=0 while (( i < ${#ARGS[@]} )); do arg="${ARGS[i]}" case "$arg" in --log-file) LOG_FILE="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-time) SNAP_T="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-scale) SNAP_SCALE="${ARGS[i+1]:-}"; i=$((i+2));; --snapshot-ssim) SNAP_SSIM="${ARGS[i+1]:-}"; i=$((i+2));; --impl) IMPL="${ARGS[i+1]:-}"; i=$((i+2));; --impl-simple) IMPL_SIMPLE="${ARGS[i+1]:-}"; i=$((i+2));; --impl-advanced) IMPL_ADV="${ARGS[i+1]:-}"; i=$((i+2));; --impl-optional) IMPL_REQUIRED=0; i=$((i+1));; --impl-required) IMPL_REQUIRED=1; i=$((i+1));; --prefer-height) PREF_HEIGHT="${ARGS[i+1]:-}"; i=$((i+2));; --duration-eps) DURATION_EPS="${ARGS[i+1]:-}"; i=$((i+2));; --action) ACTION="${ARGS[i+1]:-}"; i=$((i+2));; --trash-dir) TRASH_DIR="${ARGS[i+1]:-}"; i=$((i+2));; --dry-run) DRY=1; i=$((i+1));; --verbose) VERBOSE=1; i=$((i+1));; --debug) DEBUG=1; VERBOSE=1; i=$((i+1));; --scan-dir) next="${ARGS[i+1]:-}" if [[ -n "$next" && "$next" != --* ]]; then SCAN_DIR="$next"; i=$((i+2)) else SCAN_DIR="."; i=$((i+1)) fi ;; --recursive|-r) RECURSIVE=1; i=$((i+1));; --ignore-prefix) IGNORE_PATTERN="${ARGS[i+1]:-}"; i=$((i+2));; --help|-h) grep -E '^# ' "$0" | sed 's/^# //' ; exit 0 ;; *) REM_ARR+=("$arg"); i=$((i+1));; esac done # initialize logging sink (after options so --log-file can override) mkdir -p "$LOG_DIR" "$(dirname "$LOG_FILE")" 2>/dev/null || true # send both stdout and stderr through tee to the log file exec > >(tee -a "$LOG_FILE") 2>&1 echo "[INFO] Running with: ${BASH:-/bin/bash} ${BASH_VERSION}" echo "[INFO] Logging to: $LOG_FILE" v "[DBG] Options: SCAN_DIR='${SCAN_DIR:-}' RECURSIVE=$RECURSIVE PREF_HEIGHT=$PREF_HEIGHT SNAP_T=$SNAP_T IMPL_SIMPLE=$IMPL_SIMPLE IMPL_ADV=$IMPL_ADV" # -------- core functions -------- # portable file size (GNU/BSD/BusyBox) get_size() { local f="$1" s if s=$(stat -c %s -- "$f" 2>/dev/null); then echo "$s"; return; fi if s=$(stat -f %z -- "$f" 2>/dev/null); then echo "$s"; return; fi if s=$(wc -c <"$f" 2>/dev/null); then echo "$s"; return; fi echo 0 } probe_meta() { # file -> "w h dur_sec vcodec size bytes" local f="$1" size w h dur vcodec size=$(get_size "$f") w=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=nw=1:nk=1 "$f" || echo 0) h=$(ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=nw=1:nk=1 "$f" || echo 0) dur=$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$f" || echo 0) vcodec=$(ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of default=nw=1:nk=1 "$f" || echo "?") echo "$w $h $dur $vcodec $size" } # Convert seconds(float) → ms(int) sec_to_ms() { awk 'BEGIN{ printf("%d", ('"$1"') * 1000 + 0.5) }'; } # Build a fast-path signature: width x height : duration_ms : size signature() { # file -> sig string local f="$1" W H DUR VC SIZE DMS read -r W H DUR VC SIZE <<<"$(probe_meta "$f")" DMS=$(sec_to_ms "$DUR") echo "${W}x${H}:${DMS}ms:${SIZE}" } check_ok() { # file -> 0 ok / 1 bad local f="$1" w h dur w=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=nw=1:nk=1 "$f" 2>/dev/null || echo "") h=$(ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=nw=1:nk=1 "$f" 2>/dev/null || echo "") dur=$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$f" 2>/dev/null || echo "") if [[ -z "$w" || -z "$h" || "$w" = "N/A" || "$h" = "N/A" || "$w" -eq 0 || "$h" -eq 0 ]]; then echo "BROKEN(ffprobe: no valid video stream): $f" >&2; return 1; fi if [[ -n "$dur" && "$dur" != "N/A" ]]; then awk -v d="$dur" 'BEGIN{exit !(d+0>0)}' || { echo "BROKEN(ffprobe: nonpositive duration): $f" >&2; return 1; } fi if ! ffmpeg -v error -xerror -i "$f" -f null - -nostats >/dev/null 2>&1; then echo "BROKEN(ffmpeg decode): $f" >&2; return 1; fi return 0 } snapshot_compare_ssim() { # f1 f2 time scale -> "ssim" (empty if fail) local f1="$1" f2="$2" t="$3" sc="$4" local tmpd s1 s2 log ssim tmpd="$(mktemp -d)"; s1="$tmpd/1.png"; s2="$tmpd/2.png"; log="$tmpd/cmp.log" ffmpeg -hide_banner -v error -y -ss "$t" -i "$f1" -frames:v 1 -vf "scale=$sc,format=yuv420p" "$s1" || true ffmpeg -hide_banner -v error -y -ss "$t" -i "$f2" -frames:v 1 -vf "scale=$sc,format=yuv420p" "$s2" || true if [[ ! -s "$s1" || ! -s "$s2" ]]; then rm -rf "$tmpd"; echo ""; return 0; fi ffmpeg -hide_banner -v info -i "$s1" -i "$s2" -lavfi "ssim" -f null - > /dev/null 2> "$log" || true ssim="$(grep -Eo 'All:[0-9]+(\.[0-9]+)?' "$log" | head -n1 | cut -d: -f2)" [[ -z "$ssim" ]] && ssim="$(grep -Eo 'SSIM [^ ]* All:[0-9]+(\.[0-9]+)?' "$log" | awk -F'All:' '{print $2}' | head -n1)" [[ -z "$ssim" ]] && ssim="$(grep -Eo 'SSIM Y:[0-9]+(\.[0-9]+)?' "$log" | head -n1 | cut -d: -f2)" rm -rf "$tmpd"; echo "$ssim" } run_impl() { # impl, A, B local which="$1" f1="$2" f2="$3" path="" [[ "$which" == "advanced" ]] && path="$IMPL_ADV" || path="$IMPL_SIMPLE" if [[ ! -x "$path" ]]; then if [[ $IMPL_REQUIRED -eq 1 ]]; then die "Requested --impl=$which but script not found/executable at $path" else echo "[impl:$which] not found ($path) — skipping" >&2; return 0 fi fi echo "[impl:$which] $path \"$f1\" \"$f2\"" >&2 "$path" "$f1" "$f2" 2>&1 | sed -n '1,12p' >&2 } score_file() { # file -> "tier720 dur size" local f="$1" local W H DUR VC SIZE read -r W H DUR VC SIZE <<<"$(probe_meta "$f")" local tier=1; [[ "$H" -eq "$PREF_HEIGHT" ]] && tier=0 echo "$tier $DUR $SIZE" } pick_winner() { # A B -> "KEEP|DROP|why" local a="$1" b="$2" local aTier aDur aSize bTier bDur bSize read -r aTier aDur aSize <<<"$(score_file "$a")" read -r bTier bDur bSize <<<"$(score_file "$b")" v "Quality scores: A[tier=$aTier dur=$aDur size=$aSize] B[tier=$bTier dur=$bDur size=$bSize]" if (( aTier < bTier )); then echo "$a|$b|prefer ${PREF_HEIGHT}p (A)"; return 0; fi if (( bTier < aTier )); then echo "$b|$a|prefer ${PREF_HEIGHT}p (B)"; return 0; fi awk -v A="$aDur" -v B="$bDur" -v eps="$DURATION_EPS" 'BEGIN{ if ((A-B) > eps) print "A"; else if ((B-A) > eps) print "B"; else print "TIE"; }' | { read who if [[ "$who" == "A" ]]; then echo "$a|$b|longer duration (A)"; return 0; fi if [[ "$who" == "B" ]]; then echo "$b|$a|longer duration (B)"; return 0; fi if (( aSize > bSize )); then echo "$a|$b|larger file size (A)"; else if (( bSize > aSize )); then echo "$b|$a|larger file size (B)"; else echo "$a|$b|tie-break (keep A)"; fi; fi } } act_on_loser() { # loser keep local loser="$1" keeper="$2" # Safety: ensure loser belongs to current group (by first token) to prevent cross-group actions local lkey lkey="$(first_underscore_key "$(basename -- "$loser")")" if [[ -n "${CURRENT_GROUP_KEY:-}" && "$lkey" != "$CURRENT_GROUP_KEY" ]]; then echo "[ACTION] skip: '$loser' not in current group '$CURRENT_GROUP_KEY' (lkey='$lkey')"; return 0; fi if [[ ! -e "$loser" ]]; then echo "[ACTION] skip: not found → $loser"; return 0; fi case "$ACTION" in print) echo "[ACTION] Keep: $keeper" echo "[ACTION] Drop: $loser" ;; move) mkdir -p -- "$TRASH_DIR" if [[ $DRY -eq 1 ]]; then echo "[ACTION] dry-run: mv \"$loser\" \"$TRASH_DIR/\"" else mv -- "$loser" "$TRASH_DIR/" && echo "[ACTION] moved to trash: $loser -> $TRASH_DIR/" || echo "[ACTION] move failed: $loser" fi echo "[ACTION] kept: $keeper" ;; delete) if [[ $DRY -eq 1 ]]; then echo "[ACTION] dry-run: rm \"$loser\"" else rm -- "$loser" && echo "[ACTION] deleted: $loser" || echo "[ACTION] delete failed: $loser" fi echo "[ACTION] kept: $keeper" ;; *) echo "[WARN] Unknown --action='$ACTION' → printing only."; echo "[ACTION] Keep: $keeper ; Drop: $loser" ;; esac } same_movie_or_skip() { # A B -> 0 if same (SSIM>=thr), else 1 local a="$1" b="$2" ssim echo "== Snapshot compare @${SNAP_T}s: ==" >&2 ssim="$(snapshot_compare_ssim "$a" "$b" "$SNAP_T" "$SNAP_SCALE")" if [[ -z "$ssim" ]]; then echo "[WARN] Could not compute SSIM for: $a vs $b" >&2; return 1; fi printf "[INFO] SSIM(All) %s vs %s → %s\n" "$(basename -- "$a")" "$(basename -- "$b")" "$ssim" >&2 awk -v s="$ssim" -v thr="$SNAP_SSIM" 'BEGIN{exit !(s+0 >= thr+0)}' } # ----- directory scan helpers/drivers ----- #scan_and_collect() { # if [[ $RECURSIVE -eq 1 ]]; then # mapfile -t FILES < <(find "$SCAN_DIR" -type f \( -iname '*.mp4' -o -iname '*.m4v' -o -iname '*.mov' -o -iname '*.mkv' \)) # else # mapfile -t FILES < <(find "$SCAN_DIR" -maxdepth 1 -type f \( -iname '*.mp4' -o -iname '*.m4v' -o -iname '*.mov' -o -iname '*.mkv' \)) # fi #} scan_and_collect() { if [[ $RECURSIVE -eq 1 ]]; then mapfile -t FILES < <(find "$SCAN_DIR" -type f \( -iname '*.mp4' -o -iname '*.m4v' -o -iname '*.mov' -o -iname '*.mkv' \) ! -name '._*') else mapfile -t FILES < <(find "$SCAN_DIR" -maxdepth 1 -type f \( -iname '*.mp4' -o -iname '*.m4v' -o -iname '*.mov' -o -iname '*.mkv' \) ! -name '._*') fi } # key = first token of basename (without extension) before the first underscore; lowercased #first_underscore_key() { # local base extless key # base="$(basename -- "$1")" # extless="${base%.*}" # if [[ "$extless" == *_* ]]; then # key="${extless%%_*}" # else # key="$extless" # fi # echo "${key,,}" #} first_underscore_key() { local base extless key base="$(basename -- "$1")" extless="${base%.*}" # Strip ignore pattern if specified if [[ -n "$IGNORE_PATTERN" ]]; then # Convert pattern to regex: idx* becomes idx[0-9]+ local pattern="$IGNORE_PATTERN" pattern="${pattern//\*/[0-9]+}" # Replace * with [0-9]+ if [[ "$extless" =~ ^${pattern}_ ]]; then # Remove the matched prefix and underscore extless="${extless#"${BASH_REMATCH[0]}"}" fi fi if [[ "$extless" == *_* ]]; then key="${extless%%_*}" else key="$extless" fi echo "${key,,}" } # union helper: merge cluster B into A (robust to unset B) union_clusters() { # cidA cidB -> merges B into A local A="$1" B="$2" line [[ "$A" == "$B" ]] && return 0 # If B doesn't exist (already merged/unset), skip gracefully if [[ -z "${CLUSTER_CONTENT[$B]+x}" ]]; then dbg "union: skip, cluster B=$B not found" return 0 fi # Ensure target A content exists if [[ -z "${CLUSTER_CONTENT[$A]+x}" ]]; then CLUSTER_CONTENT["$A"]="" fi while IFS= read -r line; do [[ -z "$line" ]] && continue FILE_TO_CID["$line"]="$A" CLUSTER_CONTENT["$A"]+=$'\n'"$line" done < <(printf "%s\n" "${CLUSTER_CONTENT[$B]}" | sed '/^$/d') unset 'CLUSTER_CONTENT[$B]' } cluster_group() { # files... -> create clusters so that "same movie" files share a cid local arr=("$@") local CID=0 # Use dynamic scoping: refer to process_group's local assoc arrays # shellcheck disable=SC2154 : "${FILE_TO_CID[@]:-}" "${CLUSTER_CONTENT[@]:-}" "${SIG_CACHE[@]:-}" # init each file in its own cluster local f id for f in "${arr[@]}"; do id=$(( ++CID )) FILE_TO_CID["$f"]="$id" CLUSTER_CONTENT["$id"]+=$'\n'"$f" SIG_CACHE["$f"]="$(signature "$f")" dbg "init cid=$id file=$f sig=${SIG_CACHE[$f]}" done local n=${#arr[@]} local i j fi fj ci cj t0 t1 for ((i=0;i $(signature "$f")"; done dbg "group='${key}' files=${#okfiles[@]}" # Build clusters of "same movie" cluster_group "${okfiles[@]}" # For each cluster, keep just one best and drop the rest local cid content arr best keepers=() for cid in "${!CLUSTER_CONTENT[@]}"; do content="${CLUSTER_CONTENT[$cid]:-}" mapfile -t arr < <(printf "%s" "$content" | sed '/^$/d') dbg "cluster id=$cid size=${#arr[@]}" if (( ${#arr[@]} == 0 )); then continue; fi if (( ${#arr[@]} == 1 )); then keepers+=("${arr[0]}"); continue; fi best="$(choose_best_in_cluster "${arr[@]}")" keepers+=("$best") # drop all others in the cluster local x for x in "${arr[@]}"; do [[ "$x" == "$best" ]] && continue echo echo "[CLUSTER] $(basename -- "$x") is duplicate of cluster best $(basename -- "$best")" act_on_loser "$x" "$best" done done echo echo "=== Group result: kept ${#keepers[@]} unique file(s) ===" for f in "${keepers[@]}"; do echo " • $(basename -- "$f")"; done group_t1=$(now_ms) echo "[TIME] group '${key}' took $(duration_ms "$group_t0" "$group_t1") ms" } dir_mode() { [[ -n "${SCAN_DIR:-}" ]] || SCAN_DIR="." [[ -d "$SCAN_DIR" ]] || die "Not a directory: $SCAN_DIR" echo ">> Directory scan mode on: $SCAN_DIR (recursive=$RECURSIVE, grouping=first-underscore)" scan_and_collect if [[ ${#FILES[@]} -eq 0 ]]; then echo "No video files found."; exit 0; fi declare -A groups local f base key for f in "${FILES[@]}"; do base="$(basename -- "$f")" key="$(first_underscore_key "$base")" groups["$key"]+=$'\n'"$f" done for k in "${!groups[@]}"; do IFS=$'\n' read -r -d '' -a grpfiles < <(printf "%s" "${groups[$k]}" | sed '/^$/d' | sort -u && printf '\0') process_group "${grpfiles[@]}" done echo echo ">> Directory scan complete." } # ----- pair comparison driver ----- compare_pair() { # A B local A="$1" B="$2" echo "== Step 1: Validating files =="; okA=0; okB=0 check_ok "$A" && okA=1; check_ok "$B" && okB=1 if (( okA==0 || okB==0 )); then echo "[FAIL] One or both files broken. A_ok=$okA B_ok=$okB" >&2; exit 1; fi echo "[OK] Both files decoded cleanly." echo; echo "== Step 2: Same-movie test (fast path then SSIM) ==" if [[ "$(signature "$A")" == "$(signature "$B")" ]]; then echo "[OK] Identical signature ⇒ treat as same movie" else if ! same_movie_or_skip "$A" "$B"; then echo "[FAIL] Files are not the same movie (SSIM < ${SNAP_SSIM})." >&2; exit 1 fi fi echo "[OK] Same movie." echo; echo "== Step 3: External compare ($IMPL) =="; run_impl "$IMPL" "$A" "$B" echo; echo "== Step 4: Quality selection (prefer ${PREF_HEIGHT}p) ==" read -r keep drop why <<<"$(pick_winner "$A" "$B" | tr '|' ' ')" echo "[DECISION] Keep: $keep"; echo "[DECISION] Drop: $drop"; echo "[REASON] $why" echo; echo "== Step 5: Action =="; act_on_loser "$drop" "$keep" echo; echo "== Summary =="; echo "Kept: $keep"; echo "Dropped: $drop" [[ "$ACTION" == "move" ]] && echo "(Moved loser to: $TRASH_DIR)" [[ "$ACTION" == "delete" ]] && echo "(Loser was deleted)" [[ $DRY -eq 1 ]] && echo "(Dry-run only; no changes made)" } # -------- dispatch -------- if [[ -n "${SCAN_DIR:-}" ]]; then dir_mode exit 0 fi # Pair mode if [[ ${#REM_ARR[@]} -lt 2 ]]; then echo "Usage (pair): $0 A.mp4 B.mp4 [options]" >&2 echo " or (scan): $0 --scan-dir [DIR] [--recursive] [options]" >&2 exit 2 fi A="${REM_ARR[0]}"; B="${REM_ARR[1]}" [[ -f "$A" ]] || die "File not found: $A" [[ -f "$B" ]] || die "File not found: $B" compare_pair "$A" "$B"