wcx_script/compare_advanced.sh

#!/usr/bin/env bash
# compare_advanced.sh file1.mp4 file2.mp4
set -euo pipefail

f1="${1:-}"; f2="${2:-}"
if [[ -z "$f1" || -z "$f2" ]]; then
  echo "Usage: $0 <file1> <file2>" >&2
  exit 2
fi
for f in "$f1" "$f2"; do
  [[ -f "$f" ]] || { echo "Not found: $f" >&2; exit 2; }
done

have_signature_filter() {
  ffmpeg -hide_banner -filters 2>/dev/null | grep -qE ' V[.*] signature '
}

# Pretty print quick summary (reuse from simple approach)
summary() {
  local f="$1"
  local size dur v w h fps
  size=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
  dur=$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$f" || echo "")
  v=$(ffprobe -v error -select_streams v:0 -show_entries stream=codec_name,width,height,r_frame_rate -of csv=p=0 "$f" | head -n1)
  IFS=',' read -r vcodec w h rfr <<<"$v"
  if [[ "$rfr" == */* ]]; then
    fps=$(awk -v r="$rfr" 'BEGIN{split(r,a,"/"); if(a[2]==0) print 0; else printf "%.3f", a[1]/a[2]}')
  else
    fps=$(awk -v r="$rfr" 'BEGIN{printf "%.3f", r+0}')
  fi
  printf "  Size: %s bytes | Duration: %.3fs | Video: %s, %sx%s @ %.3f fps\n" "$size" "${dur:-0}" "$vcodec" "$w" "$h" "$fps"
}

if have_signature_filter; then
  # Use ffmpeg's signature filter (robust against re-encode/resize)
  tmpdir=$(mktemp -d)
  trap 'rm -rf "$tmpdir"' EXIT

  sig1="$tmpdir/1.sig"
  sig2="$tmpdir/2.sig"
  log="$tmpdir/compare.log"

  echo "Computing video fingerprints (signature filter)..."
  ffmpeg -v error -i "$f1" -vf "signature=format=xml:filename=$sig1" -f null - </dev/null
  ffmpeg -v error -i "$f2" -vf "signature=format=xml:filename=$sig2" -f null - </dev/null

  echo "Comparing fingerprints..."
  # The signature filter can compare two signature files. Different FFmpeg builds print slightly different lines.
  # We run a compare pass and parse a similarity metric from stderr.
  # Try two plausible invocations; accept the first that succeeds.
  if ffmpeg -v info -i "$sig1" -i "$sig2" -filter_complex "signature=compare=1" -f null - 2>"$log"; then
    :
  elif ffmpeg -v info -i "$sig2" -i "$sig1" -filter_complex "signature=compare=1" -f null - 2>>"$log"; then
    :
  fi

  # Extract a rough similarity ratio (fallback to match count if ratio not printed)
  ratio=$(grep -Eo 'similarity[^0-9]*([0-9]+(\.[0-9]+)?)' "$log" | tail -n1 | awk '{print $NF}' || true)
  matches=$(grep -Eo 'matches[^0-9]*([0-9]+)' "$log" | awk '{print $NF}' | tail -n1 || true)

  echo
  echo "Quick summaries:"
  echo "File 1: $f1"; summary "$f1"
  echo "File 2: $f2"; summary "$f2"
  echo

  if [[ -n "${ratio:-}" ]]; then
    echo "Similarity (signature): ${ratio}"
    verdict=$(awk -v r="$ratio" 'BEGIN{ if(r+0 >= 0.80) print "Very likely same content"; else if(r+0 >= 0.60) print "Possibly same with edits"; else print "Likely different"; }')
  elif [[ -n "${matches:-}" ]]; then
    echo "Matching frames (signature): ${matches}"
    verdict=$(awk -v m="$matches" 'BEGIN{ if(m+0 >= 50) print "Very likely same content"; else if(m+0 >= 10) print "Possibly same"; else print "Likely different"; }')
  else
    echo "Could not parse a similarity score from FFmpeg output; check log at: $log"
    verdict="Inconclusive (check logs)"
  fi

  echo "Verdict: $verdict"
  exit 0
fi

# ---- Fallback heuristic (no signature filter) ----
# Sample 1 frame every 10s, normalize to small grayscale, hash each frame, compare overlap.

echo "Signature filter not available; using frame-sampling heuristic..."
tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' EXIT

extract_hashes() {
  local f="$1" base="$2"
  # One frame every 10s, small grayscale to be robust to resizes/re-encodes
  ffmpeg -v error -i "$f" -vf "fps=1/10,scale=160:90,format=gray" -f image2pipe -vcodec png - \
    | sha256sum | awk '{print $1}' > "$tmpdir/${base}.hashes"
  # Note: sha256sum on the whole pipe will hash the full stream; we want per-frame hashes.
  # If system sha256sum collapses the stream to one hash, fall back to numbered files:
  if [[ ! -s "$tmpdir/${base}.hashes" || $(wc -l < "$tmpdir/${base}.hashes") -le 1 ]]; then
    rm -f "$tmpdir/${base}"_*.png
    ffmpeg -v error -i "$f" -vf "fps=1/10,scale=160:90,format=gray" "$tmpdir/${base}_%05d.png"
    ( cd "$tmpdir" && for p in ${base}_*.png; do sha256sum "$p" | awk '{print $1}'; done ) > "$tmpdir/${base}.hashes"
    rm -f "$tmpdir/${base}"_*.png
  fi
}

extract_hashes "$f1" A
extract_hashes "$f2" B

A="$tmpdir/A.hashes"; B="$tmpdir/B.hashes"
countA=$(wc -l < "$A"); countB=$(wc -l < "$B")
common=$(comm -12 <(sort "$A") <(sort "$B") | wc -l)
jaccard=$(awk -v c="$common" -v a="$countA" -v b="$countB" 'BEGIN{u=a+b-c; if(u==0) print 0; else printf "%.3f", c/u}')

echo
echo "Quick summaries:"
echo "File 1: $f1"; summary "$f1"
echo "File 2: $f2"; summary "$f2"
echo
echo "Frame-sampling similarity:"
echo "  Frames hashed: file1=$countA, file2=$countB"
echo "  Common hashes: $common"
echo "  Jaccard index: $jaccard"

verdict=$(awk -v j="$jaccard" 'BEGIN{
  if(j+0 >= 0.80) print "Very likely same content";
  else if(j+0 >= 0.50) print "Possibly same content (edits/recodes)";
  else print "Likely different";
}')
echo "Verdict: $verdict"