Files
wcx_script/compare_advanced.sh
2025-10-09 10:55:52 +02:00

128 lines
5.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# compare_advanced.sh file1.mp4 file2.mp4
set -euo pipefail
f1="${1:-}"; f2="${2:-}"
if [[ -z "$f1" || -z "$f2" ]]; then
echo "Usage: $0 <file1> <file2>" >&2
exit 2
fi
for f in "$f1" "$f2"; do
[[ -f "$f" ]] || { echo "Not found: $f" >&2; exit 2; }
done
have_signature_filter() {
ffmpeg -hide_banner -filters 2>/dev/null | grep -qE ' V[.*] signature '
}
# Pretty print quick summary (reuse from simple approach)
summary() {
local f="$1"
local size dur v w h fps
size=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
dur=$(ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 "$f" || echo "")
v=$(ffprobe -v error -select_streams v:0 -show_entries stream=codec_name,width,height,r_frame_rate -of csv=p=0 "$f" | head -n1)
IFS=',' read -r vcodec w h rfr <<<"$v"
if [[ "$rfr" == */* ]]; then
fps=$(awk -v r="$rfr" 'BEGIN{split(r,a,"/"); if(a[2]==0) print 0; else printf "%.3f", a[1]/a[2]}')
else
fps=$(awk -v r="$rfr" 'BEGIN{printf "%.3f", r+0}')
fi
printf " Size: %s bytes | Duration: %.3fs | Video: %s, %sx%s @ %.3f fps\n" "$size" "${dur:-0}" "$vcodec" "$w" "$h" "$fps"
}
if have_signature_filter; then
# Use ffmpeg's signature filter (robust against re-encode/resize)
tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' EXIT
sig1="$tmpdir/1.sig"
sig2="$tmpdir/2.sig"
log="$tmpdir/compare.log"
echo "Computing video fingerprints (signature filter)..."
ffmpeg -v error -i "$f1" -vf "signature=format=xml:filename=$sig1" -f null - </dev/null
ffmpeg -v error -i "$f2" -vf "signature=format=xml:filename=$sig2" -f null - </dev/null
echo "Comparing fingerprints..."
# The signature filter can compare two signature files. Different FFmpeg builds print slightly different lines.
# We run a compare pass and parse a similarity metric from stderr.
# Try two plausible invocations; accept the first that succeeds.
if ffmpeg -v info -i "$sig1" -i "$sig2" -filter_complex "signature=compare=1" -f null - 2>"$log"; then
:
elif ffmpeg -v info -i "$sig2" -i "$sig1" -filter_complex "signature=compare=1" -f null - 2>>"$log"; then
:
fi
# Extract a rough similarity ratio (fallback to match count if ratio not printed)
ratio=$(grep -Eo 'similarity[^0-9]*([0-9]+(\.[0-9]+)?)' "$log" | tail -n1 | awk '{print $NF}' || true)
matches=$(grep -Eo 'matches[^0-9]*([0-9]+)' "$log" | awk '{print $NF}' | tail -n1 || true)
echo
echo "Quick summaries:"
echo "File 1: $f1"; summary "$f1"
echo "File 2: $f2"; summary "$f2"
echo
if [[ -n "${ratio:-}" ]]; then
echo "Similarity (signature): ${ratio}"
verdict=$(awk -v r="$ratio" 'BEGIN{ if(r+0 >= 0.80) print "Very likely same content"; else if(r+0 >= 0.60) print "Possibly same with edits"; else print "Likely different"; }')
elif [[ -n "${matches:-}" ]]; then
echo "Matching frames (signature): ${matches}"
verdict=$(awk -v m="$matches" 'BEGIN{ if(m+0 >= 50) print "Very likely same content"; else if(m+0 >= 10) print "Possibly same"; else print "Likely different"; }')
else
echo "Could not parse a similarity score from FFmpeg output; check log at: $log"
verdict="Inconclusive (check logs)"
fi
echo "Verdict: $verdict"
exit 0
fi
# ---- Fallback heuristic (no signature filter) ----
# Sample 1 frame every 10s, normalize to small grayscale, hash each frame, compare overlap.
echo "Signature filter not available; using frame-sampling heuristic..."
tmpdir=$(mktemp -d)
trap 'rm -rf "$tmpdir"' EXIT
extract_hashes() {
local f="$1" base="$2"
# One frame every 10s, small grayscale to be robust to resizes/re-encodes
ffmpeg -v error -i "$f" -vf "fps=1/10,scale=160:90,format=gray" -f image2pipe -vcodec png - \
| sha256sum | awk '{print $1}' > "$tmpdir/${base}.hashes"
# Note: sha256sum on the whole pipe will hash the full stream; we want per-frame hashes.
# If system sha256sum collapses the stream to one hash, fall back to numbered files:
if [[ ! -s "$tmpdir/${base}.hashes" || $(wc -l < "$tmpdir/${base}.hashes") -le 1 ]]; then
rm -f "$tmpdir/${base}"_*.png
ffmpeg -v error -i "$f" -vf "fps=1/10,scale=160:90,format=gray" "$tmpdir/${base}_%05d.png"
( cd "$tmpdir" && for p in ${base}_*.png; do sha256sum "$p" | awk '{print $1}'; done ) > "$tmpdir/${base}.hashes"
rm -f "$tmpdir/${base}"_*.png
fi
}
extract_hashes "$f1" A
extract_hashes "$f2" B
A="$tmpdir/A.hashes"; B="$tmpdir/B.hashes"
countA=$(wc -l < "$A"); countB=$(wc -l < "$B")
common=$(comm -12 <(sort "$A") <(sort "$B") | wc -l)
jaccard=$(awk -v c="$common" -v a="$countA" -v b="$countB" 'BEGIN{u=a+b-c; if(u==0) print 0; else printf "%.3f", c/u}')
echo
echo "Quick summaries:"
echo "File 1: $f1"; summary "$f1"
echo "File 2: $f2"; summary "$f2"
echo
echo "Frame-sampling similarity:"
echo " Frames hashed: file1=$countA, file2=$countB"
echo " Common hashes: $common"
echo " Jaccard index: $jaccard"
verdict=$(awk -v j="$jaccard" 'BEGIN{
if(j+0 >= 0.80) print "Very likely same content";
else if(j+0 >= 0.50) print "Possibly same content (edits/recodes)";
else print "Likely different";
}')
echo "Verdict: $verdict"