renam and filter

This commit is contained in:
2025-11-13 14:14:43 +01:00
parent 77d6eca004
commit 3234a19be0

View File

@ -2,49 +2,49 @@
# =============================================================================
# rename_and_filter.sh
#
# Version: 1.3.1
# Last updated: 2025-10-04
# Version: 1.9.0
# Last updated: 2025-11-13 (camelCase- & siffersplit borttagen)
#
# Summary
# -------
# Given a directory, a comma-separated file of reserved words, and an optional
# file extension filter:
# 1) Removes any filename tokens (separated by "_") that match reserved words
# (case-insensitive, exact token matches).
# - EXTRA: If a reserved word appears as the last token (e.g., "..._good"),
# it is removed even if it's only delimited on the left side.
# 2) Replaces one or more spaces with a single underscore "_".
# 3) Collapses multiple "_" into a single "_", and trims leading/trailing "_".
# Döper om filer i en katalog baserat på en kommaseparerad lista med spärrade
# ord (reserved words). Matchning sker mot _tokens_ i basnamnet (skiljetecken
# "_"), med **exakt token-matchning** (case-insensitivt). Inga substring-
# regler och ingen automatisk camelCase/siffersplit.
#
# Notes:
# - No files are deleted; only renaming occurs.
# - Processes files only in the given directory (non-recursive).
# - Collision handling: if target name exists, suffix "_N" before the extension.
# - Reserved words match only against the basename tokens (extension excluded).
# Regler (AUKTORITATIV ORDNING)
# -----------------------------
# OBS: Numreringen återspeglar **exakt tillämpningsordning** i koden. Om en
# ny regel införs mellan t.ex. 3 och 4 ska **alla efterföljande regler
# renumreras**. Referera alltid till regler som "Regel N".
#
# Extension filter:
# - Default: mp4
# - "*" : all files
# - "jpg" : only .jpg (with or without leading dot accepted, e.g. "jpg" or ".jpg")
# Regler:
# 1) [strip_reserved] Ta bort tokens som exakt matchar ett spärrat ord
# (case-insensitivt).
# 2) [strip_reserved] Ta bort tokens som består av enbart siffror, oavsett
# längd (t.ex. 2, 22, 123).
# 3) [strip_reserved] Normalisera basnamnet: ersätt "-" → "_", whitespace → "_",
# kollapsa multipla "_" och dela på "_".
# 4) [normalize_name] Finjustera: kollapsa ev. kvarvarande multipla "_", trimma
# ledande/efterföljande "_" i basnamnet. Extension bibehålls.
# 5) [candidate_name] Krockhantering: om målnamn finns, suffix "_N" före
# extension.
#
# Logging:
# RENAME: "old_name.ext" => "new_name.ext"
# NOCHANGE (dry-run): "file.ext" (nothing to do)
# Noter:
# - Ingen fil raderas; endast rename.
# - Icke-rekursiv som standard; använd --recursive för att gå ned i underkataloger.
# - Extensionfilter: en, flera (kommaseparerade) eller "*" för alla filer.
#
# Usage
# -----
# ./rename_and_filter.sh [--dry-run] <directory> <reserved_words.csv> [extension]
# Exempel
# -------
# # Torrkörning (default mp4)
# ./rename_and_filter.sh --dry-run ./videos reserved_words.csv
#
# Examples
# --------
# # Dry-run on mp4 files (default):
# ./rename_and_filter.sh --dry-run ./videos ./reserved_words.csv
# # Alla filer rekursivt
# ./rename_and_filter.sh -n -r ./media reserved_words.csv "*"
#
# # All files:
# ./rename_and_filter.sh ./media ./reserved_words.csv "*"
#
# # Only jpg:
# ./rename_and_filter.sh ./pictures ./reserved_words.csv jpg
# # jpg & png
# ./rename_and_filter.sh -n ./pics reserved_words.csv jpg,png
#
# Exit codes
# ----------
@ -53,87 +53,87 @@
# 2 Requires bash >= 4
# =============================================================================
set -euo pipefail
# Viktigt: vi kör INTE set -e, för att undvika aborter mitt i loopen.
# Vi behåller -u och pipefail för rimlig säkerhet.
set -uo pipefail
SCRIPT_VERSION="1.9.0"
# Requires bash 4 for associative arrays
# Bash 4 krävs för assoc. arrayer & case-conversion
if [ "${BASH_VERSINFO:-0}" -lt 4 ]; then
echo "This script requires bash >= 4." >&2
exit 2
fi
DRY_RUN=0
RECURSIVE=0
MIN_LEN=0
VERBOSE=0
print_help() {
sed -n '2,160p' "$0" | sed 's/^# \{0,1\}//'
sed -n '2,200p' "$0" | sed 's/^# \{0,1\}//'
}
print_rules() {
cat <<'RULES'
Regler (auktoritativ ordning)
1) [strip_reserved] Ta bort tokens som exakt matchar ett spärrat ord (case-insensitivt).
2) [strip_reserved] Ta bort tokens som består av enbart siffror, oavsett längd (t.ex. 2, 22, 123).
3) [strip_reserved] Normalisera basnamnet: ersätt "-" → "_", whitespace → "_", kollapsa multipla "_" och dela på "_".
4) [normalize_name] Finjustera: kollapsa ev. kvarvarande multipla "_", trimma ledande/efterföljande "_" i basnamnet. Extension bibehålls.
5) [candidate_name] Krockhantering: om målnamn finns, suffix "_N" före extension.
RULES
}
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
rename_file() {
local src="$1"
local dst="$2"
# Guard: identical path, nothing to do
if [[ "$src" == "$dst" ]]; then
if (( DRY_RUN )); then
# CHANGE: add `--` so basename doesn't treat dash-prefixed names as options
log "NOCHANGE (dry-run): \"$(basename -- "$src")\" (nothing to do)"
fi
return 0
fi
local base ext candidate n
base="${dst##*/}"
ext=""
if [[ "$base" == *.* ]]; then
ext=".${base##*.}"
base="${base%.*}"
fi
candidate="$dst"
n=1
# Avoid collisions (unless it's the same file, already handled above)
while [[ -e "$candidate" && "$src" != "$candidate" ]]; do
candidate="$(dirname "$dst")/${base}_$n$ext"
((n++))
# --- Argument parsing ---
ARGS=()
while (("$#")); do
case "${1}" in
-h|--help) print_help; exit 0;;
--print-rules) print_rules; exit 0;;
-n|--dry-run) DRY_RUN=1; shift;;
-r|--recursive) RECURSIVE=1; shift;;
-v|--verbose) VERBOSE=1; shift;;
--min-len) MIN_LEN=${2:-0}; shift 2;;
--) shift; break;;
-* ) echo "Unknown option: $1" >&2; exit 1;;
* ) ARGS+=("$1"); shift;;
esac
done
if (( DRY_RUN )); then
# CHANGE: add `--` for safe logging
log "RENAME: \"$(basename -- "$src")\" => \"$(basename -- "$candidate")\""
else
mv -v -- "$src" "$candidate" >/dev/null
# CHANGE: add `--` for safe logging
log "RENAME: \"$(basename -- "$src")\" => \"$(basename -- "$candidate")\""
fi
}
# --- Argument parsing ---
if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
print_help
exit 0
fi
if [[ "${1:-}" == "-n" || "${1:-}" == "--dry-run" ]]; then
DRY_RUN=1
shift
fi
if [[ $# -lt 2 || $# -gt 3 ]]; then
echo "Error: Provide <directory> <reserved_words.csv> [extension]. Use --help for info." >&2
# Positional args
if (( ${#ARGS[@]} < 2 || ${#ARGS[@]} > 3 )); then
echo "Error: Provide <directory> <reserved_words.csv> [extension|list|*]. Use --help for info." >&2
exit 1
fi
DIR="$1"
WORD_FILE="$2"
EXT="${3:-mp4}"
DIR="${ARGS[0]}"
WORD_FILE="${ARGS[1]}"
EXT_ARG="${ARGS[2]:-mp4}"
# Normalize EXT: strip leading dot if present
if [[ "$EXT" == .* ]]; then
EXT="${EXT#.}"
# Normalize extensions list into a csv string (lowercase, strip leading dots)
normalize_ext_list() {
local arg="$1"
local -a out=()
if [[ "$arg" == "*" ]]; then
echo "*"
return 0
fi
IFS=',' read -r -a tmp <<< "$arg"
for e in "${tmp[@]}"; do
[[ -z "$e" ]] && continue
e="${e#.}"
out+=("${e,,}")
done
(IFS=','; echo "${out[*]}")
}
EXT_LIST_STR="$(normalize_ext_list "$EXT_ARG")"
# Validate inputs
if [[ ! -d "$DIR" ]]; then
echo "Error: Directory does not exist: $DIR" >&2
exit 1
@ -143,67 +143,102 @@ if [[ ! -f "$WORD_FILE" ]]; then
exit 1
fi
# --- Load reserved words into an associative set ---
# --- Load reserved words (to associative set) ---
declare -A RESERVED=()
# Allow comma and newlines as separators
# shellcheck disable=SC2002
mapfile -t _tokens < <(cat "$WORD_FILE" | tr ',\r\n' '\n')
mapfile -t _tokens < <(tr ',\r\n' '\n' < "$WORD_FILE")
for raw in "${_tokens[@]}"; do
# trim whitespace
w="${raw#"${raw%%[![:space:]]*}"}" # ltrim
w="${w%"${w##*[![:space:]]}"}" # rtrim
w="${raw#"${raw%%[![:space:]]*}"}"
w="${w%"${w##*[![:space:]]}"}"
[[ -z "$w" ]] && continue
RESERVED["${w,,}"]=1
lw="${w,,}" # case-insensitive
lw="${lw//-/_}" # normalisera även ordlista: '-' → '_'
RESERVED["$lw"]=1
done
log "Processing directory: $DIR (dry-run=$DRY_RUN, extension=${EXT})"
log "Reserved words loaded: ${#RESERVED[@]}"
# --- Helpers ---
matches_extension() {
local file="$1" ext_list="$2" base ext lc
[[ "$ext_list" == "*" ]] && return 0
base="$(basename "$file")"
[[ "$base" != *.* ]] && return 1
ext="${base##*.}"; lc="${ext,,}"
IFS=',' read -r -a arr <<< "$ext_list"
for e in "${arr[@]}"; do
if [[ "$lc" == "$e" ]]; then return 0; fi
done
return 1
}
# --- Normalize (rules 2 & 3) ---
normalize_name() {
local name="$1"
# Replace one-or-more spaces with single underscore
name="$(printf '%s' "$name" | sed -E 's/[[:space:]]+/_/g')"
# Collapse multiple underscores
local name="$1" base ext
# strip_reserved har redan gjort whitespace → "_" och en första kollaps
name="$(printf '%s' "$name" | sed -E 's/_+/_/g')"
# Trim leading/trailing underscores from the basename (keep extension)
local base ext
base="$name"
ext=""
base="$name"; ext=""
if [[ "$base" == *.* ]]; then
ext=".${base##*.}"
base="${base%.*}"
fi
base="${base##_}"
base="${base%%_}"
base="${base#_}"
base="${base%_}"
printf '%s%s' "$base" "$ext"
}
# --- Remove reserved tokens from basename (plus end-token rule) ---
candidate_name() {
local dst="$1" src="$2" base ext candidate n
base="${dst##*/}"; ext=""
if [[ "$base" == *.* ]]; then
ext=".${base##*.}"
base="${base%.*}"
fi
candidate="$dst"; n=1
while [[ -e "$candidate" && "$src" != "$candidate" ]]; do
candidate="$(dirname "$dst")/${base}_$n$ext"
((n++))
done
printf '%s' "$candidate"
}
strip_reserved() {
local name="$1"
local base ext token lw
local name="$1" base ext token lw
local -a parts new_parts=()
# CHANGE: pass `--` to basename for dash-prefixed names
base="$(basename -- "$name")"
ext=""
base="$(basename "$name")"; ext=""
if [[ "$base" == *.* ]]; then
ext=".${base##*.}"
base="${base%.*}"
fi
# Pre-normalize for tokenization
# Pre-normalisering & tokenisering:
# - ersätt '-' → '_'
# - ersätt all whitespace → '_'
# - kollapsa multipla '_'
# - dela på '_'
local norm_base
norm_base="$(printf '%s' "$base" | sed -E 's/[[:space:]]+/_/g' | sed -E 's/_+/_/g')"
norm_base="$(printf '%s' "$base" \
| sed -E 's/-/_/g' \
| sed -E 's/[[:space:]]+/_/g' \
| sed -E 's/_+/_/g')"
IFS='_' read -r -a parts <<< "$norm_base"
for token in "${parts[@]}"; do
[[ -z "$token" ]] && continue
lw="${token,,}"
if [[ -n "${RESERVED[$lw]:-}" ]]; then
# (Regel 2) slopa rena siffertokens
if [[ "$token" =~ ^[[:digit:]]+$ ]]; then
(( VERBOSE )) && log "DEBUG: drop numeric token '$token'"
continue
fi
# (Regel 1) exakt token = reserverat ord (case-insensitivt)
if [[ -n "${RESERVED[$lw]:-}" ]]; then
(( VERBOSE )) && log "DEBUG: drop reserved token '$token'"
continue
fi
new_parts+=("$token")
done
@ -214,63 +249,94 @@ strip_reserved() {
new_base="$(IFS=_; echo "${new_parts[*]}")"
fi
# Extra end-token rule (defensive): if last token is reserved, drop it
if [[ -n "$new_base" ]]; then
local last last_lc
last="${new_base##*_}"
last_lc="${last,,}"
if [[ -n "${RESERVED[$last_lc]:-}" ]]; then
if [[ "$new_base" == *_* ]]; then
new_base="${new_base%_*}"
else
new_base=""
fi
fi
fi
# Fallback if everything disappeared
if [[ -z "$new_base" ]]; then
new_base="untitled"
fi
[[ -z "$new_base" ]] && new_base="untitled"
printf '%s%s' "$new_base" "$ext"
}
# --- File pattern based on EXT ---
# --- Walk files ---
shopt -s nullglob
pattern="*"
if [[ "$EXT" != "*" ]]; then
pattern="*.$EXT"
VISITED=0
RENAMED=0
UNCHANGED=0
COLLISIONED=0
log "rename_and_filter.sh version $SCRIPT_VERSION"
log "Processing directory: $DIR (dry-run=$DRY_RUN, recursive=$RECURSIVE, min-len=$MIN_LEN, extensions=$EXT_LIST_STR, verbose=$VERBOSE)"
log "Reserved words loaded: ${#RESERVED[@]}"
process_one() {
local path="$1"
[[ -f "$path" ]] || return 0
if ! matches_extension "$path" "$EXT_LIST_STR"; then
if (( DRY_RUN && VERBOSE )); then
log "SKIP (ext): $path"
fi
return 0
fi
# --- Main loop ---
for path in "$DIR"/$pattern; do
[[ -f "$path" ]] || continue
((VISITED++))
if (( DRY_RUN || VERBOSE )); then
log "Checking file: $path"
fi
orig_basename="$(basename -- "$path")"
# NEW RULE (minimal): remove ALL leading dashes before processing
clean_basename="$orig_basename"
while [[ "$clean_basename" == -* ]]; do
clean_basename="${clean_basename#-}"
done
# 1) remove reserved tokens from basename (on the cleaned name)
stripped="$(strip_reserved "$clean_basename")"
# 2 & 3) normalize (spaces -> "_", collapse "_", trim)
local orig_basename stripped new_name bn target cand
orig_basename="$(basename "$path")"
stripped="$(strip_reserved "$orig_basename")"
new_name="$(normalize_name "$stripped")"
# If nothing changed, do not attempt to rename at all
if [[ "$new_name" == "$orig_basename" ]]; then
if (( DRY_RUN )); then
log "NOCHANGE (dry-run): \"${orig_basename}\" (nothing to do)"
if (( VERBOSE )); then
log "DEBUG: orig='${orig_basename}' stripped='${stripped}' normalized='${new_name}'"
fi
continue
bn="$new_name"
[[ "$bn" == *.* ]] && bn="${bn%.*}"
if (( MIN_LEN > 0 )) && (( ${#bn} < MIN_LEN )); then
if (( DRY_RUN || VERBOSE )); then
log "NOCHANGE: \"${orig_basename}\" (too short after normalize)"
fi
((UNCHANGED++))
return 0
fi
if [[ "$new_name" == "$orig_basename" ]]; then
if (( DRY_RUN || VERBOSE )); then
log "NOCHANGE: \"${orig_basename}\" (nothing to do)"
fi
((UNCHANGED++))
return 0
fi
target="$(dirname "$path")/$new_name"
rename_file "$path" "$target"
done
cand="$(candidate_name "$target" "$path")"
if [[ "$cand" != "$target" ]]; then
((COLLISIONED++))
fi
if (( DRY_RUN )); then
log "RENAME: \"$(basename "$path")\" => \"$(basename "$cand")\""
else
mv -- "$path" "$cand"
log "RENAME: \"$(basename "$path")\" => \"$(basename "$cand")\""
fi
((RENAMED++))
}
# --- Huvudloop ---
if (( RECURSIVE )); then
log "[DEBUG] Enter recursive loop (find '$DIR')"
while IFS= read -r -d '' path; do
log "[DEBUG] raw candidate (recursive): $path"
process_one "$path"
done < <(find "$DIR" -type f -print0)
else
log "[DEBUG] Enter non-recursive loop over '$DIR'/*"
for path in "$DIR"/*; do
log "[DEBUG] raw candidate: $path"
process_one "$path"
done
fi
log "Summary: visited=$VISITED, renamed=$RENAMED, unchanged=$UNCHANGED, collisions_adjusted=$COLLISIONED"
log "Done."