folder deduper
If like me you have thousands of duplicate photos, you need this. it only deduplicates on md5 (identical) files, not visually similar files. It does not run inside github as that will break any coding you are doing.
#!/bin/bash
# dupemd5.sh — report duplicate files by MD5; dry-run move + CSV log
# Safety check: do not allow running inside GitHub directories
if [[ "$(pwd)" =~ [Gg]ithub ]]; then
echo "Error: refusing to run inside a GitHub directory ($(pwd))" >&2
exit 1
fi
APPLY=0
if [[ "${1:-}" == "--apply" ]]; then
APPLY=1
shift
fi
dir="${1:-.}"
declare -A seen
count=0
duplicates_dir="./duplicates"
csv="$duplicates_dir/duplicates_found.csv"
mkdir -p "$duplicates_dir"
touch "$csv"
if [[ ! -s "$csv" ]]; then
echo '"item";"original";"duplicate";"target"' > "$csv"
fi
print_summary() {
echo "Total duplicate pairs found: $count"
}
log_csv() { # idx, orig, dup, target
local idx="$1" orig="$2" dup="$3" target="$4"
esc() { printf '%s' "$1" | sed 's/"/""/g'; }
printf '"%s";"%s";"%s";"%s"\n' \
"$(esc "$idx")" "$(esc "$orig")" "$(esc "$dup")" "$(esc "$target")" >> "$csv"
}
unique_name() {
local target="$1"
local base="$(basename "$target")"
local name="${base%.*}"
local ext="${base##*.}"
local dir="$(dirname "$target")"
local n=1
local candidate="$target"
while [[ -e "$candidate" ]]; do
n=$((n+1))
if [[ "$ext" != "$name" ]]; then
candidate="$dir/${name}_$n.$ext"
else
candidate="$dir/${name}_$n"
fi
done
echo "$candidate"
}
move_dup() { # dry-run by default; real move with --apply
local dup="$1" target="$2"
target=$(unique_name "$target")
if (( APPLY )); then
mv -n -- "$dup" "$target"
else
echo "mv -n -- \"$dup\" \"$target\""
fi
echo "$target"
}
while IFS= read -r -d '' file; do
md5=$(md5sum "$file" | awk '{print $1}')
if [[ -n "${seen[$md5]}" ]]; then
echo "original: ${seen[$md5]}"
echo "apparent duplicate: $file"
echo
echo
orig="${seen[$md5]}"
dup="$file"
target="$duplicates_dir/$(basename "$dup")"
orig_abs="$(realpath -m "$orig")"
dup_abs="$(realpath -m "$dup")"
target_abs="$(move_dup "$dup_abs" "$target")"
count=$((count+1))
log_csv "$count" "$orig_abs" "$dup_abs" "$target_abs"
else
seen[$md5]="$file"
fi
done < <(find "$dir" -type f -print0 | LC_ALL=C sort -z)
print_summary
The following is the undo script: make sure you are in the directory that was deduped, do not go into the duplicates directory first, as it tries to read duplicates/duplicates_found.csv to undo the moves.
#!/bin/bash
# undo_dupes.sh — restore moved duplicates back to their original locations
# Usage:
# bash undo_dupes.sh # dry-run
# bash undo_dupes.sh --apply # actually move files
APPLY=0
if [[ "${1:-}" == "--apply" ]]; then
APPLY=1
shift
fi
csv="${1:-./duplicates/duplicates_found.csv}"
if [[ ! -f "$csv" ]]; then
echo "CSV not found: $csv" >&2
exit 1
fi
awk -v FPAT='([^;]*)|(\"([^\"]|\"\")*\")' '
NR==1 { next } # skip header
function unq(s){ if (s ~ /^".*"$/){ s=substr(s,2,length(s)-2); gsub(/""/,"\"",s) } return s }
{
dup = unq($3) # duplicate original path (destination to restore to)
tgt = unq($4) # where the file was moved (source to restore from)
print dup "\t" tgt
}
' "$csv" | while IFS=$'\t' read -r dup tgt; do
if [[ -f "$tgt" ]]; then
if (( APPLY )); then
mkdir -p -- "$(dirname "$dup")"
mv -n -- "$tgt" "$dup"
echo "restored: $dup"
else
echo "mv -n -- \"$tgt\" \"$dup\""
fi
else
echo "missing (skipped): $tgt" >&2
fi
done