folder deduper

 If like me you have thousands of duplicate photos, you need this. it only deduplicates on md5 (identical) files, not visually similar files. It does not run inside github as that will break any coding you are doing.


#!/bin/bash
# dupemd5.sh — report duplicate files by MD5; dry-run move + CSV log

# Safety check: do not allow running inside GitHub directories
if [[ "$(pwd)" =~ [Gg]ithub ]]; then
    echo "Error: refusing to run inside a GitHub directory ($(pwd))" >&2
    exit 1
fi

APPLY=0
if [[ "${1:-}" == "--apply" ]]; then
    APPLY=1
    shift
fi
dir="${1:-.}"

declare -A seen
count=0

duplicates_dir="./duplicates"
csv="$duplicates_dir/duplicates_found.csv"

mkdir -p "$duplicates_dir"
touch "$csv"
if [[ ! -s "$csv" ]]; then
    echo '"item";"original";"duplicate";"target"' > "$csv"
fi

print_summary() {
    echo "Total duplicate pairs found: $count"
}

log_csv() { # idx, orig, dup, target
    local idx="$1" orig="$2" dup="$3" target="$4"
    esc() { printf '%s' "$1" | sed 's/"/""/g'; }
    printf '"%s";"%s";"%s";"%s"\n' \
        "$(esc "$idx")" "$(esc "$orig")" "$(esc "$dup")" "$(esc "$target")" >> "$csv"
}

unique_name() {
    local target="$1"
    local base="$(basename "$target")"
    local name="${base%.*}"
    local ext="${base##*.}"
    local dir="$(dirname "$target")"
    local n=1
    local candidate="$target"
    while [[ -e "$candidate" ]]; do
        n=$((n+1))
        if [[ "$ext" != "$name" ]]; then
            candidate="$dir/${name}_$n.$ext"
        else
            candidate="$dir/${name}_$n"
        fi
    done
    echo "$candidate"
}

move_dup() { # dry-run by default; real move with --apply
    local dup="$1" target="$2"
    target=$(unique_name "$target")
    if (( APPLY )); then
        mv -n -- "$dup" "$target"
    else
        echo "mv -n -- \"$dup\" \"$target\""
    fi
    echo "$target"
}

while IFS= read -r -d '' file; do
    md5=$(md5sum "$file" | awk '{print $1}')
    if [[ -n "${seen[$md5]}" ]]; then
        echo "original: ${seen[$md5]}"
        echo "apparent duplicate: $file"
        echo
        echo

        orig="${seen[$md5]}"
        dup="$file"
        target="$duplicates_dir/$(basename "$dup")" 

        orig_abs="$(realpath -m "$orig")"
        dup_abs="$(realpath -m "$dup")"

        target_abs="$(move_dup "$dup_abs" "$target")"
        count=$((count+1))
        log_csv "$count" "$orig_abs" "$dup_abs" "$target_abs"
    else
        seen[$md5]="$file"
    fi
done < <(find "$dir" -type f -print0 | LC_ALL=C sort -z)

print_summary

The following is the undo script: make sure you are in the directory that was deduped, do not go into the duplicates directory first, as it tries to read duplicates/duplicates_found.csv to undo the moves.

#!/bin/bash
# undo_dupes.sh — restore moved duplicates back to their original locations
# Usage:
#   bash undo_dupes.sh              # dry-run
#   bash undo_dupes.sh --apply      # actually move files

APPLY=0
if [[ "${1:-}" == "--apply" ]]; then
    APPLY=1
    shift
fi

csv="${1:-./duplicates/duplicates_found.csv}"

if [[ ! -f "$csv" ]]; then
    echo "CSV not found: $csv" >&2
    exit 1
fi

awk -v FPAT='([^;]*)|(\"([^\"]|\"\")*\")' '
NR==1 { next }  # skip header
function unq(s){ if (s ~ /^".*"$/){ s=substr(s,2,length(s)-2); gsub(/""/,"\"",s) } return s }
{
  dup = unq($3)   # duplicate original path (destination to restore to)
  tgt = unq($4)   # where the file was moved (source to restore from)
  print dup "\t" tgt
}
' "$csv" | while IFS=$'\t' read -r dup tgt; do
    if [[ -f "$tgt" ]]; then
        if (( APPLY )); then
            mkdir -p -- "$(dirname "$dup")"
            mv -n -- "$tgt" "$dup"
            echo "restored: $dup"
        else
            echo "mv -n -- \"$tgt\" \"$dup\""
        fi
    else
        echo "missing (skipped): $tgt" >&2
    fi
done

Popular posts from this blog

Automatically Fix Song Metadata and Filenames on Linux with Beets

throttle traffic on apache

script to show all files with disk usage sizes