#!/bin/sh
# Simple file based dedupe producer.
# Idea is to run this on a directory(s) and the result is a tuple
# of md5sum and filenames that are potentially duplicate (need to
# finish by using cmp to make sure).
# Copyright 2010 Sterling Commerce, Inc.
# Copyright 2022 Christopher Jay Cox
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
# Filters here: size, inode, md5sum
# 
# md5sum, like a cmp, is the most expensive item, so ideally, it should be
# the last filter run.  The inode check, while fast, is more intersting
# only in certain cases as the 1st filter run.  In some cases, you may
# determine to skip it altogether, but realize that the program might
# give you duplicate files that are really the SAME file.
#
# Output is a list of md5sum filename tuples ordered by matching md5sum.
# So ultimately, the md5sum becomes the key for the next step of processing
# in identifying what files are duplicates.  So, in the next step, your
# processing loop would to cmp's across the potentially duplicate files
# by md5sum (this is to avoid the rare case of md5sum collision, made even
# rarer by the size filter in particular).
#
# You could also consider a file type filter, if you are interested in
# duplicates of some particular file type.
# xargs -0 file | grep 'image data' | cut -f1 -d: | tr '\012' '\000' |
#
# Note: If comparing to other deduplicators like fdupes, realize that
# the MIN1024BLOCKS looks for blocks occupied, which could be more than
# the bytes of the file itself.  So even if you try to use the -G bytes
# option of fdupes to try to get the same result, it's not guaranteed.
#
# Changelog
#
# 2022-03-19 - cjcox - added uniq --group filter (blank lines between
# duplicates), added MIN1024BLOCKS env to override
# 2010-07-18 - wwalker - changed sed | cut to more complex sed to prevent
# possible filename truncation
# 2010-07-18 - wwalker - changed 6s to 7s
# 2010-07-19 - cjcox - even more generic needed IMHO, need to assure fixed
# format field lengths... sigh... I used awk, might as well go for 10 digits
# 2010-07-19 - cjcox - fix type and args on find
# 2010-07-25 - cjcox - limit to large files >10000 bytes
# eventually will parameterize

# You could use something else besides md5sum, last line
#xargs -0 -P ${THREADS} b2sum -l 128 | sort | uniq -w32 -D | uniq -w32 --group

# We check for filesizes in terms of 1024 sized blocks, so we default 10K min
MIN1024BLOCKS=${MIN1024BLOCKS:=10}
THREADS=16

find "$@" -type f -print0 |
xargs -0 ls -sd | sort -k1bn |
awk '{num=$1;if (num > '"$MIN1024BLOCKS"') printf("%10d %s\n",num,$0);}' | uniq -w 10 -D |
sed 's/^[ ]*[0-9][0-9]*[ ][ ]*[0-9][0-9]* //' | tr '\012' '\000' |
xargs -0 ls -id | sort |
awk '{num=$1;printf("%10d %s\n",num,$0);}' | uniq -w 10 |
sed 's/^[ ]*[0-9][0-9]*[ ][ ]*[0-9][0-9]* //' | tr '\012' '\000' |
xargs -0 -P ${THREADS} md5sum | sort | uniq -w32 -D | uniq -w32 --group