#!/bin/sh # Simple file based dedupe producer. # Idea is to run this on a directory(s) and the result is a tuple # of md5sum and filenames that are potentially duplicate (need to # finish by using cmp to make sure). # Copyright 2010 Sterling Commerce, Inc. # Copyright 2022 Christopher Jay Cox # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # # Filters here: size, inode, md5sum # # md5sum, like a cmp, is the most expensive item, so ideally, it should be # the last filter run. The inode check, while fast, is more intersting # only in certain cases as the 1st filter run. In some cases, you may # determine to skip it altogether, but realize that the program might # give you duplicate files that are really the SAME file. # # Output is a list of md5sum filename tuples ordered by matching md5sum. # So ultimately, the md5sum becomes the key for the next step of processing # in identifying what files are duplicates. So, in the next step, your # processing loop would to cmp's across the potentially duplicate files # by md5sum (this is to avoid the rare case of md5sum collision, made even # rarer by the size filter in particular). # # You could also consider a file type filter, if you are interested in # duplicates of some particular file type. # xargs -0 file | grep 'image data' | cut -f1 -d: | tr '\012' '\000' | # # Note: If comparing to other deduplicators like fdupes, realize that # the MIN1024BLOCKS looks for blocks occupied, which could be more than # the bytes of the file itself. So even if you try to use the -G bytes # option of fdupes to try to get the same result, it's not guaranteed. # # Changelog # # 2022-03-19 - cjcox - added uniq --group filter (blank lines between # duplicates), added MIN1024BLOCKS env to override # 2010-07-18 - wwalker - changed sed | cut to more complex sed to prevent # possible filename truncation # 2010-07-18 - wwalker - changed 6s to 7s # 2010-07-19 - cjcox - even more generic needed IMHO, need to assure fixed # format field lengths... sigh... I used awk, might as well go for 10 digits # 2010-07-19 - cjcox - fix type and args on find # 2010-07-25 - cjcox - limit to large files >10000 bytes # eventually will parameterize # You could use something else besides md5sum, last line #xargs -0 -P ${THREADS} b2sum -l 128 | sort | uniq -w32 -D | uniq -w32 --group # We check for filesizes in terms of 1024 sized blocks, so we default 10K min MIN1024BLOCKS=${MIN1024BLOCKS:=10} THREADS=16 find "$@" -type f -print0 | xargs -0 ls -sd | sort -k1bn | awk '{num=$1;if (num > '"$MIN1024BLOCKS"') printf("%10d %s\n",num,$0);}' | uniq -w 10 -D | sed 's/^[ ]*[0-9][0-9]*[ ][ ]*[0-9][0-9]* //' | tr '\012' '\000' | xargs -0 ls -id | sort | awk '{num=$1;printf("%10d %s\n",num,$0);}' | uniq -w 10 | sed 's/^[ ]*[0-9][0-9]*[ ][ ]*[0-9][0-9]* //' | tr '\012' '\000' | xargs -0 -P ${THREADS} md5sum | sort | uniq -w32 -D | uniq -w32 --group