Newer
Older
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2015, 2016, 2018 Karl Fogel
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# If you did not receive a copy of the GNU General Public License
# along with this program, see <http://www.gnu.org/licenses/>.
"""Efficiently find duplicate files in one or more directory trees.
Duplicate files are files that have exactly the same content, even
though their names, timestamps, and other metadata might differ.
Usage: 'find-dups [OPTIONS] [DIR_OR_FILE_1 [DIR_OR_FILE_2 ...]]'
If no DIR_OR_FILE arguments given, use the current directory (".").
Hard links to the same underlying file count as duplicates, but since
the output always shows the inode number of each duplicate, you can
distinguish between true content duplication and the same file merely
having multiple names.
Symbolic links are ignored. Therefore, issue a warning if one is
named as an explicit target (if the link is furthermore broken,
mention that in the warning).
--ignore-missing
Ignore (don't warn) about named arguments that don't exist.
Exception: when a symbolic link is named as an explicit target and
that link is broken, then the warning message about ignoring the
symbolic link -- which is issued unconditionally because symbolic
links are always ignored, and therefore to name one as an explicit
target is a bit odd -- will also mention that the link is broken.
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
--ignore-empty
Ignore empty files, i.e., don't treat them as dups.
--ignore-dir NAME
Don't descend into any directories named NAME.
(use multiple times, once for each NAME)
--ignore-contained DIRPATH
Ignore duplicates if they're all contained, directly or indirectly
via path ancestry, under the directory DIRPATH.
(use multiple times, once for each NAME)
Output:
-------
The output shows each group of duplicate files, arranged in ascending
chronological order of modification time within each group, in the
following format:
MD5DIGEST (NUMBER identical files, each SIZE bytes):
(optional other metadata)
(more optional other metadata)
(okay, really, as many optional other metadata lines as needed)
DATE-TIME (inode NUMBER): PATH1
DATE-TIME (inode NUMBER): PATH2
DATE-TIME (inode NUMBER): PATH3
...
There could be an arbitary number of '(optional other metadata)'
entries, though right now the only kind is "(common parent: foo/)",
indicating that all the duplicates in this group have some shared path
prefix, that is, some *parent directory in common. Optional metadata
lines always begin in the leftmost column with an open parenthesis and
end on the same line with a close parenthesis.
Here's some example output:
$ find-dups some_tree another_tree
bea8252ff4e80f41719ea13cdf007273 (2 identical files, each 14 bytes):
(common parent: some_tree/)
2015/09/10 19:47:22 (inode 33034320): some_tree/foo
2015/09/10 19:48:05 (inode 33034324): some_tree/bar/baz
d41d8cd98f00b204e9800998ecf8427e (2 identical files, each 0 bytes):
2015/09/03 01:33:57 (inode 33034330): another_tree/qux
2015/09/03 01:34:04 (inode 33034331): some_tree/fnord/farblundzhet
68859e26a113e13c9d29ee1925edcbda (3 identical files, each 8514 bytes):
2015/09/10 22:18:35 (inode 33034318): some_tree/quuux
2015/09/10 22:19:14 (inode 33034323): another_tree/hardlink1
2015/09/10 22:19:14 (inode 33034323): another_tree/hardlink2
$
See also:
---------
There are many scripts out there for finding duplicate files -- too
many to list here -- but most don't bother with speed optimizations
such as the partial-checksum step. One that does is the 'findup'
script that comes with the 'fslint' package (on Debian GNU/Linux see
/usr/share/fslint/fslint/findup after installing fslint). However, I
wanted a more easily hackable script, and specifically to have more
control over the output, so I wrote this instead of just using findup.
"""
import getopt
import sys
import os
import hashlib
import time
class PathInfo:
"""Information about one path."""
def __init__(self, inode, mtime, size, quicksum):
self.inode = inode
self.mtime = mtime
# Although file size is used as a first filter in determining
# duplicativeness, that takes place outside this class, and the
# self.size member here is not used for that. However, if this
# file turns out to be one of the identical files in a bucket of
# duplicates, one of the bucket members will be randomly chosen to
# supply the size for display, and this could be the one.
self.size_for_display = size
self.quicksum = quicksum
def __str__(self):
ret = ""
ret += " mtime: %s\n" % time.ctime(self.mtime)
ret += " inode: %d\n" % self.inode
ret += " size: %d\n" % self.size_for_display
ret += " qsum: %s\n" % self.quicksum
return ret
def md5_file(path, quicksum=False):
"""Return the MD5 digest of file PATH as a hexadecimal string.
If QUICKSUM is True, then return the md5sum for no more than the first
4096 bytes (for large files, this is faster than doing the entire
file, and a probabilistic fingerprint is useful for some purposes).
QUICKSUM is False by default.
If there is an IOError opening PATH, return the all-zeros hex digest."""
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
md5er = hashlib.md5()
try:
with open(path) as f:
if quicksum:
md5er.update(f.read(4096))
else:
for chunk in iter(lambda: f.read(1048576), ''):
md5er.update(chunk)
return md5er.hexdigest()
except IOError, err:
# The reason to trap this exception is that sometimes a path that
# existed during an earlier crawl no longer exists when we go back
# to get its short or long md5sum. Rather than stop at that point,
# the most useful behavior is to just give a special, recognizable
# hex digest for such files.
#
# Traceback (most recent call last):
# File "/home/kfogel/bin/find-dups", line 378, in <module>
# main()
# File "/home/kfogel/bin/find-dups", line 347, in main
# duplicates = gather_dups(roots, ignored_directories, ignore_empty)
# File "/home/kfogel/bin/find-dups", line 212, in gather_dups
# md5_file(this_file_path, quicksum=True))
# File "/home/kfogel/bin/find-dups", line 138, in md5_file
# with open(path) as f:
# IOError: [Errno 6] No such device or address: \
# './.forever/sock/worker.1435765019424GgN.sock'
return "0" * 32
def gather_dups(roots,
ignore_missing=False,
ignored_directories=[],
ignore_empty=False):
"""Walk the trees in ROOTS; return info about duplicates.
The return value is a dictionary mapping hex digests (strings)
to sub-dictionaries, where each sub-dict maps paths (strings)
to PathInfo objects.
If IGNORED_DIRECTORIES is non-empty, it is a list of names of
directories not to descend into.
If IGNORE_EMPTY is True, then ignore empty files, that is, do not
count them as duplicates."""
# Maps sizes-in-bytes (numbers) to sub-dictionaries, where each
# sub-dict maps paths (strings) to PathInfo objects or to None.
#
# When None, that means we have only seen one instance of a file of
# this size so far, so there has as yet been no reason to calculate
# its checksum, discover its modtime, etc. Once we see the second
# file with that size, there is now the possibility of duplication, so
# from that moment on, we will have to fill out PathInfo instances for
# both the original file of that size and all subsequent files we
# encounter of the same size.
paths_by_size = { }
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def gather_file(this_dir, f):
"""Gather file F into paths_by_size.
If THIS_DIR is not None, look for F in THIS_DIR."""
if this_dir is not None:
this_file_path = os.path.join(this_dir, f)
else:
this_file_path = f
if os.path.islink(this_file_path):
pass # symlinks are not interesting to us; skip them
this_file_stat = os.stat(this_file_path)
if ignore_empty and this_file_stat.st_size == 0:
pass
size_bucket = paths_by_size.get(this_file_stat.st_size, { })
if size_bucket:
# We've already found at least one other path with this size.
# A truly optimal implementation would check for inode
# identity too, because if two paths are really hard links
# to the same underlying file then obviously they will have
# the same content. Making this optimization would probably
# involve another level of hashing somewhere around here.
# This circumstance is rare enough that I'm not sure it's
# worth the extra complexity, but counter-evidence welcome.
# Technically, a loop should be unnecessary here, since
# there should be at most one existing path key with a None
# value. A loop with a conditional inside it is just a
# cheap way to take care of that first case. However, it
# might be good to put a sanity-check assertion anyway.
for existing_path in size_bucket.keys():
existing_pathinfo = size_bucket[existing_path]
if existing_pathinfo is None:
existing_path_stat = os.stat(existing_path)
size_bucket[existing_path] = PathInfo(
existing_path_stat.st_ino,
existing_path_stat.st_mtime,
existing_path_stat.st_size,
md5_file(existing_path, quicksum=True))
# Now that we've backfilled the info for the file of this
# size already seen, fill in the info for the new file too.
size_bucket[this_file_path] = PathInfo(
this_file_stat.st_ino,
this_file_stat.st_mtime,
this_file_stat.st_size,
md5_file(this_file_path, quicksum=True))
else:
# This is the first path found with this size. Therefore we
# record this path, but there is as yet no need to create a
# PathInfo object for it.
size_bucket[this_file_path] = None
paths_by_size[this_file_stat.st_size] = size_bucket
if os.path.islink(root):
if os.path.exists(root):
sys.stderr.write("WARNING: ignoring symbolic link: %s\n" % root)
else:
sys.stderr.write("WARNING: ignoring (broken) symbolic link: %s\n" % root)
else:
if (not os.path.exists(root)) and (not ignore_missing):
sys.stderr.write("WARNING: path not found: %s\n" % root)
elif os.path.isfile(root):
gather_file(None, root)
else:
for this_dir, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in ignored_directories]
for f in filenames:
gather_file(this_dir, f)
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
# Maps quicksum hex digests (strings) to sub-dictionaries, where
# each sub-dict maps paths (strings) to PathInfo objects.
paths_by_quicksum = { }
# Now go through paths_by_size, looking for duplicates in the
# quicksum. That doesn't mean the files are actually duplicates, of
# course, but it narrows things down so that we can perform the
# highest-cost test, a full checksum, only in the cases that truly
# need it.
for size in paths_by_size.keys():
for path in paths_by_size[size].keys():
this_path_info = paths_by_size[size][path]
if this_path_info is not None:
# This is the sub-dict value stored in paths_by_quicksum.
quicksum_bucket = paths_by_quicksum.get(this_path_info.quicksum, {})
quicksum_bucket[path] = this_path_info
# Unconditionally update. Since we're updating whatever
# quicksum group object we found, overwriting the "old" one
# with the "new" one is a no-op after the first time, but
# there's no point checking -- just do it unconditionally.
paths_by_quicksum[this_path_info.quicksum] = quicksum_bucket
# Eliminate every quicksum group that has only one member.
# The ones that remain represent quicksum duplicates.
for quicksum in paths_by_quicksum.keys():
quicksum_bucket = paths_by_quicksum[quicksum]
if len(quicksum_bucket.keys()) == 1:
del paths_by_quicksum[quicksum]
# The remaining files are duplicates according to quicksum.
# Now do the long test to make sure.
paths_by_fullsum = { }
for quicksum in paths_by_quicksum.keys():
quicksum_bucket = paths_by_quicksum[quicksum]
for path in quicksum_bucket.keys():
this_path_info = quicksum_bucket[path]
fullsum = md5_file(path)
fullsum_bucket = paths_by_fullsum.get(fullsum, { })
fullsum_bucket[path] = this_path_info
paths_by_fullsum[fullsum] = fullsum_bucket
# Eliminate every fullsum group that has only one member. Since the
# buckets are organized by full checksum this time, any buckets that
# remain afterwards represent true duplicates.
for fullsum in paths_by_fullsum.keys():
fullsum_bucket = paths_by_fullsum[fullsum]
if len(fullsum_bucket.keys()) == 1:
del paths_by_fullsum[fullsum]
return paths_by_fullsum
"""Return the path-wise common prefix of all the paths in LST.
LST is a list of strings, each a relative or absolute filesystem path.
Parts of the comparison are done only textually, so if some paths are
absolute and some are relative, this may miss some filesystem-wise
common prefixes.
If the result would be "./", then just fold it to the empty string,
since it's just a call artifact. It might be cleaner to handle that
in the caller, but let's just drop the pretense that this is a general
purpose function, shall we? We all know what this script does."""
prefix = os.path.commonprefix(lst)
if prefix == './':
if prefix == '' or prefix[-1] == os.sep:
return prefix
else:
if os.path.isdir(prefix):
return prefix
else:
idx = prefix.rfind(os.sep)
if idx == -1 or prefix[:(idx + 1)] == './':
return ''
else:
return prefix[:(idx + 1)]
def paths_are_contained(paths, containers):
"""If all PATHS are under one of CONTAINERS, return that container.
PATHS is a list of paths (presumably duplicate files), and CONTAINERS
is a list of directory basenames. If every path in PATHS is contained
in a subtree under a directory named any of the names in CONTAINERS,
return that name, else return False."""
prefix = os.path.abspath(common_path_prefix(paths))
for container in containers:
abs_container = os.path.abspath(container)
if common_path_prefix((prefix, abs_container,)) == abs_container:
return container
return False
ignore_empty = False
ignored_directories = []
ignored_if_containing = []
try:
(opts, args) = getopt.getopt(sys.argv[1:], "h?",
[ "ignore-missing",
"ignore-empty",
"ignore-dir=",
"ignore-contained=",
"help",
"usage"
])
except getopt.GetoptError, err:
sys.stderr.write(str(err))
sys.stderr.write("\n")
sys.exit(1)
for opt, optarg in opts:
if opt in ("-h", "-?", "--help", "--usage"):
print __doc__;
sys.exit(0)
elif opt in ("--ignore-missing"):
ignore_missing = True
elif opt in ("--ignore-empty"):
ignore_empty = True
elif opt in ("--ignore-dir"):
ignored_directories.append(optarg)
elif opt in ("--ignore-contained"):
ignored_if_containing.append(optarg)
if len(args) < 1:
roots = (".",)
else:
roots = args
duplicates = gather_dups(roots, ignore_missing, ignored_directories, ignore_empty)
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# Display each bucket, sorted by modtime within a given bucket.
for checksum in duplicates.keys():
paths = duplicates[checksum].keys()
size = None
mtime_sorted_paths = [ ]
# If ignored_if_containing is empty this is a no-op of course.
if paths_are_contained(paths, ignored_if_containing):
continue
for path in paths:
if size is None:
size = duplicates[checksum][path].size_for_display
mtime_sorted_paths.append((path, duplicates[checksum][path]))
mtime_sorted_paths = sorted(mtime_sorted_paths,
key=lambda tup: tup[1].mtime)
print "%s (%d identical files, each %d bytes):" \
% (checksum, len(mtime_sorted_paths), size)
common_prefix = common_path_prefix([x[0] for x in mtime_sorted_paths])
if common_prefix:
print "(common parent: %s)" % common_prefix
for tup in mtime_sorted_paths:
pretty_time = time.strftime("%Y/%m/%d %H:%M:%S",
time.localtime(tup[1].mtime))
print " %s (inode %s): %s" % (pretty_time, tup[1].inode, tup[0])
print ""
if __name__ == '__main__':
main()