-
Notifications
You must be signed in to change notification settings - Fork 5
/
dups
executable file
·278 lines (225 loc) · 8.36 KB
/
dups
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/perl -s
use warnings;
use strict;
use 5.10.0;
use Data::Dumper;
use Digest::SHA qw(sha256_hex);
sub usage {
print STDERR "
dups -- find duplicate files
Usage:
dups [-H] [dirlist]
find ... | dups
Find duplicate files.
NOTES AND CAVEATS:
1. Uses 'vidir' for the user interface. If you're not familiar with 'vidir',
install 'moreutils' (available on every distro) and play with it.
https://github.com/sitaramc/notes/blob/master/dups.mkd) has more details.
2. This is mainly about reclaiming disk space, so hard links are ignored;
i.e., only one file in a set of files that have the same device+inode are
processed.
However there is one option (-H) that looks *only* for hard links, so you
*can* examine and delete those also if you wish.
3. Doesn't look at symlinks. Symlinks sent as arguments are resolved to
normal directories before processing.
4. Empty files are ignored; if you must process them, use the second form of
the usage above (e.g., 'find . -empty | dups').
";
exit 1;
}
our($h, $H);
usage if $h;
# ----------------------------------------------------------------------
# One incarnation of my duplicate finder used to check the beginnings of two
# same-sized files to save time. I think checking the middle is more likely
# to filter out false matches, so that's what I'm doing here.
#
# If the size of a file is greater than $THRESHOLD, then in the first round of
# hashing, compute the hash for $SNIP_SIZE bytes from the middle of the file
my $THRESHOLD = 1024*1024*4;
my $SNIP_SIZE = 1024*1024;
# ----------------------------------------------------------------------
# Find hard links. This mode is separate, and stands alone, because most of
# the time I don't need to delete them, but sometimes I might wish to only
# weed out hard links.
if ($H) {
shift;
cleanup_ARGV();
# Get input from `find` for files that have more than one hard link,
# sorted by size descending.
open(FIND, "-|", "find " . join(" ", @ARGV) . " -type f -links +1 -printf '%12s %10D/%12i\t%p\\n' | sort -s -k1,1nr");
# Our output, as we said earlier, goes to `vidir`
open(STDOUT, "|-", "vidir -");
print_vidir_preamble();
my $seen = 0;
my %names_seen;
while (<FIND>) {
chomp;
my ($size_dev_inode, $name) = split /\t/, $_, 2;
if ($size_dev_inode ne $seen) {
say "" if $seen;
$seen = $size_dev_inode;
say "# $seen";
}
say $name unless $names_seen{$name}++;
}
close(STDOUT);
exit;
}
# ----------------------------------------------------------------------
my @list;
if ( -t 0 ) {
cleanup_ARGV();
# Get input from `find`; files only, with device, inode, size, and path as fields
open(FIND, "-|", "find", @ARGV, "-type", "f", "!", "-empty", "-printf", "%D %i %s %p\\n") or die "$!";
@list = <FIND>;
} else {
# Alternatively, if STDIN is a pipe, take STDIN as a list of files to be
# de-duplicated and produce the same 4 fields for them. "perldoc -f stat"
# is where we get the 0, 1, and 7 for device, inode, and size. Oh and we
# discard anything that is not a file from the STDIN
@list = map { my @a = stat($_); join " ", @a[0,1,7], $_ } grep { -f } map { chomp; $_; } <>;
}
# set up the hashes we will need
my %by_inode; # well, we mean device+inode of course
my %by_size_or_hash; # and this becomes size+hash after the first round of elimination
for (@list) {
chomp;
my ($dev, $inode, $size, $name) = split ' ', $_, 4;
# skip empty files (see caveats section above)
next unless $size;
# skip if dev+inode combo was already seen; again, see caveats section
# above. If you need to de-dup hard links, use "-H"
next if exists $by_inode{"$dev/$inode"};
$by_inode{"$dev/$inode"} = 1;
# add the file to a list for the given size
push @{ $by_size_or_hash{$size} }, $name;
# here's what that hash looks like on a set of 4 files, of which 3 are of
# the same size, and a 4th is different. The *keys* of the hash are "size",
# and the corresponding *value* is a **list** of files of that size:
# {
# '41' => [
# './a/file1',
# './a/file2',
# './b/file3'
# ],
# '82' => [
# './b/file4'
# ]
# };
}
# eliminate singletons; in the above example, the key "82" would disappear
# since the list it points to has only one entry.
dups_only();
# compute SHA256, round 1. However, for files larger than the threshold, this
# won't be a hash of the full file, but only of a chunk in the middle.
#
# Also, the keys of the hash are currently the size, but after this step they
# will be "hash<space>size".
my @keys = keys %by_size_or_hash;
for my $s (@keys) {
my @names = @{ $by_size_or_hash{$s} };
delete $by_size_or_hash{$s};
for my $n ( @names ) {
my $h = hash(1, $s, $n);
push @{ $by_size_or_hash{"$h $s"} }, $n;
}
}
# eliminate singletons again
dups_only();
# compute SHA256, round 2. For most files, nothing will change, but for files
# larger than $THRESHOLD, the previous step only computed a partial hash. Now
# we get the full hash for those files.
@keys = keys %by_size_or_hash;
for my $sh (@keys) {
my @names = @{ $by_size_or_hash{$sh} };
delete $by_size_or_hash{$sh};
for my $n ( @names ) {
my $h2 = hash(2, $sh, $n);
push @{ $by_size_or_hash{"$h2"} }, $n;
}
}
dups_only();
open(STDOUT, "|-", "vidir -");
print_vidir_preamble();
dumpdups();
close(STDOUT);
# ----------------------------------------------------------------------
sub cleanup_ARGV {
# remove all symlinks in argument list. But before that, remove trailing
# slashes; they seem to make a symlink not look like a symlink
@ARGV = map { -l ? readlink : $_ } map { s(/$)(); $_ } @ARGV;
}
# eliminate singletons by size
sub dups_only {
for my $s (keys %by_size_or_hash) {
delete $by_size_or_hash{$s} if @{ $by_size_or_hash{$s} } == 1;
}
}
sub hash {
my ($round, $size, $file) = @_;
if ($round == 1) {
if ($size > $THRESHOLD) {
my $snippet;
open(my $fh, "<", $file) or die $!;
# seek to the middle of the file then read $SNIP_SIZE bytes
seek $fh, int ( $size / 2 ), 0;
read $fh, $snippet, $SNIP_SIZE;
return sha256_hex($snippet);
}
# else do the full file
return hashfile($file)
}
# now round == 2, remember what we get as "size" is actually "hash<space>size"
my $hash;
($hash, $size) = split ' ', $size;
# recompute the hash if the file is bigger than the threshold
$hash = hashfile($file) if $size > $THRESHOLD;
return "$hash $size";
}
sub hashfile {
my $file = shift;
return "UNREADABLE" unless -r $file;
my $sha = Digest::SHA->new(256);
$sha->addfile($file);
return $sha->hexdigest;
}
sub dumpdups {
# this somewhat convoluted thing is called a Schwartzian Transform (and
# has its own wikipedia entry!). But what it's doing is quite simple:
# given that the keys are "hash<space>size", it sorts them by size
# descending, so that the user sees the largest space hogs at the top when
# `vidir` opens
my @keys =
map { $_ -> [0] }
sort { $b->[1] <=> $a->[1] }
map { [ $_, _trim($_) ] }
keys %by_size_or_hash;
# now list out the files within each set of duplicates. The comment line
# shows the sha256 hash and the size of each set. They can safely be
# ignored -- vidir does not do anything with them.
for my $k (@keys) {
say "# sha256 $k bytes";
for my $n (@{ $by_size_or_hash{$k} }) {
say $n;
}
say "";
}
}
sub _trim {
my $x = shift;
$x =~ s/.* //;
return $x;
}
sub print_vidir_preamble {
say "# Here's how this works:";
say "# - delete lines for files you want to delete";
say "# - keep lines for files you want to keep";
say "# - then save the file";
say "# If you want to ABORT without any changes:";
say "# - if you're using vim, exit with ':cq'";
say "# - if not, find out how to \"exit with error\" in your editor";
say "# (or go to another terminal and kill vidir!)";
say "# If you need more info, run 'man vidir'";
say "";
}