Mercurial > hg > config
annotate python/find_duplicate_files.py @ 929:7c4be71a560b default tip
remove old aliases
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Mon, 20 Oct 2025 15:22:19 -0700 |
| parents | aa9a3850ed56 |
| children |
| rev | line source |
|---|---|
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 find duplicate files in a directory |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
| 826 | 10 import csv |
| 11 import difflib | |
| 12 import json | |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
13 import os |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
14 import sys |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
16 |
|
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
17 class DuplicateFilesParser(argparse.ArgumentParser): |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 """CLI option parser""" |
|
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
19 |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
20 def __init__(self, **kwargs): |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
21 kwargs.setdefault('description', __doc__) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
22 argparse.ArgumentParser.__init__(self, **kwargs) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
23 self.add_argument('directory') |
| 826 | 24 self.add_argument('--identical-sizes', dest='identical_sizes', |
| 25 action='store_true', default=False, | |
| 26 help="print out all matches with identical sizes and exit") | |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
27 self.options = None |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
28 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
29 def parse_args(self, *args, **kw): |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
30 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
31 self.validate(options) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
32 self.options = options |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
33 return options |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
34 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
35 def validate(self, options): |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
36 """validate options""" |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
37 if not os.path.isdir(options.directory): |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
38 self.error("Not a directory: {}".format(options.directory)) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
39 |
| 801 | 40 |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
41 def main(args=sys.argv[1:]): |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
42 """CLI""" |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
43 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
44 # parse command line options |
|
799
dbd2562cb03e
remove old way of doing things; note TODO on replacing
Jeff Hammel <k0scist@gmail.com>
parents:
711
diff
changeset
|
45 parser = DuplicateFilesParser() |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
46 options = parser.parse_args(args) |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
47 |
| 826 | 48 # get all file sizes |
| 49 sizes = {} | |
| 50 directory = options.directory | |
| 51 for dirpath, dirnames, files in os.walk(directory, topdown=True): | |
| 52 for path in files: | |
| 53 path = os.path.join(dirpath, path) | |
| 54 sizes.setdefault(os.path.getsize(path), []).append(path) | |
| 55 | |
| 56 # filter out those with identical sizes | |
| 57 identical_sizes = {k: v for k, v in sizes.items() | |
| 58 if len(v) > 1} | |
| 59 if options.identical_sizes: | |
| 60 print(json.dumps(identical_sizes, indent=2, sort_keys=True)) | |
| 61 | |
| 62 | |
| 63 # now that we've narrowed it down, let's find the identical files | |
| 64 duplicate_files = [] | |
| 65 for row in identical_sizes.values(): | |
| 66 | |
| 67 while len(row) > 1: | |
| 68 duplicates = [] | |
| 69 ref_file = row.pop() | |
| 70 ref = open(ref_file).read() | |
| 71 for index, path in reversed(list(enumerate(row))): | |
| 72 comp = open(path).read() | |
| 73 if ref == comp: | |
| 74 if not duplicates: | |
| 75 duplicates.append(ref_file) | |
| 76 duplicates.append(path) | |
| 77 row.pop(index) | |
| 78 if duplicates: | |
| 79 duplicate_files.append(duplicates) | |
| 80 | |
| 81 | |
| 82 # output CSV | |
| 83 writer = csv.writer(sys.stdout) | |
| 84 writer.writerows(duplicate_files) | |
|
711
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
85 |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 if __name__ == '__main__': |
|
ab831c7621e9
hacky way to note duplicate files
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
87 main() |
