Mercurial > hg > config
annotate python/find_duplicate_files.py @ 929:7c4be71a560b default tip
remove old aliases
| author | Jeff Hammel <k0scist@gmail.com> | 
|---|---|
| date | Mon, 20 Oct 2025 15:22:19 -0700 | 
| parents | aa9a3850ed56 | 
| children | 
| rev | line source | 
|---|---|
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 1 #!/usr/bin/env python | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 2 # -*- coding: utf-8 -*- | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 3 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 4 """ | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 5 find duplicate files in a directory | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 6 """ | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 7 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 8 # imports | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 9 import argparse | 
| 826 | 10 import csv | 
| 11 import difflib | |
| 12 import json | |
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 13 import os | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 14 import sys | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 15 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 16 | 
| 799 
dbd2562cb03e
remove old way of doing things; note TODO on replacing
 Jeff Hammel <k0scist@gmail.com> parents: 
711diff
changeset | 17 class DuplicateFilesParser(argparse.ArgumentParser): | 
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 18 """CLI option parser""" | 
| 799 
dbd2562cb03e
remove old way of doing things; note TODO on replacing
 Jeff Hammel <k0scist@gmail.com> parents: 
711diff
changeset | 19 | 
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 20 def __init__(self, **kwargs): | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 21 kwargs.setdefault('description', __doc__) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 22 argparse.ArgumentParser.__init__(self, **kwargs) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 23 self.add_argument('directory') | 
| 826 | 24 self.add_argument('--identical-sizes', dest='identical_sizes', | 
| 25 action='store_true', default=False, | |
| 26 help="print out all matches with identical sizes and exit") | |
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 27 self.options = None | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 28 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 29 def parse_args(self, *args, **kw): | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 30 options = argparse.ArgumentParser.parse_args(self, *args, **kw) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 31 self.validate(options) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 32 self.options = options | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 33 return options | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 34 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 35 def validate(self, options): | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 36 """validate options""" | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 37 if not os.path.isdir(options.directory): | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 38 self.error("Not a directory: {}".format(options.directory)) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 39 | 
| 801 | 40 | 
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 41 def main(args=sys.argv[1:]): | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 42 """CLI""" | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 43 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 44 # parse command line options | 
| 799 
dbd2562cb03e
remove old way of doing things; note TODO on replacing
 Jeff Hammel <k0scist@gmail.com> parents: 
711diff
changeset | 45 parser = DuplicateFilesParser() | 
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 46 options = parser.parse_args(args) | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 47 | 
| 826 | 48 # get all file sizes | 
| 49 sizes = {} | |
| 50 directory = options.directory | |
| 51 for dirpath, dirnames, files in os.walk(directory, topdown=True): | |
| 52 for path in files: | |
| 53 path = os.path.join(dirpath, path) | |
| 54 sizes.setdefault(os.path.getsize(path), []).append(path) | |
| 55 | |
| 56 # filter out those with identical sizes | |
| 57 identical_sizes = {k: v for k, v in sizes.items() | |
| 58 if len(v) > 1} | |
| 59 if options.identical_sizes: | |
| 60 print(json.dumps(identical_sizes, indent=2, sort_keys=True)) | |
| 61 | |
| 62 | |
| 63 # now that we've narrowed it down, let's find the identical files | |
| 64 duplicate_files = [] | |
| 65 for row in identical_sizes.values(): | |
| 66 | |
| 67 while len(row) > 1: | |
| 68 duplicates = [] | |
| 69 ref_file = row.pop() | |
| 70 ref = open(ref_file).read() | |
| 71 for index, path in reversed(list(enumerate(row))): | |
| 72 comp = open(path).read() | |
| 73 if ref == comp: | |
| 74 if not duplicates: | |
| 75 duplicates.append(ref_file) | |
| 76 duplicates.append(path) | |
| 77 row.pop(index) | |
| 78 if duplicates: | |
| 79 duplicate_files.append(duplicates) | |
| 80 | |
| 81 | |
| 82 # output CSV | |
| 83 writer = csv.writer(sys.stdout) | |
| 84 writer.writerows(duplicate_files) | |
| 711 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 85 | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 86 if __name__ == '__main__': | 
| 
ab831c7621e9
hacky way to note duplicate files
 Jeff Hammel <k0scist@gmail.com> parents: diff
changeset | 87 main() | 
