📚 The CoCalc Library - books, templates and other resources
License: OTHER
"""This module contains a code example related to12Think Python, 2nd Edition3by Allen Downey4http://thinkpython2.com56Copyright 2015 Allen Downey78License: http://creativecommons.org/licenses/by/4.0/9"""1011from __future__ import print_function, division1213import os141516def walk(dirname):17"""Finds the names of all files in dirname and its subdirectories.1819dirname: string name of directory20"""21names = []22if '__pycache__' in dirname:23return names2425for name in os.listdir(dirname):26path = os.path.join(dirname, name)2728if os.path.isfile(path):29names.append(path)30else:31names.extend(walk(path))32return names333435def compute_checksum(filename):36"""Computes the MD5 checksum of the contents of a file.3738filename: string39"""40cmd = 'md5sum ' + filename41return pipe(cmd)424344def check_diff(name1, name2):45"""Computes the difference between the contents of two files.4647name1, name2: string filenames48"""49cmd = 'diff %s %s' % (name1, name2)50return pipe(cmd)515253def pipe(cmd):54"""Runs a command in a subprocess.5556cmd: string Unix command5758Returns (res, stat), the output of the subprocess and the exit status.59"""60# Note: os.popen is deprecated61# now, which means we are supposed to stop using it and start using62# the subprocess module. But for simple cases, I find63# subprocess more complicated than necessary. So I am going64# to keep using os.popen until they take it away.6566fp = os.popen(cmd)67res = fp.read()68stat = fp.close()69assert stat is None70return res, stat717273def compute_checksums(dirname, suffix):74"""Computes checksums for all files with the given suffix.7576dirname: string name of directory to search77suffix: string suffix to match7879Returns: map from checksum to list of files with that checksum80"""81names = walk(dirname)8283d = {}84for name in names:85if name.endswith(suffix):86res, stat = compute_checksum(name)87checksum, _ = res.split()8889if checksum in d:90d[checksum].append(name)91else:92d[checksum] = [name]9394return d959697def check_pairs(names):98"""Checks whether any in a list of files differs from the others.99100names: list of string filenames101"""102for name1 in names:103for name2 in names:104if name1 < name2:105res, stat = check_diff(name1, name2)106if res:107return False108return True109110111def print_duplicates(d):112"""Checks for duplicate files.113114Reports any files with the same checksum and checks whether they115are, in fact, identical.116117d: map from checksum to list of files with that checksum118"""119for key, names in d.items():120if len(names) > 1:121print('The following files have the same checksum:')122for name in names:123print(name)124125if check_pairs(names):126print('And they are identical.')127128129if __name__ == '__main__':130d = compute_checksums(dirname='.', suffix='.py')131print_duplicates(d)132133134