11import os
22import sys
3- import hashlib
3+ # import 5 hash functions from hashlib
4+ import timeit
5+ from hashlib import md5 , sha1 , blake2b
6+ from xxhash import xxh64 , xxh128
47
8+ hashfunc = 0
9+ import_module = "import random"
510
6- def find_duplicates (folders ):
11+ def duplicates (folders ):
712 dup_size = {}
813 for i in folders :
914 if os .path .exists (i ):
@@ -17,14 +22,12 @@ def find_duplicates(folders):
1722 for dup_list in dup_size .values ():
1823 if len (dup_list ) > 1 :
1924 join_dicts (dups , find_duplicate_hash (dup_list ))
20- print_results (dups )
21- return dups
2225
2326
2427def find_duplicate_size (parent_dir ):
2528 dups = {} # format {size:[filepaths]}
2629 for dirName , subdirs , fileList in os .walk (parent_dir ):
27- # print(dirName, subdirs, fileList)
30+ print (dirName , subdirs , fileList )
2831 print ('Scanning %s ' % dirName )
2932 for filename in fileList :
3033 path = os .path .join (dirName , filename )
@@ -62,7 +65,24 @@ def join_dicts(dict1, dict2):
6265
6366def hashfile (path , blocksize = 65536 ):
6467 file = open (path , 'rb' )
65- hasher = hashlib .md5 ()
68+
69+ hasher = md5 ()
70+ # use switch case for hash functions
71+ match hashfunc :
72+ case 0 :
73+ hasher = md5 ()
74+ case 1 :
75+ hasher = sha1 ()
76+ case 2 :
77+ hasher = blake2b ()
78+ case 3 :
79+ hasher = xxh64 ()
80+ case 4 :
81+ hasher = xxh128 ()
82+ case _:
83+ print ("Invalid hash function" )
84+ sys .exit (1 )
85+
6686 buf = file .read (blocksize )
6787 while len (buf ) > 0 :
6888 hasher .update (buf )
@@ -88,21 +108,45 @@ def print_results(dict1):
88108
89109
90110def find_duplicates (dir ):
91- # parser = argparse.ArgumentParser(description='Find duplicate files')
92- # parser.add_argument('folders', metavar='dir', type=str, nargs='+',help='A directory to parse for duplicates',)
93- # args = parser.parse_args()
94- # dir=input("Enter the directory names to find for duplicates: ").split(" ")
95- dups = find_duplicates ([dir ])
96- print (dups )
97- return dups
111+ a = duplicates ([dir ])
112+ return a
98113
99114def remove_duplicates (dups ):
100115 if len (dups ):
101116 for dup in dups :
102117 for i in range (1 , len (dups [dup ])):
103118 os .remove (dups [dup ][i ])
104119 print ("Duplicates deleted" )
120+ return True
105121 else :
106122 print ("Duplicates not deleted" )
123+ return False
107124
108-
125+ def main ():
126+ # dir=input("Enter the directory names to find for duplicates: ").split(" ")
127+ dir = "D:\\ GitHub\\ codebrewers-hackathon"
128+ dir2 = "D:\\ GitHub\\ climateview"
129+ dir3 = "D:\\ GitHub\\ js-samples"
130+ dir4 = "D:\\ GitHub\\ MemoryGrid"
131+ dir5 = "D:\\ GitHub\\ portfolio"
132+ results = []
133+ for i in range (5 ):
134+ print ("Hash function" , i ,"is being used" )
135+ hashfunc = i
136+ starttime = timeit .default_timer ()
137+ find_duplicates (dir )
138+ find_duplicates (dir2 )
139+ find_duplicates (dir3 )
140+ find_duplicates (dir4 )
141+ find_duplicates (dir5 )
142+ results .append (timeit .default_timer ()- starttime )
143+ print ("\n Time taken for 5 hash functions: " )
144+ print ("md5: " , results [0 ], " seconds" )
145+ print ("sha1: " , results [1 ], " seconds" )
146+ print ("blake2b: " , results [2 ], " seconds" )
147+ print ("xxh64: " , results [3 ], " seconds" )
148+ print ("xxh128: " , results [4 ], " seconds" )
149+
150+
151+ if __name__ == '__main__' :
152+ main ()
0 commit comments