-
Notifications
You must be signed in to change notification settings - Fork 2
/
duplicate-file-finder.py
84 lines (67 loc) · 2.66 KB
/
duplicate-file-finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import hashlib
from collections import defaultdict
from typing import Dict, List, Set, Tuple
def calculate_file_hash(filepath: str) -> str:
"""Calculate SHA-256 hash of file content."""
hash_sha256 = hashlib.sha256()
with open(filepath, 'rb') as f:
# Read file in chunks to handle large files efficiently
for chunk in iter(lambda: f.read(4096), b''):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()
def find_identical_files(root_dir: str) -> Dict[str, List[Tuple[str, Set[str]]]]:
"""
Find files with identical names and content across subdirectories.
Args:
root_dir: Root directory to start the search from
Returns:
Dictionary with filename as key and list of tuples containing
file hash and set of full paths as value
"""
# Dictionary to store findings: filename -> [(hash1, {path1, path2}), (hash2, {path3, path4})]
file_map = defaultdict(lambda: defaultdict(set))
# Walk through all subdirectories
for dirpath, _, filenames in os.walk(root_dir):
for filename in filenames:
full_path = os.path.join(dirpath, filename)
try:
file_hash = calculate_file_hash(full_path)
file_map[filename][file_hash].add(full_path)
except (IOError, OSError) as e:
print(f"Error processing {full_path}: {e}")
# Convert to regular dict and filter out unique files
result = {}
for filename, hash_paths in file_map.items():
# Convert to list of tuples (hash, paths) where there are multiple paths
hash_path_list = [
(file_hash, paths)
for file_hash, paths in hash_paths.items()
if len(paths) > 1
]
if hash_path_list:
result[filename] = hash_path_list
return result
def display_results(results: Dict[str, List[Tuple[str, Set[str]]]]) -> None:
"""Display the results in a readable format."""
if not results:
print("No identical files found.")
return
print("\nFindings:")
print("-" * 80)
for filename, hash_paths_list in results.items():
print(f"\nFilename: {filename}")
print("=" * 40)
for file_hash, paths in hash_paths_list:
print(f"\nHash: {file_hash}")
print("Locations:")
for path in sorted(paths):
print(f" - {path}")
print("-" * 80)
def main():
root_dir = os.getcwd();
print(f"Scanning directory: {root_dir}")
results = find_identical_files(root_dir)
display_results(results)
if __name__ == "__main__":
main()