feat(scanner): add duplicate-scanner functionality

Add functionality for the duplicate scanner. This uses the
previously-implemented code to derive SHA-256-based hashes and detect
duplicates before deleting them. Arguments are provided for a dry-run
mode and directory selection.
This commit is contained in:
inference 2025-06-25 21:53:57 +00:00
parent a67a40c571
commit 90482e42d7
Signed by: inference
SSH Key Fingerprint: SHA256:/O3c09/4f1lh4zrhFs2qvQEDda6dZbTwG9xEcj8OfWo

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python
# Duplicate Check
# Version: 0.14.0
# Version: 0.15.0
# Copyright 2025 Jake Winters
# SPDX-License-Identifier: BSD-3-Clause
@ -23,4 +23,23 @@ def hash_file(file_path):
with open(file_path, 'rb') as f:
for byte_block in iter(lambda: f.read(65536), b''):
sha256_hash.update(byte_block)
return sha256_hash.hexdigest()
return sha256_hash.hexdigest()
def find_and_delete_duplicates(directory, dry_run):
os.chdir(directory)
file_hashes = {}
for file in os.listdir():
file_path = os.path.abspath(file)
if os.path.isfile(file_path):
file_hash = hash_file(file_path)
if file_hash in file_hashes:
if not dry_run:
os.remove(file_path)
print(f"Duplicate detected: {file_path}")
if not dry_run:
print(f"Duplicate deleted: {file_path}")
else:
file_hashes[file_hash] = file_hash
os.chdir('..')
find_and_delete_duplicates(args.directory, args.dry_run)