Skip to content

Commit

Permalink
Merge pull request #40 from sgibson91/improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
sgibson91 committed Mar 6, 2022
2 parents 112fa62 + f23bdec commit 3a2d8b0
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 116 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ The `compare` command reads in the JSON database generated by running `hash`, th
The command runs a check to test if the stem of the filepath are equivalent for all paths that generated a given hash.
This indicates that the file is a true duplication as since both its name and content match.
If they do not match, this implies that the same content is saved under two different filenames.
In this scenario, a `ValueError` is raised and the user is asked to manually investigate these files.
In this scenario, a warning is raised asking the user to manually investigate these files.
If all the filenames for a given hash match, then the shortest filepath is removed from the list and the rest are returned to be deleted.
To delete files, the user needs to run `compare` with the `--purge` flag set.
Expand Down
6 changes: 4 additions & 2 deletions deduplify/compare_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import logging
import os
import warnings
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed

Expand Down Expand Up @@ -71,8 +72,9 @@ def compare_filenames(hash: str, db) -> list:
# but, by coincidence, have the same length
file_list.remove(file_list[0])
else:
raise ValueError(
f"The following filenames need investigation.\n{name_freq}\n{file_list}"
# Hashes are same but filenames are different
warnings.warn(
"The following filenames need investigation.\n- " + "\n- ".join(file_list)
)

return file_list
Expand Down
96 changes: 24 additions & 72 deletions deduplify/hash_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,18 @@
Author: Sarah Gibson
Python version: >=3.7 (developed with 3.8)
"""
import fnmatch
import hashlib
import logging
import os
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Tuple

from tinydb import TinyDB, where
from tqdm import tqdm
from tinydb import Query, TinyDB

logger = logging.getLogger()
EXPANDED_USER = os.path.expanduser("~")


def get_total_number_of_files(target_dir: str, file_ext: list = ["*"]) -> int:
"""Count the total number of files of a given extension in a directory.
Args:
target_dir (str): The target directory to search.
file_ext (list[str]): A list of file extensions to search for. Default: all
extensions (['*']).
Returns:
int: The number of files with the matching extension within the tree
of the target directory
"""
logger.info("Calculating number of files that will be hashed in %s" % target_dir)

num_of_files = 0
for ext in file_ext:
num_of_files += len(fnmatch.filter(os.listdir(target_dir), f"*.{ext}"))

logger.info(f"{num_of_files} files to be hashed in {target_dir}")

return num_of_files


def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
"""Calculate the MD5 hash of a given file
Expand Down Expand Up @@ -74,41 +48,6 @@ def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
return hasher.hexdigest(), path.replace(EXPANDED_USER, "~")


def identify_duplicates(db):
"""Identify duplicated documents in a given TinyDB database based on the whether
the hash key in each document is unique in the whole database.
Args:
db (TinyDB database): The TinyDB database object to be filtered
Returns:
db (TinyDB database): The database updated with the "duplicate" key
containing a Boolean value indicating if the file has a duplicate or not.
"""
logger.info("Filtering the results...")

all_rows = db.all()
all_hashes = [row["hash"] for row in all_rows]
counted_hashes = Counter(all_hashes)

# Add duplicate key to each document in the database indicating
# whether it is a duplicate or not
for k, v in counted_hashes.items():
if v == 1:
db.update({"duplicate": False}, where("hash") == k)
elif v > 1:
db.update({"duplicate": True}, where("hash") == k)

# Calculate number of unique and duplicated files
unique = db.search(where("duplicate") == False)
logger.info("Number of unique files: %s" % len(unique))

duplicates = db.search(where("duplicate") == True)
logger.info("Number of duplicated files: %s" % len(duplicates))

return db


def restart_run(db) -> list:
"""When restarting a hash run, identify which files need to be skipped from the
database
Expand Down Expand Up @@ -146,8 +85,7 @@ def run_hash(
raise ValueError("Please provide a known filepath!")

hashes_db = TinyDB(dbfile)

total_file_num = get_total_number_of_files(dir, file_ext)
DBQuery = Query()

if restart:
files_to_skip = restart_run(hashes_db)
Expand All @@ -157,23 +95,37 @@ def run_hash(
logger.info("Walking structure of: %s" % dir)
logger.info("Generating MD5 hashes for files...")

total = total_file_num - len(files_to_skip)
pbar = tqdm(total=total)

count_files_hashed = 0
for dirName, _, fileList in os.walk(dir):
with ThreadPoolExecutor(max_workers=count) as executor:
futures = [
executor.submit(hashfile, os.path.join(dirName, filename))
for filename in fileList
if filename not in files_to_skip
if os.path.splitext(filename)[1] in file_ext
if os.path.splitext(filename)[1].replace(".", "") in file_ext
or file_ext == ["*"]
]
for future in as_completed(futures):
hash, filepath = future.result()
hashes_db.insert({"hash": hash, "filepath": filepath})

pbar.update(1)
if hashes_db.contains(DBQuery.hash == hash):
hashes_db.insert(
{"hash": hash, "filepath": filepath, "duplicate": True}
)
hashes_db.update({"duplicate": True}, DBQuery.hash == hash)
else:
hashes_db.insert(
{"hash": hash, "filepath": filepath, "duplicate": False}
)

pbar.close()
count_files_hashed += 1
print(f"Total files hashed: {count_files_hashed}", end="\r", flush=True)

hashes_db = identify_duplicates(hashes_db)
# Calculate number of unique and duplicated files
logger.info("Number of files hashed: %s" % len(hashes_db))
logger.info(
"Number of unique files: %s" % hashes_db.count(DBQuery.duplicate == False)
)
logger.info(
"Number of duplicated files: %s" % hashes_db.count(DBQuery.duplicate == True)
)
42 changes: 1 addition & 41 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,8 @@
import os
from tempfile import NamedTemporaryFile

from tinydb import TinyDB

from deduplify.hash_files import (
get_total_number_of_files,
hashfile,
identify_duplicates,
restart_run,
)


def test_get_total_number_of_files():
dirpath = os.path.join("tests", "testdir")

output1 = get_total_number_of_files(dirpath)
output2 = get_total_number_of_files(dirpath, file_ext=["txt"])
output3 = get_total_number_of_files(dirpath, file_ext=["txt", "xml"])

assert output1 == 3
assert output2 == 1
assert output3 == 3
from deduplify.hash_files import hashfile, restart_run


def test_hashfile():
Expand All @@ -40,25 +22,3 @@ def test_restart_run():
files_to_be_skipped = restart_run(test_db)

assert files_to_be_skipped == expected_list


def test_identify_duplicates():
with NamedTemporaryFile("w") as test_f, NamedTemporaryFile("w") as expected_f:
test_db = TinyDB(test_f.name)
expected_db = TinyDB(expected_f.name)

test_db.insert_multiple(
[
{"hash": "hash1", "filepath": "file1.txt"},
{"hash": "hash1", "filepath": "file2.txt"},
]
)
expected_db.insert_multiple(
[
{"hash": "hash1", "filepath": "file1.txt", "duplicate": True},
{"hash": "hash1", "filepath": "file2.txt", "duplicate": True},
]
)
updated_db = identify_duplicates(test_db)

assert expected_db.all() == updated_db.all()

0 comments on commit 3a2d8b0

Please sign in to comment.