Merge pull request #40 from sgibson91/improvements

Living-with-machines · Mar 6, 2022 · 3a2d8b0 · 3a2d8b0
2 parents 112fa62 + f23bdec
commit 3a2d8b0
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 116 deletions.
diff --git a/README.md b/README.md
@@ -107,7 +107,7 @@ The `compare` command reads in the JSON database generated by running `hash`, th
 The command runs a check to test if the stem of the filepath are equivalent for all paths that generated a given hash.
 This indicates that the file is a true duplication as since both its name and content match.
 If they do not match, this implies that the same content is saved under two different filenames.
-In this scenario, a `ValueError` is raised and the user is asked to manually investigate these files.
+In this scenario, a warning is raised asking the user to manually investigate these files.
 
 If all the filenames for a given hash match, then the shortest filepath is removed from the list and the rest are returned to be deleted.
 To delete files, the user needs to run `compare` with the `--purge` flag set.

diff --git a/deduplify/compare_files.py b/deduplify/compare_files.py
@@ -15,6 +15,7 @@
 
 import logging
 import os
+import warnings
 from collections import Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
@@ -71,8 +72,9 @@ def compare_filenames(hash: str, db) -> list:
         # but, by coincidence, have the same length
         file_list.remove(file_list[0])
     else:
-        raise ValueError(
-            f"The following filenames need investigation.\n{name_freq}\n{file_list}"
+        # Hashes are same but filenames are different
+        warnings.warn(
+            "The following filenames need investigation.\n- " + "\n- ".join(file_list)
         )
 
     return file_list

diff --git a/deduplify/hash_files.py b/deduplify/hash_files.py
@@ -8,44 +8,18 @@
 Author: Sarah Gibson
 Python version: >=3.7 (developed with 3.8)
 """
-import fnmatch
 import hashlib
 import logging
 import os
-from collections import Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Tuple
 
-from tinydb import TinyDB, where
-from tqdm import tqdm
+from tinydb import Query, TinyDB
 
 logger = logging.getLogger()
 EXPANDED_USER = os.path.expanduser("~")
 
 
-def get_total_number_of_files(target_dir: str, file_ext: list = ["*"]) -> int:
-    """Count the total number of files of a given extension in a directory.
-
-    Args:
-        target_dir (str): The target directory to search.
-        file_ext (list[str]): A list of file extensions to search for. Default: all
-            extensions (['*']).
-
-    Returns:
-        int: The number of files with the matching extension within the tree
-            of the target directory
-    """
-    logger.info("Calculating number of files that will be hashed in %s" % target_dir)
-
-    num_of_files = 0
-    for ext in file_ext:
-        num_of_files += len(fnmatch.filter(os.listdir(target_dir), f"*.{ext}"))
-
-    logger.info(f"{num_of_files} files to be hashed in {target_dir}")
-
-    return num_of_files
-
-
 def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
     """Calculate the MD5 hash of a given file
 
@@ -74,41 +48,6 @@ def hashfile(path: str, blocksize: int = 65536) -> Tuple[str, str]:
     return hasher.hexdigest(), path.replace(EXPANDED_USER, "~")
 
 
-def identify_duplicates(db):
-    """Identify duplicated documents in a given TinyDB database based on the whether
-    the hash key in each document is unique in the whole database.
-
-    Args:
-        db (TinyDB database): The TinyDB database object to be filtered
-
-    Returns:
-        db (TinyDB database): The database updated with the "duplicate" key
-            containing a Boolean value indicating if the file has a duplicate or not.
-    """
-    logger.info("Filtering the results...")
-
-    all_rows = db.all()
-    all_hashes = [row["hash"] for row in all_rows]
-    counted_hashes = Counter(all_hashes)
-
-    # Add duplicate key to each document in the database indicating
-    # whether it is a duplicate or not
-    for k, v in counted_hashes.items():
-        if v == 1:
-            db.update({"duplicate": False}, where("hash") == k)
-        elif v > 1:
-            db.update({"duplicate": True}, where("hash") == k)
-
-    # Calculate number of unique and duplicated files
-    unique = db.search(where("duplicate") == False)
-    logger.info("Number of unique files: %s" % len(unique))
-
-    duplicates = db.search(where("duplicate") == True)
-    logger.info("Number of duplicated files: %s" % len(duplicates))
-
-    return db
-
-
 def restart_run(db) -> list:
     """When restarting a hash run, identify which files need to be skipped from the
     database
@@ -146,8 +85,7 @@ def run_hash(
         raise ValueError("Please provide a known filepath!")
 
     hashes_db = TinyDB(dbfile)
-
-    total_file_num = get_total_number_of_files(dir, file_ext)
+    DBQuery = Query()
 
     if restart:
         files_to_skip = restart_run(hashes_db)
@@ -157,23 +95,37 @@ def run_hash(
     logger.info("Walking structure of: %s" % dir)
     logger.info("Generating MD5 hashes for files...")
 
-    total = total_file_num - len(files_to_skip)
-    pbar = tqdm(total=total)
-
+    count_files_hashed = 0
     for dirName, _, fileList in os.walk(dir):
         with ThreadPoolExecutor(max_workers=count) as executor:
             futures = [
                 executor.submit(hashfile, os.path.join(dirName, filename))
                 for filename in fileList
                 if filename not in files_to_skip
-                if os.path.splitext(filename)[1] in file_ext
+                if os.path.splitext(filename)[1].replace(".", "") in file_ext
+                or file_ext == ["*"]
             ]
             for future in as_completed(futures):
                 hash, filepath = future.result()
-                hashes_db.insert({"hash": hash, "filepath": filepath})
 
-                pbar.update(1)
+                if hashes_db.contains(DBQuery.hash == hash):
+                    hashes_db.insert(
+                        {"hash": hash, "filepath": filepath, "duplicate": True}
+                    )
+                    hashes_db.update({"duplicate": True}, DBQuery.hash == hash)
+                else:
+                    hashes_db.insert(
+                        {"hash": hash, "filepath": filepath, "duplicate": False}
+                    )
 
-    pbar.close()
+                count_files_hashed += 1
+                print(f"Total files hashed: {count_files_hashed}", end="\r", flush=True)
 
-    hashes_db = identify_duplicates(hashes_db)
+    # Calculate number of unique and duplicated files
+    logger.info("Number of files hashed: %s" % len(hashes_db))
+    logger.info(
+        "Number of unique files: %s" % hashes_db.count(DBQuery.duplicate == False)
+    )
+    logger.info(
+        "Number of duplicated files: %s" % hashes_db.count(DBQuery.duplicate == True)
+    )
diff --git a/tests/test_hash.py b/tests/test_hash.py
@@ -1,26 +1,8 @@
 import os
-from tempfile import NamedTemporaryFile
 
 from tinydb import TinyDB
 
-from deduplify.hash_files import (
-    get_total_number_of_files,
-    hashfile,
-    identify_duplicates,
-    restart_run,
-)
-
-
-def test_get_total_number_of_files():
-    dirpath = os.path.join("tests", "testdir")
-
-    output1 = get_total_number_of_files(dirpath)
-    output2 = get_total_number_of_files(dirpath, file_ext=["txt"])
-    output3 = get_total_number_of_files(dirpath, file_ext=["txt", "xml"])
-
-    assert output1 == 3
-    assert output2 == 1
-    assert output3 == 3
+from deduplify.hash_files import hashfile, restart_run
 
 
 def test_hashfile():
@@ -40,25 +22,3 @@ def test_restart_run():
     files_to_be_skipped = restart_run(test_db)
 
     assert files_to_be_skipped == expected_list
-
-
-def test_identify_duplicates():
-    with NamedTemporaryFile("w") as test_f, NamedTemporaryFile("w") as expected_f:
-        test_db = TinyDB(test_f.name)
-        expected_db = TinyDB(expected_f.name)
-
-    test_db.insert_multiple(
-        [
-            {"hash": "hash1", "filepath": "file1.txt"},
-            {"hash": "hash1", "filepath": "file2.txt"},
-        ]
-    )
-    expected_db.insert_multiple(
-        [
-            {"hash": "hash1", "filepath": "file1.txt", "duplicate": True},
-            {"hash": "hash1", "filepath": "file2.txt", "duplicate": True},
-        ]
-    )
-    updated_db = identify_duplicates(test_db)
-
-    assert expected_db.all() == updated_db.all()