add dynamic language support based on available language dirs (#8)

* implement dynamic language detection * update SafeText for automatic language handling * remove init files contains not needed classes anymore * delete languages base module * move baseprofanity as profanity checker * update readme * remove languages/init * minor fix * simplified code
safevideo · Dec 11, 2023 · ed8124e · ed8124e
1 parent 4e39bd2
commit ed8124e
Show file tree

Hide file tree

Showing 10 changed files with 157 additions and 156 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,14 @@
+<div align="center">
+  <p>
+    <a align="center" href="" target="_blank">
+      <img
+        width="1280"
+        src="https://github.com/safevideo/safetext/assets/44926076/9af66dde-3a93-4c5b-b802-cb31dffcb2e5"
+      >
+    </a>
+  </p>
+</div>
+
 # safetext
 
 Rule-based profanity checking tool for English and Turkish.

diff --git a/safetext/__init__.py b/safetext/__init__.py
@@ -1,10 +1,6 @@
-from safetext.utils import detect_language_from_srt, detect_language_from_text
+import os
 
-from .languages.de import GermanProfanityChecker
-from .languages.en import EnglishProfanityChecker
-from .languages.es import SpanishProfanityChecker
-from .languages.pt import PortugueseProfanityChecker
-from .languages.tr import TurkishProfanityChecker
+from safetext.utils import detect_language_from_srt, detect_language_from_text
 
 __version__ = "0.0.4"
 
@@ -17,20 +13,17 @@ def __init__(self, language="en"):
         if language is not None:
             self.set_language(language)
 
-    def set_language(self, language):
+    def set_language(self, language: str):
+        """Sets the language of the profanity checker."""
+        words_file_path = self._get_words_filepath(language)
+        if not os.path.exists(words_file_path):
+            raise ValueError(f"No profanity word list found for language '{language}'.")
+
         self.language = language
-        if language == "en":
-            self.checker = EnglishProfanityChecker()
-        elif language == "tr":
-            self.checker = TurkishProfanityChecker()
-        elif language == "es":
-            self.checker = SpanishProfanityChecker()
-        elif language == "de":
-            self.checker = GermanProfanityChecker()
-        elif language == "pt":
-            self.checker = PortugueseProfanityChecker()
-        else:
-            raise ValueError("Language not supported")
+        self.checker = ProfanityChecker(language)
+
+    def _get_words_filepath(self, language: str) -> str:
+        return os.path.join(os.path.dirname(__file__), f"languages/{language}/words.txt")
 
     def set_language_from_text(self, text):
         """
@@ -76,7 +69,7 @@ def check_profanity(self, text):
                 - end: The end index of the profanity word in the text.
         """
         if self.checker is None:
-            raise ValueError("Language not set")
+            self._auto_set_language(text)
         return self.checker.check(text)
 
     def censor_profanity(self, text):
@@ -90,5 +83,90 @@ def censor_profanity(self, text):
             str: The censored text. The profanity words are replaced with asterisks.
         """
         if self.checker is None:
-            raise ValueError("Language not set")
+            self._auto_set_language(text)
         return self.checker.censor(text)
+
+    def _auto_set_language(self, text: str):
+        detected_language = detect_language_from_text(text)
+        self.set_language(detected_language)
+
+
+class ProfanityChecker:
+    """Base class for profanity checkers."""
+
+    def __init__(self, language):
+        self.language = language
+
+    @property
+    def words_filepath(self):
+        """Get the filepath for the profanity words file."""
+        import pathlib
+
+        return f"{pathlib.Path(__file__).parent.resolve()}/languages/{self.language}/words.txt"
+
+    @property
+    def profanity_words(self):
+        """Get the profanity words for the language."""
+        if not hasattr(self, "_profanity_words"):
+            self._profanity_words = self._read_words(self.words_filepath)
+
+        return self._profanity_words
+
+    def _check(self, text):
+        """Check the text for profanity."""
+        # Split the text into a list of words
+        words = text.split()
+
+        # Initialize a list to store the indices of profanity words
+        profanity_infos = []
+
+        for i, word in enumerate(words):
+            if word.lower() in self.profanity_words:
+                start_index = sum(len(w) + 1 for w in words[:i])  # +1 to account for space between words
+                end_index = start_index + len(word)
+                profanity_info = {
+                    "word": word,
+                    "index": i + 1,
+                    "start": start_index,
+                    "end": end_index,
+                }
+                profanity_infos.append(profanity_info)
+
+        return profanity_infos
+
+    def _read_words(self, filepath):
+        """Read the profanity words from the given file."""
+        with open(filepath, encoding="utf8") as f:
+            profanity_words = f.read().splitlines()
+
+        return profanity_words
+
+    def _preprocess(self, text):
+        """Preprocess the text before checking for profanity."""
+        return text
+
+    def check(self, text):
+        """
+        Check the text for profanity.
+
+        Args:
+            text (str): The text to check for profanity.
+
+        Returns:
+            list: A list of profanity infos. Each profanity info is a dict with the following keys:
+                - word: The profanity word.
+                - index: The index of the profanity word in the text.
+                - start: The start index of the profanity word in the text.
+                - end: The end index of the profanity word in the text.
+        """
+        return self._check(self._preprocess(text))
+
+    def censor(self, text):
+        """Censor the text."""
+        detected_profanities = self.check(text)
+        for profanity in detected_profanities:
+            start_index = profanity["start"]
+            end_index = profanity["end"]
+            text = text.replace(text[start_index:end_index], "***")
+
+        return text
diff --git a/safetext/languages/__init__.py b/safetext/languages/__init__.py
diff --git a/safetext/languages/base.py b/safetext/languages/base.py
diff --git a/safetext/languages/de/__init__.py b/safetext/languages/de/__init__.py
diff --git a/safetext/languages/en/__init__.py b/safetext/languages/en/__init__.py
diff --git a/safetext/languages/es/__init__.py b/safetext/languages/es/__init__.py
diff --git a/safetext/languages/pt/__init__.py b/safetext/languages/pt/__init__.py
diff --git a/safetext/languages/tr/__init__.py b/safetext/languages/tr/__init__.py
diff --git a/safetext/utils.py b/safetext/utils.py
@@ -1,29 +1,62 @@
+import os
+from typing import List
+
+import pysrt
 from lingua import Language, LanguageDetectorBuilder
 
-LANGUAGE_TO_CODE = {
-    Language.ENGLISH: "en",
-    Language.TURKISH: "tr",
-    Language.GERMAN: "de",
-    Language.FRENCH: "fr",
-    Language.SPANISH: "es",
-}
-LANGUAGES = [Language.ENGLISH, Language.TURKISH, Language.GERMAN, Language.FRENCH, Language.SPANISH]
-DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build()
+
+def available_languages() -> List[Language]:
+    """
+    Scans the 'languages' directory to identify available languages based on directory names.
+
+    Returns:
+        List[Language]: A list of available languages as Language enum values.
+    """
+    current_file_directory = os.path.dirname(__file__)
+
+    languages_path = os.path.join(current_file_directory, "languages")
+
+    all_items_in_languages_dir = os.listdir(languages_path)
+
+    available_lang_codes = []
+
+    for item in all_items_in_languages_dir:
+        item_full_path = os.path.join(languages_path, item)
+
+        if os.path.isdir(item_full_path):
+            available_lang_codes.append(item)
+
+    available_langs = []
+    for lang in Language:
+        if lang.iso_code_639_1.name.lower() in available_lang_codes:  # Correctly access the ISO 639-1 code
+            available_langs.append(lang)
+
+    return available_langs
+
+
+def initialize_detector() -> LanguageDetectorBuilder:
+    """
+    Dynamically initializes the language detector based on the available languages.
+
+    Returns:
+        LanguageDetectorBuilder: An initialized language detector.
+    """
+    return LanguageDetectorBuilder.from_languages(*available_languages()).build()
 
 
 def detect_language_from_text(text: str) -> str:
     """
-    Detects the language of the given text.
+    Detects the language of the given text using the dynamically initialized language detector.
 
     Args:
         text (str): The text to detect the language of.
 
     Returns:
-        str: The language code of the detected language.
-            (e.g. "en", "tr")
+        str: The ISO 639-1 language code of the detected language.
     """
-    result = DETECTOR.detect_language_of(text)
-    return LANGUAGE_TO_CODE[result]
+    DETECTOR = initialize_detector()
+    detected_language = DETECTOR.detect_language_of(text)
+    return detected_language.iso_code_639_1.name.lower()  # IsoCode639_1
 
 
 def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
@@ -38,8 +71,6 @@ def detect_language_from_srt(srt_file: str, use_first_n_subs: 10) -> str:
         str: The language code of the detected language.
             (e.g. "en", "tr")
     """
-    import pysrt
-
     subs = pysrt.open(srt_file, encoding="utf-8")
     text = " ".join([sub.text_without_tags.replace("\n", " ") for sub in subs[:use_first_n_subs]])