Letter freq

CWU-VKD-LAB · May 17, 2024 · c2de80c · c2de80c
1 parent ac2cf1c
commit c2de80c
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 6 deletions.
diff --git a/ML_Scripts/MissingGlyphFinder.py b/ML_Scripts/MissingGlyphFinder.py
@@ -0,0 +1,60 @@
+import PyPDF2
+from collections import Counter
+import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+from matplotlib.ft2font import FT2Font
+import re
+
+# Extract text from the PDF
+def extract_text_from_pdf(file_path):
+    with open(file_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        num_pages = len(reader.pages)
+
+        # Extract text from each page and concatenate it
+        text = ""
+        for page_num in range(num_pages):
+            text += reader.pages[page_num].extract_text()
+    return text
+
+# Find unique characters in the text
+def find_unique_characters(text):
+    # Remove common punctuations and whitespace
+    text = re.sub(r'[\s]', '', text)  # Remove all whitespace
+    # Count characters
+    char_counter = Counter(text)
+    return char_counter
+
+# List missing glyphs
+def list_missing_glyphs(unique_chars, font_paths):
+    missing_glyphs = []
+    for char in unique_chars:
+        glyph_found = False
+        for font_path in font_paths:
+            font = FT2Font(font_path)
+            if font.get_char_index(ord(char)):
+                glyph_found = True
+                break
+        if not glyph_found:
+            missing_glyphs.append(char)
+    return missing_glyphs
+
+# Paths to the files
+pdf_path = r"Related_Papers\DCVis_v41.pdf"
+font_paths = [
+    r"fonts\NotoSans-Regular.ttf",  # Path to Noto Sans font
+    r"fonts\NotoSansOriya-Regular.ttf",  # Path to Noto Sans Oriya font
+    r"fonts\NotoSansSymbols-Regular.ttf",  # Path to Noto Sans Symbols font
+    r"fonts\NotoSansCJK-Regular.ttc",  # Path to Noto Sans CJK font
+    r"fonts\NotoSansMath-Regular.ttf"  # Path to Noto Sans Math font
+]
+
+# Extract text from PDF
+text = extract_text_from_pdf(pdf_path)
+# Get unique characters
+unique_chars = find_unique_characters(text)
+print("Unique Characters:", unique_chars)
+
+# List missing glyphs
+missing_glyphs = list_missing_glyphs(unique_chars, font_paths)
+print("Missing Glyphs:", missing_glyphs)
diff --git a/ML_Scripts/visWord.py → ML_Scripts/VisPDFWord.py b/ML_Scripts/visWord.py → ML_Scripts/VisPDFWord.py
@@ -1,6 +1,8 @@
 import PyPDF2
 from collections import Counter
 import matplotlib.pyplot as plt
+import matplotlib.font_manager as fm
+import matplotlib
 
 def load_pdf_and_analyze(file_path):
     # Open the PDF file
@@ -23,11 +25,25 @@ def load_pdf_and_analyze(file_path):
     sorted_chars = sorted(char_counter.items())
     labels, values = zip(*sorted_chars)
 
+    # Set font properties to Noto Sans, Noto Sans Oriya, Noto Sans Symbols, and Noto Sans CJK
+    font_paths = [
+        r"fonts\NotoSans-Regular.ttf",  # Path to Noto Sans font
+        r"fonts\NotoSansOriya-Regular.ttf",  # Path to Noto Sans Oriya font
+        r"fonts\NotoSansSymbols-Regular.ttf",  # Path to Noto Sans Symbols font
+        r"fonts\NotoSansCJK-Regular.ttc",  # Path to Noto Sans CJK font
+        r"fonts\NotoSansMath-Regular.ttf"  # Path to Noto Sans Math font
+    ]
+
+    # Create a font property object that includes all specified fonts
+    prop = fm.FontProperties(fname=font_paths[0])
+    for font_path in font_paths[1:]:
+        prop.set_file(font_path)
+
     plt.figure(figsize=(10, 5))
     plt.bar(labels, values, color='b')
-    plt.xlabel('Characters')
-    plt.ylabel('Frequency')
-    plt.title('Character Frequency in PDF (Alphabetically Ordered)')
+    plt.xlabel('Characters', fontproperties=prop)
+    plt.ylabel('Frequency', fontproperties=prop)
+    plt.title('Character Frequency in PDF (Alphabetically Ordered)', fontproperties=prop)
     plt.show()
 
     # Print the top 10 words
@@ -36,7 +52,6 @@ def load_pdf_and_analyze(file_path):
     for word, count in top_words:
         print(f"{word}: {count}")
 
-
-# Example usage
-file_path = r"ML_Scripts\DCVis_v41.pdf"  # Change this to your PDF file path
+# Load the PDF file and analyze it
+file_path = r"Related_Papers/DCVis_v41.pdf"  # Change this to your PDF file path
 load_pdf_and_analyze(file_path)
diff --git a/fonts/NotoSans-Regular.ttf b/fonts/NotoSans-Regular.ttf
diff --git a/fonts/NotoSansCJK-Regular.ttc b/fonts/NotoSansCJK-Regular.ttc
diff --git a/fonts/NotoSansMath-Regular.ttf b/fonts/NotoSansMath-Regular.ttf
diff --git a/fonts/NotoSansOriya-Regular.ttf b/fonts/NotoSansOriya-Regular.ttf
diff --git a/fonts/NotoSansSymbols-Regular.ttf b/fonts/NotoSansSymbols-Regular.ttf