Skip to content

Commit

Permalink
Letter freq
Browse files Browse the repository at this point in the history
  • Loading branch information
AvaAvarai committed May 17, 2024
1 parent ac2cf1c commit c2de80c
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 6 deletions.
60 changes: 60 additions & 0 deletions ML_Scripts/MissingGlyphFinder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import PyPDF2
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.ft2font import FT2Font
import re

# Extract text from the PDF
def extract_text_from_pdf(file_path):
with open(file_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)

# Extract text from each page and concatenate it
text = ""
for page_num in range(num_pages):
text += reader.pages[page_num].extract_text()
return text

# Find unique characters in the text
def find_unique_characters(text):
# Remove common punctuations and whitespace
text = re.sub(r'[\s]', '', text) # Remove all whitespace
# Count characters
char_counter = Counter(text)
return char_counter

# List missing glyphs
def list_missing_glyphs(unique_chars, font_paths):
missing_glyphs = []
for char in unique_chars:
glyph_found = False
for font_path in font_paths:
font = FT2Font(font_path)
if font.get_char_index(ord(char)):
glyph_found = True
break
if not glyph_found:
missing_glyphs.append(char)
return missing_glyphs

# Paths to the files
pdf_path = r"Related_Papers\DCVis_v41.pdf"
font_paths = [
r"fonts\NotoSans-Regular.ttf", # Path to Noto Sans font
r"fonts\NotoSansOriya-Regular.ttf", # Path to Noto Sans Oriya font
r"fonts\NotoSansSymbols-Regular.ttf", # Path to Noto Sans Symbols font
r"fonts\NotoSansCJK-Regular.ttc", # Path to Noto Sans CJK font
r"fonts\NotoSansMath-Regular.ttf" # Path to Noto Sans Math font
]

# Extract text from PDF
text = extract_text_from_pdf(pdf_path)
# Get unique characters
unique_chars = find_unique_characters(text)
print("Unique Characters:", unique_chars)

# List missing glyphs
missing_glyphs = list_missing_glyphs(unique_chars, font_paths)
print("Missing Glyphs:", missing_glyphs)
27 changes: 21 additions & 6 deletions ML_Scripts/visWord.py → ML_Scripts/VisPDFWord.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import PyPDF2
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib

def load_pdf_and_analyze(file_path):
# Open the PDF file
Expand All @@ -23,11 +25,25 @@ def load_pdf_and_analyze(file_path):
sorted_chars = sorted(char_counter.items())
labels, values = zip(*sorted_chars)

# Set font properties to Noto Sans, Noto Sans Oriya, Noto Sans Symbols, and Noto Sans CJK
font_paths = [
r"fonts\NotoSans-Regular.ttf", # Path to Noto Sans font
r"fonts\NotoSansOriya-Regular.ttf", # Path to Noto Sans Oriya font
r"fonts\NotoSansSymbols-Regular.ttf", # Path to Noto Sans Symbols font
r"fonts\NotoSansCJK-Regular.ttc", # Path to Noto Sans CJK font
r"fonts\NotoSansMath-Regular.ttf" # Path to Noto Sans Math font
]

# Create a font property object that includes all specified fonts
prop = fm.FontProperties(fname=font_paths[0])
for font_path in font_paths[1:]:
prop.set_file(font_path)

plt.figure(figsize=(10, 5))
plt.bar(labels, values, color='b')
plt.xlabel('Characters')
plt.ylabel('Frequency')
plt.title('Character Frequency in PDF (Alphabetically Ordered)')
plt.xlabel('Characters', fontproperties=prop)
plt.ylabel('Frequency', fontproperties=prop)
plt.title('Character Frequency in PDF (Alphabetically Ordered)', fontproperties=prop)
plt.show()

# Print the top 10 words
Expand All @@ -36,7 +52,6 @@ def load_pdf_and_analyze(file_path):
for word, count in top_words:
print(f"{word}: {count}")


# Example usage
file_path = r"ML_Scripts\DCVis_v41.pdf" # Change this to your PDF file path
# Load the PDF file and analyze it
file_path = r"Related_Papers/DCVis_v41.pdf" # Change this to your PDF file path
load_pdf_and_analyze(file_path)
Binary file added fonts/NotoSans-Regular.ttf
Binary file not shown.
Binary file added fonts/NotoSansCJK-Regular.ttc
Binary file not shown.
Binary file added fonts/NotoSansMath-Regular.ttf
Binary file not shown.
Binary file added fonts/NotoSansOriya-Regular.ttf
Binary file not shown.
Binary file added fonts/NotoSansSymbols-Regular.ttf
Binary file not shown.

0 comments on commit c2de80c

Please sign in to comment.