Skip to content

Commit

Permalink
fix sme bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
zaidalyafeai committed Aug 27, 2020
1 parent 3f27237 commit 7495fcd
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 38 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
readme = readme_file.read()

setup(name='tkseem',
version='0.0.1',
version='0.0.2',
url='https://github.com/MagedSaeed/tkseem',
discription="Arabic Tokenization Library",
long_description=readme,
Expand Down
56 changes: 19 additions & 37 deletions tkseem/_base.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import mmap
import os
import sys
import mmap
import pickle
import itertools
import sys
from collections import Counter, defaultdict
from pathlib import Path

import numpy as np
from farasa.segmenter import FarasaSegmenter
from tqdm import tqdm

from pathlib import Path
from .util import split_on_binary

from collections import Counter, defaultdict

class BaseTokenizer:
"""
Expand All @@ -24,15 +20,15 @@ def __init__(
"""Constructor
Args:
unk_token (str, optional): reserved token for unknowns. Defaults to "<UNK>".
pad_token (str, optional): reserved token for padding. Defaults to "<PAD>".
max_tokens (int, optional): max number of vocabulary. Defaults to 10000.
unk_token (str, optional): unkown symbol. Defaults to "<UNK>".
pad_token (str, optional): pad symbol. Defaults to "<PAD>".
vocab_size (int, optional): max vocab size. Defaults to 10000.
special_tokens (list, optional): user defined special tokens. Defaults to [].
"""
self.vocab_size = vocab_size
self.unk_token = unk_token
self.pad_token = pad_token
self.special_tokens = special_tokens

self.rel_path = os.path.dirname(__file__)
cach_dict_path = os.path.join(self.rel_path, "dictionaries/cached.pl")
self.cached = pickle.load(open(cach_dict_path, "rb"))
Expand Down Expand Up @@ -68,17 +64,6 @@ def _get_tokens_frequency_quickly(self, file_path):
pbar.update(1)
return freq

def _write_data(self, path, data):
"""
Write the string data to a path
Args:
file_path (str): the directory of the data to read
"""
# TOCHECK: I think this code will break if the path does not exist.
open(path, "w").write(data)

def _get_tokens_frequency(self, file_path):
"""
Get tokens frequency using a dictionary
Expand Down Expand Up @@ -292,21 +277,26 @@ def token_to_id(self, piece):
return list(self.vocab.keys()).index(piece)

def id_to_token(self, id):
""" Get tokens list
"""convert id to token
Args:
id (int): input id
Returns:
list: tokens
str: token
"""
return list(self.vocab.keys())[id]

def tokenize(self, text, use_cache = False, max_cache_size = 1000):
"""Tokenize using the frequency dictionary
"""tokenize
Args:
text (str): input string
text (str): input text
use_cache (bool, optional): speed up using caching. Defaults to False.
max_cache_size (int, optional): max cacne size. Defaults to 1000.
Returns:
list: generated tokens
list: output list of tokens
"""
output_tokens = self._tokenize_from_dict(text, self.vocab, use_cache, max_cache_size = max_cache_size)
return output_tokens
Expand Down Expand Up @@ -348,21 +338,13 @@ def encode(self, text):
encoded = [self.token_to_id(token) for token in tokens]
return encoded

def encode_and_save(self):
"""
Encode all the files then save as numpy
"""
Path("data/encoded").mkdir(parents=True, exist_ok=True)
for file_path in os.listdir("data/raw/"):
ids = self.encode(open(f"data/raw/{file_path}", "r").read())
np.save(f"data/encoded/{file_path[:-4]}.npy", ids)

def encode_sentences(self, sentences, boundries=("", ""), out_length=None):
"""
Encode a list of sentences using the trained model
Args:
sentences (list): list of sentences
boundries (tuple): boundries for each sentence.
out_length (int, optional): specify the max length of encodings. Defaults to 100.
Returns:
Expand Down

0 comments on commit 7495fcd

Please sign in to comment.