Skip to content

Commit

Permalink
Have one instance of the tokenizer and encoder
Browse files Browse the repository at this point in the history
Instead of instantiating both of those models in different classes,
there will now be only one model object (bert), defined in bert.py.
  • Loading branch information
Pligabue committed Oct 26, 2023
1 parent b6a8751 commit 0b8abfc
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 33 deletions.
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import itertools
import re
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel, TFBertModel

from typing import Union

from ..predicate_extraction.types import SentenceInput, SentenceInputs, PredicateMasks
from ..bert import bert
from ..constants import (MAX_SENTENCE_SIZE, OBJECT_PATTERN, PREDICATE_PATTERN,
SPECIAL_TOKEN_IDS, SUBJECT_PATTERN)

Expand All @@ -16,10 +15,6 @@


class DataFormatter():
def __init__(self) -> None:
self.tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
self.bert: TFBertModel = TFAutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")

#################
# INPUT SECTION #
#################
Expand All @@ -35,7 +30,7 @@ def split_on(self, sentence, pattern) -> list[str]:

def get_span(self, sentence, pattern) -> Span:
split_sentence = self.split_on(sentence, pattern)
token_sets: list = self.tokenizer(split_sentence, add_special_tokens=False)["input_ids"] # type: ignore
token_sets: list = bert.tokenizer(split_sentence, add_special_tokens=False)["input_ids"] # type: ignore
start = len(token_sets[0]) + 1 # 1 is added to account for the [CLS] token that is added later
end = start + len(token_sets[1])
return (start, end)
Expand All @@ -46,8 +41,8 @@ def span_to_mask(self, span):

def tokenize(self, sen: Union[list[str], str]):
if isinstance(sen, list):
return self.tokenizer(sen, padding="max_length", max_length=MAX_SENTENCE_SIZE)["input_ids"]
return self.tokenizer.encode(sen, padding="max_length", max_length=MAX_SENTENCE_SIZE)
return bert.tokenizer(sen, padding="max_length", max_length=MAX_SENTENCE_SIZE)["input_ids"]
return bert.tokenizer.encode(sen, padding="max_length", max_length=MAX_SENTENCE_SIZE)

def token_to_tag(self, token, index, subject_span: Span, object_span: Span):
if index in range(*subject_span):
Expand Down
8 changes: 2 additions & 6 deletions triple_extractor_ptbr_pligabue/argument_prediction/model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import tensorflow as tf
from transformers.models.auto.modeling_tf_auto import TFAutoModel
import math

from typing import cast, Optional

from ..constants import MAX_SENTENCE_SIZE, ARGUMENT_PREDICTION_MODEL_DIR, DEFAULT_MODEL_NAME
from ..bert import bert
from ..predicate_extraction.types import ArgPredInputs

from .constants import N_HEADS, ACCEPTANCE_THRESHOLD
Expand All @@ -14,8 +14,6 @@

class ArgumentPredictor(DataFormatter):
def __init__(self, *layer_units: int, name: Optional[str] = None) -> None:
super().__init__()

if name:
self._load_model(name)
else:
Expand All @@ -37,12 +35,10 @@ def _config_model(self, *layer_units: int):
lstm_units = layer_units[0]
dense_layer_units = layer_units[1:]

bert = TFAutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased").bert

token_ids = tf.keras.layers.Input(MAX_SENTENCE_SIZE, dtype="int32")
mask = tf.keras.layers.Input(MAX_SENTENCE_SIZE, dtype="bool")

base_embeddings = bert(token_ids)["last_hidden_state"]
base_embeddings = bert.encoder(token_ids)["last_hidden_state"] # type: ignore
predicate_embeddings = tf.ragged.boolean_mask(base_embeddings, mask)
mean_pred_embedding = tf.math.reduce_mean(predicate_embeddings, axis=1)
mean_pred_as_matrix = tf.expand_dims(mean_pred_embedding, axis=1)
Expand Down
10 changes: 10 additions & 0 deletions triple_extractor_ptbr_pligabue/bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from transformers import AutoTokenizer, TFAutoModel, TFBertModel


class Bert:
def __init__(self) -> None:
self.tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
self.encoder: TFBertModel = TFAutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased").bert


bert = Bert()
9 changes: 4 additions & 5 deletions triple_extractor_ptbr_pligabue/data_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,21 @@

from typing import Union

from .bert import bert

from .predicate_extraction.types import SentenceId, SentenceInput, PredicateMask
from .argument_prediction.types import ArgPredOutput, ArgPredOutputs, SubjectMask, ObjectMask


class DataFormatter():
COLUMNS = ['confidence', 'subject', 'relation', 'object', 'subject_id', 'object_id']

def __init__(self, tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]) -> None:
self.tokenizer = tokenizer

def build_annotation(self, sentence_id: SentenceId, tokens: SentenceInput, pred_mask: PredicateMask,
subj_mask: SubjectMask, obj_mask: ObjectMask):
pred_tokens = [t for t, mask_value in zip(tokens, pred_mask) if mask_value]
subj_tokens = [t for t, mask_value in zip(tokens, subj_mask) if mask_value]
obj_tokens = [t for t, mask_value in zip(tokens, obj_mask) if mask_value]
sentence, pred, subj, obj = self.tokenizer.batch_decode(
sentence, pred, subj, obj = bert.tokenizer.batch_decode(
[tokens, pred_tokens, subj_tokens, obj_tokens],
skip_special_tokens=True
)
Expand Down Expand Up @@ -56,7 +55,7 @@ def build_row(self, output: ArgPredOutput, id_prefix=''):
pred_tokens = [t for t, mask_value in zip(tokens, pred_mask) if mask_value]
subj_tokens = [t for t, mask_value in zip(tokens, subj_mask) if mask_value]
obj_tokens = [t for t, mask_value in zip(tokens, obj_mask) if mask_value]
pred, subj, obj = self.tokenizer.batch_decode(
pred, subj, obj = bert.tokenizer.batch_decode(
[pred_tokens, subj_tokens, obj_tokens],
skip_special_tokens=True
)
Expand Down
2 changes: 0 additions & 2 deletions triple_extractor_ptbr_pligabue/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ def __init__(self, pe_layers_or_name: Union[tuple[int], str], ap_layers_or_name:
else:
self.argument_predictor = ArgumentPredictor(*ap_layers_or_name)

super().__init__(self.predicate_extractor.tokenizer)

@classmethod
def load(cls, name: str = DEFAULT_MODEL_NAME):
return cls(name, name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from typing import Union

from ..bert import bert
from ..constants import (MAX_SENTENCE_SIZE, OBJECT_PATTERN, PREDICATE_PATTERN,
SPECIAL_TOKEN_IDS, SUBJECT_PATTERN)

Expand All @@ -15,9 +16,6 @@


class DataFormatter():
def __init__(self) -> None:
self.tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

#################
# INPUT SECTION #
#################
Expand All @@ -30,7 +28,7 @@ def split_on_predicate(self, sentence) -> list[str]:
return trimmed_split

def format_input(self, sentence: str) -> SentenceInput:
return self.tokenizer.encode(sentence, padding="max_length", max_length=MAX_SENTENCE_SIZE)
return bert.tokenizer.encode(sentence, padding="max_length", max_length=MAX_SENTENCE_SIZE)

def format_inputs(self, sentences: list[str]) -> SentenceInputs:
return [self.format_input(sentence) for sentence in sentences]
Expand All @@ -56,7 +54,7 @@ def add_training_sentence_data(self, sentence: str, sentence_map: SentenceMap, m
else:
input_tokens = sentence_map[key]["input"]

token_sets = [self.tokenizer.encode(chunk, add_special_tokens=False) for chunk in split_sentence]
token_sets = [bert.tokenizer.encode(chunk, add_special_tokens=False) for chunk in split_sentence]
predicate_start = len(token_sets[0]) + 1 # 1 is added to account for the [CLS] token that is added later
predicate_end = predicate_start + len(token_sets[1])

Expand Down Expand Up @@ -120,8 +118,8 @@ def print_elements(self, tokens: list[str], tags: list[BIO], scores: Union[list[
print()

def print_annotated_sentence(self, sentence: str, sentence_output: tf.Tensor, o_threshold=0.0, show_scores=False):
token_ids = self.tokenizer.encode(sentence)
tokens: list[str] = self.tokenizer.convert_ids_to_tokens(token_ids) # type: ignore
token_ids = bert.tokenizer.encode(sentence)
tokens: list[str] = bert.tokenizer.convert_ids_to_tokens(token_ids) # type: ignore
formatted_sentence_output = self.format_output(sentence_output)

tags = []
Expand Down
6 changes: 2 additions & 4 deletions triple_extractor_ptbr_pligabue/predicate_extraction/model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import math
import tensorflow as tf
from transformers.models.auto.modeling_tf_auto import TFAutoModel

from typing import cast, Optional

from ..constants import MAX_SENTENCE_SIZE, PREDICATE_EXTRACTION_MODEL_DIR, DEFAULT_MODEL_NAME
from ..bert import bert
from .constants import ACCEPTANCE_THRESHOLD, O_THRESHOLD
from .data_formatter import DataFormatter

Expand Down Expand Up @@ -37,10 +37,8 @@ def _load_model(self, name: str):
raise Exception(f"Model {str} does not exist.")

def _config_model(self, *dense_layer_units: int):
bert = TFAutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased").bert

token_ids = tf.keras.layers.Input(MAX_SENTENCE_SIZE, dtype="int32")
embeddings = bert(token_ids)["last_hidden_state"]
embeddings = bert.encoder(token_ids)["last_hidden_state"] # type: ignore

dense_layers = embeddings
for layer_units in dense_layer_units:
Expand Down

0 comments on commit 0b8abfc

Please sign in to comment.