Add MT tutorial

sillsdev · Oct 16, 2024 · 764ad87 · 764ad87
1 parent b7c06c8
commit 764ad87
Show file tree

Hide file tree

Showing 12 changed files with 1,591 additions and 80 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,24 +1,15 @@
 {
   "editor.formatOnSave": true,
   "editor.codeActionsOnSave": {
-    "source.organizeImports": "explicit",
+    "source.organizeImports": "explicit"
   },
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,
-  "python.analysis.extraPaths": [
-    "tests"
-  ],
+  "python.analysis.extraPaths": ["tests"],
   "python.analysis.importFormat": "relative",
   "[python]": {
     "editor.defaultFormatter": "ms-python.black-formatter",
     "editor.formatOnSave": true
   },
-  "black-formatter.path": [
-    "poetry",
-    "run",
-    "black"
-  ],
-  "python.analysis.extraPaths": [
-    "./tests"
-  ]
-}
+  "black-formatter.path": ["poetry", "run", "black"]
+}
diff --git a/README.md b/README.md
@@ -17,3 +17,4 @@ If you would like to find out more about how to use Machine, check out the tutor
 - [Tokenization](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/tokenization.ipynb)
 - [Text Corpora](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/corpora.ipynb)
 - [Word Alignment](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/word_alignment.ipynb)
+- [Machine Translation](https://githubtocolab.com/sillsdev/machine.py/blob/main/samples/machine_translation.ipynb)
diff --git a/machine/jobs/huggingface/hugging_face_nmt_model_factory.py b/machine/jobs/huggingface/hugging_face_nmt_model_factory.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from typing import Any, cast
 
+import datasets.utils.logging as datasets_logging
+import transformers.utils.logging as transformers_logging
 from transformers import AutoConfig, AutoModelForSeq2SeqLM, HfArgumentParser, PreTrainedModel, Seq2SeqTrainingArguments
 from transformers.integrations import ClearMLCallback
 from transformers.tokenization_utils import TruncationStrategy
@@ -39,6 +41,16 @@ def __init__(self, config: Any) -> None:
         ):
             self._training_args.report_to.remove("clearml")
 
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers_logging.set_verbosity_info()
+
+        log_level = self._training_args.get_process_log_level()
+        logger.setLevel(log_level)
+        datasets_logging.set_verbosity(log_level)
+        transformers_logging.set_verbosity(log_level)
+        transformers_logging.enable_default_handler()
+        transformers_logging.enable_explicit_format()
+
     @property
     def train_tokenizer(self) -> bool:
         return False
@@ -67,7 +79,7 @@ def create_model_trainer(self, corpus: ParallelTextCorpus) -> Trainer:
             src_lang=self._config.src_lang,
             tgt_lang=self._config.trg_lang,
             add_unk_src_tokens=self._config.huggingface.tokenizer.add_unk_src_tokens,
-            add_unk_trg_tokens=self._config.huggingface.tokenizer.add_unk_trg_tokens,
+            add_unk_tgt_tokens=self._config.huggingface.tokenizer.add_unk_tgt_tokens,
         )
 
     def create_engine(self) -> TranslationEngine:

diff --git a/machine/jobs/settings.yaml b/machine/jobs/settings.yaml
@@ -24,7 +24,7 @@ default:
       oom_batch_size_backoff_mult: 0.5
     tokenizer:
       add_unk_src_tokens: true
-      add_unk_trg_tokens: true
+      add_unk_tgt_tokens: true
   thot_mt:
     word_alignment_model_type: hmm
     tokenizer: latin

diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -52,7 +52,7 @@ def __init__(
         self._tokenizer = AutoTokenizer.from_pretrained(self._model.name_or_path, use_fast=True)
         if isinstance(self._tokenizer, (NllbTokenizer, NllbTokenizerFast)):
             self._mpn = MosesPunctNormalizer()
-            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
+            self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
         else:
             self._mpn = None
 

diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -6,9 +6,7 @@
 from pathlib import Path
 from typing import Any, Callable, List, Optional, Union, cast
 
-import datasets.utils.logging as datasets_logging
 import torch  # pyright: ignore[reportMissingImports]
-import transformers.utils.logging as transformers_logging
 from datasets.arrow_dataset import Dataset
 from sacremoses import MosesPunctNormalizer
 from torch import Tensor  # pyright: ignore[reportMissingImports]
@@ -84,10 +82,10 @@ def __init__(
         corpus: Union[ParallelTextCorpus, Dataset],
         src_lang: Optional[str] = None,
         tgt_lang: Optional[str] = None,
-        max_source_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
+        max_src_length: Optional[int] = None,
+        max_tgt_length: Optional[int] = None,
         add_unk_src_tokens: bool = False,
-        add_unk_trg_tokens: bool = True,
+        add_unk_tgt_tokens: bool = True,
     ) -> None:
         self._model = model
         self._training_args = training_args
@@ -96,12 +94,12 @@ def __init__(
         self._tgt_lang = tgt_lang
         self._trainer: Optional[Seq2SeqTrainer] = None
         self._metrics = {}
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
+        self.max_src_length = max_src_length
+        self.max_tgt_length = max_tgt_length
         self._add_unk_src_tokens = add_unk_src_tokens
-        self._add_unk_trg_tokens = add_unk_trg_tokens
+        self._add_unk_tgt_tokens = add_unk_tgt_tokens
         self._mpn = MosesPunctNormalizer()
-        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]
+        self._mpn.substitutions = [(re.compile(r), sub) for r, sub in self._mpn.substitutions]  # type: ignore
         self._stats = TrainStats()
 
     @property
@@ -113,17 +111,6 @@ def train(
         progress: Optional[Callable[[ProgressStatus], None]] = None,
         check_canceled: Optional[Callable[[], None]] = None,
     ) -> None:
-        if self._training_args.should_log:
-            # The default of training_args.log_level is passive, so we set log level at info here to have that default.
-            transformers_logging.set_verbosity_info()
-
-        log_level = self._training_args.get_process_log_level()
-        logger.setLevel(log_level)
-        datasets_logging.set_verbosity(log_level)
-        transformers_logging.set_verbosity(log_level)
-        transformers_logging.enable_default_handler()
-        transformers_logging.enable_explicit_format()
-
         last_checkpoint = None
         if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
             last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
@@ -203,7 +190,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
             logger.info(f"Added {len(missing_tokens)} tokens to the tokenizer: {missing_tokens}")
             return AutoTokenizer.from_pretrained(str(tokenizer_dir), use_fast=True)
 
-        if self._add_unk_src_tokens or self._add_unk_trg_tokens:
+        if self._add_unk_src_tokens or self._add_unk_tgt_tokens:
             logger.info("Checking for missing tokens")
             if not isinstance(tokenizer, PreTrainedTokenizerFast):
                 logger.warning(
@@ -217,7 +204,7 @@ def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
                 )
                 # using unofficially supported behavior to set the normalizer
                 tokenizer.backend_tokenizer.normalizer = norm_tok.backend_tokenizer.normalizer  # type: ignore
-                if self._add_unk_src_tokens and self._add_unk_trg_tokens:
+                if self._add_unk_src_tokens and self._add_unk_tgt_tokens:
                     lang_codes = [src_lang, tgt_lang]
                 elif self._add_unk_src_tokens:
                     lang_codes = [src_lang]
@@ -293,12 +280,12 @@ def add_lang_code_to_tokenizer(tokenizer: Any, lang_code: str):
         if model.name_or_path.startswith("t5-") or model.name_or_path.startswith("google/mt5-"):
             prefix = f"translate {self._src_lang} to {self._tgt_lang}: "
 
-        max_source_length = self.max_source_length
-        if max_source_length is None:
-            max_source_length = model.config.max_length
-        max_target_length = self.max_target_length
-        if max_target_length is None:
-            max_target_length = model.config.max_length
+        max_src_length = self.max_src_length
+        if max_src_length is None:
+            max_src_length = model.config.max_length
+        max_tgt_length = self.max_tgt_length
+        if max_tgt_length is None:
+            max_tgt_length = model.config.max_length
 
         if self._training_args.label_smoothing_factor > 0 and not hasattr(
             model, "prepare_decoder_input_ids_from_labels"
@@ -317,9 +304,9 @@ def preprocess_function(examples):
                 inputs = [prefix + ex[src_lang] for ex in examples["translation"]]
                 targets = [ex[tgt_lang] for ex in examples["translation"]]
 
-            model_inputs = tokenizer(inputs, max_length=max_source_length, truncation=True)
+            model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True)
             # Tokenize targets with the `text_target` keyword argument
-            labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
+            labels = tokenizer(text_target=targets, max_length=max_tgt_length, truncation=True)
 
             model_inputs["labels"] = labels["input_ids"]
             return model_inputs

diff --git a/machine/translation/translation_suggester.py b/machine/translation/translation_suggester.py
@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Iterable, Sequence
+from typing import Iterable, Optional, Sequence
 
+from .interactive_translator import InteractiveTranslator
 from .translation_result import TranslationResult
 from .translation_suggestion import TranslationSuggestion
+from .truecaser import Truecaser
 
 
 class TranslationSuggester(ABC):
@@ -14,3 +16,18 @@ def __init__(self, confidence_threshold: float = 0, break_on_punctuation: bool =
     def get_suggestions(
         self, n: int, prefix_count: int, is_last_word_complete: bool, results: Iterable[TranslationResult]
     ) -> Sequence[TranslationSuggestion]: ...
+
+    def get_suggestions_from_translator(
+        self, n: int, translator: InteractiveTranslator, truecaser: Optional[Truecaser] = None
+    ) -> Sequence[TranslationSuggestion]:
+        results = translator.get_current_results()
+        if truecaser is not None:
+            results = (
+                truecaser.truecase_translation_result(result, translator.target_detokenizer) for result in results
+            )
+        return self.get_suggestions(
+            n,
+            len(translator.prefix_word_ranges),
+            translator.is_last_word_complete,
+            results
+        )
diff --git a/machine/translation/truecaser.py b/machine/translation/truecaser.py
@@ -1,8 +1,11 @@
 from abc import ABC, abstractmethod
-from typing import Sequence
+from typing import Optional, Sequence
 
 from ..corpora.text_corpus import TextCorpus
+from ..tokenization.detokenizer import Detokenizer
+from ..tokenization.whitespace_detokenizer import WHITESPACE_DETOKENIZER
 from .trainer import Trainer
+from .translation_result import TranslationResult
 
 
 class Truecaser(ABC):
@@ -15,5 +18,21 @@ def train_segment(self, segment: Sequence[str], sentence_start: bool = True) ->
     @abstractmethod
     def truecase(self, segment: Sequence[str]) -> Sequence[str]: ...
 
+    def truecase_translation_result(
+        self, result: TranslationResult, detokenizer: Optional[Detokenizer] = None
+    ) -> TranslationResult:
+        if detokenizer is None:
+            detokenizer = WHITESPACE_DETOKENIZER
+        target_tokens = self.truecase(result.target_tokens)
+        return TranslationResult(
+            detokenizer.detokenize(target_tokens),
+            result.source_tokens,
+            target_tokens,
+            result.confidences,
+            result.sources,
+            result.alignment,
+            result.phrases,
+        )
+
     @abstractmethod
     def save(self) -> None: ...
diff --git a/samples/data/smt.cfg b/samples/data/smt.cfg
@@ -0,0 +1,29 @@
+# Translation model prefix
+-tm tm/src_trg
+
+# Language model
+-lm lm/trg.lm
+
+# W parameter (maximum number of translation options to be considered per each source phrase)
+-W 10
+
+# S parameter (maximum number of hypotheses that can be stored in each stack)
+-S 10
+
+# A parameter (Maximum length in words of the source phrases to be translated)
+-A 7
+
+# Degree of non-monotonicity
+-nomon 0
+
+# Heuristic function used
+-h 6
+
+# Best-first search flag
+-be
+
+# Translation model weights
+-tmw 0 0.5 1 1 1 1 0 1
+
+# Set online learning parameters (ol_alg, lr_policy, l_stepsize, em_iters, e_par, r_par)
+-olp 0 0 1 5 1 0