Tokenizer special token map update (guillaume-be#330)

* Updates for compatibility with tokenizers special token rework * Updated mask pipline methods * Bumped version * Fix clippy warnings
SpirosMakris · Jan 30, 2023 · 84561ec · 84561ec
1 parent 80e0197
commit 84561ec
Show file tree

Hide file tree

Showing 57 changed files with 535 additions and 658 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
+## Changed
+- Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer.
 
 ## [0.20.0] - 2023-01-21
 ## Added

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rust-bert"
-version = "0.20.0"
+version = "0.20.1-alpha"
 authors = ["Guillaume Becquin <guillaume.becquin@gmail.com>"]
 edition = "2018"
 description = "Ready-to-use NLP pipelines and language models"
@@ -69,7 +69,7 @@ remote = ["cached-path", "dirs", "lazy_static"]
 features = ["doc-only"]
 
 [dependencies]
-rust_tokenizers = "~7.0.2"
+rust_tokenizers = "8.0.0"
 tch = "~0.10.1"
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }

diff --git a/examples/async-sentiment.rs b/examples/async-sentiment.rs
@@ -16,7 +16,7 @@ async fn main() -> Result<()> {
         "Classify this negative text".to_owned(),
     ];
     let sentiments = classifier.predict(texts).await?;
-    println!("Results: {:?}", sentiments);
+    println!("Results: {sentiments:?}");
 
     Ok(())
 }

diff --git a/examples/codebert.rs b/examples/codebert.rs
@@ -50,7 +50,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = sequence_classification_model.predict(input);
     for label in output {
-        println!("{:?}", label);
+        println!("{label:?}");
     }
 
     // Masked language model
@@ -78,7 +78,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = mask_language_model.predict(input)?;
     for sentence_output in output {
-        println!("{:?}", sentence_output);
+        println!("{sentence_output:?}");
     }
 
     Ok(())

diff --git a/examples/conversation.rs b/examples/conversation.rs
@@ -31,7 +31,7 @@ fn main() -> anyhow::Result<()> {
 
     let output = conversation_model.generate_responses(&mut conversation_manager);
 
-    println!("{:?}", output);
+    println!("{output:?}");
 
     let _ = conversation_manager
         .get(&conversation_1_id)
@@ -40,11 +40,11 @@ fn main() -> anyhow::Result<()> {
 
     let output = conversation_model.generate_responses(&mut conversation_manager);
 
-    println!("{:?}", output);
+    println!("{output:?}");
 
     let output = conversation_model.generate_responses(&mut conversation_manager);
 
-    println!("{:?}", output);
+    println!("{output:?}");
 
     Ok(())
 }
diff --git a/examples/generation_gpt2.rs b/examples/generation_gpt2.rs
@@ -33,7 +33,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.generate(&[input_context], None);
 
     for sentence in output {
-        println!("{:?}", sentence);
+        println!("{sentence:?}");
     }
     Ok(())
 }
diff --git a/examples/generation_gpt_neo.rs b/examples/generation_gpt_neo.rs
@@ -60,7 +60,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.generate(&[input_context_1, input_context_2], None);
 
     for sentence in output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/generation_reformer.rs b/examples/generation_reformer.rs
@@ -55,7 +55,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.generate(&[input_context_1, input_context_2], None);
 
     for sentence in output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/generation_xlnet.rs b/examples/generation_xlnet.rs
@@ -50,7 +50,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.generate(&[input_context], None);
 
     for sentence in output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/masked_language.rs b/examples/masked_language.rs
@@ -39,7 +39,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = mask_language_model.predict(input)?;
     for sentence_output in output {
-        println!("{:?}", sentence_output);
+        println!("{sentence_output:?}");
     }
 
     Ok(())

diff --git a/examples/named_entities_recognition.rs b/examples/named_entities_recognition.rs
@@ -28,7 +28,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = ner_model.predict_full_entities(&input);
     for entity in output {
-        println!("{:?}", entity);
+        println!("{entity:?}");
     }
 
     Ok(())

diff --git a/examples/part_of_speech_tagging.rs b/examples/part_of_speech_tagging.rs
@@ -24,7 +24,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = pos_model.predict(&input);
     for (pos, pos_tag) in output[0].iter().enumerate() {
-        println!("{} - {:?}", pos, pos_tag);
+        println!("{pos} - {pos_tag:?}");
     }
 
     Ok(())

diff --git a/examples/question_answering.rs b/examples/question_answering.rs
@@ -34,6 +34,6 @@ fn main() -> anyhow::Result<()> {
 
     //    Get answer
     let answers = qa_model.predict(&[qa_input_1, qa_input_2], 1, 32);
-    println!("{:?}", answers);
+    println!("{answers:?}");
     Ok(())
 }
diff --git a/examples/question_answering_bert.rs b/examples/question_answering_bert.rs
@@ -50,6 +50,6 @@ fn main() -> anyhow::Result<()> {
 
     //    Get answer
     let answers = qa_model.predict(&[qa_input_1, qa_input_2], 1, 32);
-    println!("{:?}", answers);
+    println!("{answers:?}");
     Ok(())
 }
diff --git a/examples/question_answering_longformer.rs b/examples/question_answering_longformer.rs
@@ -55,6 +55,6 @@ fn main() -> anyhow::Result<()> {
 
     //    Get answer
     let answers = qa_model.predict(&[qa_input_1, qa_input_2], 1, 32);
-    println!("{:?}", answers);
+    println!("{answers:?}");
     Ok(())
 }
diff --git a/examples/sentence_embeddings.rs b/examples/sentence_embeddings.rs
@@ -12,6 +12,6 @@ fn main() -> anyhow::Result<()> {
 
     // Generate Embeddings
     let embeddings = model.encode(&sentences)?;
-    println!("{:?}", embeddings);
+    println!("{embeddings:?}");
     Ok(())
 }
diff --git a/examples/sentence_embeddings_local.rs b/examples/sentence_embeddings_local.rs
@@ -32,6 +32,6 @@ fn main() -> anyhow::Result<()> {
 
     // Generate Embeddings
     let embeddings = model.encode(&sentences)?;
-    println!("{:?}", embeddings);
+    println!("{embeddings:?}");
     Ok(())
 }
diff --git a/examples/sentiment_analysis.rs b/examples/sentiment_analysis.rs
@@ -28,7 +28,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = sentiment_classifier.predict(input);
     for sentiment in output {
-        println!("{:?}", sentiment);
+        println!("{sentiment:?}");
     }
 
     Ok(())

diff --git a/examples/sentiment_analysis_fnet.rs b/examples/sentiment_analysis_fnet.rs
@@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = sentiment_classifier.predict(input);
     for sentiment in output {
-        println!("{:?}", sentiment);
+        println!("{sentiment:?}");
     }
 
     Ok(())

diff --git a/examples/sequence_classification.rs b/examples/sequence_classification.rs
@@ -28,7 +28,7 @@ fn main() -> anyhow::Result<()> {
     //    Run model
     let output = sequence_classification_model.predict(input);
     for label in output {
-        println!("{:?}", label);
+        println!("{label:?}");
     }
 
     Ok(())

diff --git a/examples/sequence_classification_multilabel.rs b/examples/sequence_classification_multilabel.rs
@@ -29,7 +29,7 @@ fn main() -> anyhow::Result<()> {
     let output = sequence_classification_model.predict_multilabel(&input, 0.05);
     if let Ok(labels) = output {
         for label in labels {
-            println!("{:?}", label);
+            println!("{label:?}");
         }
     }
 

diff --git a/examples/summarization_bart.rs b/examples/summarization_bart.rs
@@ -73,7 +73,7 @@ about exoplanets like K2-18b."];
     //    Credits: WikiNews, CC BY 2.5 license (https://en.wikinews.org/wiki/Astronomers_find_water_vapour_in_atmosphere_of_exoplanet_K2-18b)
     let _output = summarization_model.summarize(&input);
     for sentence in _output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
 
     Ok(())

diff --git a/examples/summarization_pegasus.rs b/examples/summarization_pegasus.rs
@@ -68,7 +68,7 @@ about exoplanets like K2-18b."];
     //    Credits: WikiNews, CC BY 2.5 license (https://en.wikinews.org/wiki/Astronomers_find_water_vapour_in_atmosphere_of_exoplanet_K2-18b)
     let _output = summarization_model.summarize(&input);
     for sentence in _output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
 
     Ok(())

diff --git a/examples/summarization_prophetnet.rs b/examples/summarization_prophetnet.rs
@@ -70,7 +70,7 @@ about exoplanets like K2-18b."];
     //    Credits: WikiNews, CC BY 2.5 license (https://en.wikinews.org/wiki/Astronomers_find_water_vapour_in_atmosphere_of_exoplanet_K2-18b)
     let _output = summarization_model.summarize(&input);
     for sentence in _output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
 
     Ok(())

diff --git a/examples/summarization_t5.rs b/examples/summarization_t5.rs
@@ -56,7 +56,7 @@ about exoplanets like K2-18b."];
     //    Credits: WikiNews, CC BY 2.5 license (https://en.wikinews.org/wiki/Astronomers_find_water_vapour_in_atmosphere_of_exoplanet_K2-18b)
     let _output = summarization_model.summarize(&input);
     for sentence in _output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
 
     Ok(())

diff --git a/examples/token_classification.rs b/examples/token_classification.rs
@@ -41,7 +41,7 @@ fn main() -> anyhow::Result<()> {
     let token_outputs = token_classification_model.predict(&input);
 
     for token in token_outputs {
-        println!("{:?}", token);
+        println!("{token:?}");
     }
 
     Ok(())

diff --git a/examples/translation_builder.rs b/examples/translation_builder.rs
@@ -32,7 +32,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.translate(&[input_context_1, input_context_2], None, Language::Spanish)?;
 
     for sentence in output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/translation_m2m100.rs b/examples/translation_m2m100.rs
@@ -50,7 +50,7 @@ fn main() -> anyhow::Result<()> {
     outputs.extend(model.translate(&[source_sentence], Language::English, Language::Hindi)?);
 
     for sentence in outputs {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/translation_marian.rs b/examples/translation_marian.rs
@@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> {
     let output = model.translate(&[input_context_1, input_context_2], None, None)?;
 
     for sentence in output {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/translation_mbart.rs b/examples/translation_mbart.rs
@@ -50,7 +50,7 @@ fn main() -> anyhow::Result<()> {
     outputs.extend(model.translate(&[source_sentence], Language::English, Language::Hindi)?);
 
     for sentence in outputs {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/translation_t5.rs b/examples/translation_t5.rs
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
     outputs.extend(model.translate(&[source_sentence], Language::English, Language::Romanian)?);
 
     for sentence in outputs {
-        println!("{}", sentence);
+        println!("{sentence}");
     }
     Ok(())
 }
diff --git a/examples/zero_shot_classification.rs b/examples/zero_shot_classification.rs
@@ -27,13 +27,13 @@ fn main() -> anyhow::Result<()> {
             [input_sentence, input_sequence_2],
             candidate_labels,
             Some(Box::new(|label: &str| {
-                format!("This example is about {}.", label)
+                format!("This example is about {label}.")
             })),
             128,
         )
         .unwrap();
 
-    println!("{:?}", output);
+    println!("{output:?}");
 
     Ok(())
 }
diff --git a/src/bart/bart_model.rs b/src/bart/bart_model.rs
@@ -26,7 +26,7 @@ use crate::pipelines::generation_utils::{
 };
 use crate::{Config, RustBertError};
 use rust_tokenizers::tokenizer::{RobertaTokenizer, TruncationStrategy};
-use rust_tokenizers::vocab::{RobertaVocab, Vocab};
+use rust_tokenizers::vocab::RobertaVocab;
 use serde::{Deserialize, Serialize};
 use std::borrow::Borrow;
 use std::collections::HashMap;
@@ -1263,9 +1263,7 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
 
         let pad_token = match pad_token_id {
             Some(value) => value,
-            None => self
-                ._get_tokenizer()
-                .convert_tokens_to_ids(&[RobertaVocab::unknown_value()])[0],
+            None => self._get_tokenizer().get_unk_id(),
         };
 
         let token_ids = token_ids

diff --git a/src/common/kind.rs b/src/common/kind.rs
@@ -14,8 +14,7 @@ pub(crate) fn get_positive_infinity(kind: Kind) -> Result<Scalar, RustBertError>
         Kind::Double => Scalar::float(f64::INFINITY),
         _ => {
             return Err(RustBertError::ValueError(format!(
-                "Type not supported: attempted to get positive infinity for {:?}",
-                kind
+                "Type not supported: attempted to get positive infinity for {kind:?}",
             )))
         }
     })
@@ -34,8 +33,7 @@ pub(crate) fn get_negative_infinity(kind: Kind) -> Result<Scalar, RustBertError>
         Kind::Double => Scalar::float(f64::NEG_INFINITY),
         _ => {
             return Err(RustBertError::ValueError(format!(
-                "Type not supported: attempted to get negative infinity for {:?}",
-                kind
+                "Type not supported: attempted to get negative infinity for {kind:?}",
             )))
         }
     })

diff --git a/src/deberta/deberta_model.rs b/src/deberta/deberta_model.rs
@@ -111,8 +111,7 @@ impl FromStr for PositionAttentionType {
             "c2p" => Ok(PositionAttentionType::c2p),
             "p2p" => Ok(PositionAttentionType::p2p),
             _ => Err(RustBertError::InvalidConfigurationError(format!(
-                "Position attention type `{}` not in accepted variants (`p2c`, `c2p`, `p2p`)",
-                s
+                "Position attention type `{s}` not in accepted variants (`p2c`, `c2p`, `p2p`)",
             ))),
         }
     }

diff --git a/src/deberta_v2/deberta_v2_model.rs b/src/deberta_v2/deberta_v2_model.rs
@@ -118,8 +118,7 @@ impl FromStr for NormRelEmbedType {
         match s {
             "layer_norm" => Ok(NormRelEmbedType::layer_norm),
             _ => Err(RustBertError::InvalidConfigurationError(format!(
-                "Layer normalization type `{}` not in accepted variants (`layer_norm`)",
-                s
+                "Layer normalization type `{s}` not in accepted variants (`layer_norm`)",
             ))),
         }
     }