Merge remote-tracking branch 'origin/master' into additional_runtime_…

…generation_options # Conflicts: # CHANGELOG.md # src/pipelines/generation_utils.rs # src/pipelines/summarization.rs # src/pipelines/text_generation.rs # src/pipelines/translation/translation_pipeline.rs
guillaume-be · Nov 10, 2021 · a97b657 · a97b657
2 parents f50a1a4 + 12d09c9
commit a97b657
Show file tree

Hide file tree

Showing 80 changed files with 733 additions and 478 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,8 +2,13 @@
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
+## Changed
+- Updated to `tch` 1.6.0 (libtorch 1.10)
+- (BREAKING) Simplified the generics for multiple library traits taking as a rule `&[AsRef<str>]` or `&str` as inputs (no longer accepts owned types `Vec` and `String`)
+
 ## Added
 - (BREAKING) Support for `bad_word_ids` generation, allowing to ban a set of word ids for all model supporting text generation
+- Support for half-precision mode for all models (reducing memory footprint). A model can be converted to half-precision by calling the `half()` method on the `VarStore` is it currently stored in. Half-precision Torch kernels are not available for CPU (limited to CUDA devices)
 - (BREAKING) Extension of the generation options that can be provided at runtime (after a model has been instantiated with a `GenerateConfig`), allowing to update the generation options from one text generation to another with the same model. This feature is implemented at the `LanguageGenerator` trait level, the high-level `TextGeneration` pipeline API remains unchanged.
 
 ## [0.16.0] - 2021-08-24

diff --git a/Cargo.toml b/Cargo.toml
@@ -57,20 +57,21 @@ all-tests = []
 features = ["doc-only"]
 
 [dependencies]
-rust_tokenizers = "~6.2.4"
-tch = "~0.5.0"
-serde_json = "1.0.66"
-serde = { version = "1.0.129", features = ["derive"] }
-dirs = "3.0.2"
-ordered-float = "2.7.0"
+rust_tokenizers = "~7.0.0"
+tch = "~0.6.1"
+serde_json = "1.0.68"
+serde = { version = "1.0.130", features = ["derive"] }
+dirs = "4.0.0"
+ordered-float = "2.8.0"
 cached-path = "0.5.1"
 lazy_static = "1.4.0"
 uuid = { version = "0.8.2", features = ["v4"] }
-thiserror = "1.0.26"
+thiserror = "1.0.30"
+half = "1.7.1"
 
 [dev-dependencies]
-anyhow = "1.0.43"
+anyhow = "1.0.44"
 csv = "1.1.6"
 criterion = "0.3.5"
-torch-sys = "0.5.0"
+torch-sys =  "~0.6.1"
 tempfile = "3.2.0"
diff --git a/README.md b/README.md
@@ -71,8 +71,8 @@ This cache location defaults to `~/.cache/.rustbert`, but can be changed by sett
 
 ### Manual installation (recommended)
 
-1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v1.9.0`: if this version is no longer available on the "get started" page,
-the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu111/libtorch-shared-with-deps-1.9.0%2Bcu111.zip` for a Linux version with CUDA11.
+1. Download `libtorch` from https://pytorch.org/get-started/locally/. This package requires `v1.10.0`: if this version is no longer available on the "get started" page,
+the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu111/libtorch-shared-with-deps-1.10.0%2Bcu111.zip` for a Linux version with CUDA11.
 2. Extract the library to a location of your choice
 3. Set the following environment variables
 ##### Linux:

diff --git a/benches/tensor_operations_benchmark.rs b/benches/tensor_operations_benchmark.rs
@@ -3,7 +3,7 @@ extern crate criterion;
 
 use criterion::{black_box, Criterion};
 use std::time::{Duration, Instant};
-use tch::kind::Kind::Float;
+use tch::kind::Kind;
 use tch::{Device, Tensor};
 
 fn matrix_multiply(iters: u64, input: &Tensor, weights: &Tensor) -> Duration {
@@ -21,8 +21,8 @@ fn bench_tensor_ops(c: &mut Criterion) {
     unsafe {
         torch_sys::dummy_cuda_dependency();
     }
-    let input = Tensor::rand(&[32, 128, 512], (Float, Device::cuda_if_available()));
-    let weights = Tensor::rand(&[512, 512], (Float, Device::cuda_if_available()));
+    let input = Tensor::rand(&[32, 128, 512], (Kind::Float, Device::cuda_if_available()));
+    let weights = Tensor::rand(&[512, 512], (Kind::Float, Device::cuda_if_available()));
 
     let _ = &input.matmul(&weights);
     c.bench_function("Matrix multiply ", |b| {

diff --git a/examples/generation_gpt_neo.rs b/examples/generation_gpt_neo.rs
@@ -25,16 +25,16 @@ use tch::Device;
 fn main() -> anyhow::Result<()> {
     //    Set-up model resources
     let config_resource = Resource::Remote(RemoteResource::from_pretrained(
-        GptNeoConfigResources::GPT_NEO_1_3B,
+        GptNeoConfigResources::GPT_NEO_125M,
     ));
     let vocab_resource = Resource::Remote(RemoteResource::from_pretrained(
-        GptNeoVocabResources::GPT_NEO_1_3B,
+        GptNeoVocabResources::GPT_NEO_125M,
     ));
     let merges_resource = Resource::Remote(RemoteResource::from_pretrained(
-        GptNeoMergesResources::GPT_NEO_1_3B,
+        GptNeoMergesResources::GPT_NEO_125M,
     ));
     let model_resource = Resource::Remote(RemoteResource::from_pretrained(
-        GptNeoModelResources::GPT_NEO_1_3B,
+        GptNeoModelResources::GPT_NEO_125M,
     ));
     let generate_config = TextGenerationConfig {
         model_type: ModelType::GPTNeo,
@@ -52,7 +52,8 @@ fn main() -> anyhow::Result<()> {
         ..Default::default()
     };
 
-    let model = TextGenerationModel::new(generate_config)?;
+    let mut model = TextGenerationModel::new(generate_config)?;
+    model.set_device(Device::cuda_if_available());
 
     let input_context_1 = "It was a very nice and sunny";
     let input_context_2 = "It was a gloom winter night, and";

diff --git a/examples/generation_xlnet.rs b/examples/generation_xlnet.rs
@@ -20,7 +20,6 @@ use rust_bert::resources::{RemoteResource, Resource};
 use rust_bert::xlnet::{XLNetConfigResources, XLNetModelResources, XLNetVocabResources};
 
 fn main() -> anyhow::Result<()> {
-    //    Set-up model
     //    Resources paths
     let config_resource = Resource::Remote(RemoteResource::from_pretrained(
         XLNetConfigResources::XLNET_BASE_CASED,
@@ -42,7 +41,7 @@ fn main() -> anyhow::Result<()> {
         vocab_resource,
         merges_resource,
         max_length: 32,
-        do_sample: true,
+        do_sample: false,
         num_beams: 3,
         temperature: 1.0,
         num_return_sequences: 1,

diff --git a/examples/summarization_t5.rs b/examples/summarization_t5.rs
@@ -18,8 +18,6 @@ use rust_bert::resources::{RemoteResource, Resource};
 use rust_bert::t5::{T5ConfigResources, T5ModelResources, T5VocabResources};
 
 fn main() -> anyhow::Result<()> {
-    // let summarization_model = SummarizationModel::new(Default::default())?;
-
     let config_resource =
         Resource::Remote(RemoteResource::from_pretrained(T5ConfigResources::T5_SMALL));
     let vocab_resource =

diff --git a/examples/translation_m2m100.rs b/examples/translation_m2m100.rs
@@ -53,9 +53,9 @@ fn main() -> anyhow::Result<()> {
     let source_sentence = "This sentence will be translated in multiple languages.";
 
     let mut outputs = Vec::new();
-    outputs.extend(model.translate([source_sentence], Language::English, Language::French)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::Spanish)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::Hindi)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::French)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::Spanish)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::Hindi)?);
 
     for sentence in outputs {
         println!("{}", sentence);

diff --git a/examples/translation_mbart.rs b/examples/translation_mbart.rs
@@ -53,9 +53,9 @@ fn main() -> anyhow::Result<()> {
     let source_sentence = "This sentence will be translated in multiple languages.";
 
     let mut outputs = Vec::new();
-    outputs.extend(model.translate([source_sentence], Language::English, Language::French)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::Spanish)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::Hindi)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::French)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::Spanish)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::Hindi)?);
 
     for sentence in outputs {
         println!("{}", sentence);

diff --git a/examples/translation_t5.rs b/examples/translation_t5.rs
@@ -56,9 +56,9 @@ fn main() -> anyhow::Result<()> {
     let source_sentence = "This sentence will be translated in multiple languages.";
 
     let mut outputs = Vec::new();
-    outputs.extend(model.translate([source_sentence], Language::English, Language::French)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::German)?);
-    outputs.extend(model.translate([source_sentence], Language::English, Language::Romanian)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::French)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::German)?);
+    outputs.extend(model.translate(&[source_sentence], Language::English, Language::Romanian)?);
 
     for sentence in outputs {
         println!("{}", sentence);

diff --git a/src/albert/albert_model.rs b/src/albert/albert_model.rs
@@ -221,10 +221,6 @@ impl AlbertModel {
         };
         let mask = mask.unwrap_or_else(|| calc_mask.as_ref().unwrap());
 
-        let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
-        let extended_attention_mask: Tensor =
-            (extended_attention_mask.ones_like() - extended_attention_mask) * -10000.0;
-
         let embedding_output = self.embeddings.forward_t(
             input_ids,
             token_type_ids,
@@ -233,6 +229,11 @@ impl AlbertModel {
             train,
         )?;
 
+        let extended_attention_mask = mask.unsqueeze(1).unsqueeze(2);
+        let extended_attention_mask: Tensor =
+            ((extended_attention_mask.ones_like() - extended_attention_mask) * -10000.0)
+                .to_kind(embedding_output.kind());
+
         let transformer_output =
             self.encoder
                 .forward_t(&embedding_output, Some(extended_attention_mask), train);

diff --git a/src/albert/attention.rs b/src/albert/attention.rs
@@ -14,7 +14,6 @@
 use crate::albert::AlbertConfig;
 use crate::common::dropout::Dropout;
 use std::borrow::Borrow;
-use tch::kind::Kind::Float;
 use tch::{nn, Tensor};
 
 #[derive(Debug)]
@@ -119,7 +118,10 @@ impl AlbertSelfAttention {
             query_layer.matmul(&key_layer.transpose(-1, -2))
         };
 
-        let weights = scores.softmax(-1, Float).apply_t(&self.dropout, train);
+        let weights = scores
+            .softmax(-1, scores.kind())
+            .apply_t(&self.dropout, train);
+
         let context = weights.matmul(&value_layer).transpose(1, 2).contiguous();
 
         let w = self.dense.ws.transpose(0, 1).view((
@@ -128,7 +130,8 @@ impl AlbertSelfAttention {
             self.hidden_size,
         ));
 
-        let context: Tensor = Tensor::einsum("bfnd,ndh->bfh", &[context, w]) + &self.dense.bs;
+        let context: Tensor =
+            Tensor::einsum("bfnd,ndh->bfh", &[context, w]) + self.dense.bs.as_ref().unwrap();
         let context = (input_ids + context.apply_t(&self.dropout, train)).apply(&self.layer_norm);
 
         if !self.output_attentions {

diff --git a/src/bart/attention.rs b/src/bart/attention.rs
@@ -13,7 +13,6 @@
 
 use crate::common::dropout::Dropout;
 use std::borrow::Borrow;
-use tch::kind::Kind::Float;
 use tch::{nn, Tensor};
 
 #[derive(Debug)]
@@ -164,7 +163,7 @@ impl BartAttention {
                 attention_weights.view([bs * self.num_heads, target_length, source_length]);
         };
 
-        attention_weights = attention_weights.softmax(-1, Float);
+        attention_weights = attention_weights.softmax(-1, attention_weights.kind());
 
         let saved_attention_weights = if self.output_attentions {
             Some(attention_weights.view((bs, self.num_heads, target_length, source_length)))

diff --git a/src/bart/bart_model.rs b/src/bart/bart_model.rs
@@ -16,6 +16,7 @@ use crate::bart::decoder::BartDecoder;
 use crate::bart::encoder::BartEncoder;
 use crate::common::activations::Activation;
 use crate::common::dropout::Dropout;
+use crate::common::kind::get_negative_infinity;
 use crate::common::resources::{RemoteResource, Resource};
 use crate::gpt2::{
     Gpt2ConfigResources, Gpt2MergesResources, Gpt2ModelResources, Gpt2VocabResources,
@@ -33,7 +34,6 @@ use rust_tokenizers::vocab::{RobertaVocab, Vocab};
 use serde::{Deserialize, Serialize};
 use std::borrow::Borrow;
 use std::collections::HashMap;
-use tch::kind::Kind::Int64;
 use tch::nn::{embedding, EmbeddingConfig};
 use tch::{nn, Device, Kind, Tensor};
 
@@ -235,7 +235,7 @@ pub(crate) fn _make_causal_mask(
 
     let mut mask = Tensor::full(
         &[target_length, target_length],
-        f64::NEG_INFINITY,
+        get_negative_infinity(dtype).unwrap(),
         (dtype, device),
     );
     let mask_cond = Tensor::arange(target_length, (dtype, device));
@@ -264,16 +264,19 @@ pub(crate) fn _make_causal_mask(
     )
 }
 
-pub(crate) fn _expand_mask(mask: &Tensor, target_length: Option<i64>) -> Tensor {
+pub(crate) fn _expand_mask(mask: &Tensor, target_length: Option<i64>, dtype: Kind) -> Tensor {
     let (batch_size, source_length) = mask.size2().unwrap();
     let target_length = target_length.unwrap_or(source_length);
     let expanded_mask = mask
         .unsqueeze(1)
         .unsqueeze(1)
         .expand(&[batch_size, 1, target_length, source_length], true)
-        .totype(Kind::Float);
+        .totype(dtype);
     let inverted_mask: Tensor = 1 - expanded_mask;
-    inverted_mask.masked_fill(&inverted_mask.to_kind(Kind::Bool), f64::NEG_INFINITY)
+    inverted_mask.masked_fill(
+        &inverted_mask.to_kind(Kind::Bool),
+        get_negative_infinity(dtype).unwrap(),
+    )
 }
 
 pub(crate) fn _prepare_decoder_attention_mask(
@@ -294,8 +297,12 @@ pub(crate) fn _prepare_decoder_attention_mask(
         None
     };
 
-    if let Some(attention_mask) = &attention_mask {
-        let expanded_attention_mask = _expand_mask(attention_mask, Some(last_input_shape_dim));
+    if let Some(attention_mask) = attention_mask {
+        let expanded_attention_mask = _expand_mask(
+            attention_mask,
+            Some(last_input_shape_dim),
+            input_embeds.kind(),
+        );
         combined_attention_mask = match combined_attention_mask {
             Some(value) => Some(value + expanded_attention_mask),
             None => Some(expanded_attention_mask),
@@ -308,9 +315,9 @@ pub(crate) fn _prepare_decoder_attention_mask(
 fn _shift_tokens_right(input_ids: &Tensor, pad_token_id: i64) -> Tensor {
     let index_eos: Tensor = input_ids
         .ne(pad_token_id)
-        .sum_dim_intlist(&[-1], true, Int64)
+        .sum_dim_intlist(&[-1], true, Kind::Int64)
         - 1;
-    let output = input_ids.empty_like().to_kind(Int64);
+    let output = input_ids.empty_like().to_kind(Kind::Int64);
     output
         .select(1, 0)
         .copy_(&input_ids.gather(1, &index_eos, true).squeeze());
@@ -812,7 +819,7 @@ impl BartForSequenceClassification {
             train,
         );
         let eos_mask = input_ids.eq(self.eos_token_id);
-        let reshape = eos_mask.sum_dim_intlist(&[1], true, Int64);
+        let reshape = eos_mask.sum_dim_intlist(&[1], true, input_ids.kind());
         let sentence_representation = base_model_output
             .decoder_output
             .permute(&[2, 0, 1])
@@ -1121,6 +1128,9 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
     fn get_var_store(&self) -> &nn::VarStore {
         &self.var_store
     }
+    fn get_var_store_mut(&mut self) -> &mut nn::VarStore {
+        &mut self.var_store
+    }
     fn get_config(&self) -> &GenerateConfig {
         &self.generate_config
     }
@@ -1195,17 +1205,17 @@ impl PrivateLanguageGenerator<BartForConditionalGeneration, RobertaVocab, Robert
         }
     }
 
-    fn encode_prompt_text<'a, S>(
+    fn encode_prompt_text<S>(
         &self,
-        prompt_text: S,
+        prompt_text: &[S],
         max_len: i64,
         pad_token_id: Option<i64>,
     ) -> Tensor
     where
-        S: AsRef<[&'a str]>,
+        S: AsRef<str> + Sync,
     {
         let tokens = self._get_tokenizer().encode_list(
-            prompt_text.as_ref(),
+            prompt_text,
             max_len as usize,
             &TruncationStrategy::LongestFirst,
             0,

diff --git a/src/bart/decoder.rs b/src/bart/decoder.rs
@@ -273,7 +273,7 @@ impl BartDecoder {
         );
 
         let encoder_attention_mask = encoder_attention_mask
-            .map(|mask| _expand_mask(mask, Some(*input_ids.size().last().unwrap())));
+            .map(|mask| _expand_mask(mask, Some(*input_ids.size().last().unwrap()), x.kind()));
 
         let x = if let Some(layer_norm_embedding) = &self.layer_norm_embedding {
             x.apply(layer_norm_embedding)

diff --git a/src/bart/embeddings.rs b/src/bart/embeddings.rs
@@ -12,9 +12,8 @@
 // limitations under the License.
 
 use std::borrow::Borrow;
-use tch::kind::Kind::Int64;
 use tch::nn::embedding;
-use tch::{nn, Tensor};
+use tch::{nn, Kind, Tensor};
 
 /// # Abstraction that holds a embeddings configuration
 pub enum EmbeddingOption {
@@ -67,7 +66,7 @@ impl LearnedPositionalEmbedding {
         let positions = Tensor::arange_start(
             past_key_values_length,
             past_key_values_length + sequence_length,
-            (Int64, input.device()),
+            (Kind::Int64, input.device()),
         ) + self.offset;
         positions.apply(&self.embedding)
     }
@@ -102,7 +101,7 @@ impl SinusoidalPositionalEmbedding {
         let positions = Tensor::arange_start(
             past_key_values_length,
             past_key_values_length + sequence_length,
-            (Int64, input.device()),
+            (Kind::Int64, input.device()),
         );
         positions.apply(&self.embedding)
     }