Addition of CI pipeline (#2)

guillaume-be · Feb 16, 2020 · 0271396 · 0271396
1 parent a7264ff
commit 0271396
Show file tree

Hide file tree

Showing 5 changed files with 155 additions and 16 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,20 @@
+language: rust
+cache:
+  - cargo
+  - pip
+  - directories:
+    - $HOME/.cache/pip
+    - $HOME/.cache/torch
+    - $HOME/rustbert
+
+jobs:
+  include:
+    - script:
+      - cargo build --verbose
+    - before_script:
+      - sudo apt-get install python3-pip python3-setuptools
+      - pip3 install --upgrade pip
+      - pip3 install -r requirements.txt --progress-bar off
+      - ls ./utils/*.py|xargs -n 1 python3
+      script:
+        - cargo test
diff --git a/src/distilbert/sentiment.rs b/src/distilbert/sentiment.rs
@@ -14,8 +14,8 @@ pub enum SentimentPolarity {
 
 #[derive(Debug)]
 pub struct Sentiment {
-    polarity: SentimentPolarity,
-    score: f64,
+    pub polarity: SentimentPolarity,
+    pub score: f64,
 }
 
 pub struct SentimentClassifier {

diff --git a/tests/distilbert.rs b/tests/distilbert.rs
@@ -0,0 +1,111 @@
+use std::path::PathBuf;
+use tch::{Device, Tensor, nn, no_grad};
+use rust_bert::distilbert::distilbert::{DistilBertModelMaskedLM, DistilBertConfig};
+use rust_tokenizers::preprocessing::tokenizer::base_tokenizer::{Tokenizer, TruncationStrategy};
+use rust_tokenizers::bert_tokenizer::BertTokenizer;
+use rust_tokenizers::preprocessing::vocab::base_vocab::Vocab;
+use rust_bert::{SentimentClassifier, SentimentPolarity};
+
+extern crate failure;
+extern crate dirs;
+
+#[test]
+fn sentiment_classifier() -> failure::Fallible<()> {
+
+//    Resources paths
+    let mut home: PathBuf = dirs::home_dir().unwrap();
+    home.push("rustbert");
+    home.push("distilbert_sst2");
+    let config_path = &home.as_path().join("config.json");
+    let vocab_path = &home.as_path().join("vocab.txt");
+    let weights_path = &home.as_path().join("model.ot");
+
+//    Set-up classifier
+    let device = Device::cuda_if_available();
+    let sentiment_classifier = SentimentClassifier::new(vocab_path,
+                                                        config_path,
+                                                        weights_path, device)?;
+
+//    Get sentiments
+    let input = [
+        "Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring.",
+        "This film tried to be too many things all at once: stinging political satire, Hollywood blockbuster, sappy romantic comedy, family values promo...",
+        "If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.",
+    ];
+
+    let output = sentiment_classifier.predict(input.to_vec());
+
+    assert_eq!(output.len(), 3 as usize);
+    assert_eq!(output[0].polarity, SentimentPolarity::Positive);
+    assert!((output[0].score - 0.9981).abs() < 1e-4);
+    assert_eq!(output[1].polarity, SentimentPolarity::Negative);
+    assert!((output[1].score - 0.9927).abs() < 1e-4);
+    assert_eq!(output[2].polarity, SentimentPolarity::Positive);
+    assert!((output[2].score - 0.9997).abs() < 1e-4);
+
+    Ok(())
+}
+
+
+
+#[test]
+fn distilbert_masked_lm() -> failure::Fallible<()> {
+
+//    Resources paths
+    let mut home: PathBuf = dirs::home_dir().unwrap();
+    home.push("rustbert");
+    home.push("distilbert");
+    let config_path = &home.as_path().join("config.json");
+    let vocab_path = &home.as_path().join("vocab.txt");
+    let weights_path = &home.as_path().join("model.ot");
+
+//    Set-up masked LM model
+    let device = Device::cuda_if_available();
+    let mut vs = nn::VarStore::new(device);
+    let tokenizer: BertTokenizer = BertTokenizer::from_file(vocab_path.to_str().unwrap());
+    let config = DistilBertConfig::from_file(config_path);
+    let distil_bert_model = DistilBertModelMaskedLM::new(&vs.root(), &config);
+    vs.load(weights_path)?;
+
+//    Define input
+    let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
+    let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
+    let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
+    let mut tokenized_input = tokenized_input.
+        iter().
+        map(|input| input.token_ids.clone()).
+        map(|mut input| {
+            input.extend(vec![0; max_len - input.len()]);
+            input
+        }).
+        collect::<Vec<_>>();
+
+//    Masking the token [thing] of sentence 1 and [oranges] of sentence 2
+    tokenized_input[0][4] = 103;
+    tokenized_input[1][6] = 103;
+    let tokenized_input = tokenized_input.
+        iter().
+        map(|input|
+            Tensor::of_slice(&(input))).
+        collect::<Vec<_>>();
+    let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);
+
+
+//    Forward pass
+    let (output, _, _) = no_grad(|| {
+        distil_bert_model
+            .forward_t(Some(input_tensor), None, None, false)
+            .unwrap()
+    });
+
+//    Print masked tokens
+    let index_1 = output.get(0).get(4).argmax(0, false);
+    let index_2 = output.get(1).get(6).argmax(0, false);
+    let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
+    let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));
+
+    assert_eq!("person", word_1); // Outputs "person" : "Looks like one [person] is missing"
+    assert_eq!("pear", word_2);// Outputs "pear" : "It\'s like comparing [pear] to apples"
+
+    Ok(())
+}
diff --git a/utils/download-dependencies_distilbert.py b/utils/download-dependencies_distilbert.py
@@ -18,12 +18,17 @@
 temp_vocab = get_from_cache(vocab_path)
 temp_weights = get_from_cache(weights_path)
 
-os.makedirs(target_path, exist_ok=True)
-shutil.copy(temp_config, target_path / 'config.json')
-shutil.copy(temp_vocab, target_path / 'vocab.txt')
-shutil.copy(temp_weights, target_path / 'model.bin')
+os.makedirs(str(target_path), exist_ok=True)
 
-weights = torch.load(temp_weights)
+config_path = str(target_path / 'config.json')
+vocab_path = str(target_path / 'vocab.txt')
+model_path = str(target_path / 'model.bin')
+
+shutil.copy(temp_config, config_path)
+shutil.copy(temp_vocab, vocab_path)
+shutil.copy(temp_weights, model_path)
+
+weights = torch.load(temp_weights, map_location='cpu')
 nps = {}
 for k, v in weights.items():
     nps[k] = v.cpu().numpy()
@@ -36,5 +41,4 @@
 toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
 
 subprocess.call(
-    ['cargo', '+nightly', 'run', '--bin=convert-tensor', f'--manifest-path={toml_location}', '--', source,
-     target])
+    ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])
diff --git a/utils/download-dependencies_sst2_sentiment.py b/utils/download-dependencies_sst2_sentiment.py
@@ -18,12 +18,17 @@
 temp_vocab = get_from_cache(vocab_path)
 temp_weights = get_from_cache(weights_path)
 
-os.makedirs(target_path, exist_ok=True)
-shutil.copy(temp_config, target_path / 'config.json')
-shutil.copy(temp_vocab, target_path / 'vocab.txt')
-shutil.copy(temp_weights, target_path / 'model.bin')
+os.makedirs(str(target_path), exist_ok=True)
 
-weights = torch.load(temp_weights)
+config_path = str(target_path / 'config.json')
+vocab_path = str(target_path / 'vocab.txt')
+model_path = str(target_path / 'model.bin')
+
+shutil.copy(temp_config, config_path)
+shutil.copy(temp_vocab, vocab_path)
+shutil.copy(temp_weights, model_path)
+
+weights = torch.load(temp_weights, map_location='cpu')
 nps = {}
 for k, v in weights.items():
     nps[k] = v.cpu().numpy()
@@ -36,5 +41,4 @@
 toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()
 
 subprocess.call(
-    ['cargo', '+nightly', 'run', '--bin=convert-tensor', f'--manifest-path={toml_location}', '--', source,
-     target])
+    ['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])