Skip to content

Commit

Permalink
Addition of CI pipeline (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaume-be authored Feb 16, 2020
1 parent a7264ff commit 0271396
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 16 deletions.
20 changes: 20 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
language: rust
cache:
- cargo
- pip
- directories:
- $HOME/.cache/pip
- $HOME/.cache/torch
- $HOME/rustbert

jobs:
include:
- script:
- cargo build --verbose
- before_script:
- sudo apt-get install python3-pip python3-setuptools
- pip3 install --upgrade pip
- pip3 install -r requirements.txt --progress-bar off
- ls ./utils/*.py|xargs -n 1 python3
script:
- cargo test
4 changes: 2 additions & 2 deletions src/distilbert/sentiment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ pub enum SentimentPolarity {

#[derive(Debug)]
pub struct Sentiment {
polarity: SentimentPolarity,
score: f64,
pub polarity: SentimentPolarity,
pub score: f64,
}

pub struct SentimentClassifier {
Expand Down
111 changes: 111 additions & 0 deletions tests/distilbert.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
use std::path::PathBuf;
use tch::{Device, Tensor, nn, no_grad};
use rust_bert::distilbert::distilbert::{DistilBertModelMaskedLM, DistilBertConfig};
use rust_tokenizers::preprocessing::tokenizer::base_tokenizer::{Tokenizer, TruncationStrategy};
use rust_tokenizers::bert_tokenizer::BertTokenizer;
use rust_tokenizers::preprocessing::vocab::base_vocab::Vocab;
use rust_bert::{SentimentClassifier, SentimentPolarity};

extern crate failure;
extern crate dirs;

#[test]
fn sentiment_classifier() -> failure::Fallible<()> {

// Resources paths
let mut home: PathBuf = dirs::home_dir().unwrap();
home.push("rustbert");
home.push("distilbert_sst2");
let config_path = &home.as_path().join("config.json");
let vocab_path = &home.as_path().join("vocab.txt");
let weights_path = &home.as_path().join("model.ot");

// Set-up classifier
let device = Device::cuda_if_available();
let sentiment_classifier = SentimentClassifier::new(vocab_path,
config_path,
weights_path, device)?;

// Get sentiments
let input = [
"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring.",
"This film tried to be too many things all at once: stinging political satire, Hollywood blockbuster, sappy romantic comedy, family values promo...",
"If you like original gut wrenching laughter you will like this movie. If you are young or old then you will love this movie, hell even my mom liked it.",
];

let output = sentiment_classifier.predict(input.to_vec());

assert_eq!(output.len(), 3 as usize);
assert_eq!(output[0].polarity, SentimentPolarity::Positive);
assert!((output[0].score - 0.9981).abs() < 1e-4);
assert_eq!(output[1].polarity, SentimentPolarity::Negative);
assert!((output[1].score - 0.9927).abs() < 1e-4);
assert_eq!(output[2].polarity, SentimentPolarity::Positive);
assert!((output[2].score - 0.9997).abs() < 1e-4);

Ok(())
}



#[test]
fn distilbert_masked_lm() -> failure::Fallible<()> {

// Resources paths
let mut home: PathBuf = dirs::home_dir().unwrap();
home.push("rustbert");
home.push("distilbert");
let config_path = &home.as_path().join("config.json");
let vocab_path = &home.as_path().join("vocab.txt");
let weights_path = &home.as_path().join("model.ot");

// Set-up masked LM model
let device = Device::cuda_if_available();
let mut vs = nn::VarStore::new(device);
let tokenizer: BertTokenizer = BertTokenizer::from_file(vocab_path.to_str().unwrap());
let config = DistilBertConfig::from_file(config_path);
let distil_bert_model = DistilBertModelMaskedLM::new(&vs.root(), &config);
vs.load(weights_path)?;

// Define input
let input = ["Looks like one thing is missing", "It\'s like comparing oranges to apples"];
let tokenized_input = tokenizer.encode_list(input.to_vec(), 128, &TruncationStrategy::LongestFirst, 0);
let max_len = tokenized_input.iter().map(|input| input.token_ids.len()).max().unwrap();
let mut tokenized_input = tokenized_input.
iter().
map(|input| input.token_ids.clone()).
map(|mut input| {
input.extend(vec![0; max_len - input.len()]);
input
}).
collect::<Vec<_>>();

// Masking the token [thing] of sentence 1 and [oranges] of sentence 2
tokenized_input[0][4] = 103;
tokenized_input[1][6] = 103;
let tokenized_input = tokenized_input.
iter().
map(|input|
Tensor::of_slice(&(input))).
collect::<Vec<_>>();
let input_tensor = Tensor::stack(tokenized_input.as_slice(), 0).to(device);


// Forward pass
let (output, _, _) = no_grad(|| {
distil_bert_model
.forward_t(Some(input_tensor), None, None, false)
.unwrap()
});

// Print masked tokens
let index_1 = output.get(0).get(4).argmax(0, false);
let index_2 = output.get(1).get(6).argmax(0, false);
let word_1 = tokenizer.vocab().id_to_token(&index_1.int64_value(&[]));
let word_2 = tokenizer.vocab().id_to_token(&index_2.int64_value(&[]));

assert_eq!("person", word_1); // Outputs "person" : "Looks like one [person] is missing"
assert_eq!("pear", word_2);// Outputs "pear" : "It\'s like comparing [pear] to apples"

Ok(())
}
18 changes: 11 additions & 7 deletions utils/download-dependencies_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
temp_vocab = get_from_cache(vocab_path)
temp_weights = get_from_cache(weights_path)

os.makedirs(target_path, exist_ok=True)
shutil.copy(temp_config, target_path / 'config.json')
shutil.copy(temp_vocab, target_path / 'vocab.txt')
shutil.copy(temp_weights, target_path / 'model.bin')
os.makedirs(str(target_path), exist_ok=True)

weights = torch.load(temp_weights)
config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'vocab.txt')
model_path = str(target_path / 'model.bin')

shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_weights, model_path)

weights = torch.load(temp_weights, map_location='cpu')
nps = {}
for k, v in weights.items():
nps[k] = v.cpu().numpy()
Expand All @@ -36,5 +41,4 @@
toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()

subprocess.call(
['cargo', '+nightly', 'run', '--bin=convert-tensor', f'--manifest-path={toml_location}', '--', source,
target])
['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])
18 changes: 11 additions & 7 deletions utils/download-dependencies_sst2_sentiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,17 @@
temp_vocab = get_from_cache(vocab_path)
temp_weights = get_from_cache(weights_path)

os.makedirs(target_path, exist_ok=True)
shutil.copy(temp_config, target_path / 'config.json')
shutil.copy(temp_vocab, target_path / 'vocab.txt')
shutil.copy(temp_weights, target_path / 'model.bin')
os.makedirs(str(target_path), exist_ok=True)

weights = torch.load(temp_weights)
config_path = str(target_path / 'config.json')
vocab_path = str(target_path / 'vocab.txt')
model_path = str(target_path / 'model.bin')

shutil.copy(temp_config, config_path)
shutil.copy(temp_vocab, vocab_path)
shutil.copy(temp_weights, model_path)

weights = torch.load(temp_weights, map_location='cpu')
nps = {}
for k, v in weights.items():
nps[k] = v.cpu().numpy()
Expand All @@ -36,5 +41,4 @@
toml_location = (Path(__file__).resolve() / '..' / '..' / 'Cargo.toml').resolve()

subprocess.call(
['cargo', '+nightly', 'run', '--bin=convert-tensor', f'--manifest-path={toml_location}', '--', source,
target])
['cargo', 'run', '--bin=convert-tensor', '--manifest-path=%s' % toml_location, '--', source, target])

0 comments on commit 0271396

Please sign in to comment.