diff --git a/CHANGELOG.md b/CHANGELOG.md index 15875aa8..0143edf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] + +## [0.22.0] - 2024-01-20 ## Added - Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines. - Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files. diff --git a/Cargo.toml b/Cargo.toml index 162cf097..78cc82c5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "rust-bert" -version = "0.21.0" +version = "0.22.0" authors = ["Guillaume Becquin "] edition = "2018" description = "Ready-to-use NLP pipelines and language models" @@ -86,19 +86,19 @@ half = "2" regex = "1.6" cached-path = { version = "0.6", default-features = false, optional = true } -dirs = { version = "4", optional = true } +dirs = { version = "5", optional = true } lazy_static = { version = "1", optional = true } ort = {version="~1.15.2", optional = true, default-features = false, features = ["half"]} ndarray = {version="0.15", optional = true} -tokenizers = {version="0.13.3", optional=true, default-features = false, features = ["onig"]} +tokenizers = {version="0.15", optional=true, default-features = false, features = ["onig"]} [dev-dependencies] anyhow = "1" csv = "1" -criterion = "0.4" -tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] } +criterion = "0.5" +tokio = { version = "1.35", features = ["sync", "rt-multi-thread", "macros"] } torch-sys = "0.14.0" tempfile = "3" -itertools = "0.10" +itertools = "0.12" tracing-subscriber = { version = "0.3", default-features = false, features = [ "env-filter", "fmt" ] } -ort = {version="~1.15.2", features = ["load-dynamic"]} \ No newline at end of file +ort = {version="~1.15.5", features = ["load-dynamic"]} \ No newline at end of file diff --git a/src/pipelines/sentence_embeddings/config.rs b/src/pipelines/sentence_embeddings/config.rs index f6f6cb2b..9c8b2cec 100644 --- a/src/pipelines/sentence_embeddings/config.rs +++ b/src/pipelines/sentence_embeddings/config.rs @@ -309,7 +309,7 @@ impl Config for SentenceEmbeddingsModulesConfig {} impl SentenceEmbeddingsModulesConfig { pub fn validate(self) -> Result { - match self.get(0) { + match self.first() { Some(SentenceEmbeddingsModuleConfig { module_type: SentenceEmbeddingsModuleType::Transformer, .. @@ -347,7 +347,7 @@ impl SentenceEmbeddingsModulesConfig { } pub fn transformer_module(&self) -> &SentenceEmbeddingsModuleConfig { - self.get(0).as_ref().unwrap() + self.first().as_ref().unwrap() } pub fn pooling_module(&self) -> &SentenceEmbeddingsModuleConfig { diff --git a/tests/gpt_j.rs b/tests/gpt_j.rs index f2efc1d4..0cd8be84 100644 --- a/tests/gpt_j.rs +++ b/tests/gpt_j.rs @@ -7,7 +7,6 @@ use rust_bert::resources::{load_weights, RemoteResource, ResourceProvider}; use rust_bert::Config; use rust_tokenizers::tokenizer::{Gpt2Tokenizer, Tokenizer}; use rust_tokenizers::vocab::Vocab; -use std::convert::TryFrom; use tch::{nn, Device, Kind, Tensor}; /// Equivalent Python code: @@ -107,7 +106,7 @@ fn gpt_j_correctness() -> anyhow::Result<()> { Tensor::from_slice( &input .iter() - .map(|&e| i64::try_from(e != pad_token).unwrap()) + .map(|&e| i64::from(e != pad_token)) .collect::>(), ) .to(device)