Merge branch 'main' into patch-1

guillaume-be · Aug 18, 2024 · 8b545d3 · 8b545d3
2 parents dc0416d + 9af98f8
commit 8b545d3
Show file tree

Hide file tree

Showing 228 changed files with 9,905 additions and 4,138 deletions.
diff --git a/.github/workflows/continuous-integration.yml b/.github/workflows/continuous-integration.yml
@@ -1,8 +1,8 @@
 on:
   push:
-    branches: [ master ]
+    branches: [ main ]
   pull_request:
-    branches: [ master ]
+    branches: [ main ]
 
 name: Build
 
@@ -20,6 +20,7 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: build
+          args: --features download-libtorch
 
   build-no-defaults:
     name: Build no defaults
@@ -34,7 +35,7 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: build
-          args: --no-default-features
+          args: --no-default-features --features download-libtorch
 
   build-windows:
     name: Build Windows
@@ -49,6 +50,7 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: build
+          args: --features download-libtorch
 
   build-mac-os:
     name: Build macOS
@@ -63,6 +65,7 @@ jobs:
       - uses: actions-rs/cargo@v1
         with:
           command: build
+          args: --features download-libtorch
 
   test-batch-0:
     name: Integration tests (batch 0)
@@ -89,6 +92,7 @@ jobs:
             --test fnet
             --test deberta
             --test deberta_v2
+            --features download-libtorch
 
   test-batch-1:
     name: Integration tests (batch 1)
@@ -114,6 +118,7 @@ jobs:
             --test longformer
             --test pegasus
             --test gpt_neo
+            --features download-libtorch
 
   test-batch-2:
     name: Integration tests (batch 2)
@@ -132,6 +137,28 @@ jobs:
             --test sentence_embeddings
             --test longt5
             --test gpt_j
+            --test nllb
+            --features download-libtorch
+
+  test-opt-features:
+    name: Integration tests (Optional features)
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+      - uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --package rust-bert
+            --features onnx
+            --features hf-tokenizers
+            --test onnx
+            --test hf_tokenizers
+            --features download-libtorch
 
   convert-model:
     name: Model conversion test
@@ -147,7 +174,7 @@ jobs:
         with:
           python-version: '3.10'
       - run: |
-          pip install -r requirements.txt --progress-bar off
+          pip install -r ./utils/requirements.txt --progress-bar off
           python ./utils/download-dependencies_distilbert.py
 
   fmt:

diff --git a/.gitignore b/.gitignore
@@ -17,4 +17,6 @@ Cargo.lock
 
 /target
 #**/*.rs.bk
-/resources/
+/models/
+/.venv/
+convert_model.log
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,14 +2,54 @@
 All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
+
+## Changed
+- (BREAKING) Upgraded to `torch` 2.2 (via `tch` 0.15.0).
+
+## [0.22.0] - 2024-01-20
+## Added
+- Addition of `new_with_tokenizer` constructor for `SentenceEmbeddingsModel` allowing passing custom tokenizers for sentence embeddings pipelines.
+- Support for [Tokenizers](https://github.com/huggingface/tokenizers) in pipelines, allowing loading `tokenizer.json` and `special_token_map.json` tokenizer files. 
+- (BREAKING) Most model configuration can now take an optional `kind` parameter to specify the model weight precision. If not provided, will default to full precision on CPU, or the serialized weights precision otherwise.
+
+## Fixed
+- (BREAKING) Fixed the keyword extraction pipeline for n-gram sizes > 2. Add new configuration option `tokenizer_forbidden_ngram_chars` to specify characters that should be excluded from n-grams (allows filtering n-grams spanning multiple sentences).
+- Improved MPS device compatibility setting the `sparse_grad` flag to false for `gather` operations
+- Updated ONNX runtime backend version to 1.15.x
+- Issue with incorrect results for QA models with a tokenizer not using segment ids
+- Issue with GPT-J that was incorrectly tracking the gradients for the attention bias
+
+## Changed
+- (BREAKING) Upgraded to `torch` 2.1 (via `tch` 0.14.0).
+- (BREAKING) Text generation traits and pipelines (including conversation, summarization and translation) now return a `Result` for improved error handling
+
+## [0.21.0] - 2023-06-03
 ## Added
 - Addition of the [LongT5](https://arxiv.org/abs/2112.07916) model architecture and pretrained weights.
+- Addition of `add_tokens` and `add_extra_ids` interface methods to the `TokenizerOption`. Allow building most pipeline with custom tokenizer via `new_with_tokenizer`.
+- Addition of `get_tokenizer` and `get_tokenizer_mut` methods to all pipelines allowing to get a (mutable) reference to the pipeline tokenizer.
+- Addition of a `get_embedding_dim` method to get the dimension of the embeddings for sentence embeddings pipelines
+- `get_vocab_size`, `get_decoder_start_token_id` and `get_prefix_and_forced_bos_id`  for the `TokenizerOption` in pipelines
+- Addition of the [GPT-J](https://www.eleuther.ai/artifacts/gpt-j) model architecture
+- Addition of the [NLLB](https://arxiv.org/abs/2207.04672) model architecture and pretrained weights
+- Addition of support for ONNX models (encoder, decoders, encoder-decoders) via the [ort](https://github.com/pykeio/ort) onnxruntime bindings
+- Integration of ONNX models to the sequence classification, token classification, question answering, zero-shot classification, text generation, summarization and translation pipelines
 
 ## Changed
-- Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer.
+- Bumped the tokenizers dependency from 7.x to 8.x, exposing additional options for special token mapping and adding the NLLBTokenizer
+- (BREAKING) Simplified the generation traits (removal of LMHeadModel and elimination of unnecessary specification for LanguageGenerator)
+- (BREAKING) Upgraded to `torch` 2.0 (via `tch` 0.13.0). The process to automatically download the dependencies have changed, it must now be enabled via the `download-libtorch` feature flag.
+- Read the `decoder_start_token_id` from the provided configuration rather than using a hard-coded default value
+- (BREAKING) Changed the return type of the `LanguageGenerator` and pipelines functions `float`, `half`, `set_device` to `Result<(), RustBertError>` as these become fallible for ONNX models
+- (BREAKING) Wrapped the model resources specification for the pipeline `Config` objects into an `Enum` to allow handling both torch-based and ONNX models. 
+  The `model_resources` field now needs to be wrapped in the corresponding enum variant, e.g. `model_resources: ModelResources::TORCH(model_resource)` for Torch-based models
+- (BREAKING) Added the `forced_bos_token_id` and `forced_eos_token_id` fields to text generation models. 
+  If these are not None, this will trigger a forced BOS/EOS token generation at the first of `max_length` positions (aligns with the Pytorch Transformers library)
+- Project structure refactoring (torch-based models moved under common module). Non-breaking change via re-exports.
 
 ## Fixed
 - MIN/MAX computation for float-like (was set to infinity instead of min/max)
+- Remove the (unused) pooler from the set of weights for BERT Masked LM architecture
 
 ## [0.20.0] - 2023-01-21
 ## Added
@@ -412,4 +452,4 @@ All notable changes to this project will be documented in this file. The format
 
 - Tensor conversion tools from Pytorch to Libtorch format
 - DistilBERT model architecture
-- Ready-to-use `SentimentClassifier` using a DistilBERT model fine-tuned on SST2
+- Ready-to-use `SentimentClassifier` using a DistilBERT model fine-tuned on SST2
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,13 +1,14 @@
 [package]
 name = "rust-bert"
-version = "0.20.1-alpha"
+version = "0.22.0"
 authors = ["Guillaume Becquin <[email protected]>"]
 edition = "2018"
 description = "Ready-to-use NLP pipelines and language models"
 repository = "https://github.com/guillaume-be/rust-bert"
 documentation = "https://docs.rs/rust-bert"
 license = "Apache-2.0"
 readme = "README.md"
+build = "build.rs"
 keywords = [
     "nlp",
     "deep-learning",
@@ -60,34 +61,78 @@ harness = false
 opt-level = 3
 
 [features]
-default = ["remote"]
+default = ["remote", "default-tls"]
 doc-only = ["tch/doc-only"]
 all-tests = []
 remote = ["cached-path", "dirs", "lazy_static"]
+download-libtorch = ["tch/download-libtorch"]
+onnx = ["ort", "ndarray"]
+rustls-tls = ["cached-path/rustls-tls"]
+default-tls = ["cached-path/default-tls"]
+hf-tokenizers = ["tokenizers"]
 
 [package.metadata.docs.rs]
 features = ["doc-only"]
 
 [dependencies]
-rust_tokenizers = "8.0.0"
-tch = "~0.10.1"
+rust_tokenizers = "8.1.1"
+tch = { version = "0.16.0", features = ["download-libtorch"] }
 serde_json = "1"
 serde = { version = "1", features = ["derive"] }
-ordered-float = "3"
+ordered-float = "4.2.0"
 uuid = { version = "1", features = ["v4"] }
 thiserror = "1"
 half = "2"
 regex = "1.6"
 
-cached-path = { version = "0.6", optional = true }
-dirs = { version = "4", optional = true }
+cached-path = { version = "0.6", default-features = false, optional = true }
+dirs = { version = "5", optional = true }
 lazy_static = { version = "1", optional = true }
+ort = { version = "1.16.3", optional = true, default-features = false, features = [
+    "half",
+] }
+ndarray = { version = "0.15", optional = true }
+tokenizers = { version = "0.19.1", optional = true, default-features = false, features = [
+    "onig",
+] }
 
 [dev-dependencies]
 anyhow = "1"
 csv = "1"
-criterion = "0.4"
-tokio = { version = "1.24", features = ["sync", "rt-multi-thread", "macros"] }
-torch-sys = "=0.10.0"
+criterion = "0.5"
+tokio = { version = "1.35", features = ["sync", "rt-multi-thread", "macros"] }
 tempfile = "3"
-itertools = "0.10"
+itertools = "0.13.0"
+tracing-subscriber = { version = "0.3", default-features = false, features = [
+    "env-filter",
+    "fmt",
+] }
+ort = { version = "1.16.3", features = ["load-dynamic"] }
+
+[[example]]
+name = "onnx-masked-lm"
+required-features = ["onnx"]
+
+[[example]]
+name = "onnx-question-answering"
+required-features = ["onnx"]
+
+[[example]]
+name = "onnx-sequence-classification"
+required-features = ["onnx"]
+
+[[example]]
+name = "onnx-text-generation"
+required-features = ["onnx"]
+
+[[example]]
+name = "onnx-token-classification"
+required-features = ["onnx"]
+
+[[example]]
+name = "onnx-translation"
+required-features = ["onnx"]
+
+[[example]]
+name = "generation_gpt2_hf_tokenizers"
+required-features = ["hf-tokenizers"]