diff --git a/src/lib.rs b/src/lib.rs
index 45f7ef06..3363374e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -91,7 +91,7 @@
 //! ### Manual installation (recommended)
 //!
 //! 1. Download `libtorch` from <https://pytorch.org/get-started/locally/>. This package requires `v2.2`: if this version is no longer available on the "get started" page,
-//! the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.0%2Bcu121.zip` for a Linux version with CUDA12.
+//!     the file should be accessible by modifying the target link, for example `https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.2.0%2Bcu121.zip` for a Linux version with CUDA12.
 //! 2. Extract the library to a location of your choice
 //! 3. Set the following environment variables
 //! ##### Linux:
diff --git a/src/models/albert/mod.rs b/src/models/albert/mod.rs
index a2a48c54..53e13d36 100644
--- a/src/models/albert/mod.rs
+++ b/src/models/albert/mod.rs
@@ -16,6 +16,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `BertTokenizer` using a `vocab.txt` vocabulary
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/bart/bart_model.rs b/src/models/bart/bart_model.rs
index cdc23d36..42fb1794 100644
--- a/src/models/bart/bart_model.rs
+++ b/src/models/bart/bart_model.rs
@@ -369,7 +369,7 @@ fn _shift_tokens_right(input_ids: &Tensor, pad_token_id: i64) -> Tensor {
 /// It is made of the following blocks:
 /// - `encoder`: `BartEncoder` (transformer) made of a vector of encoding layers
 /// - `decoder`: `BartDecoder` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `pad_token_id`: padding token id
 pub struct BartModel {
     pub(crate) encoder: BartEncoder,
@@ -437,7 +437,7 @@ impl BartModel {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
     ///
@@ -597,7 +597,7 @@ impl BartForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
@@ -798,7 +798,7 @@ impl BartForSequenceClassification {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/bart/mod.rs b/src/models/bart/mod.rs
index 09631991..a6e51b1f 100644
--- a/src/models/bart/mod.rs
+++ b/src/models/bart/mod.rs
@@ -11,6 +11,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `RobertaTokenizer` using a `vocab.txt` vocabulary and `merges.txt` 2-gram merges
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/bert/mod.rs b/src/models/bert/mod.rs
index 7a6672b5..df2ad44a 100644
--- a/src/models/bert/mod.rs
+++ b/src/models/bert/mod.rs
@@ -16,6 +16,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `BertTokenizer` using a `vocab.txt` vocabulary
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/deberta/mod.rs b/src/models/deberta/mod.rs
index ced39a9b..f5fbf624 100644
--- a/src/models/deberta/mod.rs
+++ b/src/models/deberta/mod.rs
@@ -12,6 +12,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `DebertaTokenizer` using a `vocab.json` vocabulary and `merges.txt` merges file
+//!
 //! Pretrained models for a number of language pairs are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/deberta_v2/mod.rs b/src/models/deberta_v2/mod.rs
index b90486b8..45431d66 100644
--- a/src/models/deberta_v2/mod.rs
+++ b/src/models/deberta_v2/mod.rs
@@ -12,6 +12,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `DebertaV2Tokenizer` using a `spiece.model` SentencePiece model file
+//!
 //! Pretrained models for a number of language pairs are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/distilbert/mod.rs b/src/models/distilbert/mod.rs
index 9f389097..29f6ddce 100644
--- a/src/models/distilbert/mod.rs
+++ b/src/models/distilbert/mod.rs
@@ -14,6 +14,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `BertTokenizer` using a `vocab.txt` vocabulary
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/electra/mod.rs b/src/models/electra/mod.rs
index f24a9662..fae28fde 100644
--- a/src/models/electra/mod.rs
+++ b/src/models/electra/mod.rs
@@ -19,6 +19,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `BertTokenizer` using a `vocab.txt` vocabulary
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/fnet/mod.rs b/src/models/fnet/mod.rs
index f6046b3e..5209f24e 100644
--- a/src/models/fnet/mod.rs
+++ b/src/models/fnet/mod.rs
@@ -14,6 +14,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `FNetTokenizer` using a `spiece.model` SentencePiece (BPE) model file
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/gpt2/mod.rs b/src/models/gpt2/mod.rs
index 4e048ff0..c5c0bf42 100644
--- a/src/models/gpt2/mod.rs
+++ b/src/models/gpt2/mod.rs
@@ -11,6 +11,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `Gpt2Tokenizer` using a `vocab.txt` vocabulary and `merges.txt` 2-gram merges
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/longt5/longt5_model.rs b/src/models/longt5/longt5_model.rs
index 69dff618..1cdb821b 100644
--- a/src/models/longt5/longt5_model.rs
+++ b/src/models/longt5/longt5_model.rs
@@ -174,7 +174,7 @@ impl From<&LongT5Config> for T5Config {
 /// It is made of the following blocks:
 /// - `encoder`: `T5Stack` (transformer) made of a vector of encoding layers
 /// - `decoder`: `T5Stack` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `embeddings`: `nn::Embedding` Shared embeddings for the encoder and decoder.
 pub struct LongT5Model {
     pub(crate) encoder: LongT5Stack,
@@ -248,7 +248,7 @@ impl LongT5Model {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). This or `input_embeds` must be provided.
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). This or `decoder_input_embeds` must be provided.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `input_embeds` - Optional input tensor of shape (*batch size*, *source_sequence_length*, *embeddings dimension*). This or `input_ids` must be provided.
@@ -436,7 +436,7 @@ impl LongT5ForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). This or `input_embeds` must be provided.
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). This or `decoder_input_embeds` must be provided.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `input_embeds` - Optional input tensor of shape (*batch size*, *source_sequence_length*, *embeddings dimension*). This or `input_ids` must be provided.
diff --git a/src/models/m2m_100/m2m_100_model.rs b/src/models/m2m_100/m2m_100_model.rs
index c86ce604..ff5a08aa 100644
--- a/src/models/m2m_100/m2m_100_model.rs
+++ b/src/models/m2m_100/m2m_100_model.rs
@@ -126,7 +126,7 @@ fn _shift_tokens_right(
 /// It is made of the following blocks:
 /// - `encoder`: `M2M100Encoder` (transformer) made of a vector of encoding layers
 /// - `decoder`: `M2M100Decoder` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `pad_token_id`: padding token id
 pub struct M2M100Model {
     pub(crate) encoder: M2M100Encoder,
@@ -197,7 +197,7 @@ impl M2M100Model {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
     ///
@@ -365,7 +365,7 @@ impl M2M100ForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/m2m_100/mod.rs b/src/models/m2m_100/mod.rs
index 850b920e..36e56a74 100644
--- a/src/models/m2m_100/mod.rs
+++ b/src/models/m2m_100/mod.rs
@@ -12,6 +12,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `M2M100Tokenizer` using a `config.json` vocabulary and a `spiece.model` SentencePiece BPE model
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/marian/marian_model.rs b/src/models/marian/marian_model.rs
index 5368037c..bee21e25 100644
--- a/src/models/marian/marian_model.rs
+++ b/src/models/marian/marian_model.rs
@@ -579,7 +579,7 @@ impl MarianForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/mbart/mbart_model.rs b/src/models/mbart/mbart_model.rs
index 0de1d0b5..02abe007 100644
--- a/src/models/mbart/mbart_model.rs
+++ b/src/models/mbart/mbart_model.rs
@@ -229,7 +229,7 @@ impl MBartClassificationHead {
 /// It is made of the following blocks:
 /// - `encoder`: `MBartEncoder` (transformer) made of a vector of encoding layers
 /// - `decoder`: `MBartDecoder` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `pad_token_id`: padding token id
 pub struct MBartModel {
     pub(crate) encoder: MBartEncoder,
@@ -297,7 +297,7 @@ impl MBartModel {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
     ///
@@ -470,7 +470,7 @@ impl MBartForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
@@ -621,7 +621,7 @@ impl MBartForSequenceClassification {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/mbart/mod.rs b/src/models/mbart/mod.rs
index 93ebcff9..6df59058 100644
--- a/src/models/mbart/mod.rs
+++ b/src/models/mbart/mod.rs
@@ -11,6 +11,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `MBart50Tokenizer` using a `spiece.model` SentencePiece model
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/mobilebert/mod.rs b/src/models/mobilebert/mod.rs
index 92e3a6a1..74be7f0b 100644
--- a/src/models/mobilebert/mod.rs
+++ b/src/models/mobilebert/mod.rs
@@ -13,6 +13,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `BertTokenizer` using a `vocab.txt` vocabulary
+//!
 //! Pretrained models for a number of language pairs are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/openai_gpt/mod.rs b/src/models/openai_gpt/mod.rs
index e97a16c0..0a0f12f9 100644
--- a/src/models/openai_gpt/mod.rs
+++ b/src/models/openai_gpt/mod.rs
@@ -10,6 +10,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `GptTokenizer` using a `vocab.txt` vocabulary and `merges.txt` 2-gram merges
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/pegasus/mod.rs b/src/models/pegasus/mod.rs
index 67876891..3bae6aad 100644
--- a/src/models/pegasus/mod.rs
+++ b/src/models/pegasus/mod.rs
@@ -11,6 +11,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `PegasusTokenizer` using a `spiece.model` vocabulary and unigram model.
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/pegasus/pegasus_model.rs b/src/models/pegasus/pegasus_model.rs
index 4fd9b6ba..52e816da 100644
--- a/src/models/pegasus/pegasus_model.rs
+++ b/src/models/pegasus/pegasus_model.rs
@@ -87,7 +87,7 @@ fn _shift_tokens_right(
 /// It is made of the following blocks:
 /// - `encoder`: `PegasusEncoder` (transformer) made of a vector of encoding layers
 /// - `decoder`: `PegasusDecoder` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 pub struct PegasusModel {
     pub(crate) encoder: PegasusEncoder,
     decoder: PegasusDecoder,
@@ -152,7 +152,7 @@ impl PegasusModel {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
     ///
@@ -322,7 +322,7 @@ impl PegasusForConditionalGeneration {
     /// * `input_ids` - Optional input tensor of shape (*batch size*, *source_sequence_length*). Must be provided when not running in generation mode
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/prophetnet/prophetnet_model.rs b/src/models/prophetnet/prophetnet_model.rs
index e27aaf27..6082a267 100644
--- a/src/models/prophetnet/prophetnet_model.rs
+++ b/src/models/prophetnet/prophetnet_model.rs
@@ -224,7 +224,7 @@ impl ProphetNetModel {
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_hidden_states` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) corresponding to pre-calculated encoder hidden states (useful for conditional generation)
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `old_layer_states` - Optional Vector `Option<Vec<Option<&LayerState>, Option<&LayerState>>>` of length *n_layer* containing tuples with the past keys and values for both the self attention and the encoder cross attention of each layer of the decoder.
     /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
@@ -431,7 +431,7 @@ impl ProphetNetForConditionalGeneration {
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). Must be provided when running in generation mode (e.g. initialized with a BOS token)
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `encoder_hidden_states` - Optional tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) corresponding to pre-calculated encoder hidden states (useful for conditional generation)
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `old_layer_states` - Optional Vector `Option<Vec<Option<&LayerState>, Option<&LayerState>>>` of length *n_layer* containing tuples with the past keys and values for both the self attention and the encoder cross attention of each layer of the decoder.
     /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided.
     /// * `train` - boolean flag to turn on/off the dropout layers in the model. Should be set to false for inference.
diff --git a/src/models/reformer/mod.rs b/src/models/reformer/mod.rs
index 5a076771..021b24a4 100644
--- a/src/models/reformer/mod.rs
+++ b/src/models/reformer/mod.rs
@@ -11,6 +11,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `ReformerTokenizer` using a `spiece.model` BPE model
+//!
 //! Pretrained models on "Crime and Punishment" (Dostoevsky) are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/reformer/reformer_model.rs b/src/models/reformer/reformer_model.rs
index 4958a522..b3bbb75c 100644
--- a/src/models/reformer/reformer_model.rs
+++ b/src/models/reformer/reformer_model.rs
@@ -221,7 +221,7 @@ pub struct PaddedReformerInput {
 /// It is made of the following blocks:
 /// - `embeddings`: `ReformerEmbeddings` Reformer embeddings, combining word and position embeddings
 /// - `encoder`: `ReformerEncoder` (transformer) made of a vector of Reformer layer with local or LSH attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `least_common_mult_chunk_length`: least common chunk length for all attention layers
 /// - `min_chunk_length`: minimum chunk length for all attention layers
 /// - `pad_token_id`: padding token id used to pad to chunk length multiple if input is long enough to be chunked.
diff --git a/src/models/roberta/mod.rs b/src/models/roberta/mod.rs
index 1471f334..5e5e1b7b 100644
--- a/src/models/roberta/mod.rs
+++ b/src/models/roberta/mod.rs
@@ -15,6 +15,7 @@
 //! - Configuration file expected to have a structure following the [Transformers library](https://github.com/huggingface/transformers)
 //! - Model weights are expected to have a structure and parameter names following the [Transformers library](https://github.com/huggingface/transformers). A conversion using the Python utility scripts is required to convert the `.bin` weights to the `.ot` format.
 //! - `RobertaTokenizer` using a `vocab.txt` vocabulary and `merges.txt` 2-gram merges
+//!
 //! Pretrained models are available and can be downloaded using RemoteResources.
 //!
 //! ```no_run
diff --git a/src/models/t5/t5_model.rs b/src/models/t5/t5_model.rs
index c715815e..c09d7683 100644
--- a/src/models/t5/t5_model.rs
+++ b/src/models/t5/t5_model.rs
@@ -237,7 +237,7 @@ impl Default for T5Config {
 /// It is made of the following blocks:
 /// - `encoder`: `T5Stack` (transformer) made of a vector of encoding layers
 /// - `decoder`: `T5Stack` (transformer)  made of a vector of decoding layers with self attention and encoder cross-attention.
-/// caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
+///     caching is implemented for the decoder to avoid recalculating static states (encoder key/values and previously calculated decoder key/values)
 /// - `embeddings`: `nn::Embedding` Shared embeddings for the encoder and decoder.
 pub struct T5Model {
     pub(crate) encoder: T5Stack,
@@ -312,7 +312,7 @@ impl T5Model {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). This or `decoder_input_embeds` must be provided.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `input_embeds` - Optional input tensor of shape (*batch size*, *source_sequence_length*, *embeddings dimension*). This or `input_ids` must be provided.
     /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided.
@@ -509,7 +509,7 @@ impl T5ForConditionalGeneration {
     /// * `attention_mask` - Optional attention mask of shape (*batch size*, *source_sequence_length*) for the encoder positions. Positions with a mask with value 0 will be masked.
     /// * `decoder_input_ids` - Optional input tensor of shape (*batch size*, *target_sequence_length*). This or `decoder_input_embeds` must be provided.
     /// * `encoder_outputs` - Optional tuple made of a tensor of shape (*batch size*, *source_sequence_length*, *encoder_hidden_dim*) and optional vectors of tensors of length *num_encoder_layers* with shape (*batch size*, *source_sequence_length*, *hidden_size*).
-    /// These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
+    ///     These correspond to the encoder last hidden state and optional hidden states/attention weights for encoder layers. When provided, the encoder hidden state will not be recalculated. Useful for generation tasks.
     /// * `decoder_attention_mask` - Optional attention mask of shape (*batch size*, *target_sequence_length*) for the decoder positions. Positions with a mask with value 0 will be masked.
     /// * `input_embeds` - Optional input tensor of shape (*batch size*, *source_sequence_length*, *embeddings dimension*). This or `input_ids` must be provided.
     /// * `decoder_input_embeds` - Optional input tensor of shape (*batch size*, *target_sequence_length*, *embeddings dimension*). This or `decoder_input_ids` must be provided.
diff --git a/src/pipelines/conversation.rs b/src/pipelines/conversation.rs
index 60dc6981..7918d525 100644
--- a/src/pipelines/conversation.rs
+++ b/src/pipelines/conversation.rs
@@ -421,6 +421,7 @@ impl Conversation {
     /// # Arguments
     /// - texts: sequence of strings, alternating between past user inputs and past generated responses.
     /// - ids: sequence of sequence of ids, alternating between past user inputs and past generated responses.
+    ///
     /// These can be generated via a `ConversationModel`'s `encode_prompts`.
     ///
     /// # Example:
diff --git a/src/pipelines/keywords_extraction/stopwords.rs b/src/pipelines/keywords_extraction/stopwords.rs
index 74b874bf..3fbf823d 100644
--- a/src/pipelines/keywords_extraction/stopwords.rs
+++ b/src/pipelines/keywords_extraction/stopwords.rs
@@ -1,3 +1,4 @@
+#[allow(clippy::doc_lazy_continuation)]
 /// BSD 3-Clause License
 ///
 /// Copyright (c) 2007-2022 The scikit-learn developers.
diff --git a/src/pipelines/ner.rs b/src/pipelines/ner.rs
index 5c02ec8f..cca38a8e 100644
--- a/src/pipelines/ner.rs
+++ b/src/pipelines/ner.rs
@@ -23,7 +23,7 @@
 //! All resources for this model can be downloaded using the Python utility script included in this repository.
 //! 1. Set-up a Python virtual environment and install dependencies (in ./requirements.txt)
 //! 2. Run the conversion script python /utils/download-dependencies_bert_ner.py.
-//! The dependencies will be downloaded to the user's home directory, under ~/rustbert/bert-ner
+//!     The dependencies will be downloaded to the user's home directory, under ~/rustbert/bert-ner
 //!
 //! The example below illustrate how to run the model for the default English NER model
 //! ```no_run
diff --git a/src/pipelines/onnx/mod.rs b/src/pipelines/onnx/mod.rs
index ed3c6f84..2b0b6c21 100644
--- a/src/pipelines/onnx/mod.rs
+++ b/src/pipelines/onnx/mod.rs
@@ -7,7 +7,7 @@
 //! installation is to use dynamic linking by pointing to an existing library location:
 //! - Use the `load-dynamic` cargo feature for `ort`
 //! - set the `ORT_DYLIB_PATH` to point to the location of downloaded onnxruntime library (`onnxruntime.dll`/`libonnxruntime.so`/`libonnxruntime.dylib`
-//! depending on the operating system). These can be downloaded from the [release page](https://github.com/microsoft/onnxruntime/releases) of the onnxruntime project
+//!     depending on the operating system). These can be downloaded from the [release page](https://github.com/microsoft/onnxruntime/releases) of the onnxruntime project
 //!
 //! For troubleshooting  issues when using an ONNX model, it is recommended to add the `tracing-subscriber = { version = "0.3", default-features = false, features = [ "env-filter", "fmt" ] }`
 //! dependency, and use the `tracing_subscriber::fmt::init();` instruction in the `main` binary.
diff --git a/src/pipelines/text_generation.rs b/src/pipelines/text_generation.rs
index 773d641f..11ec09bc 100644
--- a/src/pipelines/text_generation.rs
+++ b/src/pipelines/text_generation.rs
@@ -25,8 +25,8 @@
 //! Two APIs exist to build text generation models:
 //! - `TextGenerationModel` is a high-level module that exposes text generation capabilities with a set of reasonable defaults
 //! - the `LanguageGenerator` trait exposes lower-level text generation capabilities allowing the user to provide additional
-//! generation options when building the model (via `GenerateConfig`) and at each query (via `GenerateOptions`). Please check the
-//! [`generation_utils` module](../generation_utils/index.html) for more details
+//!     generation options when building the model (via `GenerateConfig`) and at each query (via `GenerateOptions`). Please check the
+//!     [`generation_utils` module](../generation_utils/index.html) for more details
 //!
 //!
 //! Customized text generation models models can be loaded by overwriting the resources in the configuration.
diff --git a/src/pipelines/translation/translation_builder.rs b/src/pipelines/translation/translation_builder.rs
index a8ebd07e..bede8094 100644
--- a/src/pipelines/translation/translation_builder.rs
+++ b/src/pipelines/translation/translation_builder.rs
@@ -30,12 +30,12 @@ enum ModelSize {
 /// The logic for selecting the most appropriate model is as follows:
 /// - If not specified, the model will be executed on a CUDA device if available, otherwise on the CPU
 /// - If the model type is specified (e.g. `Marian`), a model with this architecture will be created. The compatibility of the model
-/// with the source and target languages will be verified, and the builder will error if the settings provided are not supported.
+///     with the source and target languages will be verified, and the builder will error if the settings provided are not supported.
 /// - If the model size is specified, a model of the corresponding size class (computational budget) will be created. The compatibility of the model
-/// with the source and target languages will be verified, and the builder will error if the settings provided are not supported.
+///     with the source and target languages will be verified, and the builder will error if the settings provided are not supported.
 /// - If no source or target languages are provided, a multilingual M2M100 model will be returned
 /// - If no model type is provided, an average sized-model (Marian) will be returned if a pretrained model exists that covers the requested source/target languages provided.
-/// Otherwise a M2M100 multi-lingual model will be returned.
+///     Otherwise a M2M100 multi-lingual model will be returned.
 ///
 /// The options for the builder are provided with dedicated "builder function", the call to `create_model()` creates a model
 /// from the builder.