You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -
import torch
from transformers import BitsAndBytesConfig
import numpy as np
import torch
import sys
# sys.path.append('/home/sandeep.pandey/from_157/llm2vec_modified/llm2vec')
# from llm2vec.llm2vec2 import LLM2Vec
# Now you can import the LLM2Vec class
from llm2vec import LLM2Vec
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import configparser
import time
# Assuming LLM2Vec and CustomModel are already defined
class CustomModel(LLM2Vec):
def prepare_for_tokenization(self, text):
text = (
"<start_of_turn>user " + text.strip() + "<end_of_turn> \n\n"
+ "<start_of_turn>model "
)
return text
# Define the main function
def main():
base_model = "path to base model"
lora_model = "path to contrastive lora checkpoint"
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
# Load the Custom Model
custom_model = CustomModel.from_pretrained(
base_model,
peft_model_name_or_path=lora_model,
device_map="cuda" if torch.cuda.is_available() else "cpu",
quantization_config=nf4_config
)
# Encoding queries using instructions
instruction = (
"Given a web search query, retrieve relevant passages that answer the query:"
)
queries = [
[instruction, "How much protein should a female eat daily"]
]
start = time.time()
q_reps = custom_model.encode(queries)
print(f"Time taken: {time.time() - start:.2f} seconds")
if __name__ == "__main__":
main()
the error i am getting is -
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it]
Batches: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 59, in
main()
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 54, in main
q_reps = custom_model.encode(queries)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/sandeep.pandey/.local/lib/python3.12/site-packages/llm2vec/llm2vec.py", line 403, in encode
all_embeddings = [result.get() for result in results]
^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 774, in get
raise self._value
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 540, in _handle_tasks
put(task)
File "/opt/miniconda/lib/python3.12/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
AttributeError: Can't pickle local object 'add_hook_to_module..new_forward'
Batches: 0%| | 0/1 [00:00<?, ?it/s]
The text was updated successfully, but these errors were encountered:
I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -
the error i am getting is -
The text was updated successfully, but these errors were encountered: