getting error when trying to do quantization using bitsandbytes #138

sandeep-krutrim · 2024-08-12T06:28:18Z

I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -

import torch
from transformers import BitsAndBytesConfig
import numpy as np
import torch
import sys
# sys.path.append('/home/sandeep.pandey/from_157/llm2vec_modified/llm2vec')

# from llm2vec.llm2vec2 import LLM2Vec

# Now you can import the LLM2Vec class
from llm2vec import LLM2Vec
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import configparser
import time

# Assuming LLM2Vec and CustomModel are already defined
class CustomModel(LLM2Vec):
    def prepare_for_tokenization(self, text):
        text = (
                "<start_of_turn>user " + text.strip() + "<end_of_turn> \n\n"
                + "<start_of_turn>model "
            )
        return text

# Define the main function
def main():
    base_model = "path to base model"
    lora_model = "path to contrastive lora checkpoint"

    nf4_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load the Custom Model
    custom_model = CustomModel.from_pretrained(
       base_model,
       peft_model_name_or_path=lora_model,
       device_map="cuda" if torch.cuda.is_available() else "cpu",
       quantization_config=nf4_config
    )

    # Encoding queries using instructions
    instruction = (
        "Given a web search query, retrieve relevant passages that answer the query:"
    )
    queries = [
        [instruction, "How much protein should a female eat daily"]
    ]
    start = time.time()
    q_reps = custom_model.encode(queries)
    print(f"Time taken: {time.time() - start:.2f} seconds")


if __name__ == "__main__":
    main()

the error i am getting is -

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it]
Batches: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 59, in
main()
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 54, in main
q_reps = custom_model.encode(queries)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/sandeep.pandey/.local/lib/python3.12/site-packages/llm2vec/llm2vec.py", line 403, in encode
all_embeddings = [result.get() for result in results]
^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 774, in get
raise self._value
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 540, in _handle_tasks
put(task)
File "/opt/miniconda/lib/python3.12/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
AttributeError: Can't pickle local object 'add_hook_to_module..new_forward'
Batches: 0%| | 0/1 [00:00<?, ?it/s]

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

getting error when trying to do quantization using bitsandbytes #138

getting error when trying to do quantization using bitsandbytes #138

sandeep-krutrim commented Aug 12, 2024

getting error when trying to do quantization using bitsandbytes #138

getting error when trying to do quantization using bitsandbytes #138

Comments

sandeep-krutrim commented Aug 12, 2024