Add support for Quantized StableLM-3B model

nod-ai · Dec 7, 2023 · 0cbd292 · 0cbd292
1 parent 8caf874
commit 0cbd292
Showing 1 changed file with 18 additions and 1 deletion.
diff --git a/apps/language_models/src/pipelines/stablelm_pipeline.py b/apps/language_models/src/pipelines/stablelm_pipeline.py
@@ -73,6 +73,14 @@ def __init__(
         super().__init__(model_name, hf_model_path, max_num_tokens)
         self.max_sequence_len = 256
         self.device = device
+        if precision != "int4" and args.hf_auth_token == None:
+            raise ValueError(
+                """ HF auth token required for StableLM-3B. Pass it using
+                --hf_auth_token flag. You can ask for the access to the model
+                here: https://huggingface.co/tiiuae/falcon-180B-chat."""
+            )
+        self.hf_auth_token = args.hf_auth_token
+
         self.precision = precision
         self.debug = debug
         self.tokenizer = self.get_tokenizer()
@@ -86,11 +94,20 @@ def shouldStop(self, tokens):
         return False
 
     def get_src_model(self):
+        kwargs = {}
+        if self.precision == "int4":
+            self.hf_model_path = "yichunkuo/stablelm-3b-4e1t-gptq"
+            from transformers import GPTQConfig
+
+            quantization_config = GPTQConfig(bits=4, disable_exllama=True)
+            kwargs["quantization_config"] = quantization_config
+            kwargs["device_map"] = "cpu"
         model = AutoModelForCausalLM.from_pretrained(
             self.hf_model_path,
             trust_remote_code=True,
             torch_dtype=torch.float32,
-            use_auth_token="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",
+            use_auth_token=self.hf_auth_token,
+            **kwargs,
         )
         return model