nod-ai · monorimet · Jan 19, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/apps/shark_studio/api/llm.py b/apps/shark_studio/api/llm.py
@@ -1,4 +1,5 @@
 from turbine_models.custom_models import stateless_llama
+from turbine_models.gen_external_params.gen_external_params import gen_external_params
 import time
 from shark.iree_utils.compile_utils import (
     get_iree_compiled_module,
@@ -28,6 +29,7 @@
         "system_prompt": """<s>[INST] <<SYS>>Be concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. <</SYS>>""",
     },
 }
+
 B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<s>", "</s>"
 
@@ -36,7 +38,6 @@ def append_user_prompt(history, input_prompt):
     history += user_prompt
     return history
 
-
 def append_bot_prompt(history, input_prompt):
     user_prompt = f"{B_SYS} {input_prompt}{E_SYS} {E_SYS}"
     history += user_prompt
@@ -58,25 +59,50 @@ def __init__(
         self.hf_model_name = llm_model_map[model_name]["hf_model_name"]
         self.device = device.split("=>")[-1].strip()
         self.driver = self.device.split("://")[0]
-        print(f" Selected {self.driver} as device driver")
-        self.precision = "fp32" if "cpu" in self.driver else "fp16"
+        print(f"Selected {self.driver} as device driver")
+        self.precision = "f32" if "cpu" in self.driver else "f16"
         self.quantization = quantization
-        self.tempfile_name = get_resource_path(f"llm_{self.precision}_{self.quantization}.tempfile")
+        #TODO: find a programmatic solution for model arch spec instead of hardcoding llama2
+        self.file_spec = "_".join([
+            "llama2",
+            "streaming" if streaming_llm else "chat",
+            self.precision,
+            self.quantization,
+        ])
+        self.tempfile_name = get_resource_path(f"{self.file_spec}.tempfile")
         #TODO: Tag vmfb with target triple of device instead of HAL backend
-        self.vmfb_name = get_resource_path(f"llm_{self.precision}_{self.quantization}_{self.driver}.vmfb.tempfile")    
-        self.safe_name = self.hf_model_name.strip("/").replace("/", "_")
+        self.vmfb_name = get_resource_path(f"{self.file_spec}_{self.driver}.vmfb.tempfile")    
+        self.safe_name = self.hf_model_name.split("/")[-1].replace("-", "_")
         self.max_tokens = llm_model_map[model_name]["max_tokens"]
         self.iree_module_dict = None
         self.external_weight_file = None
         self.streaming_llm = streaming_llm
         if external_weights is not None:
             self.external_weight_file = get_resource_path(
-                self.safe_name + "." + external_weights
+                self.safe_name
+                + "_" + self.precision
+                + "_" + self.quantization
+                + "." + external_weights
             )
         self.use_system_prompt = use_system_prompt
         self.global_iter = 0
         self.prev_token_len = 0
-
+        if self.external_weight_file is not None:
+            if not os.path.exists(self.external_weight_file):
+                print(
+                    f"External weight file {self.external_weight_file} does not exist. Generating..."
+                )
+                gen_external_params(
+                    hf_model_name=self.hf_model_name,
+                    quantization=self.quantization,
+                    weight_path=self.external_weight_file,
+                    hf_auth_token=hf_auth_token,
+                    precision=self.precision,
+                )
+            else:
+                print(
+                    f"External weight file {self.external_weight_file} found for {self.vmfb_name}"
+                )
         if os.path.exists(self.vmfb_name) and (
             external_weights is None or os.path.exists(str(self.external_weight_file))
         ):
@@ -210,7 +236,7 @@ def format_out(results):
             else:
                 if self.streaming_llm and self.iree_module_dict["vmfb"]["get_seq_step"]() > 600:
                     print("Evicting cache space!")
-                    self.model["evict_kvcache_space"]()
+                    self.iree_module_dict["vmfb"]["evict_kvcache_space"]()
                 device_inputs = [
                     ireert.asdevicearray(
                         self.iree_module_dict["config"].device,
@@ -222,7 +248,9 @@ def format_out(results):
             total_time = time.time() - st_time
             history.append(format_out(token))
             self.prev_token_len = token_len + len(history)
-            yield self.tokenizer.decode(history), total_time
+            res = self.tokenizer.decode(history, skip_special_tokens=True)
+            prompt = append_bot_prompt(prompt, res)
+            yield prompt, total_time
 
             if format_out(token) == llm_model_map["llama2_7b"]["stop_token"]:
                 break

diff --git a/apps/shark_studio/api/utils.py b/apps/shark_studio/api/utils.py
@@ -140,22 +140,22 @@ def get_output_value(dev_dict):
 
 def get_opt_flags(model, precision="fp16"):
     iree_flags = []
-    if len(cmd_opts.iree_vulkan_target_triple) > 0:
-        iree_flags.append(
-            f"-iree-vulkan-target-triple={cmd_opts.iree_vulkan_target_triple}"
-        )
-    if "rocm" in cmd_opts.device:
-        from shark.iree_utils.gpu_utils import get_iree_rocm_args
-
-        rocm_args = get_iree_rocm_args()
-        iree_flags.extend(rocm_args)
-    if cmd_opts.iree_constant_folding == False:
-        iree_flags.append("--iree-opt-const-expr-hoisting=False")
-        iree_flags.append(
-            "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
-        )
-    if cmd_opts.data_tiling == False:
-        iree_flags.append("--iree-opt-data-tiling=False")
+    # if len(cmd_opts.iree_vulkan_target_triple) > 0:
+    #     iree_flags.append(
+    #         f"-iree-vulkan-target-triple={cmd_opts.iree_vulkan_target_triple}"
+    #     )
+    # if "rocm" in cmd_opts.device:
+    #     from shark.iree_utils.gpu_utils import get_iree_rocm_args
+
+    #     rocm_args = get_iree_rocm_args()
+    #     iree_flags.extend(rocm_args)
+    # if cmd_opts.iree_constant_folding == False:
+    #     iree_flags.append("--iree-opt-const-expr-hoisting=False")
+    #     iree_flags.append(
+    #         "--iree-codegen-linalg-max-constant-fold-elements=9223372036854775807"
+    #     )
+    # if cmd_opts.data_tiling == False:
+    #     iree_flags.append("--iree-opt-data-tiling=False")
 
     if "vae" not in model:
         # Due to lack of support for multi-reduce, we always collapse reduction

diff --git a/apps/shark_studio/web/ui/chat.py b/apps/shark_studio/web/ui/chat.py
@@ -13,9 +13,9 @@
     LanguageModel,
 )
 
-
 def user(message, history):
     # Append the user's message to the conversation history
+    #message = f"{B_INST} {message} {E_INST}"
     return "", history + [[message, ""]]
 
 
@@ -76,101 +76,6 @@ def chat_fn(
             yield history, f"Prefill: {prefill_time:.2f} seconds\n Decode: {tokens_per_sec:.2f} tokens/sec"
 
 
-def llm_chat_api(InputData: dict):
-    return None
-    print(f"Input keys : {InputData.keys()}")
-    # print(f"model : {InputData['model']}")
-    is_chat_completion_api = (
-        "messages" in InputData.keys()
-    )  # else it is the legacy `completion` api
-    # For Debugging input data from API
-    # if is_chat_completion_api:
-    #     print(f"message -> role : {InputData['messages'][0]['role']}")
-    #     print(f"message -> content : {InputData['messages'][0]['content']}")
-    # else:
-    #     print(f"prompt : {InputData['prompt']}")
-    # print(f"max_tokens : {InputData['max_tokens']}") # Default to 128 for now
-    global vicuna_model
-    model_name = InputData["model"] if "model" in InputData.keys() else "codegen"
-    model_path = llm_model_map[model_name]
-    device = "cpu-task"
-    precision = "fp16"
-    max_toks = None if "max_tokens" not in InputData.keys() else InputData["max_tokens"]
-    if max_toks is None:
-        max_toks = 128 if model_name == "codegen" else 512
-
-    # make it working for codegen first
-    from apps.language_models.scripts.vicuna import (
-        UnshardedVicuna,
-    )
-
-    device_id = None
-    if vicuna_model == 0:
-        if "cuda" in device:
-            device = "cuda"
-        elif "sync" in device:
-            device = "cpu-sync"
-        elif "task" in device:
-            device = "cpu-task"
-        elif "vulkan" in device:
-            device_id = int(device.split("://")[1])
-            device = "vulkan"
-        else:
-            print("unrecognized device")
-
-        vicuna_model = UnshardedVicuna(
-            model_name,
-            hf_model_path=model_path,
-            device=device,
-            precision=precision,
-            max_num_tokens=max_toks,
-            download_vmfb=True,
-            load_mlir_from_shark_tank=True,
-            device_id=device_id,
-        )
-
-    # TODO: add role dict for different models
-    if is_chat_completion_api:
-        # TODO: add funtionality for multiple messages
-        prompt = create_prompt(model_name, [(InputData["messages"][0]["content"], "")])
-    else:
-        prompt = InputData["prompt"]
-    print("prompt = ", prompt)
-
-    res = vicuna_model.generate(prompt)
-    res_op = None
-    for op in res:
-        res_op = op
-
-    if is_chat_completion_api:
-        choices = [
-            {
-                "index": 0,
-                "message": {
-                    "role": "assistant",
-                    "content": res_op,  # since we are yeilding the result
-                },
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    else:
-        choices = [
-            {
-                "text": res_op,
-                "index": 0,
-                "logprobs": None,
-                "finish_reason": "stop",  # or length
-            }
-        ]
-    end_time = dt.now().strftime("%Y%m%d%H%M%S%f")
-    return {
-        "id": end_time,
-        "object": "chat.completion" if is_chat_completion_api else "text_completion",
-        "created": int(end_time),
-        "choices": choices,
-    }
-
-
 def view_json_file(file_obj):
     content = ""
     with open(file_obj.name, "r") as fopen: