diff --git a/apps/language_models/scripts/stablelm.py b/apps/language_models/scripts/stablelm.py index 223760374a..bb637ede64 100644 --- a/apps/language_models/scripts/stablelm.py +++ b/apps/language_models/scripts/stablelm.py @@ -49,7 +49,6 @@ def compile_stableLM( ): from shark.shark_inference import SharkInference - # device = "cuda" # "cpu" # TODO: vmfb and mlir name should include precision and device vmfb_path = ( Path(model_name + f"_{device}.vmfb") @@ -129,14 +128,6 @@ def get_tokenizer(): print("Sucessfully loaded the tokenizer to the memory") return tok - -# sharkStableLM = compile_stableLM -# ( -# None, -# tuple([input_ids, attention_mask]), -# "stableLM_linalg_f32_seqLen256", -# "/home/shark/vivek/stableLM_shark_f32_seqLen256" -# ) def generate( new_text, max_new_tokens, @@ -148,18 +139,8 @@ def generate( # Construct the input message string for the model by # concatenating the current system message and conversation history # Tokenize the messages string - # sharkStableLM = compile_stableLM - # ( - # None, - # tuple([input_ids, attention_mask]), - # "stableLM_linalg_f32_seqLen256", - # "/home/shark/vivek/stableLM_shark_f32_seqLen256" - # ) words_list = [] for i in range(max_new_tokens): - # numWords = len(new_text.split()) - # if(numWords>220): - # break params = { "new_text": new_text, } @@ -188,7 +169,6 @@ def generate_new_token(shark_model, tokenizer, params): return_tensors="pt", ) sum_attentionmask = torch.sum(model_inputs.attention_mask) - # sharkStableLM = compile_stableLM(None, tuple([input_ids, attention_mask]), "stableLM_linalg_f32_seqLen256", "/home/shark/vivek/stableLM_shark_f32_seqLen256") output = shark_model( "forward", [model_inputs.input_ids, model_inputs.attention_mask] ) diff --git a/apps/language_models/src/pipelines/stablelm_pipeline.py b/apps/language_models/src/pipelines/stablelm_pipeline.py index d765b72fce..b54a94b7c6 100644 --- a/apps/language_models/src/pipelines/stablelm_pipeline.py +++ b/apps/language_models/src/pipelines/stablelm_pipeline.py @@ -63,7 +63,6 @@ def compile(self): f"stableLM_linalg_{self.precision}_seqLen{self.max_sequence_len}" ) - # device = "cuda" # "cpu" # TODO: vmfb and mlir name should include precision and device model_vmfb_name = None vmfb_path = ( @@ -120,7 +119,6 @@ def compile(self): def get_tokenizer(self): tok = AutoTokenizer.from_pretrained(self.hf_model_path) tok.add_special_tokens({"pad_token": ""}) - # print("[DEBUG] Sucessfully loaded the tokenizer to the memory") return tok def generate(self, prompt):