Merge branch 'nod-ai:main' into merge-fixes

nod-ai · Jun 23, 2023 · 396b70a · 396b70a
2 parents 06980ea + 28f4d44
commit 396b70a
Show file tree

Hide file tree

Showing 24 changed files with 1,149 additions and 124 deletions.
diff --git a/.github/workflows/test-models.yml b/.github/workflows/test-models.yml
@@ -35,6 +35,8 @@ jobs:
         include:
           - os: ubuntu-latest
             suite: lint
+          - os: MacStudio
+            suite: metal
         exclude:
           - os: ubuntu-latest
             suite: vulkan
@@ -46,6 +48,8 @@ jobs:
             suite: cuda
           - os: MacStudio
             suite: cpu
+          - os: MacStudio
+            suite: vulkan
           - os: icelake
             suite: vulkan
           - os: icelake
@@ -125,15 +129,14 @@ jobs:
         # python build_tools/stable_diffusion_testing.py --device=cuda 
 
     - name: Validate Vulkan Models (MacOS)
-      if: matrix.suite == 'vulkan' && matrix.os == 'MacStudio'
+      if: matrix.suite == 'metal' && matrix.os == 'MacStudio'
       run: |
         cd $GITHUB_WORKSPACE
         PYTHON=python${{ matrix.python-version }} ./setup_venv.sh
         source shark.venv/bin/activate
-        export DYLD_LIBRARY_PATH=/usr/local/lib/
         echo $PATH
         pip list | grep -E "torch|iree"
-        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k vulkan
+        pytest --ci --ci_sha=${SHORT_SHA} --local_tank_cache="/Volumes/builder/anush/shark_cache" --tank_url="gs://shark_tank/nightly/" -k metal
 
     - name: Validate Vulkan Models (a100)
       if: matrix.suite == 'vulkan' && matrix.os == 'a100'

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,8 @@
 __pycache__/
 *.py[cod]
 *$py.class
+*.mlir
+*.vmfb
 
 # C extensions
 *.so
@@ -157,7 +159,7 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
 
 # vscode related
 .vscode

diff --git a/apps/language_models/scripts/vicuna.py b/apps/language_models/scripts/vicuna.py
@@ -3,6 +3,10 @@
 from apps.language_models.src.pipelines import vicuna_pipeline as vp
 from apps.language_models.src.pipelines import vicuna_sharded_pipeline as vsp
 import torch
+import json
+
+if __name__ == "__main__":
+    import gc
 
 
 parser = argparse.ArgumentParser(
@@ -55,35 +59,38 @@
     help="Run model in cli mode",
 )
 
+parser.add_argument(
+    "--config",
+    default=None,
+    help="configuration file",
+)
+
 if __name__ == "__main__":
     args, unknown = parser.parse_known_args()
 
     vic = None
     if not args.sharded:
         first_vic_mlir_path = (
-            Path(f"first_vicuna_{args.precision}.mlir")
+            None
             if args.first_vicuna_mlir_path is None
             else Path(args.first_vicuna_mlir_path)
         )
         second_vic_mlir_path = (
-            Path(f"second_vicuna_{args.precision}.mlir")
+            None
             if args.second_vicuna_mlir_path is None
             else Path(args.second_vicuna_mlir_path)
         )
         first_vic_vmfb_path = (
-            Path(
-                f"first_vicuna_{args.precision}_{args.device.replace('://', '_')}.vmfb"
-            )
+            None
             if args.first_vicuna_vmfb_path is None
             else Path(args.first_vicuna_vmfb_path)
         )
         second_vic_vmfb_path = (
-            Path(
-                f"second_vicuna_{args.precision}_{args.device.replace('://', '_')}.vmfb"
-            )
+            None
             if args.second_vicuna_vmfb_path is None
             else Path(args.second_vicuna_vmfb_path)
         )
+
         vic = vp.Vicuna(
             "vicuna",
             device=args.device,
@@ -95,16 +102,21 @@
             load_mlir_from_shark_tank=args.load_mlir_from_shark_tank,
         )
     else:
+        if args.config is not None:
+            config_file = open(args.config)
+            config_json = json.load(config_file)
+            config_file.close()
+        else:
+            config_json = None
         vic = vsp.Vicuna(
             "vicuna",
             device=args.device,
             precision=args.precision,
+            config_json=config_json,
         )
     prompt_history = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
     prologue_prompt = "ASSISTANT:\n"
 
-    import gc
-
     while True:
         # TODO: Add break condition from user input
         user_prompt = input("User: ")

diff --git a/apps/language_models/src/model_wrappers/vicuna_sharded_model.py b/apps/language_models/src/model_wrappers/vicuna_sharded_model.py
@@ -145,7 +145,7 @@ def forward(
 
 
 class ShardedVicunaModel(torch.nn.Module):
-    def __init__(self, model, layers0, layers1):
+    def __init__(self, model, layers0, layers1, lmhead, embedding, norm):
         super().__init__()
         self.model = model
         assert len(layers0) == len(model.model.layers)
@@ -154,6 +154,12 @@ def __init__(self, model, layers0, layers1):
         self.model.model.config.output_attentions = False
         self.layers0 = layers0
         self.layers1 = layers1
+        self.norm = norm
+        self.embedding = embedding
+        self.lmhead = lmhead
+        self.model.model.norm = self.norm
+        self.model.model.embed_tokens = self.embedding
+        self.model.lm_head = self.lmhead
 
     def forward(
         self,
@@ -176,3 +182,69 @@ def forward(
                 attention_mask=attention_mask,
                 past_key_values=past_key_values,
             )
+
+
+class LMHead(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, hidden_states):
+        output = self.model(hidden_states)
+        return output
+
+
+class LMHeadCompiled(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.detach()
+        output = self.model("forward", (hidden_states,))
+        output = torch.tensor(output)
+        return output
+
+
+class VicunaNorm(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, hidden_states):
+        output = self.model(hidden_states)
+        return output
+
+
+class VicunaNormCompiled(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(self, hidden_states):
+        hidden_states.detach()
+        output = self.model("forward", (hidden_states,))
+        output = torch.tensor(output)
+        return output
+
+
+class VicunaEmbedding(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, input_ids):
+        output = self.model(input_ids)
+        return output
+
+
+class VicunaEmbeddingCompiled(torch.nn.Module):
+    def __init__(self, shark_module):
+        super().__init__()
+        self.model = shark_module
+
+    def forward(self, input_ids):
+        input_ids.detach()
+        output = self.model("forward", (input_ids,))
+        output = torch.tensor(output)
+        return output
diff --git a/apps/language_models/src/pipelines/vicuna_pipeline.py b/apps/language_models/src/pipelines/vicuna_pipeline.py
@@ -28,11 +28,12 @@ def __init__(
         max_num_tokens=512,
         device="cuda",
         precision="fp32",
-        first_vicuna_mlir_path=Path("first_vicuna.mlir"),
-        second_vicuna_mlir_path=Path("second_vicuna.mlir"),
-        first_vicuna_vmfb_path=Path("first_vicuna.vmfb"),
-        second_vicuna_vmfb_path=Path("second_vicuna.vmfb"),
+        first_vicuna_mlir_path=None,
+        second_vicuna_mlir_path=None,
+        first_vicuna_vmfb_path=None,
+        second_vicuna_vmfb_path=None,
         load_mlir_from_shark_tank=True,
+        low_device_memory=False,
     ) -> None:
         super().__init__(model_name, hf_model_path, max_num_tokens)
         self.max_sequence_length = 256
@@ -42,9 +43,30 @@ def __init__(
         self.second_vicuna_vmfb_path = second_vicuna_vmfb_path
         self.first_vicuna_mlir_path = first_vicuna_mlir_path
         self.second_vicuna_mlir_path = second_vicuna_mlir_path
+        self.load_mlir_from_shark_tank = load_mlir_from_shark_tank
+        self.low_device_memory = low_device_memory
+        self.first_vic = None
+        self.second_vic = None
+        if self.first_vicuna_mlir_path == None:
+            self.first_vicuna_mlir_path = self.get_model_path()
+        if self.second_vicuna_mlir_path == None:
+            self.second_vicuna_mlir_path = self.get_model_path("second")
+        if self.first_vicuna_vmfb_path == None:
+            self.first_vicuna_vmfb_path = self.get_model_path(suffix="vmfb")
+        if self.second_vicuna_vmfb_path == None:
+            self.second_vicuna_vmfb_path = self.get_model_path(
+                "second", "vmfb"
+            )
         self.tokenizer = self.get_tokenizer()
         self.shark_model = self.compile()
-        self.load_mlir_from_shark_tank = load_mlir_from_shark_tank
+
+    def get_model_path(self, model_number="first", suffix="mlir"):
+        safe_device = "_".join(self.device.split("-"))
+        if suffix == "mlir":
+            return Path(f"{model_number}_vicuna_{self.precision}.{suffix}")
+        return Path(
+            f"{model_number}_vicuna_{self.precision}_{safe_device}.{suffix}"
+        )
 
     def get_tokenizer(self):
         tokenizer = AutoTokenizer.from_pretrained(
@@ -69,7 +91,7 @@ def compile_first_vicuna(self):
         # Compilation path needs some more work before it is functional
 
         print(
-            f"[DEBUG] vmfb not found at {self.first_vicuna_vmfb_path.absolute()}. Trying to work with"
+            f"[DEBUG] vmfb not found at {self.first_vicuna_vmfb_path.absolute()}. Trying to work with\n"
             f"[DEBUG] mlir path { self.first_vicuna_mlir_path} {'exists' if self.first_vicuna_mlir_path.exists() else 'does not exist'}"
         )
         if self.first_vicuna_mlir_path.exists():
@@ -418,12 +440,19 @@ def generate(self, prompt, cli=False):
         # TODO: refactor for cleaner integration
         import gc
 
+        if not self.low_device_memory:
+            if self.first_vic == None:
+                self.first_vic = self.compile_first_vicuna()
+            if self.second_vic == None:
+                self.second_vic = self.compile_second_vicuna()
         res = []
         res_tokens = []
         params = {
             "prompt": prompt,
             "is_first": True,
-            "fv": self.compile_first_vicuna(),
+            "fv": self.compile_first_vicuna()
+            if self.first_vic == None
+            else self.first_vic,
         }
 
         generated_token_op = self.generate_new_token(params=params)
@@ -439,18 +468,20 @@ def generate(self, prompt, cli=False):
             print(f"Assistant: {detok}", end=" ", flush=True)
 
         # Clear First Vic from Memory (main and cuda)
-        del params
-        torch.cuda.empty_cache()
-        gc.collect()
+        if self.low_device_memory:
+            del params
+            torch.cuda.empty_cache()
+            gc.collect()
 
-        sec_vic = self.compile_second_vicuna()
         for _ in range(self.max_num_tokens - 2):
             params = {
                 "prompt": None,
                 "is_first": False,
                 "logits": logits,
                 "pkv": pkv,
-                "sv": sec_vic,
+                "sv": self.compile_second_vicuna()
+                if self.second_vic == None
+                else self.second_vic,
             }
 
             generated_token_op = self.generate_new_token(params=params)
@@ -471,9 +502,10 @@ def generate(self, prompt, cli=False):
                 res.append(detok)
                 if cli:
                     print(f"{detok}", end=" ", flush=True)
-        del sec_vic, pkv, logits
-        torch.cuda.empty_cache()
-        gc.collect()
+        if self.device == "cuda":
+            del sec_vic, pkv, logits
+            torch.cuda.empty_cache()
+            gc.collect()
 
         for i in range(len(res_tokens)):
             if type(res_tokens[i]) != int: