smallcloudai · 88Ocelot · Dec 25, 2023 · Dec 25, 2023 · Dec 25, 2023 · Dec 25, 2023
diff --git a/docker-compose.rocm.yml b/docker-compose.rocm.yml
@@ -0,0 +1,31 @@
+version: "3.9"
+services:
+  refact_self_hosted:
+    # TODO: figureout how to pass gpu to docker builds, so there is no need to install deepspeed at runtime
+    command: >
+      /bin/bash -c 'pip install deepspeed --no-cache-dir 
+      && python -m self_hosting_machinery.watchdog.docker_watchdog'
+    image: refact_self_hosting_rocm
+    build:
+      dockerfile: rocm.Dockerfile
+    shm_size: "32gb"
+    devices:
+      - "/dev/kfd"
+      - "/dev/dri"
+    group_add:
+      - "video"
+    security_opt:
+      - seccomp:unconfined
+    volumes:
+      - perm_storage:/perm_storage
+    ports:
+      - 8008:8008
+  nginx:
+    image: nginx
+    ports:
+      - "80:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/conf.d/default.conf:ro
+
+volumes:
+  perm_storage:
diff --git a/rocm.Dockerfile b/rocm.Dockerfile
@@ -0,0 +1,68 @@
+FROM ocelot88/rocm-pytorch-slim:rocm-5.7.1-dev-torch-2.3
+RUN apt-get update
+RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \
+    curl \
+    git \
+    htop \
+    tmux \
+    file \
+    vim \
+    expect \
+    mpich \
+    libmpich-dev \
+    python3 python3-pip \
+    && rm -rf /var/lib/{apt,dpkg,cache,log}
+
+
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
+
+# linguist requisites
+RUN apt-get update
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y \
+    expect \
+    ruby-full \
+    ruby-bundler \
+    build-essential \
+    cmake \
+    pkg-config \
+    libicu-dev \
+    zlib1g-dev \
+    libcurl4-openssl-dev \
+    libssl-dev
+RUN git clone https://github.com/smallcloudai/linguist.git /tmp/linguist \
+    && cd /tmp/linguist \
+    && bundle install \
+    && rake build_gem
+
+ENV PATH="${PATH}:/tmp/linguist/bin"
+
+RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get install -y python3-packaging
+
+ENV INSTALL_OPTIONAL=TRUE
+ENV BUILD_CUDA_EXT=1
+ENV USE_ROCM=1
+ENV GITHUB_ACTIONS=true
+ENV AMDGPU_TARGETS="gfx1030"
+ENV FLASH_ATTENTION_FORCE_BUILD=TRUE
+ENV MAX_JOBS=8
+COPY . /tmp/app
+RUN pip install --upgrade pip ninja packaging
+RUN DEBIAN_FRONTEND=noninteractive apt-get install python3-mpi4py -y
+ENV PYTORCH_ROCM_ARCH="gfx1030"
+ENV ROCM_TARGET="gfx1030"
+ENV ROCM_HOME=/opt/rocm-5.7.1
+# TODO: https://github.com/TimDettmers/bitsandbytes/pull/756 remove this layer, when this pr merged
+RUN git clone https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6 && \
+    cd bitsandbytes-rocm-5.6 && \
+    make hip && pip install . && \
+    cd .. && rm -rf bitsandbytes-rocm-5.6
+RUN pip install /tmp/app -v --no-build-isolation && rm -rf /tmp/app
+RUN ln -s ${ROCM_HOME} /opt/rocm
+ENV REFACT_PERM_DIR "/perm_storage"
+ENV REFACT_TMP_DIR "/tmp"
+ENV RDMAV_FORK_SAFE 0
+ENV RDMAV_HUGEPAGES_SAFE 0
+
+EXPOSE 8008
+
+CMD ["python", "-m", "self_hosting_machinery.watchdog.docker_watchdog"]
diff --git a/self_hosting_machinery/scripts/enum_gpus.py b/self_hosting_machinery/scripts/enum_gpus.py
@@ -9,6 +9,38 @@
 
 from self_hosting_machinery import env
 
+def query_rocm_smi():
+    rocm_smi_output = "- no output -"
+    descriptions = []
+    try:
+        rocm_smi_output = subprocess.check_output([
+            "/opt/rocm/bin/rocm-smi", 
+            "--showbus", 
+            "--showproductname", 
+            "--showtemp",
+            "--showmeminfo", "vram",
+            "--json"])
+        logging.info(rocm_smi_output)
+        smi_output_dict = json.loads(rocm_smi_output)
+        for gpu_id, props in smi_output_dict.items():
+            descriptions.append({
+                "id": props.get("PCI Bus"),
+                "name": props.get("Card model", "AMD GPU"),
+                "mem_used_mb": bytes_to_mb(int(props.get("VRAM Total Used Memory (B)", 0))),
+                "mem_total_mb": bytes_to_mb(int(props.get("VRAM Total Memory (B)", 0 ))),
+                "temp_celsius": props.get("Temperature (Sensor junction) (C)", -1),
+            })
+    except Exception:
+        logging.warning("rocm-smi does not work, that's especially bad for initial setup.")
+        logging.warning(traceback.format_exc())
+        logging.warning(f"output was:\n{smi_output_dict}")
+
+    return {"gpus": descriptions}
+
+def bytes_to_mb(bytes_size):
+    mb_size = bytes_size / (1024 ** 2)
+    return mb_size
+
 
 def query_nvidia_smi():
     nvidia_smi_output = "- no output -"
@@ -42,7 +74,10 @@ def query_nvidia_smi():
 
 
 def enum_gpus():
-    result = query_nvidia_smi()
+    if os.environ.get('USE_ROCM'):
+        result = query_rocm_smi()
+    else:
+        result = query_nvidia_smi()
     with open(env.CONFIG_ENUM_GPUS + ".tmp", 'w') as f:
         json.dump(result, f, indent=4)
     os.rename(env.CONFIG_ENUM_GPUS + ".tmp", env.CONFIG_ENUM_GPUS)

diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 
 setup_package = os.environ.get("SETUP_PACKAGE", None)
 install_optional = os.environ.get("INSTALL_OPTIONAL", "FALSE")
+use_rocm = os.environ.get("USE_ROCM", "FALSE")
 
 
 @dataclass
@@ -44,12 +45,24 @@ class PyPackage:
     "self_hosting_machinery": PyPackage(
         requires=["aiohttp", "aiofiles", "cryptography", "fastapi==0.100.0", "giturlparse", "pydantic==1.10.13",
                   "starlette==0.27.0", "uvicorn", "uvloop", "python-multipart", "auto-gptq==0.4.2", "accelerate",
-                  "termcolor", "torch", "transformers==4.34.0", "bitsandbytes", "safetensors", "peft", "triton",
-                  "torchinfo", "mpi4py", "deepspeed==0.11.1"],
-        optional=["ninja", "flash_attn @ git+https://github.com/smallcloudai/flash-attention@feat/alibi"],
+                  "termcolor", "torch", "transformers==4.34.0", "bitsandbytes", "safetensors", "peft",
+                  "torchinfo"],
+        optional=["ninja"],
         requires_packages=["refact_scratchpads", "refact_scratchpads_no_gpu",
                            "known_models_db", "refact_data_pipeline"],
         data=["webgui/static/*", "webgui/static/js/*", "webgui/static/components/modals/*", "watchdog/watchdog.d/*"]),
+    "rocm": PyPackage(
+            requires=[
+                # "bitsandbytes", # TODO: bitsandbytes still dont have support for the ROCm, so we build it from sources, see: https://github.com/TimDettmers/bitsandbytes/pull/756
+                # "deepspeed", # TODO: figure out how to install deepspeed at build time, see: docker-compose.rocm.yaml
+                # "flash_attn", # TODO: flash_attn has support limited support for GPUs, see: https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm2
+                "pytorch-triton-rocm",
+                ]
+        ),
+    "cuda": PyPackage(
+            requires=["mpi4py", "deepspeed==0.11.1", "triton"],
+            optional=["flash_attn @ git+https://github.com/smallcloudai/flash-attention@feat/alibi"],
+        ),    
 }
 
 
@@ -66,17 +79,30 @@ def find_required_packages(packages: Set[str]) -> Set[str]:
 def get_install_requires(packages):
     install_requires = list({
         required_package
-        for py_package in packages.values()
+        for key, py_package in packages.items()
         for required_package in py_package.requires
+        if key not in ("rocm", "cuda")
     })
     if install_optional.upper() == "TRUE":
         install_requires.extend(list({
             required_package
-            for py_package in packages.values()
+            for key, py_package in packages.items()
             for required_package in py_package.optional
+            if key not in ("rocm", "cuda")
         }))
+    install_requires.extend(get_runtime_dependent_dependencies(packages))
     return install_requires
 
+def get_runtime_dependent_dependencies(packages):
+    required = []
+    runtime_key = "rocm" if use_rocm else "cuda"
+    if use_rocm:
+        required.extend(package for package in packages.get(runtime_key).requires)
+        if install_optional.upper() == "TRUE":
+            required.extend(package for package in packages.get(runtime_key).optional)
+    return required
+
+
 
 if setup_package is not None:
     if setup_package not in all_refact_packages: