Skip to content

Commit

Permalink
mpi : add support for distributed inference via MPI (ggerganov#2099)
Browse files Browse the repository at this point in the history
* MPI support, first cut

* fix warnings, update README

* fixes

* wrap includes

* PR comments

* Update CMakeLists.txt

* Add GH workflow, fix test

* Add info to README

* mpi : trying to move more MPI stuff into ggml-mpi (WIP) (ggerganov#2099)

* mpi : add names for layer inputs + prep ggml_mpi_graph_compute()

* mpi : move all MPI logic into ggml-mpi

Not tested yet

* mpi : various fixes - communication now works but results are wrong

* mpi : fix output tensor after MPI compute (still not working)

* mpi : fix inference

* mpi : minor

* Add OpenMPI to GH action

* [mpi] continue-on-error: true

* mpi : fix after master merge

* [mpi] Link MPI C++ libraries to fix OpenMPI

* tests : fix new llama_backend API

* [mpi] use MPI_INT32_T

* mpi : factor out recv / send in functions and reuse

* mpi : extend API to allow usage with outer backends (e.g. Metal)

---------

Co-authored-by: Georgi Gerganov <[email protected]>
  • Loading branch information
evanmiller and ggerganov authored Jul 10, 2023
1 parent 1d16309 commit 5656d10
Show file tree
Hide file tree
Showing 18 changed files with 460 additions and 35 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,40 @@ jobs:
cd build
ctest --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest

continue-on-error: true

strategy:
matrix:
mpi_library: [mpich, libopenmpi-dev]

steps:
- name: Clone
id: checkout
uses: actions/checkout@v1

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }}
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_MPI=ON ..
cmake --build . --config Release
- name: Test
id: cmake_test
run: |
cd build
ctest --verbose
macOS-latest-make:
runs-on: macos-latest

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ build-static/
build-cublas/
build-opencl/
build-metal/
build-mpi/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
Expand Down
24 changes: 24 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)

Expand Down Expand Up @@ -308,6 +309,28 @@ if (LLAMA_METAL)
)
endif()

if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
Expand Down Expand Up @@ -476,6 +499,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_MPI}
${GGML_SOURCES_EXTRA}
)

Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,15 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE

ifdef LLAMA_MPI
CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
OBJS += ggml-mpi.o

ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI

ifdef LLAMA_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
Expand Down
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,45 @@ Any value larger than 0 will offload the computation to the GPU. For example:
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
```

### MPI Build

MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):

- Using `make`:

```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```

- Using `CMake`:

```bash
cmake -S . -B build -DLLAMA_MPI=ON
```

Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.

Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".

Here is an example hostfile:

```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
```

### BLAS Build

Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
Expand Down
2 changes: 1 addition & 1 deletion examples/embd-input/embd-input-lib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down
4 changes: 3 additions & 1 deletion examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -93,5 +93,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -671,5 +671,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/perplexity/perplexity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
params.prompt = gpt_random_prompt(rng);
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand All @@ -172,5 +172,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
usage(argv[0]);
}

llama_init_backend(false);
llama_backend_init(false);

// parse command line arguments
const std::string fname_inp = argv[arg_idx];
Expand Down Expand Up @@ -257,5 +257,7 @@ int main(int argc, char ** argv) {
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
}

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ int main(int argc, char **argv)
params.model_alias = params.model;
}

llama_init_backend(params.numa);
llama_backend_init(params.numa);

LOG_INFO("build info", {{"build", BUILD_NUMBER},
{"commit", BUILD_COMMIT}});
Expand Down Expand Up @@ -1309,5 +1309,7 @@ int main(int argc, char **argv)
return 1;
}

llama_backend_free();

return 0;
}
4 changes: 3 additions & 1 deletion examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ int main(int argc, char ** argv)
// Init LLM :
//---------------------------------

llama_init_backend(params.numa);
llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;
Expand Down Expand Up @@ -173,6 +173,8 @@ int main(int argc, char ** argv)
llama_free( ctx );
llama_free_model( model );

llama_backend_free();

return 0;
}

Expand Down
1 change: 1 addition & 0 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ void ggml_metal_graph_compute(
//}

switch (dst->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:
Expand Down
Loading

0 comments on commit 5656d10

Please sign in to comment.