Update docs, pin REST API requirements, use modelId for LiteLLM

These changes address pul request feedback for adding comments for inter-file dependencies and updates to the README file. This also modifies the LiteLLM configuration to use the user-provided modelId if it exists, otherwise default back to the model name. This allows for users to use the same model and weights, but using two different containers.
awslabs · May 31, 2024 · 2ae1596 · 2ae1596
1 parent ed42a82
commit 2ae1596
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -88,6 +88,64 @@ We also will need `.safetensors`. In order to reduce the startup time we will do
 
 Note: we have primarily designed and tested this with HuggingFace models in mind. Any models outside of this format will require you to create and upload safetensors manually.
 
+### LiteLLM Configuration
+
+With the models that we are hosted using the process above, we automatically add them to a LiteLLM configuration,
+utilizing the [OpenAI specification](https://platform.openai.com/docs/api-reference) for programmatic access. We expose
+the [LiteLLM configuration](https://litellm.vercel.app/docs/proxy/configs) file directly within the LISA config.yaml
+file, so any options defined there can be defined directly in the LISA config file, under the `litellmConfig` option.
+This also means that we will also support calling other existing models that your VPC configuration allows. For more
+information about adding models, please see the LiteLLM docs [here](https://litellm.vercel.app/docs/proxy/configs).
+
+#### SageMaker Endpoints and Bedrock Models
+
+We do support adding existing SageMaker Endpoints and Bedrock Models to the LiteLLM configuration, and as long as the
+services you use are in the same region as the LISA installation, LISA will be able to use those models alongside any
+other models you have deployed. After installing LISA without referencing the SageMaker Endpoint, create a Model using
+the private subnets of the LISA deployment, and that will allow the REST API container to reach out to any Endpoint that
+uses that Model. Then, to invoke the SageMaker Endpoints or Bedrock Models, you would need to add the following
+permissions to the "REST-Role" that was created in the IAM stack:
+
+```
+"bedrock:InvokeModel",
+"bedrock:InvokeModelWithResponseStream",
+"sagemaker:InvokeEndpoint",
+"sagemaker:InvokeEndpointWithResponseStream"
+```
+
+After adding those permissions and access in the VPC, LiteLLM will now be able to route traffic to those entities, and
+they will be accessible through the LISA ALB, using the OpenAI specification for programmatic access.
+
+#### Recommended Configuration Options
+
+There is no one-size-fits-all configuration, especially when it comes to referencing models that you've deployed outside
+the scope of LISA, but we do recommend the following settings for a minimal setup, assuming a SageMaker Endpoint called
+"test-endpoint," access to the "amazon.titan-text-express-v1" Bedrock Model, and a self-hosted OpenAI-compatible model
+with an endpoint you can access from the VPC. The SageMaker Endpoint and Bedrock Model must be in the same region as the
+LISA installation.
+
+```yaml
+dev:
+  litellmConfig:
+    litellm_settings:
+      telemetry: false # Don't attempt to send telemetry to LiteLLM servers from within VPC
+      drop_params: true # Don't fail if params not recognized, instead ignore unrecognized params
+    model_list:
+      - model_name: test-endpoint # Human-readable name, can be anything and will be used for OpenAI API calls
+        litellm_params:
+          model: sagemaker/test-endpoint # Prefix required for SageMaker Endpoints and "test-endpoint" matches Endpoint name
+          api_key: ignored # Provide an ignorable placeholder key to avoid LiteLLM deployment failures
+      - model_name: bedrock-titan-express # Human-readable name for future OpenAI API calls
+        litellm_params:
+          model: bedrock/amazon.titan-text-express-v1 # Prefix required for Bedrock Models, and exact name of Model to use
+          api_key: ignored # Provide an ignorable placeholder key to avoid LiteLLM deployment failures
+      - model_name: custom-openai-model # Used in future OpenAI-compatible calls to LiteLLM
+        litellm_params:
+          model: openai/modelProvider/modelName # Prefix required for OpenAI-compatible models followed by model provider and name details
+          api_base: https://your-domain-here:443/v1 # Your model's base URI
+          api_key: ignored # Provide an ignorable placeholder key to avoid LiteLLM deployment failures
+```
+
 ### DEV ONLY: Create Self-Signed Certificates for ALB
 
 **WARNING: THIS IS FOR DEV ONLY**

diff --git a/lib/schema.ts b/lib/schema.ts
@@ -73,12 +73,14 @@ export enum EcsSourceType {
  *
  * @property {string} provider - Model provider, of the form <engine>.<type>.
  * @property {string} modelName - The unique name that identifies the model.
+ * @property {string} modelId - The unique user-provided name for the model.
  * @property {ModelType} modelType - Specifies the type of model (e.g., 'textgen', 'embedding').
  * @property {string} endpointUrl - The URL endpoint where the model can be accessed or invoked.
  * @property {boolean} streaming - Indicates whether the model supports streaming capabilities.
  */
 export interface RegisteredModel {
   provider: string;
+  modelId: string;
   modelName: string;
   modelType: ModelType;
   endpointUrl: string;
@@ -697,6 +699,7 @@ const LiteLLMConfig = z.object({
   litellm_settings: z.object({
     // ALL (https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py)
     telemetry: z.boolean().default(false).optional(),
+    drop_params: z.boolean().default(true).optional(),
   }),
   general_settings: z
     .object({

diff --git a/lib/serve/index.ts b/lib/serve/index.ts
@@ -101,6 +101,8 @@ export class LisaServeApplicationStack extends Stack {
         // Create metadata to register model in parameter store
         const registeredModel: RegisteredModel = {
           provider: `${modelConfig.modelHosting}.${modelConfig.modelType}.${modelConfig.inferenceContainer}`,
+          // modelId is used for LiteLLM config to differentiate the same model deployed with two different containers
+          modelId: modelConfig.modelId ? modelConfig.modelId : modelConfig.modelName,
           modelName: modelConfig.modelName,
           modelType: modelConfig.modelType,
           endpointUrl: ecsModel.endpointUrl,

diff --git a/lib/serve/rest-api/Dockerfile b/lib/serve/rest-api/Dockerfile
@@ -20,7 +20,8 @@ COPY src/requirements.txt .
 RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 # Copy LiteLLM config directly to container, it will be updated at runtime
-# with LISA-hosted models.
+# with LISA-hosted models. This filename is expected in the entrypoint.sh file, so do not modify
+# the filename unless you modify it in the entrypoint.sh file too.
 RUN echo "$LITELLM_CONFIG" > litellm_config.yaml
 
 # Copy the source code into the container

diff --git a/lib/serve/rest-api/src/api/endpoints/v2/litellm_passthrough.py b/lib/serve/rest-api/src/api/endpoints/v2/litellm_passthrough.py
@@ -22,7 +22,8 @@
 from fastapi import APIRouter, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
 
-# Local LiteLLM installation URL
+# Local LiteLLM installation URL. By default, LiteLLM runs on port 4000. Change the port here if the
+# port was changed as part of the LiteLLM startup in entrypoint.sh
 LITELLM_URL = "http://localhost:4000"
 
 logger = logging.getLogger(__name__)

diff --git a/lib/serve/rest-api/src/entrypoint.sh b/lib/serve/rest-api/src/entrypoint.sh
@@ -8,9 +8,13 @@ PORT="8080"
 # Update LiteLLM config that was already copied from config.yaml with runtime-deployed models.
 # Depends on SSM Parameter for registered models.
 echo "Configuring and starting LiteLLM"
+# litellm_config.yaml is generated from the REST API Dockerfile from the LISA config.yaml.
+# Do not modify the litellm_config.yaml name unless you change it in the Dockerfile and in the `litellm` command below.
 python ./src/utils/generate_litellm_config.py -f litellm_config.yaml
 
 # Start LiteLLM in the background, default port 4000, not exposed outside of container.
+# If you need to change the port, you can specify the --port option, and then the port needs to be updated in
+# src/api/endpoints/v2/litellm_passthrough.py for the LiteLLM URI
 litellm -c litellm_config.yaml &
 
 echo "Starting Gunicorn with $THREADS workers..."

diff --git a/lib/serve/rest-api/src/requirements.txt b/lib/serve/rest-api/src/requirements.txt
@@ -1,15 +1,14 @@
-aiobotocore
-aioboto3
-aiohttp
-boto3
-click
-cryptography
-fastapi
-fastapi_utils
-gunicorn
-litellm[proxy]
-loguru
-pydantic<2
-PyJWT
-text-generation
-uvicorn
+aioboto3==13.0.0
+aiohttp==3.9.5
+boto3==1.34.106
+click==8.1.7
+cryptography==42.0.7
+fastapi==0.111.0
+fastapi_utils==0.6.0
+gunicorn==22.0.0
+litellm[proxy]==1.38.12
+loguru==0.7.2
+pydantic==1.10.15 # must be <2
+PyJWT==2.8.0
+text-generation==0.6.1
+uvicorn==0.22.0
diff --git a/lib/serve/rest-api/src/utils/generate_litellm_config.py b/lib/serve/rest-api/src/utils/generate_litellm_config.py
@@ -38,7 +38,7 @@ def generate_config(filepath: str) -> None:
     # Generate model definitions for each of the LISA-deployed models
     litellm_model_params = [
         {
-            "model_name": model["modelName"],
+            "model_name": model["modelId"],  # Use user-provided name if one given, otherwise it is the model name.
             "litellm_params": {
                 "model": f"openai/{model['modelName']}",
                 "api_base": model["endpointUrl"] + "/v1",  # Local containers require the /v1 for OpenAI API routing.