Merge pull request #49 from DARPA-ASKEM/4432-bug-configuration-from-d…

…ataset 4432 bug configuration from dataset
DARPA-ASKEM · Sep 4, 2024 · 530980f · 530980f
2 parents d510b56 + 39df364
commit 530980f
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 108 deletions.
diff --git a/api/run.py b/api/run.py
@@ -1,8 +1,8 @@
 from contextlib import contextmanager
 from fastapi import FastAPI, HTTPException
 import json
-from gollm.openai.tool_utils import model_config_chain, amodel_card_chain
-from gollm.entities import ConfigureModel, ModelCardModel
+from gollm.openai.tool_utils import model_config_from_document, amodel_card_chain
+from gollm.entities import ConfigureModelDocument, ModelCardModel
 
 app = FastAPI()
 
@@ -22,12 +22,12 @@ async def root():
 
 
 @app.post("/configure")
-async def configure_model(input_model: ConfigureModel):
+async def configure_model_from_document(input_model: ConfigureModelDocument):
     with handle_http_exception():
         print("Received request to configure model from paper..")
         amr = json.dumps(input_model.amr, separators=(",", ":"))
         research_paper = input_model.research_paper
-        response = model_config_chain(research_paper=research_paper, amr=amr)
+        response = model_config_from_document(research_paper=research_paper, amr=amr)
         response = {"response": response}
         return json.dumps(response)
 

diff --git a/gollm/entities.py b/gollm/entities.py
@@ -4,15 +4,14 @@
 from typing import List, Callable, Type, Dict
 
 
-class ConfigureModel(BaseModel):
+class ConfigureModelDocument(BaseModel):
     research_paper: str
-    amr: Dict  # expects AMR in JSON format
+    amr: str  # expects AMR in a stringified JSON object
 
 
 class ConfigureModelDataset(BaseModel):
-    datasets: List[str]
-    amr: Dict  # expects AMR in JSON format
-    matrix_str: str
+    dataset: List[str]
+    amr: str  # expects AMR in a stringified JSON object
 
 
 class ModelCardModel(BaseModel):

diff --git a/gollm/openai/prompts/config_from_dataset.py b/gollm/openai/prompts/config_from_dataset.py
@@ -0,0 +1,111 @@
+CONFIGURE_FROM_DATASET_PROMPT = """
+You are a helpful agent designed to create a model configuration for a given AMR model from a set of user-supplied CSV datasets.
+
+Create a condition for each dataset.
+
+The user-supplied datasets may include both time-series datasets and model-mapping datasets.
+One of your key tasks is to determine the type of dataset supplied. This can be done by examining the column headers in the first row and the values in the first column of the user-supplied CSV dataset.
+    - Model-mapping datasets have the first row and column containing labels and the rest containing numerical values. Often, the first cell in the CSV is empty.
+    - Time-series datasets usually have the first row as labels and a column representing sequential time steps. You can use the column headers to determine which column represents time steps. If the dataset does not have header information, look for columns with date strings or incrementally increasing timestamps or numbers. The other columns will represent the values of the AMR model's states.
+
+If the dataset is time-series, follow the instructions in the TIME-SERIES EXTRACTION section.
+If the dataset is model-mapping, follow the instructions in the MODEL-MAPPING EXTRACTION section.
+
+Do not respond in full sentences; only create a JSON object that satisfies the JSON schema specified in the response format.
+
+"""
+
+CONFIGURE_FROM_DATASET_MAPPING_PROMPT = """
+---MODEL-MAPPING EXTRACTION START---
+
+Pay attention to the following example and use it to understand how to extract values from a model-mapping dataset:
+
+---EXAMPLE START---
+
+---MODEL MAPPING START---
+
+...
+subject-controllers of f
+
+, S_1, S_2, S_3
+I_1, f_0, f_1, f_2
+I_2, f_4, f_3, f_5
+I_3, f_7, f_8, f_6
+
+...
+
+---MODEL MAPPING END---
+
+---SAMPLE DATASET START---
+
+, I_1, I_2, I_3
+S_1, 38.6, 20.5, 6.1
+S_2, 20.5, 28.2, 11.5
+S_3, 6.1, 11.5, 20.0
+
+---SAMPLE DATASET END---
+
+Since the subject controller of f_0 is I_1, S_1, we want to map the value from the dataset cell S_1, I_1 to f_0 which will be 38.6.
+
+Based on this information, we do not know the initial values for I_1 and S_1. Do not misinterpret these interaction values as initials.
+
+---EXAMPLE END---
+
+If the user-supplied dataset is a model-mapping dataset, you must create a model configuration JSON object that satisfies the JSON schema specified in the response format. To do this, follow the instructions below:
+    1.	Using metadata from the AMR model and the user-supplied dataset, create values for `name` and `description`.
+    2.	Provide a long-form description for the description. Set it to an empty string if it cannot be created from the provided metadata.
+    3.	`model_id` id a UUID. If the AMR model has an id, you can use it. Otherwise, you can set as the nil UUID "00000000-0000-0000-0000-000000000000".
+    4.	Create a parameter semantic object for each parameter specified in the AMR model ODE semantics. Do not create new parameter semantic objects if they are not included in the original AMR model. You should set parameter semantic object fields using the following rules:
+        a.	`reference_id` should reference the id of the parameter.
+        b.	`source` should reference the title or file name of the user-supplied dataset.
+        c.	`type` should be set to "parameter".
+        d.	Be sure to extract parameter values from the user-supplied dataset, and do not use the default values from the AMR model. Set the parameter `value` to the constant value and set `type` to "Constant".
+    5.	Create an initial semantic object for each initial specified in the AMR model ODE semantics. Use the default values found in the AMR model. Do not try to create new values.
+    6. `observableSemanticList` should be an empty list.
+    7. `inferredParameterList` should be an empty list.
+
+---MODEL-MAPPING EXTRACTION END---
+
+"""
+
+CONFIGURE_FROM_DATASET_TIMESERIES_PROMPT = """
+---TIME-SERIES EXTRACTION START---
+
+If the user-supplied dataset is a time-series dataset, you must create a model configuration JSON object that satisfies the JSON schema specified in the response format. To do this, follow the instructions below:
+    1.	Using metadata from the AMR model and the user-supplied dataset, create values for `name` and `description`.
+    2.	Provide a long-form description for the description. If it cannot be created from the provided metadata, set it to an empty string.
+    3.	`model_id` id a UUID. If the AMR model has an id, you can use it. Otherwise, you can set as the nil UUID "00000000-0000-0000-0000-000000000000".
+    4.	Create an initial semantic object for each initial specified in the AMR model ODE semantics. Do not create new initial semantic objects if they are not included in the original AMR model. You should set initial semantic object fields using the following rules:
+        a.	`target` should reference the id of the initial variable from the AMR model ODE semantics.
+        b.	`source` should reference the title or file name of the user-supplied dataset.
+        c.	`type` should be set to "initial”.
+        d.	Find the value for `expression` in the user-supplied dataset that aligns with timepoint 0 or the earliest available timepoint.
+        e.	`expression_mathml` should be the value of `expression` written in MathML format.
+    5.	Create a parameter semantic object for each parameter specified in the AMR model ODE semantics. Use the default values found in the AMR model. Do not try to create new values. If the default value is a constant type, set the parameter `value` to the constant value and set `type` to "Constant". If the default value is a distribution with a maximum and minimum value, set `type` to only "Uniform" and populate the `minimum` and `maximum` fields.
+    6. `observableSemanticList` should be an empty list.
+    7. `inferredParameterList` should be an empty list.
+
+---TIME-SERIES EXTRACTION END---
+
+"""
+
+CONFIGURE_FROM_DATASET_DATASET_PROMPT = """
+Use the following user-supplied datasets to answer the query:
+
+---START USER-SUPPLIED CSV DATASETS---
+
+{data}
+---END USER-SUPPLIED CSV DATASETS---
+
+"""
+
+CONFIGURE_FROM_DATASET_AMR_PROMPT = """
+Use the following JSON representation of an AMR model as a reference:
+
+---START AMR MODEL JSON---
+
+{amr}
+
+---END AMR MODEL JSON---
+
+"""
diff --git a/gollm/openai/prompts/petrinet_config.py → gollm/openai/prompts/config_from_document.py b/gollm/openai/prompts/petrinet_config.py → gollm/openai/prompts/config_from_document.py
@@ -1,11 +1,11 @@
-PETRINET_PROMPT = """
-You are a helpful agent designed to find multiple model configurations for a given Petri net model of various conditions described in a research paper.
+CONFIGURE_FROM_DOCUMENT_PROMPT = """
+You are a helpful agent designed to find multiple model configurations for a given AMR model of various conditions described in a research paper.
 
-Use the following Petri net JSON file as a reference:
+Use the following AMR model JSON file as a reference:
 
----START PETRI NET MODEL JSON---
-{petrinet}
----END PETRI NET MODEL JSON---
+---START AMR MODEL JSON---
+{amr}
+---END AMR MODEL JSON---
 
 Use the following user-provided text as the research paper to answer the query:
 
@@ -14,27 +14,28 @@
 ---END USER-PROVIDED TEXT---
 
 Assume that the user-provided text describes multiple conditions to which the model can be applied. Create a model configuration for each condition.
-Be sure to extract parameter values and initial values from the user-provided text, and do not use the default values from the Petri net model.
+Be sure to extract parameter values and initial values from the user-provided text, and do not use the default values from the AMR model.
 Be sure to use consistent naming conventions for the conditions. Instead of 'condition_1' and 'condition_2', use descriptive names.
 
 For each condition, create a model configuration JSON object that satisfies the JSON schema specified in the response format. To do this, follow the instructions below:
 1.	Create a value for `name` and `description` from the user-provided text.
 2.	For the description, provide a long-form description of the condition. If the description cannot be created from the user-provided text, set it to an empty string.
-3.	`model_id` should reference the id of the Petri net model.
-4.	For each initial specified in the Petri net model ODE semantics, create an initial semantic object. Do not create new initial semantic objects if they are not included in the original Petri net model. You should set initial semantic object fields using the following rules:
-    a.	`target` should reference the id of the initial variable from the Petri net model ODE semantics.
+3.	`model_id` id a UUID. If the AMR model has an id, you can use it. Otherwise, you can set as the nil UUID "00000000-0000-0000-0000-000000000000".
+4.	For each initial specified in the AMR model ODE semantics, create an initial semantic object. Do not create new initial semantic objects if they are not included in the original AMR model. You should set initial semantic object fields using the following rules:
+    a.	`target` should reference the id of the initial variable from the AMR model ODE semantics.
     b.	`source` should reference the title or file name of the research paper.
     c.	`type` should be set to "initial”.
-    d.	`expression` should be written in LaTeX format.
-    e.	`expression_mathml` should be written in MathML format.
-    f.	For `expression` and `expression_mathml`, Ensure both are valid and represent the same unit. If the unit is not found or not valid, omit the units field.
-5.	For each parameter specified in the Petri net model ODE semantics, create a parameter semantic object. Do not create new parameter semantic objects if they are not included in the original Petri net model. You should set parameter semantic object fields using the following rules:
+    d.	You should extract a numerical value or an expression of the initial state from the user-provided text if possible and add it to `expression`
+    e.	`expression_mathml` should be `expression` written in MathML format.
+5.	For each parameter specified in the AMR model ODE semantics, create a parameter semantic object. Do not create new parameter semantic objects if they are not included in the original AMR model. You should set parameter semantic object fields using the following rules:
     a.	`reference_id` should reference the id of the parameter.
     b.	`source` should reference the title or file name of the research paper.
     c.	`type` should be set to "parameter".
-    d.	Be sure to extract parameter values from the user-provided text, and do not use the default values from the Petri net model.
+    d.	Be sure to extract parameter values from the user-provided text, and do not use the default values from the AMR model.
         -	If the extracted parameter value is a single constant value, set the parameter `value` to the constant value and set `type` to "Constant".
         -	If the extracted parameter value is a distribution with a maximum and minimum value, set `type` to only "Uniform" and populate the `minimum` and `maximum` fields.
+6. `observableSemanticList` should be an empty list.
+7. `inferredParameterList` should be an empty list.
 
 Do not respond in full sentences; only create a JSON object that satisfies the JSON schema specified in the response format.
 

diff --git a/gollm/openai/prompts/dataset_config.py b/gollm/openai/prompts/dataset_config.py