evaluate.py

import sys
import os
import subprocess
import platform
import base64
import json
import openai
import argparse

from dotenv import load_dotenv

# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
TEST_CASES = {
    "Go to Github.com": "A Github page is visible.",
    "Go to Youtube.com and play a video": "The YouTube video player is visible.",
}

EVALUATION_PROMPT = """
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
You must respond in the following format ONLY. Do not add anything else:
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
reason must be a string containing a justification for your decision.

Guideline: {guideline}
"""

SCREENSHOT_PATH = os.path.join('screenshots', 'screenshot.png')

# Check if on a windows terminal that supports ANSI escape codes
def supports_ansi():
    """
    Check if the terminal supports ANSI escape codes
    """
    plat = platform.system()
    supported_platform = plat != "Windows" or "ANSICON" in os.environ
    is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
    return supported_platform and is_a_tty

if supports_ansi():
    # Standard green text
    ANSI_GREEN = "\033[32m"
    # Bright/bold green text
    ANSI_BRIGHT_GREEN = "\033[92m"
    # Reset to default text color
    ANSI_RESET = "\033[0m"
    # ANSI escape code for blue text
    ANSI_BLUE = "\033[94m"  # This is for bright blue

    # Standard yellow text
    ANSI_YELLOW = "\033[33m"

    ANSI_RED = "\033[31m"

    # Bright magenta text
    ANSI_BRIGHT_MAGENTA = "\033[95m"
else:
    ANSI_GREEN = ""
    ANSI_BRIGHT_GREEN = ""
    ANSI_RESET = ""
    ANSI_BLUE = ""
    ANSI_YELLOW = ""
    ANSI_RED = ""
    ANSI_BRIGHT_MAGENTA = ""
    
    
def format_evaluation_prompt(guideline):
    prompt = EVALUATION_PROMPT.format(guideline=guideline)
    return prompt


def parse_eval_content(content):
    try:
        res = json.loads(content)
        
        print(res["reason"])
        
        return res["guideline_met"]
    except:
        print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
        exit(1)


def evaluate_final_screenshot(guideline):
    '''Load the final screenshot and return True or False if it meets the given guideline.'''
    with open(SCREENSHOT_PATH, "rb") as img_file:
        img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

        eval_message = [{
            "role": "user",
            "content": [
                {"type": "text", "text": format_evaluation_prompt(guideline)},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
                },
            ],
        }]
        
        response = openai.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=eval_message,
            presence_penalty=1,
            frequency_penalty=1,
            temperature=0.7,
            max_tokens=300,
        )

        eval_content = response.choices[0].message.content
        
        return parse_eval_content(eval_content)


def run_test_case(objective, guideline, model):
    '''Returns True if the result of the test with the given prompt meets the given guideline for the given model.'''
    # Run `operate` with the model to evaluate and the test case prompt
    subprocess.run(['operate', '-m', model, '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
    
    try:
        result = evaluate_final_screenshot(guideline)
    except(OSError):
        print("[Error] Couldn't open the screenshot for evaluation")
        return False
    
    return result


def get_test_model():
    parser = argparse.ArgumentParser(
        description="Run the self-operating-computer with a specified model."
    )
    
    parser.add_argument(
        "-m",
        "--model",
        help="Specify the model to evaluate.",
        required=False,
        default="gpt-4-with-ocr",
    )
    
    return parser.parse_args().model


def main():
    load_dotenv()
    openai.api_key = os.getenv("OPENAI_API_KEY")
    
    model = get_test_model()
    
    print(f"{ANSI_BLUE}[EVALUATING MODEL `{model}`]{ANSI_RESET}")
    print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")

    passed = 0; failed = 0
    for objective, guideline in TEST_CASES.items():
        print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
        
        result = run_test_case(objective, guideline, model)
        if result:
            print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
            passed += 1
        else:
            print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
            failed += 1

    print(
        f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} test{'' if passed == 1 else 's'} passed, {failed} test{'' if failed == 1 else 's'} failed"
    )

if __name__ == "__main__":
    main()