|
| 1 | +import sys |
| 2 | +import os |
| 3 | +import subprocess |
| 4 | +import platform |
| 5 | +import base64 |
| 6 | +import json |
| 7 | +import openai |
| 8 | + |
| 9 | +from dotenv import load_dotenv |
| 10 | + |
| 11 | +# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v" |
| 12 | +TEST_CASES = { |
| 13 | + "Go to Github.com": "The Github home page is visible.", |
| 14 | + "Go to Youtube.com and play a video": "The YouTube video player is visible.", |
| 15 | +} |
| 16 | + |
| 17 | +EVALUATION_PROMPT = """ |
| 18 | +Your job is to look at the given screenshot and determine if the following guideline is met in the image. |
| 19 | +You must respond in the following format ONLY. Do not add anything else: |
| 20 | +{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }} |
| 21 | +guideline_met must be set to a JSON boolean. True if the image meets the given guideline. |
| 22 | +reason must be a string containing a justification for your decision. |
| 23 | +
|
| 24 | +Guideline: {guideline} |
| 25 | +""" |
| 26 | + |
| 27 | +SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png') |
| 28 | + |
| 29 | +# Check if on a windows terminal that supports ANSI escape codes |
| 30 | +def supports_ansi(): |
| 31 | + """ |
| 32 | + Check if the terminal supports ANSI escape codes |
| 33 | + """ |
| 34 | + plat = platform.system() |
| 35 | + supported_platform = plat != "Windows" or "ANSICON" in os.environ |
| 36 | + is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty() |
| 37 | + return supported_platform and is_a_tty |
| 38 | + |
| 39 | +if supports_ansi(): |
| 40 | + # Standard green text |
| 41 | + ANSI_GREEN = "\033[32m" |
| 42 | + # Bright/bold green text |
| 43 | + ANSI_BRIGHT_GREEN = "\033[92m" |
| 44 | + # Reset to default text color |
| 45 | + ANSI_RESET = "\033[0m" |
| 46 | + # ANSI escape code for blue text |
| 47 | + ANSI_BLUE = "\033[94m" # This is for bright blue |
| 48 | + |
| 49 | + # Standard yellow text |
| 50 | + ANSI_YELLOW = "\033[33m" |
| 51 | + |
| 52 | + ANSI_RED = "\033[31m" |
| 53 | + |
| 54 | + # Bright magenta text |
| 55 | + ANSI_BRIGHT_MAGENTA = "\033[95m" |
| 56 | +else: |
| 57 | + ANSI_GREEN = "" |
| 58 | + ANSI_BRIGHT_GREEN = "" |
| 59 | + ANSI_RESET = "" |
| 60 | + ANSI_BLUE = "" |
| 61 | + ANSI_YELLOW = "" |
| 62 | + ANSI_RED = "" |
| 63 | + ANSI_BRIGHT_MAGENTA = "" |
| 64 | + |
| 65 | + |
| 66 | +def format_evaluation_prompt(guideline): |
| 67 | + prompt = EVALUATION_PROMPT.format(guideline=guideline) |
| 68 | + return prompt |
| 69 | + |
| 70 | + |
| 71 | +def parse_eval_content(content): |
| 72 | + try: |
| 73 | + res = json.loads(content) |
| 74 | + |
| 75 | + print(res["reason"]) |
| 76 | + |
| 77 | + return res["guideline_met"] |
| 78 | + except: |
| 79 | + print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...") |
| 80 | + exit(1) |
| 81 | + |
| 82 | + |
| 83 | +def evaluate_summary_screenshot(guideline): |
| 84 | + '''Load the summary screenshot and return True or False if it meets the given guideline.''' |
| 85 | + with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file: |
| 86 | + img_base64 = base64.b64encode(img_file.read()).decode("utf-8") |
| 87 | + |
| 88 | + eval_message = [{ |
| 89 | + "role": "user", |
| 90 | + "content": [ |
| 91 | + {"type": "text", "text": format_evaluation_prompt(guideline)}, |
| 92 | + { |
| 93 | + "type": "image_url", |
| 94 | + "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, |
| 95 | + }, |
| 96 | + ], |
| 97 | + }] |
| 98 | + |
| 99 | + response = openai.chat.completions.create( |
| 100 | + model="gpt-4-vision-preview", |
| 101 | + messages=eval_message, |
| 102 | + presence_penalty=1, |
| 103 | + frequency_penalty=1, |
| 104 | + temperature=0.7, |
| 105 | + max_tokens=300, |
| 106 | + ) |
| 107 | + |
| 108 | + eval_content = response.choices[0].message.content |
| 109 | + |
| 110 | + return parse_eval_content(eval_content) |
| 111 | + |
| 112 | + |
| 113 | +def run_test_case(objective, guideline): |
| 114 | + '''Returns True if the result of the test with the given prompt meets the given guideline.''' |
| 115 | + # Run `operate` with the test case prompt |
| 116 | + subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL) |
| 117 | + |
| 118 | + try: |
| 119 | + result = evaluate_summary_screenshot(guideline) |
| 120 | + except(OSError): |
| 121 | + print("Couldn't open the summary screenshot") |
| 122 | + return False |
| 123 | + |
| 124 | + return result |
| 125 | + |
| 126 | + |
| 127 | +def main(): |
| 128 | + load_dotenv() |
| 129 | + openai.api_key = os.getenv("OPENAI_API_KEY") |
| 130 | + |
| 131 | + print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}") |
| 132 | + |
| 133 | + passed = 0; failed = 0 |
| 134 | + for objective, guideline in TEST_CASES.items(): |
| 135 | + print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'") |
| 136 | + |
| 137 | + result = run_test_case(objective, guideline) |
| 138 | + if result: |
| 139 | + print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'") |
| 140 | + passed += 1 |
| 141 | + else: |
| 142 | + print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'") |
| 143 | + failed += 1 |
| 144 | + |
| 145 | + print( |
| 146 | + f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed" |
| 147 | + ) |
| 148 | + |
| 149 | +if __name__ == "__main__": |
| 150 | + main() |
0 commit comments