Skip to content

Commit 82829cf

Browse files
authored
Merge pull request OthersideAI#98 from michaelhhogue/evaluator
Create automated evaluator
2 parents 9459e9f + 979840c commit 82829cf

File tree

3 files changed

+177
-13
lines changed

3 files changed

+177
-13
lines changed

CONTRIBUTING.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ We appreciate your contributions!
1414
3. Run `operate` to test your changes
1515

1616
## Contribution Ideas
17-
- **Develop an Automated End-to-End Testing System**: Build an automated testing framework that can be run before merging PRs to `main` to confirm no test cases broke. An example of such a test case would be "go to google docs and write a poem". This testing system should be flexible to add new test cases in the future and reduce the time spent on manually testing each PR.
1817
- **Improve performance by finding optimal screenshot grid**: A primary element of the framework is that it overlays a percentage grid on the screenshot which GPT-4v uses to estimate click locations. If someone is able to find the optimal grid and some evaluation metrics to confirm it is an improvement on the current method then we will merge that PR.
1918
- **Improve the `SUMMARY_PROMPT`**
2019
- **Improve Linux and Windows compatibility**: There are still some issues with Linux and Windows compatibility. PRs to fix the issues are encouraged.

evaluate.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import sys
2+
import os
3+
import subprocess
4+
import platform
5+
import base64
6+
import json
7+
import openai
8+
9+
from dotenv import load_dotenv
10+
11+
# "Objective for `operate`" : "Guideline for passing this test case given to GPT-4v"
12+
TEST_CASES = {
13+
"Go to Github.com": "The Github home page is visible.",
14+
"Go to Youtube.com and play a video": "The YouTube video player is visible.",
15+
}
16+
17+
EVALUATION_PROMPT = """
18+
Your job is to look at the given screenshot and determine if the following guideline is met in the image.
19+
You must respond in the following format ONLY. Do not add anything else:
20+
{{ "guideline_met": (true|false), "reason": "Explanation for why guideline was or wasn't met" }}
21+
guideline_met must be set to a JSON boolean. True if the image meets the given guideline.
22+
reason must be a string containing a justification for your decision.
23+
24+
Guideline: {guideline}
25+
"""
26+
27+
SUMMARY_SCREENSHOT_PATH = os.path.join('screenshots', 'summary_screenshot.png')
28+
29+
# Check if on a windows terminal that supports ANSI escape codes
30+
def supports_ansi():
31+
"""
32+
Check if the terminal supports ANSI escape codes
33+
"""
34+
plat = platform.system()
35+
supported_platform = plat != "Windows" or "ANSICON" in os.environ
36+
is_a_tty = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
37+
return supported_platform and is_a_tty
38+
39+
if supports_ansi():
40+
# Standard green text
41+
ANSI_GREEN = "\033[32m"
42+
# Bright/bold green text
43+
ANSI_BRIGHT_GREEN = "\033[92m"
44+
# Reset to default text color
45+
ANSI_RESET = "\033[0m"
46+
# ANSI escape code for blue text
47+
ANSI_BLUE = "\033[94m" # This is for bright blue
48+
49+
# Standard yellow text
50+
ANSI_YELLOW = "\033[33m"
51+
52+
ANSI_RED = "\033[31m"
53+
54+
# Bright magenta text
55+
ANSI_BRIGHT_MAGENTA = "\033[95m"
56+
else:
57+
ANSI_GREEN = ""
58+
ANSI_BRIGHT_GREEN = ""
59+
ANSI_RESET = ""
60+
ANSI_BLUE = ""
61+
ANSI_YELLOW = ""
62+
ANSI_RED = ""
63+
ANSI_BRIGHT_MAGENTA = ""
64+
65+
66+
def format_evaluation_prompt(guideline):
67+
prompt = EVALUATION_PROMPT.format(guideline=guideline)
68+
return prompt
69+
70+
71+
def parse_eval_content(content):
72+
try:
73+
res = json.loads(content)
74+
75+
print(res["reason"])
76+
77+
return res["guideline_met"]
78+
except:
79+
print("The model gave a bad evaluation response and it couldn't be parsed. Exiting...")
80+
exit(1)
81+
82+
83+
def evaluate_summary_screenshot(guideline):
84+
'''Load the summary screenshot and return True or False if it meets the given guideline.'''
85+
with open(SUMMARY_SCREENSHOT_PATH, "rb") as img_file:
86+
img_base64 = base64.b64encode(img_file.read()).decode("utf-8")
87+
88+
eval_message = [{
89+
"role": "user",
90+
"content": [
91+
{"type": "text", "text": format_evaluation_prompt(guideline)},
92+
{
93+
"type": "image_url",
94+
"image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
95+
},
96+
],
97+
}]
98+
99+
response = openai.chat.completions.create(
100+
model="gpt-4-vision-preview",
101+
messages=eval_message,
102+
presence_penalty=1,
103+
frequency_penalty=1,
104+
temperature=0.7,
105+
max_tokens=300,
106+
)
107+
108+
eval_content = response.choices[0].message.content
109+
110+
return parse_eval_content(eval_content)
111+
112+
113+
def run_test_case(objective, guideline):
114+
'''Returns True if the result of the test with the given prompt meets the given guideline.'''
115+
# Run `operate` with the test case prompt
116+
subprocess.run(['operate', '--prompt', f'"{objective}"'], stdout=subprocess.DEVNULL)
117+
118+
try:
119+
result = evaluate_summary_screenshot(guideline)
120+
except(OSError):
121+
print("Couldn't open the summary screenshot")
122+
return False
123+
124+
return result
125+
126+
127+
def main():
128+
load_dotenv()
129+
openai.api_key = os.getenv("OPENAI_API_KEY")
130+
131+
print(f"{ANSI_BRIGHT_MAGENTA}[STARTING EVALUATION]{ANSI_RESET}")
132+
133+
passed = 0; failed = 0
134+
for objective, guideline in TEST_CASES.items():
135+
print(f"{ANSI_BLUE}[EVALUATING]{ANSI_RESET} '{objective}'")
136+
137+
result = run_test_case(objective, guideline)
138+
if result:
139+
print(f"{ANSI_GREEN}[PASSED]{ANSI_RESET} '{objective}'")
140+
passed += 1
141+
else:
142+
print(f"{ANSI_RED}[FAILED]{ANSI_RESET} '{objective}'")
143+
failed += 1
144+
145+
print(
146+
f"{ANSI_BRIGHT_MAGENTA}[EVALUATION COMPLETE]{ANSI_RESET} {passed} tests passed, {failed} tests failed"
147+
)
148+
149+
if __name__ == "__main__":
150+
main()

operate/main.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@
133133
"""
134134

135135

136-
137136
class ModelNotRecognizedException(Exception):
138137
"""Exception raised for unrecognized models."""
139138

@@ -195,15 +194,12 @@ def supports_ansi():
195194
ANSI_BRIGHT_MAGENTA = ""
196195

197196

198-
def main(model, accurate_mode, voice_mode=False):
197+
def main(model, accurate_mode, terminal_prompt, voice_mode=False):
199198
"""
200199
Main function for the Self-Operating Computer
201200
"""
202201
mic = None
203202
# Initialize WhisperMic if voice_mode is True if voice_mode is True
204-
"""
205-
Main function for the Self-Operating Computer
206-
"""
207203
if voice_mode:
208204
try:
209205
from whisper_mic import WhisperMic
@@ -216,11 +212,15 @@ def main(model, accurate_mode, voice_mode=False):
216212
)
217213
sys.exit(1)
218214

219-
message_dialog(
220-
title="Self-Operating Computer",
221-
text="Ask a computer to do anything.",
222-
style=style,
223-
).run()
215+
# Skip message dialog if prompt was given directly
216+
if not terminal_prompt:
217+
message_dialog(
218+
title="Self-Operating Computer",
219+
text="Ask a computer to do anything.",
220+
style=style,
221+
).run()
222+
else:
223+
print("Running direct prompt...")
224224

225225
print("SYSTEM", platform.system())
226226
# Clear the console
@@ -229,7 +229,9 @@ def main(model, accurate_mode, voice_mode=False):
229229
else:
230230
print("\033c", end="")
231231

232-
if voice_mode:
232+
if terminal_prompt: # Skip objective prompt if it was given as an argument
233+
objective = terminal_prompt
234+
elif voice_mode:
233235
print(
234236
f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RESET} Listening for your command... (speak now)"
235237
)
@@ -838,9 +840,22 @@ def main_entry():
838840
required=False,
839841
)
840842

843+
# Allow for direct input of prompt
844+
parser.add_argument(
845+
"--prompt",
846+
help="Directly input the objective prompt",
847+
type=str,
848+
required=False,
849+
)
850+
841851
try:
842852
args = parser.parse_args()
843-
main(args.model, accurate_mode=args.accurate, voice_mode=args.voice)
853+
main(
854+
args.model,
855+
accurate_mode=args.accurate,
856+
terminal_prompt=args.prompt,
857+
voice_mode=args.voice,
858+
)
844859
except KeyboardInterrupt:
845860
print(f"\n{ANSI_BRIGHT_MAGENTA}Exiting...")
846861

0 commit comments

Comments
 (0)