-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
124 lines (104 loc) · 3.53 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import uuid
import time
import ollama
from eval_prompts import prompts
from eval_variables import variables
from eval_models import models
from utils.my_types import Result, Model
from utils.llm import query
from utils.grading import grade_response
from utils.display import print_data, generate_and_show_summary
from utils.validation import check_exists
from utils.io import save_logs
MODELS = models
PROMPTS = prompts
VARIABLES = variables
TEMPERATURE = 0.0
DANGER_MODE = True # does not ask permission about prices; use with care.
GRADING_TYPE = "json" # qualitative uses gpt4o; use with care.
def main():
start_time = time.perf_counter()
run_id = time.strftime("%Y%m%d%H%M%S")
print()
print(f"CRUCIBLE PROMPT EVALUATION {run_id}")
print()
print(f"models: " + ", ".join([str(x.id) for x in MODELS]))
print(f"prompts: " + ", ".join([str(x.id) for x in PROMPTS]))
print(f"variables: " + ", ".join([str(x.id) for x in VARIABLES]))
total_cases = len(MODELS) * len(PROMPTS) * len(VARIABLES)
print(f"total cases: {total_cases}")
print()
check_exists(MODELS, PROMPTS, VARIABLES)
print_data(
"case",
"model",
"prompt",
"variable",
"case_id",
"grade",
MODELS,
PROMPTS,
VARIABLES,
)
outputs: list[Result] = []
counter = 0
for model in MODELS:
for prompt in PROMPTS:
for variable in VARIABLES:
start_query_time = time.perf_counter()
case_id = uuid.uuid4().hex
result = Result(
id=case_id,
model=model.id,
prompt_id=prompt.id,
variable_id=variable.id,
expected=variable.expected,
)
try:
response = query(model, prompt, variable, TEMPERATURE, DANGER_MODE)
grade = grade_response(response, variable, GRADING_TYPE)
result.response = response
result.grade = grade
print_data(
f"{counter}/{total_cases}",
model.id,
prompt.id,
variable.id,
case_id,
str(grade),
MODELS,
PROMPTS,
VARIABLES,
)
except ollama.ResponseError as e:
result.response = response
result.grade = 0
result.time_elapsed = _time
result.error = e.error
print("Error:", e.error)
finally:
_time = round(time.perf_counter() - start_query_time, 2)
result.time_elapsed = _time
outputs.append(result)
save_logs(outputs, run_id)
counter += 1
print()
print("SUMMARY")
print()
print("BY MODEL")
generate_and_show_summary(outputs, MODELS, GRADING_TYPE)
print()
print("BY PROMPT")
generate_and_show_summary(outputs, PROMPTS, GRADING_TYPE)
print()
print("BY VARIABLE")
generate_and_show_summary(outputs, VARIABLES, GRADING_TYPE)
print()
print("BY EXPECTED")
print("todo")
# generate_and_show_summary(outputs, VARIABLES)
print()
print(f"Time taken: {time.perf_counter() - start_time:.0f} seconds")
print(f"Saved logs to: outputs/{run_id}")
if __name__ == "__main__":
main()