Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MATH evaluation #135

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ We provide the scripts for running evaluation of Huggingface/OpenAI models on a

- [MMLU](https://github.com/hendrycks/test)
- [Grade School Math (GSM)](https://github.com/openai/grade-school-math)
- [MATH](https://github.com/hendrycks/math)
- [Big-Bench Hard (BBH)](https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main)
- [TydiQA](https://github.com/google-research-datasets/tydiqa)
- [Codex HumanEval](https://github.com/openai/human-eval/tree/master)
Expand Down
271 changes: 271 additions & 0 deletions eval/MATH/answer_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,271 @@
import re
import regex

def _fix_fracs(string):
substrs = string.split("\\frac")
new_str = substrs[0]
if len(substrs) > 1:
substrs = substrs[1:]
for substr in substrs:
new_str += "\\frac"
if len(substr) > 0 and substr[0] == "{":
new_str += substr
else:
try:
assert len(substr) >= 2
except:
return string
a = substr[0]
b = substr[1]
if b != "{":
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}{" + b + "}" + post_substr
else:
new_str += "{" + a + "}{" + b + "}"
else:
if len(substr) > 2:
post_substr = substr[2:]
new_str += "{" + a + "}" + b + post_substr
else:
new_str += "{" + a + "}" + b
string = new_str
return string


def _fix_a_slash_b(string):
if len(string.split("/")) != 2:
return string
a = string.split("/")[0]
b = string.split("/")[1]
try:
if "sqrt" not in a:
a = int(a)
if "sqrt" not in b:
b = int(b)
assert string == "{}/{}".format(a, b)
new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
return new_string
except:
return string


def _fix_sqrt(string):
_string = re.sub(r"\\sqrt(-?[0-9.a-zA-Z]+)", r"\\sqrt{\1}", string)
_string = re.sub(r"\\sqrt\s+(\w+)$", r"\\sqrt{\1}", _string)
return _string


def _fix_tan(string):
_string = re.sub(r"\\tan(-?[0-9.a-zA-Z]+)", r"\\tan{\1}", string)
_string = re.sub(r"\\tan\s+(\w+)$", r"\\tan{\1}", _string)
return _string


def strip_string(string):
string = str(string).strip()
# linebreaks
string = string.replace("\n", "")

# right "."
string = string.rstrip(".")

# remove inverse spaces
string = string.replace("\\!", "")
# string = string.replace("\\ ", "")

# replace \\ with \
# string = string.replace("\\\\", "\\")
# string = string.replace("\\\\", "\\")

if string.startswith("\\text{") and string.endswith("}"):
string = string.split("{", 1)[1][:-1]

# replace tfrac and dfrac with frac
string = string.replace("tfrac", "frac")
string = string.replace("dfrac", "frac")
string = string.replace("cfrac", "frac")

# remove \left and \right
string = string.replace("\\left", "")
string = string.replace("\\right", "")

# Remove unit: miles, dollars if after is not none
_string = re.sub(r"\\text{.*?}$", "", string).strip()
if _string != "" and _string != string:
# print("Warning: unit not removed: '{}' -> '{}'".format(string, _string))
string = _string

# Remove circ (degrees)
string = string.replace("^{\\circ}", "").strip()
string = string.replace("^\\circ", "").strip()

string = regex.sub(r"\{(c|m)?m\}(\^(2|3))?", "", string).strip()
string = regex.sub(r"p\.m\.$", "", string).strip()
string = regex.sub(r"(\d)\s*t$", r"\1", string).strip()

# remove dollar signs
string = string.replace("\\$", "")
string = string.replace("$", "")

# string = string.replace("\\text", "")
string = string.replace("x\\in", "")

# remove percentage
string = string.replace("\\%", "%")
string = string.replace("\%", "%")
# string = string.replace("%", "")

# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
string = string.replace(" .", " 0.")
string = string.replace("{.", "{0.")

# cdot
string = string.replace("\\cdot", "")

# inf
string = string.replace("infinity", "\\infty")
if "\\infty" not in string:
string = string.replace("inf", "\\infty")
string = string.replace("+\\inity", "\\infty")

# and
# string = string.replace("and", "")
string = string.replace("\\mathbf", "")
string = string.replace("\\mathrm", "")

# use regex to remove \mbox{...}
string = re.sub(r"\\mbox{.*?}", "", string)

# quote
string.replace("'", "")
string.replace("\"", "")

# i, j
if "j" in string and "i" not in string:
string = string.replace("j", "i")

# replace a.000b where b is not number or b is end, with ab, use regex
string = re.sub(r"(\d+)\.0+([^\d])", r"\1\2", string)
string = re.sub(r"(\d+)\.0+$", r"\1", string)

# if empty, return empty string
if len(string) == 0:
return string
if string[0] == ".":
string = "0" + string

# to consider: get rid of e.g. "k = " or "q = " at beginning
# if len(string.split("=")) == 2:
# if len(string.split("=")[0]) <= 2:
# string = string.split("=")[1]

string = _fix_sqrt(string)
string = _fix_tan(string)
string = string.replace(" ", "")

# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
string = _fix_fracs(string)

# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
string = _fix_a_slash_b(string)

string = regex.sub(r"(\\|,|\.)+$", "", string)

return string

def extract_boxed_answers(text):
answers = []
for piece in text.split('boxed{')[1:]:
n = 0
for i in range(len(piece)):
if piece[i] == '{':
n += 1
elif piece[i] == '}':
n -= 1
if n < 0:
if i + 1 < len(piece) and piece[i + 1] == '%':
answers.append(piece[: i + 1])
else:
answers.append(piece[:i])
break
return answers

def extract_program_output(pred_str):
"""
extract output between the last ```output\n...\n```
"""
if "```output" not in pred_str:
return ""
if '```output' in pred_str:
pred_str = pred_str.split('```output')[-1]
if '```' in pred_str:
pred_str = pred_str.split('```')[0]
output = pred_str.strip()
return output

def extract_answer(pred_str, exhaust=False):
pred = []
if 'final answer is $' in pred_str and '$. I hope' in pred_str:
tmp = pred_str.split('final answer is $', 1)[1]
pred = [tmp.split('$. I hope', 1)[0].strip()]
elif 'boxed' in pred_str:
pred = extract_boxed_answers(pred_str)
elif ('he answer is' in pred_str):
pred = [pred_str.split('he answer is')[-1].strip()]
else:
program_output = extract_program_output(pred_str)
if program_output != "":
# fall back to program
pred.append(program_output)
else: # use the last number
pattern = '-?\d*\.?\d+'
ans = re.findall(pattern, pred_str.replace(",", ""))
if(len(ans) >= 1):
ans = ans[-1]
else:
ans = ''
if ans:
pred.append(ans)

# multiple line
_pred = []
for ans in pred:
ans = ans.strip().split("\n")[0]
ans = ans.lstrip(":")
ans = ans.rstrip(".")
ans = ans.rstrip("/")
ans = strip_string(ans)
_pred.append(ans)
if exhaust:
return _pred
else:
return _pred[-1] if _pred else ""

def extract_math_answer(question, reasoning, task):
answer = []
for ans in extract_answer(reasoning, exhaust=True):
if 'separated by commas' in question and all(ch not in ans for ch in '()[]'):
answer.extend([a.strip() for a in ans.split(",")])
elif regex.search(r"\\text\{\s*and\s*\}", ans):
answer.extend([a.strip() for a in regex.sub(r"\\text\{\s*and\s*\}", "[SEP]", ans).split("[SEP]")])
else:
answer.append(ans.strip())
return answer

def extract_math_few_shot_cot_answer(question, reasoning, task):
if 'Problem:' in reasoning:
reasoning = reasoning.split("Problem:", 1)[0]
return extract_math_answer(question, reasoning, task)

def extract_last_single_answer(question, reasoning, task):
return extract_answer(reasoning, exhaust=False)

def extract_gsm_few_shot_cot_answer(question, reasoning, task):
if 'Q: ' in reasoning:
reasoning = reasoning.split("Q: ", 1)[0]
pred = [s for s in regex.findall(r'-?\d+\.?\d*', reasoning)]
if pred:
return pred[-1]
else:
return "[invalid]"
23 changes: 23 additions & 0 deletions eval/MATH/examplars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# These examplars are from the DeepSeekMath GitHub repository (https://github.com/deepseek-ai/DeepSeek-Math/tree/main/evaluation/few_shot_prompts)
EXAMPLARS = [
{
"question": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}",
"cot_answer": "The expressions inside each square root must be non-negative.\nTherefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$.\nAlso, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$.\nTherefore, the domain of the expression is $\\boxed{[2,5)}$.",
"short_answer": "[2,5)"
},
{
"question": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$",
"cot_answer": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$",
"short_answer": "24"
},
{
"question": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?",
"cot_answer": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{align*}\n30n&=480\\\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}",
"short_answer": "16"
},
{
"question": "If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.",
"cot_answer": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$",
"short_answer": "-\\frac{2}{3}"
}
]
Loading