diff --git a/.gitignore b/.gitignore index dd44983..812dd12 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,4 @@ data/datasets optimized */__pycache__ *.pyc -/workspace *.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3b6c1ba --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,39 @@ +repos: + - repo: https://github.com/psf/black + rev: 23.1.0 + hooks: + - id: black + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + + - repo: https://github.com/PyCQA/autoflake + rev: v2.0.1 + hooks: + - id: autoflake + args: [ + --remove-all-unused-imports, + --ignore-init-module-imports, + --expand-star-imports, + --remove-duplicate-keys, + --remove-unused-variables, + --recursive, + --in-place, + --exclude=__init__.py, + ] + files: \.py$ + + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: [ + "--profile", "black", + "--filter-files", + "--lines-after-imports=2", + ] diff --git a/run_baseline.py b/run_baseline.py index e7fd043..28f0df3 100644 --- a/run_baseline.py +++ b/run_baseline.py @@ -15,8 +15,8 @@ from scripts.async_llm import create_llm_instance # Import the new Workflow from MBPP workspace -import workspace.MBPP.workflows.round_8.graph as mbpp_workflow -import workspace.LiveCodeBench.workflows.round_2.graph as livecodebench_workflow +import workspace.MBPP.workflows.round_1.graph as mbpp_workflow +import workspace.LiveCodeBench.workflows.round_1.graph as livecodebench_workflow import workspace.MBPP.workflows.template.operator as mbpp_operator from scripts.evaluator import DatasetType @@ -77,7 +77,7 @@ async def main(): # log_path refer to the folder of output csv. # test_hotpotqa_benchmark = HotpotQABenchmark(name="HotpotQA", file_path="data/hotpotqa_validate.jsonl", log_path="") test_mbpp_benchmark = MBPPBenchmark(name="MBPP", file_path="data/datasets/mbpp_test.jsonl", log_path="") - test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_raw_validate.jsonl", log_path="experiments/lcb") + test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="experiments/lcb") # test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="") # results = await test_mbpp_benchmark.run_baseline(mbpp_test_workflow) diff --git a/workspace/LiveCodeBench/workflows/round_1/__init__.py b/workspace/LiveCodeBench/workflows/round_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workspace/LiveCodeBench/workflows/round_1/graph.py b/workspace/LiveCodeBench/workflows/round_1/graph.py new file mode 100644 index 0000000..a40d4ad --- /dev/null +++ b/workspace/LiveCodeBench/workflows/round_1/graph.py @@ -0,0 +1,39 @@ +import workspace.LiveCodeBench.workflows.template.operator as operator +from scripts.async_llm import create_llm_instance +from scripts.evaluator import DatasetType +from scripts.logs import logger + + +class Workflow: + def __init__( + self, + name: str, + llm_config, + dataset: DatasetType, + ) -> None: + self.name = name + self.dataset = dataset + self.llm = create_llm_instance(llm_config) + self.custom = operator.Custom(self.llm) + self.custom_code_generate = operator.CustomCodeGenerate(self.llm) + self.test = operator.Test(self.llm) # NEW: enable automatic verification + + async def __call__(self, problem: str, entry_point: str, question_id: str): + """ + Implementation of the workflow + """ + solution = await self.custom_code_generate( + problem=problem, entry_point=entry_point, instruction="" + ) + # NEW: run public tests and, if modified, use the repaired solution + test_result = await self.test( + problem=problem, + solution=solution["response"], + entry_point=entry_point, + question_id=question_id, + ) + final_solution = test_result.get("solution", solution["response"]) + logger.info("-------- test result --------") + logger.info(final_solution) + logger.info("-------- test result --------") + return final_solution, self.llm.get_usage_summary()["total_cost"] diff --git a/workspace/LiveCodeBench/workflows/round_1/prompt.py b/workspace/LiveCodeBench/workflows/round_1/prompt.py new file mode 100644 index 0000000..4a3b247 --- /dev/null +++ b/workspace/LiveCodeBench/workflows/round_1/prompt.py @@ -0,0 +1,5 @@ +# XXX_PROMPT = """ +# +# Solve it. +# +# """ diff --git a/workspace/LiveCodeBench/workflows/template/op_prompt.py b/workspace/LiveCodeBench/workflows/template/op_prompt.py new file mode 100644 index 0000000..6d9f4ae --- /dev/null +++ b/workspace/LiveCodeBench/workflows/template/op_prompt.py @@ -0,0 +1,27 @@ +SC_ENSEMBLE_PROMPT = """ +Given the question described as follows: {problem} +Several solutions have been generated to address the given question. They are as follows: +{solutions} + +Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution. + +In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field. +""" + + +REFLECTION_ON_PUBLIC_TEST_PROMPT = """ +Given a code problem and a python code solution which failed to pass test or execute, you need to analyze the reason for the failure and propose a better code solution.: +### problem +{problem} + +### Code Solution +{solution} + +### Execution Result +{exec_pass} + +#### Failed Test Case +{test_fail} + +Please provide a reflection on the failed test cases and code solution, followed by a better code solution without any additional text or test cases. +""" diff --git a/workspace/LiveCodeBench/workflows/template/operator.py b/workspace/LiveCodeBench/workflows/template/operator.py new file mode 100644 index 0000000..d29435c --- /dev/null +++ b/workspace/LiveCodeBench/workflows/template/operator.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +# @Date : 6/27/2024 17:36 PM +# @Author : didi +# @Desc : operator demo of aflow +from typing import List + +from scripts.async_llm import AsyncLLM +from scripts.logs import logger +from scripts.operators import Operator +from scripts.utils.code import extract_test_cases_from_jsonl +from scripts.utils.lcb_runner import grade_stdio +from workspace.MBPP.workflows.template.op_prompt import * +from workspace.MBPP.workflows.template.operator_an import * + + +class Custom(Operator): + def __init__(self, llm: AsyncLLM, name: str = "Custom"): + super().__init__(llm, name) + + async def __call__(self, input, instruction): + prompt = instruction + input + response = await self._fill_node(GenerateOp, prompt, mode="single_fill") + return response + + +class CustomCodeGenerate(Operator): + def __init__(self, llm: AsyncLLM, name: str = "CustomCodeGenerate"): + super().__init__(llm, name) + + async def __call__(self, problem, entry_point, instruction): + prompt = instruction + problem + response = await self._fill_node( + GenerateOp, prompt, mode="code_fill", function_name=entry_point + ) + return response + + +class ScEnsemble(Operator): + """ + Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models + Link: https://arxiv.org/abs/2203.11171 + Paper: Universal Self-Consistency for Large Language Model Generation + Link: https://arxiv.org/abs/2311.17311 + """ + + def __init__(self, llm: AsyncLLM, name: str = "ScEnsemble"): + super().__init__(llm, name) + + async def __call__(self, solutions: List[str], problem: str): + answer_mapping = {} + solution_text = "" + for index, solution in enumerate(solutions): + answer_mapping[chr(65 + index)] = index + solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n" + + prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text) + response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill") + + answer = response.get("solution_letter", "") + answer = answer.strip().upper() + + return {"response": solutions[answer_mapping[answer]]} + + +class Test(Operator): + def __init__(self, llm: AsyncLLM, name: str = "Test"): + super().__init__(llm, name) + + def exec_code(self, solution, entry_point, question_id=""): + """ + Execute code using LiveCodeBench runner for consistency with official evaluation + """ + import json + + # For LiveCodeBench, use question_id to find test cases + search_key = question_id if question_id else entry_point + test_cases = extract_test_cases_from_jsonl(search_key, dataset="LiveCodeBench") + + # Handle case where no test cases are found + if test_cases is None: + return {"exec_fail_case": f"No test cases found for {search_key}"} + + try: + # Parse test cases - they should be in JSON format for LiveCodeBench + if isinstance(test_cases, str): + test_cases = json.loads(test_cases) + + # Extract inputs and outputs for lcb_runner + inputs = [] + outputs = [] + + for test_case in test_cases: + if isinstance(test_case, dict): + inputs.append(test_case.get("input", "")) + outputs.append(test_case.get("output", "")) + print(inputs) + print(outputs) + + # Use grade_stdio directly to avoid multiprocessing issues + results, metadata = grade_stdio( + code=solution, all_inputs=inputs, all_outputs=outputs, timeout=6 + ) + + logger.info(f"results: {results} {metadata}") + + # Check if all tests passed + if isinstance(results, list) and all(r == True or r == 1 for r in results): + return "no error" + else: + # Return error information + return {"exec_fail_case": f"Test failed: {metadata}"} + + except Exception as e: + return {"exec_fail_case": f"Error executing tests: {str(e)}"} + + async def __call__( + self, problem, solution, entry_point, test_loop: int = 3, question_id: str = "" + ): + """ + "Test": { + "description": "Test the solution with test cases, if the solution is correct, return 'no error', if the solution is incorrect, return reflect on the soluion and the error information", + "interface": "test(problem: str, solution: str, entry_point: str) -> str" + } + """ + for _ in range(test_loop): + result = self.exec_code(solution, entry_point, question_id) + if result == "no error": + return {"result": True, "solution": solution} + elif "exec_fail_case" in result: + result = result["exec_fail_case"] + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem=problem, + solution=solution, + exec_pass=f"executed unsuccessfully, error: \n {result}", + test_fail="executed unsucessfully", + ) + response = await self._fill_node( + ReflectionTestOp, prompt, mode="code_fill" + ) + solution = response["response"] + else: + prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format( + problem=problem, + solution=solution, + exec_pass="executed successfully", + test_fail=result, + ) + response = await self._fill_node( + ReflectionTestOp, prompt, mode="code_fill" + ) + solution = response["response"] + + result = self.exec_code(solution, entry_point, question_id) + if result == "no error": + return {"result": True, "solution": solution} + else: + return {"result": False, "solution": solution} diff --git a/workspace/LiveCodeBench/workflows/template/operator_an.py b/workspace/LiveCodeBench/workflows/template/operator_an.py new file mode 100644 index 0000000..9b97d9e --- /dev/null +++ b/workspace/LiveCodeBench/workflows/template/operator_an.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# @Date : 6/27/2024 19:46 PM +# @Author : didi +# @Desc : action nodes for operator + +from pydantic import BaseModel, Field + + +class GenerateOp(BaseModel): + response: str = Field(default="", description="Your solution for this problem") + + +class ScEnsembleOp(BaseModel): + thought: str = Field( + default="", description="The thought of the most consistent solution." + ) + solution_letter: str = Field( + default="", description="The letter of most consistent solution." + ) + + +class ReflectionTestOp(BaseModel): + reflection_and_solution: str = Field( + default="", + description="Corrective solution for code execution errors or test case failures", + )