Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ data/datasets
optimized
*/__pycache__
*.pyc
/workspace
*.json
39 changes: 39 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
repos:
- repo: https://github.com/psf/black
rev: 23.1.0
hooks:
- id: black

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files

- repo: https://github.com/PyCQA/autoflake
rev: v2.0.1
hooks:
- id: autoflake
args: [
--remove-all-unused-imports,
--ignore-init-module-imports,
--expand-star-imports,
--remove-duplicate-keys,
--remove-unused-variables,
--recursive,
--in-place,
--exclude=__init__.py,
]
files: \.py$

- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args: [
"--profile", "black",
"--filter-files",
"--lines-after-imports=2",
]
6 changes: 3 additions & 3 deletions run_baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from scripts.async_llm import create_llm_instance

# Import the new Workflow from MBPP workspace
import workspace.MBPP.workflows.round_8.graph as mbpp_workflow
import workspace.LiveCodeBench.workflows.round_2.graph as livecodebench_workflow
import workspace.MBPP.workflows.round_1.graph as mbpp_workflow
import workspace.LiveCodeBench.workflows.round_1.graph as livecodebench_workflow
import workspace.MBPP.workflows.template.operator as mbpp_operator

from scripts.evaluator import DatasetType
Expand Down Expand Up @@ -77,7 +77,7 @@ async def main():
# log_path refer to the folder of output csv.
# test_hotpotqa_benchmark = HotpotQABenchmark(name="HotpotQA", file_path="data/hotpotqa_validate.jsonl", log_path="")
test_mbpp_benchmark = MBPPBenchmark(name="MBPP", file_path="data/datasets/mbpp_test.jsonl", log_path="")
test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_raw_validate.jsonl", log_path="experiments/lcb")
test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="experiments/lcb")
# test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="")

# results = await test_mbpp_benchmark.run_baseline(mbpp_test_workflow)
Expand Down
Empty file.
39 changes: 39 additions & 0 deletions workspace/LiveCodeBench/workflows/round_1/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import workspace.LiveCodeBench.workflows.template.operator as operator
from scripts.async_llm import create_llm_instance
from scripts.evaluator import DatasetType
from scripts.logs import logger


class Workflow:
def __init__(
self,
name: str,
llm_config,
dataset: DatasetType,
) -> None:
self.name = name
self.dataset = dataset
self.llm = create_llm_instance(llm_config)
self.custom = operator.Custom(self.llm)
self.custom_code_generate = operator.CustomCodeGenerate(self.llm)
self.test = operator.Test(self.llm) # NEW: enable automatic verification

async def __call__(self, problem: str, entry_point: str, question_id: str):
"""
Implementation of the workflow
"""
solution = await self.custom_code_generate(
problem=problem, entry_point=entry_point, instruction=""
)
# NEW: run public tests and, if modified, use the repaired solution
test_result = await self.test(
problem=problem,
solution=solution["response"],
entry_point=entry_point,
question_id=question_id,
)
final_solution = test_result.get("solution", solution["response"])
logger.info("-------- test result --------")
logger.info(final_solution)
logger.info("-------- test result --------")
return final_solution, self.llm.get_usage_summary()["total_cost"]
5 changes: 5 additions & 0 deletions workspace/LiveCodeBench/workflows/round_1/prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# XXX_PROMPT = """
#
# Solve it.
#
# """
27 changes: 27 additions & 0 deletions workspace/LiveCodeBench/workflows/template/op_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
SC_ENSEMBLE_PROMPT = """
Given the question described as follows: {problem}
Several solutions have been generated to address the given question. They are as follows:
{solutions}

Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.

In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
"""


REFLECTION_ON_PUBLIC_TEST_PROMPT = """
Given a code problem and a python code solution which failed to pass test or execute, you need to analyze the reason for the failure and propose a better code solution.:
### problem
{problem}

### Code Solution
{solution}

### Execution Result
{exec_pass}

#### Failed Test Case
{test_fail}

Please provide a reflection on the failed test cases and code solution, followed by a better code solution without any additional text or test cases.
"""
157 changes: 157 additions & 0 deletions workspace/LiveCodeBench/workflows/template/operator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# -*- coding: utf-8 -*-
# @Date : 6/27/2024 17:36 PM
# @Author : didi
# @Desc : operator demo of aflow
from typing import List

from scripts.async_llm import AsyncLLM
from scripts.logs import logger
from scripts.operators import Operator
from scripts.utils.code import extract_test_cases_from_jsonl
from scripts.utils.lcb_runner import grade_stdio
from workspace.MBPP.workflows.template.op_prompt import *
from workspace.MBPP.workflows.template.operator_an import *


class Custom(Operator):
def __init__(self, llm: AsyncLLM, name: str = "Custom"):
super().__init__(llm, name)

async def __call__(self, input, instruction):
prompt = instruction + input
response = await self._fill_node(GenerateOp, prompt, mode="single_fill")
return response


class CustomCodeGenerate(Operator):
def __init__(self, llm: AsyncLLM, name: str = "CustomCodeGenerate"):
super().__init__(llm, name)

async def __call__(self, problem, entry_point, instruction):
prompt = instruction + problem
response = await self._fill_node(
GenerateOp, prompt, mode="code_fill", function_name=entry_point
)
return response


class ScEnsemble(Operator):
"""
Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
Link: https://arxiv.org/abs/2203.11171
Paper: Universal Self-Consistency for Large Language Model Generation
Link: https://arxiv.org/abs/2311.17311
"""

def __init__(self, llm: AsyncLLM, name: str = "ScEnsemble"):
super().__init__(llm, name)

async def __call__(self, solutions: List[str], problem: str):
answer_mapping = {}
solution_text = ""
for index, solution in enumerate(solutions):
answer_mapping[chr(65 + index)] = index
solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"

prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text)
response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill")

answer = response.get("solution_letter", "")
answer = answer.strip().upper()

return {"response": solutions[answer_mapping[answer]]}


class Test(Operator):
def __init__(self, llm: AsyncLLM, name: str = "Test"):
super().__init__(llm, name)

def exec_code(self, solution, entry_point, question_id=""):
"""
Execute code using LiveCodeBench runner for consistency with official evaluation
"""
import json

# For LiveCodeBench, use question_id to find test cases
search_key = question_id if question_id else entry_point
test_cases = extract_test_cases_from_jsonl(search_key, dataset="LiveCodeBench")

# Handle case where no test cases are found
if test_cases is None:
return {"exec_fail_case": f"No test cases found for {search_key}"}

try:
# Parse test cases - they should be in JSON format for LiveCodeBench
if isinstance(test_cases, str):
test_cases = json.loads(test_cases)

# Extract inputs and outputs for lcb_runner
inputs = []
outputs = []

for test_case in test_cases:
if isinstance(test_case, dict):
inputs.append(test_case.get("input", ""))
outputs.append(test_case.get("output", ""))
print(inputs)
print(outputs)

# Use grade_stdio directly to avoid multiprocessing issues
results, metadata = grade_stdio(
code=solution, all_inputs=inputs, all_outputs=outputs, timeout=6
)

logger.info(f"results: {results} {metadata}")

# Check if all tests passed
if isinstance(results, list) and all(r == True or r == 1 for r in results):
return "no error"
else:
# Return error information
return {"exec_fail_case": f"Test failed: {metadata}"}

except Exception as e:
return {"exec_fail_case": f"Error executing tests: {str(e)}"}

async def __call__(
self, problem, solution, entry_point, test_loop: int = 3, question_id: str = ""
):
"""
"Test": {
"description": "Test the solution with test cases, if the solution is correct, return 'no error', if the solution is incorrect, return reflect on the soluion and the error information",
"interface": "test(problem: str, solution: str, entry_point: str) -> str"
}
"""
for _ in range(test_loop):
result = self.exec_code(solution, entry_point, question_id)
if result == "no error":
return {"result": True, "solution": solution}
elif "exec_fail_case" in result:
result = result["exec_fail_case"]
prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
problem=problem,
solution=solution,
exec_pass=f"executed unsuccessfully, error: \n {result}",
test_fail="executed unsucessfully",
)
response = await self._fill_node(
ReflectionTestOp, prompt, mode="code_fill"
)
solution = response["response"]
else:
prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
problem=problem,
solution=solution,
exec_pass="executed successfully",
test_fail=result,
)
response = await self._fill_node(
ReflectionTestOp, prompt, mode="code_fill"
)
solution = response["response"]

result = self.exec_code(solution, entry_point, question_id)
if result == "no error":
return {"result": True, "solution": solution}
else:
return {"result": False, "solution": solution}
26 changes: 26 additions & 0 deletions workspace/LiveCodeBench/workflows/template/operator_an.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
# @Date : 6/27/2024 19:46 PM
# @Author : didi
# @Desc : action nodes for operator

from pydantic import BaseModel, Field


class GenerateOp(BaseModel):
response: str = Field(default="", description="Your solution for this problem")


class ScEnsembleOp(BaseModel):
thought: str = Field(
default="", description="The thought of the most consistent solution."
)
solution_letter: str = Field(
default="", description="The letter of most consistent solution."
)


class ReflectionTestOp(BaseModel):
reflection_and_solution: str = Field(
default="",
description="Corrective solution for code execution errors or test case failures",
)