FoundationAgents · SNHuan · Jul 24, 2025 · Jul 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,5 +5,4 @@ data/datasets
 optimized
 */__pycache__
 *.pyc
-/workspace
 *.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,39 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+      - id: black
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.0.1
+    hooks:
+      - id: autoflake
+        args: [
+          --remove-all-unused-imports,
+          --ignore-init-module-imports,
+          --expand-star-imports,
+          --remove-duplicate-keys,
+          --remove-unused-variables,
+          --recursive,
+          --in-place,
+          --exclude=__init__.py,
+        ]
+        files: \.py$
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: [
+          "--profile", "black",
+          "--filter-files",
+          "--lines-after-imports=2",
+        ]
diff --git a/run_baseline.py b/run_baseline.py
@@ -15,8 +15,8 @@
 from scripts.async_llm import create_llm_instance
 
 # Import the new Workflow from MBPP workspace
-import workspace.MBPP.workflows.round_8.graph as mbpp_workflow
-import workspace.LiveCodeBench.workflows.round_2.graph as livecodebench_workflow
+import workspace.MBPP.workflows.round_1.graph as mbpp_workflow
+import workspace.LiveCodeBench.workflows.round_1.graph as livecodebench_workflow
 import workspace.MBPP.workflows.template.operator as mbpp_operator
 
 from scripts.evaluator import DatasetType
@@ -77,7 +77,7 @@ async def main():
     # log_path refer to the folder of output csv.
     # test_hotpotqa_benchmark = HotpotQABenchmark(name="HotpotQA", file_path="data/hotpotqa_validate.jsonl", log_path="")
     test_mbpp_benchmark = MBPPBenchmark(name="MBPP", file_path="data/datasets/mbpp_test.jsonl", log_path="")
-    test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_raw_validate.jsonl", log_path="experiments/lcb")
+    test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="experiments/lcb")
     # test_livecodebench = LiveCodeBench(name="LiveCodeBench", file_path="data/datasets/livecodebench_validate.jsonl", log_path="")
 
     # results = await test_mbpp_benchmark.run_baseline(mbpp_test_workflow)

diff --git a/workspace/LiveCodeBench/workflows/round_1/__init__.py b/workspace/LiveCodeBench/workflows/round_1/__init__.py
diff --git a/workspace/LiveCodeBench/workflows/round_1/graph.py b/workspace/LiveCodeBench/workflows/round_1/graph.py
@@ -0,0 +1,39 @@
+import workspace.LiveCodeBench.workflows.template.operator as operator
+from scripts.async_llm import create_llm_instance
+from scripts.evaluator import DatasetType
+from scripts.logs import logger
+
+
+class Workflow:
+    def __init__(
+        self,
+        name: str,
+        llm_config,
+        dataset: DatasetType,
+    ) -> None:
+        self.name = name
+        self.dataset = dataset
+        self.llm = create_llm_instance(llm_config)
+        self.custom = operator.Custom(self.llm)
+        self.custom_code_generate = operator.CustomCodeGenerate(self.llm)
+        self.test = operator.Test(self.llm)  # NEW: enable automatic verification
+
+    async def __call__(self, problem: str, entry_point: str, question_id: str):
+        """
+        Implementation of the workflow
+        """
+        solution = await self.custom_code_generate(
+            problem=problem, entry_point=entry_point, instruction=""
+        )
+        # NEW: run public tests and, if modified, use the repaired solution
+        test_result = await self.test(
+            problem=problem,
+            solution=solution["response"],
+            entry_point=entry_point,
+            question_id=question_id,
+        )
+        final_solution = test_result.get("solution", solution["response"])
+        logger.info("-------- test result --------")
+        logger.info(final_solution)
+        logger.info("-------- test result --------")
+        return final_solution, self.llm.get_usage_summary()["total_cost"]
diff --git a/workspace/LiveCodeBench/workflows/round_1/prompt.py b/workspace/LiveCodeBench/workflows/round_1/prompt.py
@@ -0,0 +1,5 @@
+# XXX_PROMPT = """
+#
+# Solve it.
+#
+# """
diff --git a/workspace/LiveCodeBench/workflows/template/op_prompt.py b/workspace/LiveCodeBench/workflows/template/op_prompt.py
@@ -0,0 +1,27 @@
+SC_ENSEMBLE_PROMPT = """
+Given the question described as follows: {problem}
+Several solutions have been generated to address the given question. They are as follows:
+{solutions}
+
+Carefully evaluate these solutions and identify the answer that appears most frequently across them. This consistency in answers is crucial for determining the most reliable solution.
+
+In the "thought" field, provide a detailed explanation of your thought process. In the "solution_letter" field, output only the single letter ID (A, B, C, etc.) corresponding to the most consistent solution. Do not include any additional text or explanation in the "solution_letter" field.
+"""
+
+
+REFLECTION_ON_PUBLIC_TEST_PROMPT = """
+Given a code problem and a python code solution which failed to pass test or execute, you need to analyze the reason for the failure and propose a better code solution.:
+### problem
+{problem}
+
+### Code Solution
+{solution}
+
+### Execution Result
+{exec_pass}
+
+#### Failed Test Case
+{test_fail}
+
+Please provide a reflection on the failed test cases and code solution, followed by a better code solution without any additional text or test cases.
+"""
diff --git a/workspace/LiveCodeBench/workflows/template/operator.py b/workspace/LiveCodeBench/workflows/template/operator.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+# @Date    : 6/27/2024 17:36 PM
+# @Author  : didi
+# @Desc    : operator demo of aflow
+from typing import List
+
+from scripts.async_llm import AsyncLLM
+from scripts.logs import logger
+from scripts.operators import Operator
+from scripts.utils.code import extract_test_cases_from_jsonl
+from scripts.utils.lcb_runner import grade_stdio
+from workspace.MBPP.workflows.template.op_prompt import *
+from workspace.MBPP.workflows.template.operator_an import *
+
+
+class Custom(Operator):
+    def __init__(self, llm: AsyncLLM, name: str = "Custom"):
+        super().__init__(llm, name)
+
+    async def __call__(self, input, instruction):
+        prompt = instruction + input
+        response = await self._fill_node(GenerateOp, prompt, mode="single_fill")
+        return response
+
+
+class CustomCodeGenerate(Operator):
+    def __init__(self, llm: AsyncLLM, name: str = "CustomCodeGenerate"):
+        super().__init__(llm, name)
+
+    async def __call__(self, problem, entry_point, instruction):
+        prompt = instruction + problem
+        response = await self._fill_node(
+            GenerateOp, prompt, mode="code_fill", function_name=entry_point
+        )
+        return response
+
+
+class ScEnsemble(Operator):
+    """
+    Paper: Self-Consistency Improves Chain of Thought Reasoning in Language Models
+    Link: https://arxiv.org/abs/2203.11171
+    Paper: Universal Self-Consistency for Large Language Model Generation
+    Link: https://arxiv.org/abs/2311.17311
+    """
+
+    def __init__(self, llm: AsyncLLM, name: str = "ScEnsemble"):
+        super().__init__(llm, name)
+
+    async def __call__(self, solutions: List[str], problem: str):
+        answer_mapping = {}
+        solution_text = ""
+        for index, solution in enumerate(solutions):
+            answer_mapping[chr(65 + index)] = index
+            solution_text += f"{chr(65 + index)}: \n{str(solution)}\n\n\n"
+
+        prompt = SC_ENSEMBLE_PROMPT.format(problem=problem, solutions=solution_text)
+        response = await self._fill_node(ScEnsembleOp, prompt, mode="xml_fill")
+
+        answer = response.get("solution_letter", "")
+        answer = answer.strip().upper()
+
+        return {"response": solutions[answer_mapping[answer]]}
+
+
+class Test(Operator):
+    def __init__(self, llm: AsyncLLM, name: str = "Test"):
+        super().__init__(llm, name)
+
+    def exec_code(self, solution, entry_point, question_id=""):
+        """
+        Execute code using LiveCodeBench runner for consistency with official evaluation
+        """
+        import json
+
+        # For LiveCodeBench, use question_id to find test cases
+        search_key = question_id if question_id else entry_point
+        test_cases = extract_test_cases_from_jsonl(search_key, dataset="LiveCodeBench")
+
+        # Handle case where no test cases are found
+        if test_cases is None:
+            return {"exec_fail_case": f"No test cases found for {search_key}"}
+
+        try:
+            # Parse test cases - they should be in JSON format for LiveCodeBench
+            if isinstance(test_cases, str):
+                test_cases = json.loads(test_cases)
+
+            # Extract inputs and outputs for lcb_runner
+            inputs = []
+            outputs = []
+
+            for test_case in test_cases:
+                if isinstance(test_case, dict):
+                    inputs.append(test_case.get("input", ""))
+                    outputs.append(test_case.get("output", ""))
+            print(inputs)
+            print(outputs)
+
+            # Use grade_stdio directly to avoid multiprocessing issues
+            results, metadata = grade_stdio(
+                code=solution, all_inputs=inputs, all_outputs=outputs, timeout=6
+            )
+
+            logger.info(f"results: {results} {metadata}")
+
+            # Check if all tests passed
+            if isinstance(results, list) and all(r == True or r == 1 for r in results):
+                return "no error"
+            else:
+                # Return error information
+                return {"exec_fail_case": f"Test failed: {metadata}"}
+
+        except Exception as e:
+            return {"exec_fail_case": f"Error executing tests: {str(e)}"}
+
+    async def __call__(
+        self, problem, solution, entry_point, test_loop: int = 3, question_id: str = ""
+    ):
+        """
+        "Test": {
+        "description": "Test the solution with test cases, if the solution is correct, return 'no error', if the solution is incorrect, return reflect on the soluion and the error information",
+        "interface": "test(problem: str, solution: str, entry_point: str) -> str"
+        }
+        """
+        for _ in range(test_loop):
+            result = self.exec_code(solution, entry_point, question_id)
+            if result == "no error":
+                return {"result": True, "solution": solution}
+            elif "exec_fail_case" in result:
+                result = result["exec_fail_case"]
+                prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
+                    problem=problem,
+                    solution=solution,
+                    exec_pass=f"executed unsuccessfully, error: \n {result}",
+                    test_fail="executed unsucessfully",
+                )
+                response = await self._fill_node(
+                    ReflectionTestOp, prompt, mode="code_fill"
+                )
+                solution = response["response"]
+            else:
+                prompt = REFLECTION_ON_PUBLIC_TEST_PROMPT.format(
+                    problem=problem,
+                    solution=solution,
+                    exec_pass="executed successfully",
+                    test_fail=result,
+                )
+                response = await self._fill_node(
+                    ReflectionTestOp, prompt, mode="code_fill"
+                )
+                solution = response["response"]
+
+        result = self.exec_code(solution, entry_point, question_id)
+        if result == "no error":
+            return {"result": True, "solution": solution}
+        else:
+            return {"result": False, "solution": solution}
diff --git a/workspace/LiveCodeBench/workflows/template/operator_an.py b/workspace/LiveCodeBench/workflows/template/operator_an.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+# @Date    : 6/27/2024 19:46 PM
+# @Author  : didi
+# @Desc    : action nodes for operator
+
+from pydantic import BaseModel, Field
+
+
+class GenerateOp(BaseModel):
+    response: str = Field(default="", description="Your solution for this problem")
+
+
+class ScEnsembleOp(BaseModel):
+    thought: str = Field(
+        default="", description="The thought of the most consistent solution."
+    )
+    solution_letter: str = Field(
+        default="", description="The letter of most consistent solution."
+    )
+
+
+class ReflectionTestOp(BaseModel):
+    reflection_and_solution: str = Field(
+        default="",
+        description="Corrective solution for code execution errors or test case failures",
+    )