Merge pull request #234 from RobotSail/add-leaderboard

RobotSail · web-flow · commit cea8acd1f5b2 · 2025-04-16T01:30:07.000-04:00
Implement leaderboard as a benchmark
diff --git a/README.md b/README.md
@@ -29,6 +29,16 @@ the phase. At the end of each phase, we evaluate all the checkpoints in order to
 Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU,
 MT-Bench Branch and MMLU Branch.
 
+### Leaderboard Evaluation
+
+For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD.
+
+To install the optional leaderboard dependencies, use:
+
+```bash
+pip install instructlab-eval[leaderboard]
+```
+
 ## Methods of Evaluation
 
 Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models.
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ issues = "https://github.com/instructlab/eval/issues"
 "mmlu_branch" = "instructlab.eval.mmlu:MMLUBranchEvaluator"
 "mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
 "mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
+"leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"
 
 [tool.setuptools_scm]
 version_file = "src/instructlab/eval/_version.py"
@@ -53,6 +54,7 @@ package-dir = {"" = "src"}
 
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.txt"]}
+optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/requirements-leaderboard.txt b/requirements-leaderboard.txt
@@ -0,0 +1,10 @@
+lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4
+
+# vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
+vllm<=0.7.3
+torch<=2.5.1
+
+# XXX(osilkin): We use StrEnum in leaderboard, but Python3.10 doesn't have it as part of
+#               the standard library, so we have to install it from the older library.
+strenum>=0.4.15; python_version < '3.11'
+typing-extensions>=4.0.0; python_version < '3.11'
diff --git a/requirements.txt b/requirements.txt
@@ -8,6 +8,7 @@ transformers
 accelerate
 pandas
 pandas-stubs
+# Base lm-eval dependency
 lm-eval>=0.4.4
 httpx
 ragas>=0.2.11
diff --git a/scripts/evaluate_best_checkpoint.py b/scripts/evaluate_best_checkpoint.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+
+"""
+Example usage:
+python scripts/evaluate_best_checkpoint.py \
+    /path/to/checkpoint_dir \
+    --output-file /path/to/output_file
+"""
+
+# Standard
+from pathlib import Path
+from typing import Optional
+import json
+
+# Third Party
+import typer
+
+app = typer.Typer()
+
+
+@app.command()
+def main(
+    input_dir: Path = typer.Argument(..., help="Input directory to process"),
+    output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
+):
+    """
+    Process files in the input directory and optionally save results to an output file.
+    """
+    if not input_dir.exists():
+        typer.echo(f"Error: Input directory '{input_dir}' does not exist")
+        raise typer.Exit(1)
+
+    if not input_dir.is_dir():
+        typer.echo(f"Error: '{input_dir}' is not a directory")
+        raise typer.Exit(1)
+
+    checkpoint_dirs = list(input_dir.glob("hf_format/samples_*"))
+    typer.echo(f"Found {len(checkpoint_dirs)} samples files")
+
+    if not checkpoint_dirs:
+        typer.echo(
+            f"No checkpoint directories found in the input directory: {input_dir}"
+        )
+        raise typer.Exit(1)
+
+    typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
+    # First Party
+    from instructlab.eval.leaderboard import LeaderboardV2Evaluator
+
+    checkpoint_results = {}
+    for checkpoint in checkpoint_dirs:
+        typer.echo(f"Processing checkpoint: {checkpoint}")
+        ckpt_output_file = checkpoint / "leaderboard_results.json"
+        evaluator = LeaderboardV2Evaluator(
+            model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
+        )
+        result = evaluator.run()
+        checkpoint_results[checkpoint.name] = result
+        typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
+
+    # Sort checkpoints by score
+    sorted_checkpoints = sorted(
+        checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
+    )
+    typer.echo("Sorted checkpoints by score:")
+    for checkpoint_name, result in sorted_checkpoints:
+        typer.echo(f"{'=' * 100}")
+        typer.echo(json.dumps(result, indent=2))
+
+    typer.echo(f"{'=' * 100}")
+    typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
+
+    if output_file:
+        typer.echo(f"Output will be saved to: {output_file}")
+        with open(output_file, "w") as f:
+            json.dump(checkpoint_results, f, indent=2)
+
+    # Add your processing logic here
+
+    typer.echo("Processing complete!")
+
+
+if __name__ == "__main__":
+    app()
diff --git a/scripts/test_leaderboard.py b/scripts/test_leaderboard.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+
+# NOTE: This script requires the leaderboard optional dependencies.
+# Install with: pip install instructlab-eval[leaderboard]
+
+# Standard
+import json
+
+# First Party
+from instructlab.eval.leaderboard import LeaderboardV2Evaluator
+
+if __name__ == "__main__":
+    evaluator = LeaderboardV2Evaluator(
+        model_path="ibm-granite/granite-3.1-8b-base",
+        eval_config={
+            "apply_chat_template": False,
+        },
+    )
+    results = evaluator.run()
+    print("got results from leaderboard v2")
+    print(json.dumps(results, indent=2))
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
diff --git a/tests/test_project.py b/tests/test_project.py
diff --git a/tox.ini b/tox.ini