Skip to content

Commit cea8acd

Browse files
authored
Merge pull request #234 from RobotSail/add-leaderboard
Implement leaderboard as a benchmark
2 parents 96c8f49 + 66fb8bb commit cea8acd

File tree

9 files changed

+993
-1
lines changed

9 files changed

+993
-1
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ the phase. At the end of each phase, we evaluate all the checkpoints in order to
2929
Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU,
3030
MT-Bench Branch and MMLU Branch.
3131

32+
### Leaderboard Evaluation
33+
34+
For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD.
35+
36+
To install the optional leaderboard dependencies, use:
37+
38+
```bash
39+
pip install instructlab-eval[leaderboard]
40+
```
41+
3242
## Methods of Evaluation
3343

3444
Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ issues = "https://github.com/instructlab/eval/issues"
4242
"mmlu_branch" = "instructlab.eval.mmlu:MMLUBranchEvaluator"
4343
"mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
4444
"mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
45+
"leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"
4546

4647
[tool.setuptools_scm]
4748
version_file = "src/instructlab/eval/_version.py"
@@ -53,6 +54,7 @@ package-dir = {"" = "src"}
5354

5455
[tool.setuptools.dynamic]
5556
dependencies = {file = ["requirements.txt"]}
57+
optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}
5658

5759
[tool.setuptools.packages.find]
5860
where = ["src"]

requirements-leaderboard.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4
2+
3+
# vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
4+
vllm<=0.7.3
5+
torch<=2.5.1
6+
7+
# XXX(osilkin): We use StrEnum in leaderboard, but Python3.10 doesn't have it as part of
8+
# the standard library, so we have to install it from the older library.
9+
strenum>=0.4.15; python_version < '3.11'
10+
typing-extensions>=4.0.0; python_version < '3.11'

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ transformers
88
accelerate
99
pandas
1010
pandas-stubs
11+
# Base lm-eval dependency
1112
lm-eval>=0.4.4
1213
httpx
1314
ragas>=0.2.11
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Example usage:
5+
python scripts/evaluate_best_checkpoint.py \
6+
/path/to/checkpoint_dir \
7+
--output-file /path/to/output_file
8+
"""
9+
10+
# Standard
11+
from pathlib import Path
12+
from typing import Optional
13+
import json
14+
15+
# Third Party
16+
import typer
17+
18+
app = typer.Typer()
19+
20+
21+
@app.command()
22+
def main(
23+
input_dir: Path = typer.Argument(..., help="Input directory to process"),
24+
output_file: Optional[Path] = typer.Option(None, help="Optional output file path"),
25+
):
26+
"""
27+
Process files in the input directory and optionally save results to an output file.
28+
"""
29+
if not input_dir.exists():
30+
typer.echo(f"Error: Input directory '{input_dir}' does not exist")
31+
raise typer.Exit(1)
32+
33+
if not input_dir.is_dir():
34+
typer.echo(f"Error: '{input_dir}' is not a directory")
35+
raise typer.Exit(1)
36+
37+
checkpoint_dirs = list(input_dir.glob("hf_format/samples_*"))
38+
typer.echo(f"Found {len(checkpoint_dirs)} samples files")
39+
40+
if not checkpoint_dirs:
41+
typer.echo(
42+
f"No checkpoint directories found in the input directory: {input_dir}"
43+
)
44+
raise typer.Exit(1)
45+
46+
typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
47+
# First Party
48+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
49+
50+
checkpoint_results = {}
51+
for checkpoint in checkpoint_dirs:
52+
typer.echo(f"Processing checkpoint: {checkpoint}")
53+
ckpt_output_file = checkpoint / "leaderboard_results.json"
54+
evaluator = LeaderboardV2Evaluator(
55+
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
56+
)
57+
result = evaluator.run()
58+
checkpoint_results[checkpoint.name] = result
59+
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
60+
61+
# Sort checkpoints by score
62+
sorted_checkpoints = sorted(
63+
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
64+
)
65+
typer.echo("Sorted checkpoints by score:")
66+
for checkpoint_name, result in sorted_checkpoints:
67+
typer.echo(f"{'=' * 100}")
68+
typer.echo(json.dumps(result, indent=2))
69+
70+
typer.echo(f"{'=' * 100}")
71+
typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}")
72+
73+
if output_file:
74+
typer.echo(f"Output will be saved to: {output_file}")
75+
with open(output_file, "w") as f:
76+
json.dump(checkpoint_results, f, indent=2)
77+
78+
# Add your processing logic here
79+
80+
typer.echo("Processing complete!")
81+
82+
83+
if __name__ == "__main__":
84+
app()

scripts/test_leaderboard.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env python
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# NOTE: This script requires the leaderboard optional dependencies.
5+
# Install with: pip install instructlab-eval[leaderboard]
6+
7+
# Standard
8+
import json
9+
10+
# First Party
11+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
12+
13+
if __name__ == "__main__":
14+
evaluator = LeaderboardV2Evaluator(
15+
model_path="ibm-granite/granite-3.1-8b-base",
16+
eval_config={
17+
"apply_chat_template": False,
18+
},
19+
)
20+
results = evaluator.run()
21+
print("got results from leaderboard v2")
22+
print(json.dumps(results, indent=2))

0 commit comments

Comments
 (0)