Skip to content

Commit 66fb8bb

Browse files
committed
add requirements file for leaderboard
Signed-off-by: Oleg Silkin <[email protected]>
1 parent cd47eaa commit 66fb8bb

File tree

6 files changed

+88
-55
lines changed

6 files changed

+88
-55
lines changed

requirements-leaderboard.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4
2+
3+
# vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct
4+
vllm<=0.7.3
5+
torch<=2.5.1
6+
7+
# XXX(osilkin): We use StrEnum in leaderboard, but Python3.10 doesn't have it as part of
8+
# the standard library, so we have to install it from the older library.
9+
strenum>=0.4.15; python_version < '3.11'
10+
typing-extensions>=4.0.0; python_version < '3.11'

scripts/evaluate_best_checkpoint.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,13 @@
77
--output-file /path/to/output_file
88
"""
99

10-
import json
11-
import typer
10+
# Standard
1211
from pathlib import Path
1312
from typing import Optional
13+
import json
1414

15+
# Third Party
16+
import typer
1517

1618
app = typer.Typer()
1719

@@ -42,22 +44,23 @@ def main(
4244
raise typer.Exit(1)
4345

4446
typer.echo("importing LeaderboardV2Evaluator, this may take a while...")
47+
# First Party
4548
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
4649

4750
checkpoint_results = {}
4851
for checkpoint in checkpoint_dirs:
4952
typer.echo(f"Processing checkpoint: {checkpoint}")
5053
ckpt_output_file = checkpoint / "leaderboard_results.json"
5154
evaluator = LeaderboardV2Evaluator(
52-
model_path=str(checkpoint), output_file=ckpt_output_file
55+
model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8
5356
)
5457
result = evaluator.run()
5558
checkpoint_results[checkpoint.name] = result
56-
typer.echo(f"Checkpoint {checkpoint.name} results: {result['score']}")
59+
typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}")
5760

5861
# Sort checkpoints by score
5962
sorted_checkpoints = sorted(
60-
checkpoint_results.items(), key=lambda x: x[1]["score"], reverse=True
63+
checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True
6164
)
6265
typer.echo("Sorted checkpoints by score:")
6366
for checkpoint_name, result in sorted_checkpoints:

scripts/test_leaderboard.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
# NOTE: This script requires the leaderboard optional dependencies.
55
# Install with: pip install instructlab-eval[leaderboard]
66

7-
# First Party
7+
# Standard
88
import json
9+
10+
# First Party
911
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
1012

1113
if __name__ == "__main__":

src/instructlab/eval/leaderboard.py

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
# Standard
2-
from enum import StrEnum
2+
from copy import deepcopy
33
from pathlib import Path
44
import gc
55
import json
66
import os
77
import typing as t
8-
from copy import deepcopy
98

109
# Third Party
1110
from accelerate import Accelerator
@@ -18,24 +17,40 @@
1817
# Local
1918
from .evaluator import Evaluator
2019

20+
# Since StrEnum wasn't part of the STL until Python3.11, we must do this
21+
try:
22+
# Standard
23+
from enum import StrEnum
24+
except ImportError:
25+
# Third Party
26+
from strenum import StrEnum # type: ignore[no-redef]
27+
28+
# And do the same thing to bring in NotRequired from typing
29+
try:
30+
# Standard
31+
from typing import NotRequired
32+
except ImportError:
33+
# Third Party
34+
from typing_extensions import NotRequired
35+
2136

2237
class ParsedScores(t.TypedDict):
2338
"""
2439
Just an ordinary dict that contains both the overall score as well as per-subtask scores.
2540
"""
2641

2742
score: float
28-
subtasks: t.NotRequired[t.Dict[str, float]]
43+
subtasks: NotRequired[t.Dict[str, float]]
2944

3045

3146
class LeaderboardV2EvalResult(t.TypedDict):
3247
overall_score: float
33-
leaderboard_gpqa: t.NotRequired[ParsedScores]
34-
leaderboard_ifeval: t.NotRequired[ParsedScores]
35-
leaderboard_bbh: t.NotRequired[ParsedScores]
36-
leaderboard_mmlu_pro: t.NotRequired[ParsedScores]
37-
leaderboard_musr: t.NotRequired[ParsedScores]
38-
leaderboard_math_hard: t.NotRequired[ParsedScores]
48+
leaderboard_gpqa: NotRequired[ParsedScores]
49+
leaderboard_ifeval: NotRequired[ParsedScores]
50+
leaderboard_bbh: NotRequired[ParsedScores]
51+
leaderboard_mmlu_pro: NotRequired[ParsedScores]
52+
leaderboard_musr: NotRequired[ParsedScores]
53+
leaderboard_math_hard: NotRequired[ParsedScores]
3954

4055

4156
class LeaderboardV2Tasks(StrEnum):
@@ -94,7 +109,7 @@ class TaskGrouping(t.TypedDict):
94109
}
95110

96111
# 1. Add OpenAI configuration defaults
97-
DEFAULT_OPENAI_CONFIG = {
112+
DEFAULT_OPENAI_CONFIG: t.Dict[str, t.Any] = {
98113
"max_tokens": 768,
99114
"temperature": 0.0,
100115
"seed": 1337,
@@ -194,9 +209,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
194209
def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
195210
# we need to use torch.multiprocessing to run each task in a separate process,
196211
# and then combine the results
197-
# Third Party
198-
import torch.multiprocessing as mp
199-
200212
num_processes = args["num_gpus"]
201213

202214
# Create the context and queue within the same context
@@ -222,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
222234
p.join()
223235

224236
# extract the result which is not None
225-
assert len([res for res in results.values() if res is not None]) == 1, (
226-
"we expect exactly 1 process to return a results dict properly"
227-
)
237+
assert (
238+
len([res for res in results.values() if res is not None]) == 1
239+
), "we expect exactly 1 process to return a results dict properly"
228240
results_dict = [res for res in results.values() if res is not None][0]
229241
return results_dict
230242

@@ -290,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290302
parsed_scores = parse_multitask_results(
291303
result_dict, LeaderboardV2Tasks.BBH.value, "acc_norm"
292304
)
293-
assert len(parsed_scores["subtasks"]) == 24, (
294-
"there should be 24 subtasks of bbh run"
295-
)
305+
assert (
306+
len(parsed_scores["subtasks"]) == 24
307+
), "there should be 24 subtasks of bbh run"
296308
return parsed_scores
297309

298310

@@ -343,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
343355
scores.append(value)
344356
target_metrics.remove(metric)
345357

346-
assert len(scores) == 2, (
347-
f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
348-
)
358+
assert (
359+
len(scores) == 2
360+
), f"there should only be 2 values extracted in ifeval, got: {len(scores)}"
349361
return {
350362
"score": sum(scores) / 2,
351363
}
@@ -369,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
369381
parsed_scores = parse_multitask_results(
370382
result_dict, LeaderboardV2Tasks.GPQA.value, "acc_norm"
371383
)
372-
assert len(parsed_scores["subtasks"]) == 3, (
373-
f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
374-
)
384+
assert (
385+
len(parsed_scores["subtasks"]) == 3
386+
), f"Expected 3 gpqa scores, got {len(parsed_scores['subtasks'])}"
375387
return parsed_scores
376388

377389

@@ -382,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
382394
parsed_scores = parse_multitask_results(
383395
result_dict, LeaderboardV2Tasks.MATH_HARD.value, "exact_match"
384396
)
385-
assert len(parsed_scores["subtasks"]) == 7, (
386-
f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
387-
)
397+
assert (
398+
len(parsed_scores["subtasks"]) == 7
399+
), f"leaderboard_math_hard should have 7 subtasks, found: {len(parsed_scores['subtasks'])}"
388400
return parsed_scores
389401

390402

@@ -451,9 +463,9 @@ def get_scores_from_result_dicts(
451463
# this is just a sanity check step
452464
benchmarks_already_covered = set(parsed_scores.keys())
453465
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
454-
assert len(benchmarks_already_covered & benchmarks_to_parse) == 0, (
455-
f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
456-
)
466+
assert (
467+
len(benchmarks_already_covered & benchmarks_to_parse) == 0
468+
), f"expected no overlapping benchmarks but found the following to overlap: {list(overlapping_benchmarks)}"
457469

458470
# now actually add them
459471
for benchmark in benchmarks_to_parse:
@@ -486,12 +498,15 @@ def validate_output_path(output_file: str) -> None:
486498

487499
# Test if we can write to the file by opening it in append mode
488500
# We don't actually write anything
489-
output_path.open("a").close()
501+
with output_path.open("a", encoding="utf-8") as _:
502+
pass
490503

491-
except PermissionError:
492-
raise ValueError(f"Permission denied: Cannot write to {output_file}")
493-
except OSError as e:
494-
raise ValueError(f"Invalid output path: {output_file}. Error: {str(e)}")
504+
except PermissionError as pe:
505+
raise ValueError(f"Permission denied: Cannot write to {output_file}") from pe
506+
except OSError as ose:
507+
raise ValueError(
508+
f"Invalid output path: {output_file}. Error: {str(ose)}"
509+
) from ose
495510

496511

497512
def validate_leaderboard_v2_tasks(tasks: t.List[str]):
@@ -658,7 +673,7 @@ def save_to_file(self, output_file: t.Optional[str] = None) -> None:
658673
output_dir = os.path.dirname(output_file)
659674
if output_dir:
660675
os.makedirs(output_dir, exist_ok=True)
661-
with open(output_file, "w") as f:
676+
with open(output_file, "w", encoding="utf-8") as f:
662677
json.dump(self._results, f, indent=2)
663678

664679
def run(
@@ -739,15 +754,6 @@ def run(
739754
# validation logic
740755
validate_leaderboard_v2_tasks(tasks)
741756

742-
# Only validate GPU requirements when not using an API endpoint
743-
if not api_endpoint:
744-
if not num_gpus:
745-
num_gpus = cuda.device_count()
746-
if num_gpus <= 0 or num_gpus > cuda.device_count():
747-
raise ValueError(
748-
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
749-
)
750-
751757
if output_file:
752758
validate_output_path(output_file)
753759

@@ -767,6 +773,14 @@ def run(
767773
openai_results = evaluate_with_openai(args_openai)
768774
self._lm_eval_results.append(openai_results)
769775
else:
776+
# Only validate GPU requirements when not using an API endpoint
777+
if not num_gpus:
778+
num_gpus = cuda.device_count()
779+
if num_gpus <= 0 or num_gpus > cuda.device_count():
780+
raise ValueError(
781+
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
782+
)
783+
770784
# Only run local evaluation if not using OpenAI API
771785
if vllm_tasks := grouped_tasks["vllm"]:
772786
args_vllm: LeaderboardArgs = {
@@ -823,11 +837,11 @@ def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
823837

824838
# Add base_url if provided
825839
if base_url:
826-
model_args["base_url"] = base_url
840+
model_args.update({"base_url": base_url})
827841

828842
# Add API key if provided
829843
if api_key:
830-
model_args["api_key"] = api_key
844+
model_args.update({"api_key": api_key})
831845

832846
# Add any remaining backend config options
833847
model_args.update(backend_config)

tests/test_project.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
# First Party
66
from instructlab.eval.evaluator import Evaluator
7+
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
78
from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
89
from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
910

@@ -14,6 +15,7 @@ def test_evaluator_eps():
1415
"mmlu_branch": MMLUBranchEvaluator,
1516
"mt_bench": MTBenchEvaluator,
1617
"mt_bench_branch": MTBenchBranchEvaluator,
18+
"leaderboard_v2": LeaderboardV2Evaluator,
1719
}
1820
eps = entry_points(group="instructlab.eval.evaluator")
1921
found = {}

tox.ini

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ setenv =
1919
package = wheel
2020
wheel_build_env = pkg
2121
# equivalent to `pip install instructlab[cpu]`
22-
extras = cpu
22+
extras =
23+
cpu
24+
leaderboard
2325
deps =
2426
pytest
2527
pytest-asyncio

0 commit comments

Comments
 (0)