1
1
# Standard
2
- from enum import StrEnum
2
+ from copy import deepcopy
3
3
from pathlib import Path
4
4
import gc
5
5
import json
6
6
import os
7
7
import typing as t
8
- from copy import deepcopy
9
8
10
9
# Third Party
11
10
from accelerate import Accelerator
18
17
# Local
19
18
from .evaluator import Evaluator
20
19
20
+ # Since StrEnum wasn't part of the STL until Python3.11, we must do this
21
+ try :
22
+ # Standard
23
+ from enum import StrEnum
24
+ except ImportError :
25
+ # Third Party
26
+ from strenum import StrEnum # type: ignore[no-redef]
27
+
28
+ # And do the same thing to bring in NotRequired from typing
29
+ try :
30
+ # Standard
31
+ from typing import NotRequired
32
+ except ImportError :
33
+ # Third Party
34
+ from typing_extensions import NotRequired
35
+
21
36
22
37
class ParsedScores (t .TypedDict ):
23
38
"""
24
39
Just an ordinary dict that contains both the overall score as well as per-subtask scores.
25
40
"""
26
41
27
42
score : float
28
- subtasks : t . NotRequired [t .Dict [str , float ]]
43
+ subtasks : NotRequired [t .Dict [str , float ]]
29
44
30
45
31
46
class LeaderboardV2EvalResult (t .TypedDict ):
32
47
overall_score : float
33
- leaderboard_gpqa : t . NotRequired [ParsedScores ]
34
- leaderboard_ifeval : t . NotRequired [ParsedScores ]
35
- leaderboard_bbh : t . NotRequired [ParsedScores ]
36
- leaderboard_mmlu_pro : t . NotRequired [ParsedScores ]
37
- leaderboard_musr : t . NotRequired [ParsedScores ]
38
- leaderboard_math_hard : t . NotRequired [ParsedScores ]
48
+ leaderboard_gpqa : NotRequired [ParsedScores ]
49
+ leaderboard_ifeval : NotRequired [ParsedScores ]
50
+ leaderboard_bbh : NotRequired [ParsedScores ]
51
+ leaderboard_mmlu_pro : NotRequired [ParsedScores ]
52
+ leaderboard_musr : NotRequired [ParsedScores ]
53
+ leaderboard_math_hard : NotRequired [ParsedScores ]
39
54
40
55
41
56
class LeaderboardV2Tasks (StrEnum ):
@@ -94,7 +109,7 @@ class TaskGrouping(t.TypedDict):
94
109
}
95
110
96
111
# 1. Add OpenAI configuration defaults
97
- DEFAULT_OPENAI_CONFIG = {
112
+ DEFAULT_OPENAI_CONFIG : t . Dict [ str , t . Any ] = {
98
113
"max_tokens" : 768 ,
99
114
"temperature" : 0.0 ,
100
115
"seed" : 1337 ,
@@ -194,9 +209,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
194
209
def evaluate_with_hf (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
195
210
# we need to use torch.multiprocessing to run each task in a separate process,
196
211
# and then combine the results
197
- # Third Party
198
- import torch .multiprocessing as mp
199
-
200
212
num_processes = args ["num_gpus" ]
201
213
202
214
# Create the context and queue within the same context
@@ -222,9 +234,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
222
234
p .join ()
223
235
224
236
# extract the result which is not None
225
- assert len ([ res for res in results . values () if res is not None ]) == 1 , (
226
- "we expect exactly 1 process to return a results dict properly"
227
- )
237
+ assert (
238
+ len ([ res for res in results . values () if res is not None ]) == 1
239
+ ), "we expect exactly 1 process to return a results dict properly"
228
240
results_dict = [res for res in results .values () if res is not None ][0 ]
229
241
return results_dict
230
242
@@ -290,9 +302,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290
302
parsed_scores = parse_multitask_results (
291
303
result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
292
304
)
293
- assert len ( parsed_scores [ "subtasks" ]) == 24 , (
294
- "there should be 24 subtasks of bbh run"
295
- )
305
+ assert (
306
+ len ( parsed_scores [ " subtasks" ]) == 24
307
+ ), "there should be 24 subtasks of bbh run"
296
308
return parsed_scores
297
309
298
310
@@ -343,9 +355,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
343
355
scores .append (value )
344
356
target_metrics .remove (metric )
345
357
346
- assert len ( scores ) == 2 , (
347
- f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
348
- )
358
+ assert (
359
+ len (scores ) == 2
360
+ ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
349
361
return {
350
362
"score" : sum (scores ) / 2 ,
351
363
}
@@ -369,9 +381,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
369
381
parsed_scores = parse_multitask_results (
370
382
result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
371
383
)
372
- assert len ( parsed_scores [ "subtasks" ]) == 3 , (
373
- f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
374
- )
384
+ assert (
385
+ len (parsed_scores [" subtasks" ]) == 3
386
+ ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
375
387
return parsed_scores
376
388
377
389
@@ -382,9 +394,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
382
394
parsed_scores = parse_multitask_results (
383
395
result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
384
396
)
385
- assert len ( parsed_scores [ "subtasks" ]) == 7 , (
386
- f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
387
- )
397
+ assert (
398
+ len (parsed_scores [" subtasks" ]) == 7
399
+ ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
388
400
return parsed_scores
389
401
390
402
@@ -451,9 +463,9 @@ def get_scores_from_result_dicts(
451
463
# this is just a sanity check step
452
464
benchmarks_already_covered = set (parsed_scores .keys ())
453
465
overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
454
- assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
455
- f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
456
- )
466
+ assert (
467
+ len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
468
+ ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
457
469
458
470
# now actually add them
459
471
for benchmark in benchmarks_to_parse :
@@ -486,12 +498,15 @@ def validate_output_path(output_file: str) -> None:
486
498
487
499
# Test if we can write to the file by opening it in append mode
488
500
# We don't actually write anything
489
- output_path .open ("a" ).close ()
501
+ with output_path .open ("a" , encoding = "utf-8" ) as _ :
502
+ pass
490
503
491
- except PermissionError :
492
- raise ValueError (f"Permission denied: Cannot write to { output_file } " )
493
- except OSError as e :
494
- raise ValueError (f"Invalid output path: { output_file } . Error: { str (e )} " )
504
+ except PermissionError as pe :
505
+ raise ValueError (f"Permission denied: Cannot write to { output_file } " ) from pe
506
+ except OSError as ose :
507
+ raise ValueError (
508
+ f"Invalid output path: { output_file } . Error: { str (ose )} "
509
+ ) from ose
495
510
496
511
497
512
def validate_leaderboard_v2_tasks (tasks : t .List [str ]):
@@ -658,7 +673,7 @@ def save_to_file(self, output_file: t.Optional[str] = None) -> None:
658
673
output_dir = os .path .dirname (output_file )
659
674
if output_dir :
660
675
os .makedirs (output_dir , exist_ok = True )
661
- with open (output_file , "w" ) as f :
676
+ with open (output_file , "w" , encoding = "utf-8" ) as f :
662
677
json .dump (self ._results , f , indent = 2 )
663
678
664
679
def run (
@@ -739,15 +754,6 @@ def run(
739
754
# validation logic
740
755
validate_leaderboard_v2_tasks (tasks )
741
756
742
- # Only validate GPU requirements when not using an API endpoint
743
- if not api_endpoint :
744
- if not num_gpus :
745
- num_gpus = cuda .device_count ()
746
- if num_gpus <= 0 or num_gpus > cuda .device_count ():
747
- raise ValueError (
748
- f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
749
- )
750
-
751
757
if output_file :
752
758
validate_output_path (output_file )
753
759
@@ -767,6 +773,14 @@ def run(
767
773
openai_results = evaluate_with_openai (args_openai )
768
774
self ._lm_eval_results .append (openai_results )
769
775
else :
776
+ # Only validate GPU requirements when not using an API endpoint
777
+ if not num_gpus :
778
+ num_gpus = cuda .device_count ()
779
+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
780
+ raise ValueError (
781
+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
782
+ )
783
+
770
784
# Only run local evaluation if not using OpenAI API
771
785
if vllm_tasks := grouped_tasks ["vllm" ]:
772
786
args_vllm : LeaderboardArgs = {
@@ -823,11 +837,11 @@ def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
823
837
824
838
# Add base_url if provided
825
839
if base_url :
826
- model_args [ "base_url" ] = base_url
840
+ model_args . update ({ "base_url" : base_url })
827
841
828
842
# Add API key if provided
829
843
if api_key :
830
- model_args [ "api_key" ] = api_key
844
+ model_args . update ({ "api_key" : api_key })
831
845
832
846
# Add any remaining backend config options
833
847
model_args .update (backend_config )
0 commit comments