hail-is · hail-ci-robot · Oct 29, 2025 · Dec 3, 2024
diff --git a/hail/python/benchmark/conftest.py b/hail/python/benchmark/conftest.py
@@ -12,7 +12,7 @@
 def pytest_addoption(parser):
     parser.addoption("--log", type=str, help='Log file path', default=None)
     parser.addoption("--output", type=str, help="Output file path.", default=None)
-    parser.addoption("--data-dir", type=str, help="Data directory.", default=None)
+    parser.addoption("--data-dir", type=str, help="Data directory.", default=os.getenv('HAIL_BENCHMARK_DIR'))
     parser.addoption('--iterations', type=int, help='override number of iterations for all benchmarks', default=None)
     parser.addoption('--cores', type=int, help='Number of cores to use.', default=1)
     parser.addoption(
@@ -23,36 +23,36 @@ def pytest_addoption(parser):
         const='cpu',
         default=None,
     )
-    parser.addoption('--profiler-path', type=str, help='path to aysnc profiler', default=None)
+    parser.addoption(
+        '--max-duration',
+        type=int,
+        help='Maximum permitted duration for any benchmark trial in seconds, not to be confused with pytest-timeout',
+        default=200,
+    )
+    parser.addoption('--max-failures', type=int, help='Stop benchmarking item after this many failures', default=3)
+    parser.addoption(
+        '--profiler-path', type=str, help='path to aysnc profiler', default=os.getenv('ASYNC_PROFILER_HOME')
+    )
     parser.addoption('--profiler-fmt', choices=['html', 'flame', 'jfr'], help='Choose profiler output.', default='html')
 
 
-def run_config_from_pytest_config(pytest_config):
-    return type(
-        'RunConfig',
-        (object,),
-        {
-            **{
-                flag: pytest_config.getoption(flag) or default
-                for flag, default in [
-                    ('log', None),
-                    ('output', None),
-                    ('cores', 1),
-                    ('data_dir', os.getenv('HAIL_BENCHMARK_DIR')),
-                    ('iterations', None),
-                    ('profile', None),
-                    ('profiler_path', os.getenv('ASYNC_PROFILER_HOME')),
-                    ('profiler_fmt', None),
-                ]
-            },
-            'verbose': pytest_config.getoption('verbose') > 0,
-            'quiet': pytest_config.getoption('verbose') < 0,
-            'timeout': int(pytest_config.getoption('timeout') or 1800),
-        },
+@pytest.hookimpl
+def pytest_configure(config):
+    init_logging(file=config.getoption('log'))
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_collection_modifyitems(config, items):
+    max_duration = config.getoption('max_duration')
+
+    xfail = pytest.mark.xfail(
+        raises=TimeoutError,
+        reason=f'Runtime exceeds maximum permitted duration of {max_duration}s',
     )
 
+    for item in items:
+        if (xtimeout := item.get_closest_marker('xtimeout')) is None:
+            continue
 
-@pytest.hookimpl
-def pytest_configure(config):
-    config.run_config = run_config_from_pytest_config(config)
-    init_logging(file=config.run_config.log)
+        if len(xtimeout.args) == 0 or (len(xtimeout.args) == 1 and xtimeout.args[0] >= max_duration):
+            item.add_marker(xfail)
diff --git a/hail/python/benchmark/hail/benchmark_combiner.py b/hail/python/benchmark/hail/benchmark_combiner.py
@@ -2,7 +2,7 @@
 from test.hail.helpers import with_flags
 
 import hail as hl
-from benchmark.tools import benchmark, chunk
+from benchmark.tools import chunk
 from hail.vds.combiner import combine_variant_datasets, new_combiner, transform_gvcf
 
 COMBINE_GVCF_MAX = 100
@@ -13,39 +13,36 @@ def import_vcf(path):
     return hl.import_vcf(str(path), reference_genome='GRCh38', force=True)
 
 
-@benchmark()
+@pytest.mark.xfail(raises=hl.utils.java.FatalError, reason='??')
 @with_flags(no_ir_logging='1')
-def benchmark_compile_2k_merge(empty_gvcf, tmp_path):
+def test_compile_2k_merge(empty_gvcf, tmp_path):
     vcf = import_vcf(empty_gvcf)
     vcfs = [transform_gvcf(vcf, [])] * COMBINE_GVCF_MAX
     combined = [combine_variant_datasets(vcfs)] * 20
     hl.vds.write_variant_datasets(combined, str(tmp_path / 'combiner-multi-write'), overwrite=True)
 
 
-@benchmark()
-def benchmark_python_only_10k_transform(empty_gvcf):
+@pytest.mark.xtimeout(270)
+def test_python_only_10k_transform(empty_gvcf):
     for vcf in [import_vcf(empty_gvcf)] * 10_000:
         transform_gvcf(vcf, [])
 
 
-@benchmark()
-def benchmark_python_only_10k_combine(empty_gvcf):
+def test_python_only_10k_combine(empty_gvcf):
     vcf = import_vcf(empty_gvcf)
     mt = transform_gvcf(vcf, [])
     for mts in chunk(COMBINE_GVCF_MAX, [mt] * 10_000):
         combine_variant_datasets(mts)
 
 
-@benchmark()
-def benchmark_import_and_transform_gvcf(single_gvcf):
+def test_import_and_transform_gvcf(single_gvcf):
     mt = import_vcf(single_gvcf)
     vds = transform_gvcf(mt, [])
     vds.reference_data._force_count_rows()
     vds.variant_data._force_count_rows()
 
 
-@benchmark()
-def benchmark_import_gvcf_force_count(single_gvcf):
+def test_import_gvcf_force_count(single_gvcf):
     mt = import_vcf(single_gvcf)
     mt._force_count_rows()
 
@@ -59,14 +56,14 @@ def tmp_and_output_paths(tmp_path):
     return (tmp, output)
 
 
-@benchmark()
-def benchmark_vds_combiner_chr22(chr22_gvcfs, tmp_and_output_paths):
+@pytest.mark.xtimeout(180)
+def test_vds_combiner_chr22(chr22_gvcfs, tmp_and_output_paths):
     parts = hl.eval([hl.parse_locus_interval('chr22:start-end', reference_genome='GRCh38')])
-
+    tmp, output = tmp_and_output_paths
     combiner = new_combiner(
-        output_path=str(tmp_and_output_paths[0]),
+        output_path=str(output),
         intervals=parts,
-        temp_path=str(tmp_and_output_paths[1]),
+        temp_path=str(tmp),
         gvcf_paths=[str(path) for path in chr22_gvcfs],
         reference_genome='GRCh38',
         branch_factor=16,

diff --git a/hail/python/benchmark/hail/benchmark_linalg.py b/hail/python/benchmark/hail/benchmark_linalg.py
@@ -1,71 +1,67 @@
+import pytest
+
 import hail as hl
-from benchmark.tools import benchmark
 
 
-@benchmark()
-def benchmark_block_matrix_nested_multiply(tmp_path):
+@pytest.mark.xtimeout
+def test_block_matrix_nested_multiply(tmp_path):
     bm = hl.linalg.BlockMatrix.random(8 * 1024, 8 * 1024)
     bm = bm.checkpoint(str(tmp_path / 'checkpoint.mt'))
     bm = (bm @ bm) @ bm @ bm @ (bm @ bm)
     bm.write(str(tmp_path / 'result.mt'), overwrite=True)
 
 
-@benchmark()
-def benchmark_make_ndarray():
+def test_make_ndarray():
     ht = hl.utils.range_table(200_000)
     ht = ht.annotate(x=hl.nd.array(hl.range(ht.idx)))
     ht._force_count()
 
 
-@benchmark()
-def benchmark_ndarray_addition():
+def test_ndarray_addition():
     arr = hl.nd.ones((1024, 1024))
     hl.eval(arr + arr)
 
 
-@benchmark()
-def benchmark_ndarray_matmul_int64():
+def test_ndarray_matmul_int64():
     arr = hl.nd.arange(1024 * 1024).map(hl.int64).reshape((1024, 1024))
     hl.eval(arr @ arr)
 
 
-@benchmark()
-def benchmark_ndarray_matmul_float64():
+def test_ndarray_matmul_float64():
     arr = hl.nd.arange(1024 * 1024).map(hl.float64).reshape((1024, 1024))
     hl.eval(arr @ arr)
 
 
-@benchmark()
-def benchmark_blockmatrix_write_from_entry_expr_range_mt(tmp_path):
+@pytest.mark.xtimeout(200)
+def test_blockmatrix_write_from_entry_expr_range_mt(tmp_path):
     mt = hl.utils.range_matrix_table(40_000, 40_000, n_partitions=4)
     path = str(tmp_path / 'result.bm')
     hl.linalg.BlockMatrix.write_from_entry_expr(mt.row_idx + mt.col_idx, path)
 
 
-@benchmark()
-def benchmark_blockmatrix_write_from_entry_expr_range_mt_standardize(tmp_path):
+@pytest.mark.xtimeout(700)
+def test_blockmatrix_write_from_entry_expr_range_mt_standardize(tmp_path):
     mt = hl.utils.range_matrix_table(40_000, 40_000, n_partitions=4)
     path = str(tmp_path / 'result.bm')
     hl.linalg.BlockMatrix.write_from_entry_expr(
         mt.row_idx + mt.col_idx, path, mean_impute=True, center=True, normalize=True
     )
 
 
-@benchmark()
-def benchmark_sum_table_of_ndarrays():
+def test_sum_table_of_ndarrays():
     ht = hl.utils.range_table(400).annotate(nd=hl.nd.ones((4096, 4096)))
     ht.aggregate(hl.agg.ndarray_sum(ht.nd))
 
 
-@benchmark()
-def benchmark_block_matrix_to_matrix_table_row_major():
+@pytest.mark.xtimeout(250)
+def test_block_matrix_to_matrix_table_row_major():
     mt = hl.utils.range_matrix_table(20_000, 20_000, n_partitions=4)
     bm = hl.linalg.BlockMatrix.from_entry_expr(mt.row_idx + mt.col_idx)
     bm.to_matrix_table_row_major()._force_count_rows()
 
 
-@benchmark()
-def benchmark_king(tmp_path):
+@pytest.mark.xtimeout
+def test_king(tmp_path):
     mt = hl.balding_nichols_model(6, n_variants=10000, n_samples=4096)
     path = str(tmp_path / 'result.mt')
     hl.king(mt.GT).write(path, overwrite=True)