From c173b32c940b6784f5dadba2a4bc594b93e69e8b Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Thu, 26 Jun 2025 14:39:34 -0700 Subject: [PATCH] Fix: genetic_relatedness to allow single sample set with self-comparisons Previously, genetic_relatedness would fail with TSK_ERR_INSUFFICIENT_SAMPLE_SETS when given a single sample set and indexes referring to that set, e.g.: ts.genetic_relatedness([[0]], indexes=[(0,0)]) This was because the C API requires at least k=2 sample sets for k-way statistics, even when indexes only reference a single set for self-comparison. The fix is more general: it removes the restriction that there be at least k sample sets for k-way stats, because this wasn't a useful or consistently applied restriction. Closes #3055 --- c/tests/test_stats.c | 20 +---------------- c/tskit/trees.c | 2 +- python/CHANGELOG.rst | 6 +++++ python/tests/test_lowlevel.py | 8 ++----- python/tests/test_tree_stats.py | 40 +++++++++++++++++++++++++++++++++ 5 files changed, 50 insertions(+), 26 deletions(-) diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 74aa961aa4..b722d5e951 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -881,9 +881,6 @@ verify_two_way_stat_func_errors( ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 1, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - options | TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 2, sample_set_sizes, samples, 0, set_indexes, 0, NULL, options | TSK_STAT_SITE, &result); @@ -938,12 +935,6 @@ verify_three_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 1, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 3, sample_set_sizes, samples, 0, set_indexes, 0, NULL, TSK_STAT_SITE, &result); @@ -972,15 +963,6 @@ verify_four_way_stat_func_errors(tsk_treeseq_t *ts, general_sample_stat_method * ret = method(ts, 0, sample_set_sizes, samples, 1, set_indexes, 0, NULL, TSK_STAT_SITE, &result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 1, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); - ret = method(ts, 3, sample_set_sizes, samples, 1, set_indexes, 0, NULL, - TSK_STAT_SITE, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_SAMPLE_SETS); ret = method(ts, 4, sample_set_sizes, samples, 0, set_indexes, 0, NULL, TSK_STAT_SITE, &result); @@ -3684,7 +3666,7 @@ test_two_locus_stat_input_errors(void) NULL, 0, result); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INSUFFICIENT_INDEX_TUPLES); - num_sample_sets = 1; + num_sample_sets = 0; num_index_tuples = 1; ret = tsk_treeseq_D2_ij(&ts, num_sample_sets, sample_set_sizes, sample_sets, num_index_tuples, index_tuples, num_sites, row_sites, NULL, num_sites, col_sites, diff --git a/c/tskit/trees.c b/c/tskit/trees.c index a536b08910..443a881f4c 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4515,7 +4515,7 @@ check_sample_stat_inputs(tsk_size_t num_sample_sets, tsk_size_t tuple_size, { int ret = 0; - if (num_sample_sets < tuple_size) { + if (num_sample_sets < 1) { ret = tsk_trace_error(TSK_ERR_INSUFFICIENT_SAMPLE_SETS); goto out; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index dfc6807aa9..8817be1891 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -47,6 +47,12 @@ when using ``TableCollection.assert_equals`` and ``Table.assert_equals``. (:user:`benjeffery`, :pr:`3246`, :issue:`3244`) +- k-way statistics no longer require k sample sets, allowing in particular + "self" comparisons for `TreeSequence.genetic_relatedness`. This changes the + error code returned in some situations. + (:user:`andrewkern`, :user:`petrelharp`, :pr:`3235`, :issue:`3055`) + + **Breaking changes** - ``ltrim``, ``rtrim``, ``trim`` and ``shift`` raise an error if used on a tree sequence diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 8df0b49724..042c15cd7a 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1871,17 +1871,13 @@ def test_ld_matrix_multipop(self, stat_method_name): stat_method( bad_ss_sizes, bad_ss, indexes, None, None, row_pos, col_pos, "branch" ) - with pytest.raises( - _tskit.LibraryError, match="TSK_ERR_INSUFFICIENT_SAMPLE_SETS" - ): + with pytest.raises(_tskit.LibraryError, match="TSK_ERR_BAD_SAMPLE_SET_INDEX"): bad_ss = np.array([], dtype=np.int32) bad_ss_sizes = np.array([0], dtype=np.uint32) stat_method( bad_ss_sizes, bad_ss, indexes, row_sites, col_sites, None, None, "site" ) - with pytest.raises( - _tskit.LibraryError, match="TSK_ERR_INSUFFICIENT_SAMPLE_SETS" - ): + with pytest.raises(_tskit.LibraryError, match="TSK_ERR_BAD_SAMPLE_SET_INDEX"): bad_ss = np.array([], dtype=np.int32) bad_ss_sizes = np.array([0], dtype=np.uint32) stat_method( diff --git a/python/tests/test_tree_stats.py b/python/tests/test_tree_stats.py index d7b2d280a0..417f0230ae 100644 --- a/python/tests/test_tree_stats.py +++ b/python/tests/test_tree_stats.py @@ -2307,6 +2307,46 @@ def test_shapes(self, proportion): else: assert x.shape == (2,) + def test_single_sample_set_self_comparison(self, ts_12_highrecomb_fixture): + if self.mode is None: + return + # Test for issue #3055 - self-comparisons with single sample set + ts = ts_12_highrecomb_fixture + # Single sample set with self-comparison + result = ts.genetic_relatedness([[0]], indexes=[(0, 0)], mode=self.mode) + result_shape = (ts.num_nodes, 1) if self.mode == "node" else (1,) + assert result.shape == result_shape + # Should work for multiple samples in single set too + result = ts.genetic_relatedness([[0, 1, 2]], indexes=[(0, 0)], mode=self.mode) + assert result.shape == result_shape + # Test with multiple self-comparisons + result = ts.genetic_relatedness( + [[0, 1], [2, 3]], indexes=[(0, 0), (1, 1)], mode=self.mode + ) + result_shape = (ts.num_nodes, 2) if self.mode == "node" else (2,) + assert result.shape == result_shape + + def test_single_sample_set_invalid_indexes(self, ts_12_highrecomb_fixture): + if self.mode is None: + return + # Test that invalid indexes raise ValueError with single sample set + ts = ts_12_highrecomb_fixture + # Index out of bounds (only have 1 sample set, but trying to access index 1) + with pytest.raises( + exceptions.LibraryError, match="TSK_ERR_BAD_SAMPLE_SET_INDEX" + ): + ts.genetic_relatedness([[0]], indexes=[(0, 1)], mode=self.mode) + # Negative index + with pytest.raises( + exceptions.LibraryError, match="TSK_ERR_BAD_SAMPLE_SET_INDEX" + ): + ts.genetic_relatedness([[0]], indexes=[(-1, 0)], mode=self.mode) + # Both indexes out of bounds + with pytest.raises( + exceptions.LibraryError, match="TSK_ERR_BAD_SAMPLE_SET_INDEX" + ): + ts.genetic_relatedness([[0, 1]], indexes=[(2, 2)], mode=self.mode) + class TestBranchGeneticRelatedness(TestGeneticRelatedness, TopologyExamplesMixin): mode = "branch"