|
1 |
| -from io import StringIO |
| 1 | +""" |
| 2 | +Tests for genotype imputation (forward and Baum-Welsh algorithms). |
| 3 | +""" |
| 4 | +import io |
2 | 5 |
|
3 | 6 | import numpy as np
|
4 | 7 | import pandas as pd
|
5 | 8 |
|
6 | 9 | import tskit
|
7 | 10 |
|
8 | 11 |
|
9 |
| -""" |
10 |
| -A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations |
11 |
| -(one per site). The first 2 individuals are used as reference panel, |
12 |
| -the last one is the target individual. |
13 |
| -""" |
| 12 | +# A tree sequence containing 3 diploid individuals with 5 sites and 5 mutations |
| 13 | +# (one per site). The first 2 individuals are used as reference panel, |
| 14 | +# the last one is the target individual. |
| 15 | + |
14 | 16 | _toy_ts_nodes_text = """\
|
15 | 17 | id is_sample time population individual metadata
|
16 | 18 | 0 1 0.000000 0 0
|
@@ -81,41 +83,40 @@ def get_toy_ts():
|
81 | 83 | Returns the toy tree sequence in text format above.
|
82 | 84 | """
|
83 | 85 | ts = tskit.load_text(
|
84 |
| - nodes=StringIO(_toy_ts_nodes_text), |
85 |
| - edges=StringIO(_toy_ts_edges_text), |
86 |
| - sites=StringIO(_toy_ts_sites_text), |
87 |
| - mutations=StringIO(_toy_ts_mutations_text), |
88 |
| - individuals=StringIO(_toy_ts_individuals_text), |
| 86 | + nodes=io.StringIO(_toy_ts_nodes_text), |
| 87 | + edges=io.StringIO(_toy_ts_edges_text), |
| 88 | + sites=io.StringIO(_toy_ts_sites_text), |
| 89 | + mutations=io.StringIO(_toy_ts_mutations_text), |
| 90 | + individuals=io.StringIO(_toy_ts_individuals_text), |
89 | 91 | strict=False,
|
90 | 92 | )
|
91 | 93 | return ts
|
92 | 94 |
|
93 | 95 |
|
94 |
| -""" |
95 |
| -BEAGLE 4.1 was run on the toy data set above using default parameters. |
96 |
| -The following are the forward probability matrices and backward probability |
97 |
| -matrices calculated when imputing into the third individual above. There are |
98 |
| -two sets of matrices, one for each haplotype. |
99 |
| -
|
100 |
| -Notes about calculations: |
101 |
| -n = number of haplotypes in ref. panel |
102 |
| -M = number of markers |
103 |
| -m = index of marker (site) |
104 |
| -h = index of haplotype in ref. panel |
105 |
| -
|
106 |
| -In forward probability matrix, |
107 |
| - fwd[m][h] = emission prob., if m = 0 (first marker) |
108 |
| - fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise |
109 |
| - where scale = (1 - switch prob.)/sum of fwd[m - 1], |
110 |
| - and shift = switch prob./n. |
111 |
| -
|
112 |
| -In backward probability matrix, |
113 |
| - bwd[m][h] = 1, if m = M - 1 (last marker) // DON'T SEE THIS IN BEAGLE |
114 |
| - unadj. bwd[m][h] = emission prob. / n |
115 |
| - bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise |
116 |
| - where scale = (1 - switch prob.)/sum of unadj. bwd[m], |
117 |
| - and shift = switch prob./n. |
118 |
| -""" |
| 96 | +# BEAGLE 4.1 was run on the toy data set above using default parameters. |
| 97 | +# The following are the forward probability matrices and backward probability |
| 98 | +# matrices calculated when imputing into the third individual above. There are |
| 99 | +# two sets of matrices, one for each haplotype. |
| 100 | +# |
| 101 | +# Notes about calculations: |
| 102 | +# n = number of haplotypes in ref. panel |
| 103 | +# M = number of markers |
| 104 | +# m = index of marker (site) |
| 105 | +# h = index of haplotype in ref. panel |
| 106 | +# |
| 107 | +# In forward probability matrix, |
| 108 | +# fwd[m][h] = emission prob., if m = 0 (first marker) |
| 109 | +# fwd[m][h] = emission prob. * (scale * fwd[m - 1][h] + shift), otherwise |
| 110 | +# where scale = (1 - switch prob.)/sum of fwd[m - 1], |
| 111 | +# and shift = switch prob./n. |
| 112 | +# |
| 113 | +# In backward probability matrix, |
| 114 | +# bwd[m][h] = 1, if m = M - 1 (last marker) // DON'T SEE THIS IN BEAGLE |
| 115 | +# unadj. bwd[m][h] = emission prob. / n |
| 116 | +# bwd[m][h] = (unadj. bwd[m][h] + shift) * scale, otherwise |
| 117 | +# where scale = (1 - switch prob.)/sum of unadj. bwd[m], |
| 118 | +# and shift = switch prob./n. |
| 119 | + |
119 | 120 | _fwd_matrix_text_1 = """
|
120 | 121 | m,h,probRec,probNoRec,noErrProb,errProb,refAl,queryAl,shift,scale,sum,val
|
121 | 122 | 0,0,0.000000,1.000000,0.999900,0.000100,1,0,0.000000,1.000000,0.000100,0.000100
|
@@ -201,7 +202,7 @@ def convert_to_pd_df(matrix_text):
|
201 | 202 | """
|
202 | 203 | Converts a matrix in text to a Pandas dataframe and returns it.
|
203 | 204 | """
|
204 |
| - df = pd.read_csv(StringIO(matrix_text)) |
| 205 | + df = pd.read_csv(io.StringIO(matrix_text)) |
205 | 206 | for i in np.arange(df.shape[0]):
|
206 | 207 | # Check that switch and non-switch probabilities sum to 1
|
207 | 208 | assert df.probRec[i] + df.probNoRec[i] == 1 or np.isnan(df.probRec[i])
|
|
0 commit comments