Skip to content
This repository was archived by the owner on Aug 26, 2023. It is now read-only.

Commit 485691f

Browse files
Ben J. WardBen Ward
authored andcommitted
Sliding window distance computation (#338)
* first draft of windowed sitance computation * Corrected typo * Remembered to initialize integer matrices to 0 * distance now also outputs ranges * Changed ends construction * Trying linear indexing to achieve simd * Tried simplifying inner loop * Eliminated some repeated operations * added window distances for Proportion{T} distances too * Corrected error in proportion windowed distance * Added sliding distance method for JukesCantor69 distances * Added draft flagmutations for both transition and transversion mutations. * Added draft of windowed distance for Count{T} <: TsTv * Added draft of windowed k80 distance * Added corrections to windowed distance measure function signatures * Added tests for windowed counts of Any, Transition, and Transversion Mutations * Edited tests * Corrected tests * Edited tests * Edited tests again * Added tests for windowed proportion distance * Tests edit * Made some tests use isapprox instead of isequal * Corrected missing end statements * Added docstrings * Corrected type in docstrings * changed approx tests to different tolerance * Corrected a typo in a function * Added tests for windowed JukesCantor69 distance * Added simd to proportion distance function * Implemented @BICYCLE's suggestions * Corrected stupid typo mistake
1 parent aa0ee59 commit 485691f

File tree

4 files changed

+269
-5
lines changed

4 files changed

+269
-5
lines changed

src/util/windows.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@ immutable EachWindowIterator{T <: ArrayOrStringOrSeq}
4040
end
4141

4242

43+
immutable EachWindow
44+
to::Int
45+
width::Int
46+
step::Int
47+
end
48+
49+
function Base.size(winitr::EachWindow)
50+
return length(StepRange(winitr.width+1, winitr.step, length(winitr.data)))
51+
end
52+
53+
4354
"""
4455
Calculate the number of windows that will result from iterating across the container.
4556

src/var/distances.jl

Lines changed: 182 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ immutable Kimura80 <: TsTv end
5959
# Distance computation internals
6060
# ------------------------------
6161

62+
@inline function expected_distance{T}(::Type{Proportion{T}}, n::Int64, l::Int64)
63+
return n / l
64+
end
65+
6266
## Jukes and Cantor 1969 distance computation.
6367

6468
@inline function expected_distance(::Type{JukesCantor69}, p::Float64)
@@ -129,6 +133,104 @@ function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs:
129133
return count_mutations(T, seqs)
130134
end
131135

136+
"""
137+
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
138+
139+
Compute pairwise distances using a sliding window.
140+
141+
As the window of `width` base pairs in size moves across a pair of sequences it
142+
computes the distance between the two sequences in that window.
143+
144+
This method computes mutation counts for every window, and returns a tuple of the
145+
matrix of p-distances for every window, a matrix of the number of valid sites
146+
counted by the function for each window.
147+
"""
148+
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
149+
mutation_flags, ambiguous_flags = flagmutations(T, seqs)
150+
nbases, npairs = size(mutation_flags)
151+
if width < 1
152+
throw(ArgumentError("`window` width must be ≥ 1."))
153+
end
154+
if step < 1
155+
throw(ArgumentError("`step` must be ≥ 1."))
156+
end
157+
if width > nbases
158+
throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
159+
end
160+
starts = 1:step:nbases
161+
ends = width:step:nbases
162+
nwindows = length(ends)
163+
mcounts = Matrix{Int}(nwindows, npairs)
164+
wsizes = Matrix{Int}(nwindows, npairs)
165+
ranges = Vector{UnitRange{Int}}(nwindows)
166+
167+
@inbounds for pair in 1:npairs
168+
pairoffset = pair - 1
169+
windowoffset = pairoffset * nwindows
170+
flagsoffset = pairoffset * nbases
171+
for i in 1:nwindows
172+
from = starts[i]
173+
to = ends[i]
174+
mcount = 0
175+
nsites = width
176+
@simd for j in from:to
177+
mcount += mutation_flags[flagsoffset + j]
178+
nsites -= ambiguous_flags[flagsoffset + j]
179+
end
180+
ranges[i] = UnitRange(starts[i],ends[i])
181+
mcounts[windowoffset + i] = mcount
182+
wsizes[windowoffset + i] = nsites
183+
end
184+
end
185+
return mcounts, wsizes, ranges
186+
end
187+
188+
189+
function distance{T<:TsTv,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
190+
transitionFlags, transversionFlags, ambiguous_flags = flagmutations(TransitionMutation, TransversionMutation, seqs)
191+
nbases, npairs = size(transitionFlags)
192+
if width < 1
193+
throw(ArgumentError("`window` width must be ≥ 1."))
194+
end
195+
if step < 1
196+
throw(ArgumentError("`step` must be ≥ 1."))
197+
end
198+
if width > nbases
199+
throw(ArgumentError("The `window` size cannot be greater than number of data elements."))
200+
end
201+
starts = 1:step:nbases
202+
ends = width:step:nbases
203+
nwindows = length(ends)
204+
tscounts = Matrix{Int}(nwindows, npairs)
205+
tvcounts = Matrix{Int}(nwindows, npairs)
206+
wsizes = Matrix{Int}(nwindows, npairs)
207+
ranges = Vector{UnitRange{Int}}(nwindows)
208+
209+
@inbounds for pair in 1:npairs
210+
pairoffset = pair - 1
211+
windowoffset = pairoffset * nwindows
212+
flagsoffset = pairoffset * nbases
213+
for i in 1:nwindows
214+
from = starts[i]
215+
to = ends[i]
216+
tscount = 0
217+
tvcount = 0
218+
nsites = width
219+
@simd for j in from:to
220+
tscount += transitionFlags[flagsoffset + j]
221+
tvcount += transversionFlags[flagsoffset + j]
222+
nsites -= ambiguous_flags[flagsoffset + j]
223+
end
224+
ranges[i] = UnitRange(starts[i],ends[i])
225+
tscounts[windowoffset + i] = tscount
226+
tvcounts[windowoffset + i] = tvcount
227+
wsizes[windowoffset + i] = nsites
228+
end
229+
end
230+
return tscounts, tvcounts, wsizes, ranges
231+
end
232+
233+
132234
"""
133235
distance{T<:MutationType,N<:Nucleotide}(::Type{Count{T}}, seqs::Matrix{N})
134236
@@ -164,7 +266,7 @@ vector of the number of valid (i.e. non-ambiguous sites) counted by the function
164266
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}})
165267
d, l = distance(Count{T}, seqs)
166268
D = Vector{Float64}(length(d))
167-
@inbounds for i in 1:length(D)
269+
@inbounds @simd for i in 1:length(D)
168270
D[i] = d[i] / l[i]
169271
end
170272
return D, l
@@ -174,7 +276,7 @@ end
174276
distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Matrix{N})
175277
176278
This method of distance returns a tuple of a vector of the p-distances, and a
177-
vector of the number of valid (i.e. non-ambiguous sites) counted by the function.
279+
vector of the number of valid (i.e. non-ambiguous) sites counted by the function.
178280
179281
**Note: This method assumes that the sequences are stored in the `Matrix{N}`
180282
provided as `seqs` in sequence major order i.e. each column of the matrix is one
@@ -189,6 +291,27 @@ function distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Ma
189291
return D, l
190292
end
191293

294+
"""
295+
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
296+
297+
A distance method which computes pairwise distances using a sliding window.
298+
299+
As the window of `width` base pairs in size moves across a pair of sequences it
300+
computes the distance between the two sequences in that window.
301+
302+
This method computes p-distances for every window, and returns a tuple of the
303+
matrix of p-distances for every window, a matrix of the number of valid sites
304+
counted by the function for each window.
305+
"""
306+
function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
307+
counts, wsizes, ranges = distance(Count{T}, seqs, width, step)
308+
res = Matrix{Float64}(size(counts))
309+
@inbounds for i in 1:endof(counts)
310+
res[i] = expected_distance(Proportion{T}, counts[i], wsizes[i])
311+
end
312+
return res, wsizes, ranges
313+
end
314+
192315
"""
193316
distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}})
194317
@@ -206,6 +329,32 @@ function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{Bio
206329
return D, V
207330
end
208331

332+
"""
333+
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
334+
335+
A distance method which computes pairwise distances using a sliding window.
336+
337+
As the window of `width` base pairs in size moves across a pair of sequences it
338+
computes the distance between the two sequences in that window.
339+
340+
This method computes the JukesCantor69 distance for every window, and returns a tuple of the
341+
matrix of p-distances for every window, a matrix of the number of valid sites
342+
counted by the function for each window.
343+
"""
344+
function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
345+
ps, wsizes, ranges = distance(Proportion{AnyMutation}, seqs, width, step)
346+
a, b = size(ps)
347+
est = Matrix{Float64}(a, b)
348+
var = Matrix{Float64}(a, b)
349+
@inbounds for i in 1:endof(ps)
350+
p = ps[i]
351+
l = wsizes[i]
352+
est[i] = expected_distance(JukesCantor69, p)
353+
var[i] = variance(JukesCantor69, p, l)
354+
end
355+
return est, var, ranges
356+
end
357+
209358
"""
210359
distance{N<:Nucleotide}(::Type{JukesCantor69}, seqs::Matrix{N})
211360
@@ -250,6 +399,37 @@ function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSeque
250399
return D, V
251400
end
252401

402+
"""
403+
distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
404+
405+
A distance method which computes pairwise distances using a sliding window.
406+
407+
As the window of `width` base pairs in size moves across a pair of sequences it
408+
computes the distance between the two sequences in that window.
409+
410+
This method computes the Kimura80 distance for every window, and returns a tuple of the
411+
matrix of p-distances for every window, a matrix of the number of valid sites
412+
counted by the function for each window.
413+
"""
414+
function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
415+
tss, tvs, wsizes, ranges = distance(Count{Kimura80}, seqs, width, step)
416+
a, b = size(tss)
417+
est = Matrix{Float64}(a, b)
418+
var = Matrix{Float64}(a, b)
419+
@inbounds for i in 1:endof(counts)
420+
L = l[i]
421+
P = tss[i] / L
422+
Q = tvs[i] / L
423+
a1 = 1 - 2 * P - Q
424+
a2 = 1 - 2 * Q
425+
tv = tvs[i]
426+
l = wsizes[i]
427+
est[i] = expected_distance(Kimura80, a1, a2)
428+
var[i] = variance(Kimura80, P, Q, L, a1, a2)
429+
end
430+
return est, var, ranges
431+
end
432+
253433
"""
254434
distance{N<:Nucleotide}(::Type{Kimura80}, seqs::Matrix{N})
255435

src/var/mutation_counting.jl

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,8 +196,9 @@ function flagmutations{M<:MutationType,N<:Nucleotide}(::Type{M}, seqs::Matrix{N}
196196
return ismutant, isambiguous
197197
end
198198

199-
200-
199+
function flagmutations{M<:MutationType,A<:NucleotideAlphabet}(::Type{M}, seqs::Vector{BioSequence{A}})
200+
return flagmutations(M, seqmatrix(seqs, :seq))
201+
end
201202

202203
"""
203204
count_mutations{T<:MutationType,N<:Nucleotide}(::Type{T}, seqs::Matrix{N})
@@ -263,6 +264,40 @@ function count_mutations{T<:MutationType,A<:NucleotideAlphabet}(::Type{T}, seque
263264
return count_mutations(T, seqs)
264265
end
265266

267+
268+
function flagmutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Matrix{N})
269+
seqsize, nseqs = size(seqs)
270+
istransition = Matrix{Bool}(seqsize, binomial(nseqs, 2))
271+
istransversion = Matrix{Bool}(seqsize, binomial(nseqs, 2))
272+
isambiguous = Matrix{Bool}(seqsize, binomial(nseqs, 2))
273+
col = 1
274+
@inbounds for i1 in 1:nseqs
275+
s1offset = (i1 - 1) * seqsize
276+
for i2 in i1+1:nseqs
277+
s2offset = (i2 - 1) * seqsize
278+
resoffset = (col - 1) * seqsize
279+
for s in 1:seqsize
280+
s1 = seqs[s1offset + s]
281+
s2 = seqs[s2offset + s]
282+
isamb = is_ambiguous_strict(s1, s2)
283+
isdiff = s1 != s2
284+
ists = is_mutation(TransitionMutation, s1, s2)
285+
isambiguous[resoffset + s] = isamb
286+
istransition[resoffset + s] = !isamb & isdiff & ists
287+
istransversion[resoffset + s] = !isamb & isdiff & !ists
288+
end
289+
col += 1
290+
end
291+
end
292+
return istransition, istransversion, isambiguous
293+
end
294+
295+
function flagmutations{A<:NucleotideAlphabet}(::Type{TransitionMutation}, ::Type{TransversionMutation}, seqs::Vector{BioSequence{A}})
296+
return flagmutations(TransitionMutation, TransversionMutation, seqmatrix(seqs, :seq))
297+
end
298+
299+
300+
266301
"""
267302
count_mutations{N<:Nucleotide}(::Type{TransitionMutation}, ::Type{TransversionMutation}, sequences::Matrix{N})
268303

test/var/runtests.jl

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ end
3939
dnas1 = [dna"ATTG-ACCTGGNTTTCCGAA", dna"A-ACAGAGTATACRGTCGTC"]
4040
m1 = seqmatrix(dnas1, :seq)
4141

42-
dnas2 = [dna"attgaacctggntttccgaa", dna"atacagagtatacrgtcgtc"]
42+
dnas2 = [dna"attgaacctggntttccgaa",
43+
dna"atacagagtatacrgtcgtc"]
44+
dnas3 = [dna"attgaacctgtntttccgaa",
45+
dna"atagaacgtatatrgccgtc"]
4346
m2 = seqmatrix(dnas2, :seq)
4447

4548
@test distance(Count{AnyMutation}, dnas1) == ([12], [16])
@@ -51,6 +54,14 @@ end
5154
@test distance(Count{TransversionMutation}, m1) == ([8], [16])
5255
@test distance(Count{Kimura80}, m1) == ([4], [8], [16])
5356

57+
@test distance(Count{AnyMutation}, dnas2, 5, 5)[1][:] == [2, 4, 3, 3]
58+
@test distance(Count{AnyMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
59+
@test distance(Count{TransitionMutation}, dnas2, 5, 5)[1][:] == [0, 2, 1, 1]
60+
@test distance(Count{TransitionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
61+
@test distance(Count{TransversionMutation}, dnas2, 5, 5)[1][:] == [2, 2, 2, 2]
62+
@test distance(Count{TransversionMutation}, dnas2, 5, 5)[2][:] == [5, 5, 3, 5]
63+
@test distance(Count{Kimura80}, dnas1) == ([4], [8], [16])
64+
5465
@test distance(Count{AnyMutation}, dnas2) == ([12], [18])
5566
@test distance(Count{TransitionMutation}, dnas2) == ([4], [18])
5667
@test distance(Count{TransversionMutation}, dnas2) == ([8], [18])
@@ -60,6 +71,25 @@ end
6071
@test distance(Count{TransversionMutation}, m2) == ([8], [18])
6172
@test distance(Count{Kimura80}, m2) == ([4], [8], [18])
6273

74+
d = distance(Proportion{AnyMutation}, dnas2, 5, 5)
75+
a = [0.4, 0.8, 1.0, 0.6]
76+
for i in 1:length(d[1])
77+
@test_approx_eq_eps d[1][i] a[i] 1e-4
78+
end
79+
@test d[2][:] == [5, 5, 3, 5]
80+
d = distance(Proportion{TransitionMutation}, dnas2, 5, 5)
81+
a = [0.0, 0.4, 0.333333, 0.2]
82+
for i in 1:length(d[1])
83+
@test_approx_eq_eps d[1][i] a[i] 1e-4
84+
end
85+
@test d[2][:] == [5, 5, 3, 5]
86+
d = distance(Proportion{TransversionMutation}, dnas2, 5, 5)
87+
a = [0.4, 0.4, 0.666667, 0.4]
88+
for i in 1:length(d[1])
89+
@test_approx_eq_eps d[1][i] a[i] 1e-4
90+
end
91+
@test d[2][:] == [5, 5, 3, 5]
92+
6393
@test distance(Proportion{AnyMutation}, dnas1) == ([(12 / 16)], [16])
6494
@test distance(Proportion{TransitionMutation}, dnas1) == ([(4 / 16)], [16])
6595
@test distance(Proportion{TransversionMutation}, dnas1) == ([(8 / 16)], [16])
@@ -81,6 +111,14 @@ end
81111
@test round(distance(JukesCantor69, dnas2)[2][1], 3) == 1
82112
@test round(distance(JukesCantor69, m2)[1][1], 3) == 1.648
83113
@test round(distance(JukesCantor69, m2)[2][1], 3) == 1
114+
@test_throws DomainError distance(JukesCantor69, dnas2, 5, 5)
115+
d = distance(JukesCantor69, dnas3, 5, 5)
116+
a = [0.232616, 0.571605, 0.44084, 0.571605]
117+
v = [0.0595041, 0.220408, 0.24, 0.220408]
118+
for i in 1:length(d[1])
119+
@test_approx_eq_eps d[1][i] a[i] 1e-5
120+
@test_approx_eq_eps d[2][i] v[i] 1e-5
121+
end
84122

85123
@test round(distance(Kimura80, dnas2)[1][1], 3) == 1.648
86124
@test round(distance(Kimura80, dnas2)[2][1], 3) == 1

0 commit comments

Comments
 (0)