@@ -59,6 +59,10 @@ immutable Kimura80 <: TsTv end
5959# Distance computation internals
6060# ------------------------------
6161
62+ @inline function expected_distance {T} (:: Type{Proportion{T}} , n:: Int64 , l:: Int64 )
63+ return n / l
64+ end
65+
6266# # Jukes and Cantor 1969 distance computation.
6367
6468@inline function expected_distance (:: Type{JukesCantor69} , p:: Float64 )
@@ -129,6 +133,104 @@ function distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs:
129133 return count_mutations (T, seqs)
130134end
131135
136+ """
137+ distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Count{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
138+
139+ Compute pairwise distances using a sliding window.
140+
141+ As the window of `width` base pairs in size moves across a pair of sequences it
142+ computes the distance between the two sequences in that window.
143+
144+ This method computes mutation counts for every window, and returns a tuple of the
145+ matrix of p-distances for every window, a matrix of the number of valid sites
146+ counted by the function for each window.
147+ """
148+ function distance {T<:MutationType,A<:NucleotideAlphabet} (:: Type{Count{T}} , seqs:: Vector{BioSequence{A}} , width:: Int , step:: Int )
149+ mutation_flags, ambiguous_flags = flagmutations (T, seqs)
150+ nbases, npairs = size (mutation_flags)
151+ if width < 1
152+ throw (ArgumentError (" `window` width must be ≥ 1." ))
153+ end
154+ if step < 1
155+ throw (ArgumentError (" `step` must be ≥ 1." ))
156+ end
157+ if width > nbases
158+ throw (ArgumentError (" The `window` size cannot be greater than number of data elements." ))
159+ end
160+ starts = 1 : step: nbases
161+ ends = width: step: nbases
162+ nwindows = length (ends)
163+ mcounts = Matrix {Int} (nwindows, npairs)
164+ wsizes = Matrix {Int} (nwindows, npairs)
165+ ranges = Vector {UnitRange{Int}} (nwindows)
166+
167+ @inbounds for pair in 1 : npairs
168+ pairoffset = pair - 1
169+ windowoffset = pairoffset * nwindows
170+ flagsoffset = pairoffset * nbases
171+ for i in 1 : nwindows
172+ from = starts[i]
173+ to = ends[i]
174+ mcount = 0
175+ nsites = width
176+ @simd for j in from: to
177+ mcount += mutation_flags[flagsoffset + j]
178+ nsites -= ambiguous_flags[flagsoffset + j]
179+ end
180+ ranges[i] = UnitRange (starts[i],ends[i])
181+ mcounts[windowoffset + i] = mcount
182+ wsizes[windowoffset + i] = nsites
183+ end
184+ end
185+ return mcounts, wsizes, ranges
186+ end
187+
188+
189+ function distance {T<:TsTv,A<:NucleotideAlphabet} (:: Type{Count{T}} , seqs:: Vector{BioSequence{A}} , width:: Int , step:: Int )
190+ transitionFlags, transversionFlags, ambiguous_flags = flagmutations (TransitionMutation, TransversionMutation, seqs)
191+ nbases, npairs = size (transitionFlags)
192+ if width < 1
193+ throw (ArgumentError (" `window` width must be ≥ 1." ))
194+ end
195+ if step < 1
196+ throw (ArgumentError (" `step` must be ≥ 1." ))
197+ end
198+ if width > nbases
199+ throw (ArgumentError (" The `window` size cannot be greater than number of data elements." ))
200+ end
201+ starts = 1 : step: nbases
202+ ends = width: step: nbases
203+ nwindows = length (ends)
204+ tscounts = Matrix {Int} (nwindows, npairs)
205+ tvcounts = Matrix {Int} (nwindows, npairs)
206+ wsizes = Matrix {Int} (nwindows, npairs)
207+ ranges = Vector {UnitRange{Int}} (nwindows)
208+
209+ @inbounds for pair in 1 : npairs
210+ pairoffset = pair - 1
211+ windowoffset = pairoffset * nwindows
212+ flagsoffset = pairoffset * nbases
213+ for i in 1 : nwindows
214+ from = starts[i]
215+ to = ends[i]
216+ tscount = 0
217+ tvcount = 0
218+ nsites = width
219+ @simd for j in from: to
220+ tscount += transitionFlags[flagsoffset + j]
221+ tvcount += transversionFlags[flagsoffset + j]
222+ nsites -= ambiguous_flags[flagsoffset + j]
223+ end
224+ ranges[i] = UnitRange (starts[i],ends[i])
225+ tscounts[windowoffset + i] = tscount
226+ tvcounts[windowoffset + i] = tvcount
227+ wsizes[windowoffset + i] = nsites
228+ end
229+ end
230+ return tscounts, tvcounts, wsizes, ranges
231+ end
232+
233+
132234"""
133235 distance{T<:MutationType,N<:Nucleotide}(::Type{Count{T}}, seqs::Matrix{N})
134236
@@ -164,7 +266,7 @@ vector of the number of valid (i.e. non-ambiguous sites) counted by the function
164266function distance {T<:MutationType,A<:NucleotideAlphabet} (:: Type{Proportion{T}} , seqs:: Vector{BioSequence{A}} )
165267 d, l = distance (Count{T}, seqs)
166268 D = Vector {Float64} (length (d))
167- @inbounds for i in 1 : length (D)
269+ @inbounds @simd for i in 1 : length (D)
168270 D[i] = d[i] / l[i]
169271 end
170272 return D, l
174276 distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Matrix{N})
175277
176278This method of distance returns a tuple of a vector of the p-distances, and a
177- vector of the number of valid (i.e. non-ambiguous sites) counted by the function.
279+ vector of the number of valid (i.e. non-ambiguous) sites counted by the function.
178280
179281**Note: This method assumes that the sequences are stored in the `Matrix{N}`
180282provided as `seqs` in sequence major order i.e. each column of the matrix is one
@@ -189,6 +291,27 @@ function distance{T<:MutationType,N<:Nucleotide}(::Type{Proportion{T}}, seqs::Ma
189291 return D, l
190292end
191293
294+ """
295+ distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Proportion{T}}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
296+
297+ A distance method which computes pairwise distances using a sliding window.
298+
299+ As the window of `width` base pairs in size moves across a pair of sequences it
300+ computes the distance between the two sequences in that window.
301+
302+ This method computes p-distances for every window, and returns a tuple of the
303+ matrix of p-distances for every window, a matrix of the number of valid sites
304+ counted by the function for each window.
305+ """
306+ function distance {T<:MutationType,A<:NucleotideAlphabet} (:: Type{Proportion{T}} , seqs:: Vector{BioSequence{A}} , width:: Int , step:: Int )
307+ counts, wsizes, ranges = distance (Count{T}, seqs, width, step)
308+ res = Matrix {Float64} (size (counts))
309+ @inbounds for i in 1 : endof (counts)
310+ res[i] = expected_distance (Proportion{T}, counts[i], wsizes[i])
311+ end
312+ return res, wsizes, ranges
313+ end
314+
192315"""
193316 distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}})
194317
@@ -206,6 +329,32 @@ function distance{A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{Bio
206329 return D, V
207330end
208331
332+ """
333+ distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{JukesCantor69}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
334+
335+ A distance method which computes pairwise distances using a sliding window.
336+
337+ As the window of `width` base pairs in size moves across a pair of sequences it
338+ computes the distance between the two sequences in that window.
339+
340+ This method computes the JukesCantor69 distance for every window, and returns a tuple of the
341+ matrix of p-distances for every window, a matrix of the number of valid sites
342+ counted by the function for each window.
343+ """
344+ function distance {A<:NucleotideAlphabet} (:: Type{JukesCantor69} , seqs:: Vector{BioSequence{A}} , width:: Int , step:: Int )
345+ ps, wsizes, ranges = distance (Proportion{AnyMutation}, seqs, width, step)
346+ a, b = size (ps)
347+ est = Matrix {Float64} (a, b)
348+ var = Matrix {Float64} (a, b)
349+ @inbounds for i in 1 : endof (ps)
350+ p = ps[i]
351+ l = wsizes[i]
352+ est[i] = expected_distance (JukesCantor69, p)
353+ var[i] = variance (JukesCantor69, p, l)
354+ end
355+ return est, var, ranges
356+ end
357+
209358"""
210359 distance{N<:Nucleotide}(::Type{JukesCantor69}, seqs::Matrix{N})
211360
@@ -250,6 +399,37 @@ function distance{A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSeque
250399 return D, V
251400end
252401
402+ """
403+ distance{T<:MutationType,A<:NucleotideAlphabet}(::Type{Kimura80}, seqs::Vector{BioSequence{A}}, width::Int, step::Int)
404+
405+ A distance method which computes pairwise distances using a sliding window.
406+
407+ As the window of `width` base pairs in size moves across a pair of sequences it
408+ computes the distance between the two sequences in that window.
409+
410+ This method computes the Kimura80 distance for every window, and returns a tuple of the
411+ matrix of p-distances for every window, a matrix of the number of valid sites
412+ counted by the function for each window.
413+ """
414+ function distance {A<:NucleotideAlphabet} (:: Type{Kimura80} , seqs:: Vector{BioSequence{A}} , width:: Int , step:: Int )
415+ tss, tvs, wsizes, ranges = distance (Count{Kimura80}, seqs, width, step)
416+ a, b = size (tss)
417+ est = Matrix {Float64} (a, b)
418+ var = Matrix {Float64} (a, b)
419+ @inbounds for i in 1 : endof (counts)
420+ L = l[i]
421+ P = tss[i] / L
422+ Q = tvs[i] / L
423+ a1 = 1 - 2 * P - Q
424+ a2 = 1 - 2 * Q
425+ tv = tvs[i]
426+ l = wsizes[i]
427+ est[i] = expected_distance (Kimura80, a1, a2)
428+ var[i] = variance (Kimura80, P, Q, L, a1, a2)
429+ end
430+ return est, var, ranges
431+ end
432+
253433"""
254434 distance{N<:Nucleotide}(::Type{Kimura80}, seqs::Matrix{N})
255435
0 commit comments