Skip to content

Commit 17172e6

Browse files
authored
Merge pull request #116 from J35P312/master
TIDDIT 3.7.0
2 parents 0afb2d4 + ddc3b9e commit 17172e6

File tree

6 files changed

+92
-50
lines changed

6 files changed

+92
-50
lines changed

setup.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from setuptools import setup
22
import numpy
3+
import pyximport
4+
import pysam
5+
pyximport.install()
6+
37

48
try:
59
from Cython.Build import cythonize
@@ -20,14 +24,16 @@
2024

2125
setup(
2226
name = 'tiddit',
23-
version = '3.6.1',
27+
version = '3.7.0',
2428

2529

2630
url = "https://github.com/SciLifeLab/TIDDIT",
2731
author = "Jesper Eisfeldt",
2832
author_email= "[email protected]",
2933
ext_modules = ext_modules,
30-
include_dirs=[numpy.get_include()],
34+
extra_link_args=pysam.get_libraries(),
35+
define_macros=pysam.get_defines(),
36+
include_dirs=[numpy.get_include()]+pysam.get_include(),
3137
packages = ['tiddit'],
3238
install_requires = ['numpy','pysam'],
3339
entry_points = {'console_scripts': ['tiddit = tiddit.__main__:main']},

tiddit/__main__.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
1818

1919
def main():
20-
version="3.6.1"
20+
version="3.7.0"
2121
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
2222
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
2323
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
@@ -27,6 +27,7 @@ def main():
2727

2828
parser = argparse.ArgumentParser("""tiddit --sv --bam inputfile [-o prefix] --ref ref.fasta""")
2929
parser.add_argument('--sv' , help="call structural variation", required=False, action="store_true")
30+
parser.add_argument('--force_overwrite' , help="force the analysis and overwrite any data in the output folder", required=False, action="store_true")
3031
parser.add_argument('--bam', type=str,required=True, help="coordinate sorted bam file(required)")
3132
parser.add_argument('-o', type=str,default="output", help="output prefix(default=output)")
3233
parser.add_argument('-i', type=int, help="paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)")
@@ -48,6 +49,7 @@ def main():
4849
parser.add_argument('--fermi2', type=str,default="fermi2", help="path to fermi2 executable file (default=fermi2)")
4950
parser.add_argument('--ropebwt2', type=str , default="ropebwt2", help="path to ropebwt2 executable file (default=ropebwt2)")
5051
parser.add_argument('--skip_assembly', action="store_true", help="Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref")
52+
#parser.add_argument('--skip_index', action="store_true", help="Do not generate the csi index")
5153
parser.add_argument('--p_ratio', type=float,default=0.1, help="minimum discordant pair/normal pair ratio at the breakpoint junction(default=0.1)")
5254
parser.add_argument('--r_ratio', type=float,default=0.1, help="minimum split read/coverage ratio at the breakpoint junction(default=0.1)")
5355
parser.add_argument('--max_coverage', type=float,default=4, help="filter call if X times higher than chromosome average coverage (default=4)")
@@ -115,10 +117,21 @@ def main():
115117
i+=1
116118

117119
prefix=args.o
118-
os.mkdir(f"{prefix}_tiddit")
119-
os.mkdir(f"{prefix}_tiddit/clips")
120+
try:
121+
os.mkdir(f"{prefix}_tiddit")
122+
os.mkdir(f"{prefix}_tiddit/clips")
123+
except:
124+
if args.force_overwrite:
125+
pass
126+
else:
127+
print("Eror output folder exists")
128+
quit()
120129

121-
pysam.index("-c","-m","6","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
130+
#if not args.skip_index:
131+
t=time.time()
132+
print("Creating index")
133+
pysam.index("-c","-m","4","-@",str(args.threads),bam_file_name,"{}_tiddit/{}.csi".format(args.o,sample_id))
134+
print("Created index in: " + str(time.time()-t) )
122135

123136
min_mapq=args.q
124137
max_ins_len=100000
@@ -132,7 +145,7 @@ def main():
132145

133146

134147
t=time.time()
135-
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig)
148+
coverage_data=tiddit_signal.main(bam_file_name,args.ref,prefix,min_mapq,max_ins_len,sample_id,args.threads,args.min_contig,False)
136149
print("extracted signals in:")
137150
print(t-time.time())
138151

@@ -153,6 +166,8 @@ def main():
153166

154167
if not args.e:
155168
args.e=int(library["avg_insert_size"]/2.0)
169+
if not args.e:
170+
args.e=50
156171

157172
t=time.time()
158173
sv_clusters=tiddit_cluster.main(prefix,contigs,contig_length,samples,library["mp"],args.e,args.l,max_ins_len,args.min_contig,args.skip_assembly,args.r)

tiddit/tiddit_cluster.pyx

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@ import sys
22
import os
33
import tiddit.DBSCAN as DBSCAN
44
import numpy
5-
import statistics
6-
from statistics import mode
5+
from collections import Counter
76

87
def find_discordant_pos(fragment,is_mp):
98
if is_mp:
@@ -267,16 +266,16 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
267266

268267

269268
if candidates[chrA][chrB][candidate]["N_splits"] and min_reads <= candidates[chrA][chrB][candidate]["N_splits"]:
270-
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["splits"])
271-
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["splits"])
269+
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
270+
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]
272271

273272
elif candidates[chrA][chrB][candidate]["N_contigs"]:
274-
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["contigs"])
275-
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["contigs"])
273+
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]).most_common(1)[0][0]
274+
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]).most_common(1)[0][0]
276275

277276
elif candidates[chrA][chrB][candidate]["N_splits"]:
278-
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["splits"])
279-
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["splits"])
277+
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["splits"]).most_common(1)[0][0]
278+
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["splits"]).most_common(1)[0][0]
280279

281280
else:
282281
reverse_A = candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"].count("True")
@@ -329,9 +328,8 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
329328
candidates[chrA][chrB][candidate]["posB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
330329

331330
else:
332-
candidates[chrA][chrB][candidate]["posA"]=mode(candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
333-
candidates[chrA][chrB][candidate]["posB"]=mode(candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
334-
331+
candidates[chrA][chrB][candidate]["posA"]=Counter(candidates[chrA][chrB][candidate]["positions_A"]["discordants"]).most_common(1)[0][0]
332+
candidates[chrA][chrB][candidate]["posB"]=Counter(candidates[chrA][chrB][candidate]["positions_B"]["discordants"]).most_common(1)[0][0]
335333

336334
candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["start"])
337335
candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["end"])

tiddit/tiddit_signal.pyx

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ import os
44
import itertools
55
import time
66
from joblib import Parallel, delayed
7+
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
78

89
import tiddit.tiddit_coverage as tiddit_coverage
910

1011
def find_SA_query_range(SA):
11-
a =pysam.AlignedSegment()
12+
cdef a =pysam.AlignedSegment()
1213
a.reference_start=int( SA[1] )
1314

1415
if SA[2] == "+":
@@ -143,21 +144,27 @@ def SA_analysis(read,min_q,tag,reference_name):
143144

144145
return(split)
145146

146-
def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size):
147+
def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int bin_size,skip_index):
147148
print("Collecting signals on contig: {}".format(chromosome))
148-
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
149+
150+
bam_index="{}_tiddit/{}.csi".format(prefix,sample_id)
151+
if skip_index:
152+
bam_index=False
153+
154+
cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename=bam_index)
149155
bam_header=samfile.header
150156
coverage_data,end_bin_size=tiddit_coverage.create_coverage(bam_header,bin_size,chromosome)
151157

152-
clips=[]
153-
data=[]
154-
splits=[]
158+
cdef list clips=[]
159+
cdef list data=[]
160+
cdef list splits=[]
155161

156-
clip_dist=100
162+
cdef int clip_dist=100
157163

158164
cdef long read_position
159165
cdef long read_end
160166
cdef int mapq
167+
cdef AlignedSegment read
161168

162169
for read in samfile.fetch(chromosome,until_eof=True):
163170

@@ -219,16 +226,16 @@ def worker(str chromosome, str bam_file_name,str ref,str prefix,int min_q,int ma
219226

220227
return(chromosome,data,splits,coverage_data, "{}_tiddit/clips/{}.fa".format(prefix,chromosome) )
221228

222-
def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig):
229+
def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_id, int threads, int min_contig,skip_index):
223230

224-
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref,index_filename="{}_tiddit/{}.csi".format(prefix,sample_id))
231+
cdef AlignmentFile samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
225232
bam_header=samfile.header
226233
samfile.close()
227234
cdef int bin_size=50
228-
cdef str file_type="wig"
235+
cdef str file_type=u"wig"
229236
cdef str outfile=prefix+".tiddit_coverage.wig"
230237

231-
t_tot=0
238+
cdef long t_tot=0
232239

233240
cdef dict data={}
234241
cdef dict splits={}
@@ -248,7 +255,7 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
248255
splits[chrA["SN"]][chrB["SN"]]={}
249256

250257
t=time.time()
251-
res=Parallel(n_jobs=threads)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size) for chromosome in chromosomes )
258+
res=Parallel(n_jobs=threads,timeout=99999)( delayed(worker)(chromosome,bam_file_name,ref,prefix,min_q,max_ins,sample_id,bin_size,skip_index) for chromosome in chromosomes )
252259

253260
chromosomes=set(chromosomes)
254261
for i in range(0,len(res)):

tiddit/tiddit_stats.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import pysam
22
import numpy
3+
import time
4+
35
def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
46
library={}
57
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=ref)
@@ -10,8 +12,16 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
1012
is_outtie=0
1113

1214
n_sampled=0
15+
t=time.time()
16+
1317
for read in samfile.fetch():
14-
18+
19+
read_length.append( read.query_length )
20+
n_sampled+=1
21+
22+
if n_sampled > n_reads:
23+
break
24+
1525
if read.mate_is_unmapped:
1626
continue
1727

@@ -27,25 +37,27 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
2737
if read.is_supplementary or read.is_secondary or read.is_duplicate or read.mapq < min_mapq:
2838
continue
2939

30-
n_sampled+=1
31-
3240
insert_size.append( read.template_length )
33-
read_length.append( read.query_length )
3441

3542
if read.is_reverse and not read.mate_is_reverse:
3643
is_outtie+=1
3744
else:
3845
is_innie+=1
3946

40-
if n_sampled > n_reads:
41-
break
4247

4348
samfile.close()
4449

4550
library["avg_read_length"]=numpy.average(read_length)
46-
library["avg_insert_size"]=numpy.average(insert_size)
47-
library["std_insert_size"]=numpy.std(insert_size)
48-
library["percentile_insert_size"]=numpy.percentile(insert_size, 99.9)
51+
if len(insert_size):
52+
library["avg_insert_size"]=numpy.average(insert_size)
53+
library["std_insert_size"]=numpy.std(insert_size)
54+
library["percentile_insert_size"]=numpy.percentile(insert_size, 99.9)
55+
else:
56+
library["avg_insert_size"]=0
57+
library["std_insert_size"]=0
58+
library["percentile_insert_size"]=0
59+
60+
4961

5062
print("LIBRARY STATISTICS")
5163
if is_innie > is_outtie:
@@ -60,6 +72,7 @@ def statistics(bam_file_name,ref,min_mapq,max_ins_len,n_reads):
6072
print("\tAverage insert size = {}".format(library["avg_insert_size"]) )
6173
print("\tStdev insert size = {}".format(library["std_insert_size"] ) )
6274
print("\t99.95 percentile insert size = {}".format( library["percentile_insert_size"]) )
75+
print("Calculated statistics in: " + str( t-time.time() ))
6376
print("")
6477

6578
return(library)

tiddit/tiddit_variant.pyx

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,8 @@ import math
33
import numpy
44
from joblib import Parallel, delayed
55

6-
#from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
7-
from pysam import AlignmentFile, AlignedSegment
8-
9-
6+
import pysam
7+
from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
108

119
def percentile(a, q):
1210
size = len(a)
@@ -53,15 +51,14 @@ def scoring(scoring_dict,percentiles):
5351

5452
return(max(score))
5553

56-
def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):
54+
def get_region(AlignmentFile samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):
5755

5856
cdef int low_q=0
5957
cdef int n_reads=0
6058
cdef long bases=0
6159
cdef int n_discs=0
6260
cdef int n_splits=0
6361

64-
6562
cdef int crossing_r=0
6663
cdef int crossing_f=0
6764

@@ -83,6 +80,8 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
8380
cdef long r_start
8481
cdef long r_end
8582

83+
cdef AlignedSegment read
84+
8685
for read in samfile.fetch(chr, q_start, q_end):
8786
if read.is_unmapped:
8887
continue
@@ -233,13 +232,14 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s
233232
return(filt)
234233

235234
def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):
236-
237-
samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
235+
cdef AlignmentFile samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
238236
variants=[]
239237

240238
var_n=0
241239
for chrB in sv_clusters[chrA]:
240+
242241
for cluster in sv_clusters[chrA][chrB]:
242+
243243
n_discordants=sv_clusters[chrA][chrB][cluster]["N_discordants"]
244244
n_splits=sv_clusters[chrA][chrB][cluster]["N_splits"]
245245
n_contigs=sv_clusters[chrA][chrB][cluster]["N_contigs"]
@@ -261,21 +261,24 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
261261
s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startA"]/50.0))
262262
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endA"]/50.0))+1
263263
avg_a=numpy.average(coverage_data[chrA][s:e])
264+
#print(f"{chrA}-{posA}-{chrB}")
264265

265266
if avg_a > args.max_coverage*library[ "avg_coverage_{}".format(chrA) ]:
266267
continue
267268
elif (args.max_coverage*n_discordants/avg_a < args.p_ratio/2 and args.max_coverage*n_splits/avg_a < args.r_ratio/2) and not n_contigs:
268269
continue
269270

270-
avg_b=numpy.average(coverage_data[chrA][s:e])
271+
s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startB"]/50.0))
272+
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endB"]/50.0))+1
273+
avg_b=numpy.average(coverage_data[chrB][s:e])
274+
271275
if avg_b == 0:
272276
continue
273277
elif avg_b > args.max_coverage*library[ "avg_coverage_{}".format(chrB) ]:
274278
continue
275279
elif (args.max_coverage*n_discordants/avg_b < args.p_ratio/2 and args.max_coverage*n_splits/avg_b < args.r_ratio/2) and not n_contigs:
276280
continue
277281

278-
279282
var_n+=1
280283
sample_data={}
281284
for sample in samples:
@@ -544,7 +547,7 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
544547
for chrB in sv_clusters[chrA]:
545548
variants[chrB]=[]
546549

547-
variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
550+
variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
548551

549552
ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
550553
for v in variants_list:

0 commit comments

Comments
 (0)