Skip to content

Commit 936c3cd

Browse files
committed
modified: setup.py
modified: tiddit/__main__.py modified: tiddit/tiddit_cluster.pyx modified: tiddit/tiddit_contig_analysis.pyx modified: tiddit/tiddit_signal.pyx modified: tiddit/tiddit_variant.pyx
1 parent dc37e29 commit 936c3cd

File tree

6 files changed

+121
-27
lines changed

6 files changed

+121
-27
lines changed

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
setup(
2222
name = 'tiddit',
23-
version = '3.3.2',
23+
version = '3.4.0',
2424

2525
url = "https://github.com/SciLifeLab/TIDDIT",
2626
author = "Jesper Eisfeldt",

tiddit/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
1818

1919
def main():
20-
version="3.3.2"
20+
version="3.4.0"
2121
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
2222
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
2323
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")

tiddit/tiddit_cluster.pyx

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
7070
if int(posB) > contig_length[chrB]:
7171
posA=contig_length[chrB]
7272

73-
discordants[chrA][chrB].append([content[0],sample,"D",posA,content[5],posB,content[8],i])
73+
discordants[chrA][chrB].append([content[0],sample,"D",posA,content[5],posB,content[8],i,int(content[3]),int(content[4]),int(content[6]),int(content[7])])
7474
positions[chrA][chrB].append([int(posA),int(posB),i])
7575
i+=1
7676

@@ -99,7 +99,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
9999
if int(posB) > contig_length[chrB]:
100100
posB=contig_length[chrB]
101101

102-
discordants[chrA][chrB].append([content[0],sample,"S",posA,content[4],posB,content[6],i])
102+
discordants[chrA][chrB].append([content[0],sample,"S",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])])
103103
positions[chrA][chrB].append([int(posA),int(posB),i])
104104
i+=1
105105

@@ -132,7 +132,7 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
132132
if int(posB) > contig_length[chrB]:
133133
posB=contig_length[chrB]
134134

135-
discordants[chrA][chrB].append([content[0],sample,"A",posA,content[4],posB,content[6],i])
135+
discordants[chrA][chrB].append([content[0],sample,"A",posA,content[4],posB,content[6],i,int(content[7]),int(content[8]),int(content[9]),int(content[10])])
136136
positions[chrA][chrB].append([int(posA),int(posB),i])
137137
contigs.add(i)
138138
i+=1
@@ -197,6 +197,9 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
197197
candidates[chrA][chrB][candidate]["positions_A"]["orientation_contigs"]=[]
198198
candidates[chrA][chrB][candidate]["positions_A"]["orientation_splits"]=[]
199199
candidates[chrA][chrB][candidate]["positions_A"]["orientation_discordants"]=[]
200+
candidates[chrA][chrB][candidate]["positions_A"]["start"]=[]
201+
candidates[chrA][chrB][candidate]["positions_A"]["end"]=[]
202+
200203
candidates[chrA][chrB][candidate]["start_A"]=0
201204
candidates[chrA][chrB][candidate]["end_A"]=0
202205

@@ -208,6 +211,8 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
208211
candidates[chrA][chrB][candidate]["positions_B"]["orientation_contigs"]=[]
209212
candidates[chrA][chrB][candidate]["positions_B"]["orientation_splits"]=[]
210213
candidates[chrA][chrB][candidate]["positions_B"]["orientation_discordants"]=[]
214+
candidates[chrA][chrB][candidate]["positions_B"]["start"]=[]
215+
candidates[chrA][chrB][candidate]["positions_B"]["end"]=[]
211216

212217
candidates[chrA][chrB][candidate]["start_B"]=0
213218
candidates[chrA][chrB][candidate]["end_B"]=0
@@ -218,7 +223,13 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
218223
candidates[chrA][chrB][candidate]["sample_contigs"][discordants[chrA][chrB][i][1]]=set([])
219224

220225
candidates[chrA][chrB][candidate]["samples"].add(discordants[chrA][chrB][i][1])
221-
226+
227+
candidates[chrA][chrB][candidate]["positions_A"]["start"].append(discordants[chrA][chrB][i][8])
228+
candidates[chrA][chrB][candidate]["positions_A"]["end"].append(discordants[chrA][chrB][i][9])
229+
230+
candidates[chrA][chrB][candidate]["positions_B"]["start"].append(discordants[chrA][chrB][i][10])
231+
candidates[chrA][chrB][candidate]["positions_B"]["end"].append(discordants[chrA][chrB][i][11])
232+
222233
if discordants[chrA][chrB][i][2] == "D":
223234
candidates[chrA][chrB][candidate]["discordants"].add(discordants[chrA][chrB][i][0])
224235
candidates[chrA][chrB][candidate]["positions_A"]["discordants"].append(int(discordants[chrA][chrB][i][3]))
@@ -321,11 +332,11 @@ def main(prefix,chromosomes,contig_length,samples,is_mp,epsilon,m,max_ins_len,mi
321332
candidates[chrA][chrB][candidate]["posA"]
322333
candidates[chrA][chrB][candidate]["posB"]
323334

324-
candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]+candidates[chrA][chrB][candidate]["positions_B"]["splits"]+candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
325-
candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["contigs"]+candidates[chrA][chrB][candidate]["positions_B"]["splits"]+candidates[chrA][chrB][candidate]["positions_B"]["discordants"])
335+
candidates[chrA][chrB][candidate]["startB"]=min(candidates[chrA][chrB][candidate]["positions_B"]["start"])
336+
candidates[chrA][chrB][candidate]["endB"]=max(candidates[chrA][chrB][candidate]["positions_B"]["end"])
326337

327-
candidates[chrA][chrB][candidate]["startA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]+candidates[chrA][chrB][candidate]["positions_A"]["splits"]+candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
328-
candidates[chrA][chrB][candidate]["endA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["contigs"]+candidates[chrA][chrB][candidate]["positions_A"]["splits"]+candidates[chrA][chrB][candidate]["positions_A"]["discordants"])
338+
candidates[chrA][chrB][candidate]["startA"]=min(candidates[chrA][chrB][candidate]["positions_A"]["start"])
339+
candidates[chrA][chrB][candidate]["endA"]=max(candidates[chrA][chrB][candidate]["positions_A"]["end"])
329340

330341
return(candidates)
331342

tiddit/tiddit_contig_analysis.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def read_contigs(aligned_contigs,prefix,sample_id,min_size):
6363
current_bp=read.reference_start
6464
for i in range(0,len(read.cigartuples)-1):
6565
if read.cigartuples[i][0] == 2 and read.cigartuples[i][1] > min_size:
66-
split_contigs[read.reference_name][read.reference_name]["{}_d_{}".format(read.query_name,i)]=[current_bp,read.is_reverse,current_bp+read.cigartuples[i][1],read.is_reverse]
66+
67+
split_contigs[read.reference_name][read.reference_name]["{}_d_{}".format(read.query_name,i)]=[current_bp,read.is_reverse,current_bp+read.cigartuples[i][1],read.is_reverse,read.reference_start,current_bp,current_bp+read.cigartuples[i][1],read.reference_end]
6768
current_bp+=read.cigartuples[i][1]
6869

6970
f=open("{}_tiddit/contigs_{}.tab".format(prefix,sample_id),"w")

tiddit/tiddit_signal.pyx

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -66,18 +66,40 @@ def SA_analysis(read,min_q,tag,reference_name):
6666
cdef long read_query_alignment_end=read.query_alignment_end
6767

6868
if (read.query_alignment_start ) < (read.query_length - read_query_alignment_end):
69-
split_pos=read.reference_end+1
69+
if read.is_reverse:
70+
71+
72+
split_pos=read.reference_start+1
73+
else:
74+
split_pos=read.reference_end+1
7075
else:
71-
split_pos=read.reference_start+1
76+
if read.is_reverse:
77+
split_pos=read.reference_end+1
78+
else:
79+
split_pos=read.reference_start+1
7280

7381
supplementry_alignment=find_SA_query_range(SA_data)
7482
SA_chr=SA_data[0]
7583

84+
startA=read.reference_start+1
85+
endA=read.reference_end+1
86+
87+
startB=supplementry_alignment.reference_start
88+
endB=supplementry_alignment.reference_end
89+
7690

7791
if (supplementry_alignment.query_alignment_start ) < (supplementry_alignment.query_length - read_query_alignment_end):
78-
SA_split_pos=supplementry_alignment.reference_end
92+
if SA_data[2] == "-":
93+
94+
SA_split_pos=supplementry_alignment.reference_start
95+
else:
96+
SA_split_pos=supplementry_alignment.reference_end
7997
else:
80-
SA_split_pos=supplementry_alignment.reference_start
98+
if SA_data[2] == "-":
99+
SA_split_pos=supplementry_alignment.reference_end
100+
101+
else:
102+
SA_split_pos=supplementry_alignment.reference_start
81103

82104

83105
if SA_chr < reference_name:
@@ -87,6 +109,12 @@ def SA_analysis(read,min_q,tag,reference_name):
87109
split_pos=SA_split_pos
88110
SA_split_pos=tmp
89111

112+
startB=read.reference_start+1
113+
endB=read.reference_end+1
114+
startA=supplementry_alignment.reference_start
115+
endA=supplementry_alignment.reference_end
116+
117+
90118
else:
91119
chrA=reference_name
92120
chrB=SA_chr
@@ -97,11 +125,16 @@ def SA_analysis(read,min_q,tag,reference_name):
97125
split_pos=SA_split_pos
98126
SA_split_pos=tmp
99127

128+
startB=read.reference_start+1
129+
endB=read.reference_end+1
130+
startA=supplementry_alignment.reference_start
131+
endA=supplementry_alignment.reference_end
132+
100133
split=[]
101134
if "-" == SA_data[2]:
102-
split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True]
135+
split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos, True,startA,endA,startB,endB]
103136
else:
104-
split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False]
137+
split=[chrA,chrB,read.query_name,split_pos,read.is_reverse,SA_split_pos,False,startA,endA,startB,endB]
105138
#splits[chrA][chrB][read.query_name]+=[split_pos,read.is_reverse,SA_split_pos,False]
106139

107140
return(split)

tiddit/tiddit_variant.pyx

Lines changed: 59 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,37 @@ from joblib import Parallel, delayed
66
#from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
77
from pysam import AlignmentFile, AlignedSegment
88

9+
10+
def scoring(scoring_dict,percentiles):
11+
score=[0]
12+
if scoring_dict["n_contigs"]:
13+
score.append(50)
14+
15+
if scoring_dict["n_discordants"]:
16+
score.append(0)
17+
for p in percentiles["FA"]:
18+
if scoring_dict["n_discordants"]/(scoring_dict["refFA"]+scoring_dict["n_discordants"]) >= p:
19+
score[-1]+=5
20+
21+
score.append(0)
22+
for p in percentiles["FB"]:
23+
if scoring_dict["n_discordants"]/(scoring_dict["refFB"]+scoring_dict["n_discordants"]) >= p:
24+
score[-1]+=5
25+
26+
27+
if scoring_dict["n_splits"]:
28+
score.append(0)
29+
for p in percentiles["RA"]:
30+
if scoring_dict["n_splits"]/(scoring_dict["refRA"]+scoring_dict["n_splits"]) >= p:
31+
score[-1]+=5
32+
33+
score.append(0)
34+
for p in percentiles["RB"]:
35+
if scoring_dict["n_splits"]/(scoring_dict["refRB"]+scoring_dict["n_splits"]) >= p:
36+
score[-1]+=5
37+
38+
return(max(score))
39+
940
def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, contig_number):
1041

1142
cdef int low_q=0
@@ -67,10 +98,10 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
6798
r_start=read_reference_start
6899
r_end=read_reference_end
69100

70-
if read_reference_start < bp-10 and r_end > bp:
101+
if read_reference_start < bp-20 and r_end > bp+20:
71102
crossing_r+=1
72103

73-
mate_bp_read= (read.next_reference_start < bp and r_end > bp)
104+
mate_bp_read= (read.next_reference_start < bp-50 and r_end > bp+50)
74105
discordant= ( abs(read.isize) > max_ins or read_next_reference_name != read_reference_name )
75106

76107
if mate_bp_read and not discordant:
@@ -281,14 +312,16 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
281312

282313
#configure filters for CNV based on Read depth
283314
for sample in samples:
315+
316+
covA=sample_data[sample]["covA"]
317+
covM=sample_data[sample]["covM"]
318+
covB=sample_data[sample]["covB"]
319+
284320
if "DEL" in svtype:
285321
#homozygout del based on coverage
286322
if cn == 0:
287323
filt="PASS"
288324

289-
covA=sample_data[sample]["covA"]
290-
covM=sample_data[sample]["covM"]
291-
covB=sample_data[sample]["covB"]
292325

293326
#normal coverage on the flanking regions, abnormal inbetween
294327
if covA > covM*(cn+0.9) and covB > covM*(cn+0.9):
@@ -297,8 +330,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
297330
#too few reads, but clear DR signal
298331
elif "DUP" in svtype and filt == "BelowExpectedLinks":
299332
filt="PASS"
300-
301-
333+
scoring_dict={"n_contigs":n_contigs, "n_discordants":n_discordants,"n_splits":n_splits,"covA":covA,"covM":covM,"covB":covB,"refRA":sample_data[sample]["refRA"],"refRB":sample_data[sample]["refRB"],"refFA":sample_data[sample]["refFA"],"refFB":sample_data[sample]["refFB"]}
302334

303335
if svtype != "BND":
304336
info=["SVTYPE={}".format(svtype),"SVLEN={}".format(posB-posA),"END={}".format(posB)]
@@ -363,7 +395,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
363395
GT="0/1"
364396

365397
variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
366-
variants.append([chrA,posA,variant])
398+
variants.append([chrA,posA,variant,scoring_dict])
367399
else:
368400
info=["SVTYPE=BND".format(svtype)]
369401
inverted=False
@@ -439,7 +471,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
439471

440472

441473
variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
442-
variants.append([chrA,posA,variant])
474+
variants.append([chrA,posA,variant,scoring_dict])
443475

444476

445477
variant=[chrB,str(posB),"SV_{}_2".format(var_n),"N",alt_str_b,".",filt,info,format_col]
@@ -472,7 +504,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
472504

473505

474506
variant.append( "{}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}".format(GT,cn,sample_data[sample]["covA"],sample_data[sample]["covM"],sample_data[sample]["covB"],n_discordants,n_splits,sample_data[sample]["QA"],sample_data[sample]["QB"],sample_data[sample]["refRA"],sample_data[sample]["refRB"],sample_data[sample]["refFA"],sample_data[sample]["refFB"]) )
475-
variants.append([chrB,posB,variant])
507+
variants.append([chrB,posB,variant, scoring_dict ])
476508

477509
samfile.close()
478510
return(variants)
@@ -498,8 +530,25 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
498530

499531
variants_list=Parallel(n_jobs=args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
500532

533+
ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
501534
for v in variants_list:
502535
for variant in v:
536+
if variant[3]["n_discordants"]:
537+
ratios["fragments_A"].append(variant[3]["n_discordants"]/(variant[3]["refFA"]+variant[3]["n_discordants"]) )
538+
ratios["fragments_B"].append(variant[3]["n_discordants"]/(variant[3]["refFB"]+variant[3]["n_discordants"]) )
539+
540+
if variant[3]["n_splits"]:
541+
ratios["reads_A"].append(variant[3]["n_splits"]/(variant[3]["refRA"]+variant[3]["n_splits"]) )
542+
ratios["reads_B"].append(variant[3]["n_splits"]/(variant[3]["refRB"]+variant[3]["n_splits"]) )
543+
544+
545+
p=[1,5,10,20,30,40,50,60,70,75,80,85,90,95,97.5,99]
546+
percentiles={"FA":numpy.percentile(ratios["fragments_A"],p),"FB":numpy.percentile(ratios["fragments_B"],p),"RA":numpy.percentile(ratios["reads_A"],p),"RB":numpy.percentile(ratios["reads_B"],p)}
547+
548+
for v in variants_list:
549+
for variant in v:
550+
score=scoring(variant[3],percentiles)
551+
variant[2][5]=str(score)
503552
variants[ variant[0] ].append( [ variant[1],variant[2] ] )
504553

505554
return(variants)

0 commit comments

Comments
 (0)