@@ -6,6 +6,37 @@ from joblib import Parallel, delayed
6
6
# from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment
7
7
from pysam import AlignmentFile, AlignedSegment
8
8
9
+
10
+ def scoring (scoring_dict ,percentiles ):
11
+ score= [0 ]
12
+ if scoring_dict[" n_contigs" ]:
13
+ score.append(50 )
14
+
15
+ if scoring_dict[" n_discordants" ]:
16
+ score.append(0 )
17
+ for p in percentiles[" FA" ]:
18
+ if scoring_dict[" n_discordants" ]/ (scoring_dict[" refFA" ]+ scoring_dict[" n_discordants" ]) >= p:
19
+ score[- 1 ]+= 5
20
+
21
+ score.append(0 )
22
+ for p in percentiles[" FB" ]:
23
+ if scoring_dict[" n_discordants" ]/ (scoring_dict[" refFB" ]+ scoring_dict[" n_discordants" ]) >= p:
24
+ score[- 1 ]+= 5
25
+
26
+
27
+ if scoring_dict[" n_splits" ]:
28
+ score.append(0 )
29
+ for p in percentiles[" RA" ]:
30
+ if scoring_dict[" n_splits" ]/ (scoring_dict[" refRA" ]+ scoring_dict[" n_splits" ]) >= p:
31
+ score[- 1 ]+= 5
32
+
33
+ score.append(0 )
34
+ for p in percentiles[" RB" ]:
35
+ if scoring_dict[" n_splits" ]/ (scoring_dict[" refRB" ]+ scoring_dict[" n_splits" ]) >= p:
36
+ score[- 1 ]+= 5
37
+
38
+ return (max (score))
39
+
9
40
def get_region (samfile ,str chr ,int start ,int end ,int bp ,int min_q ,int max_ins , contig_number ):
10
41
11
42
cdef int low_q= 0
@@ -67,10 +98,10 @@ def get_region(samfile,str chr,int start,int end,int bp,int min_q,int max_ins, c
67
98
r_start= read_reference_start
68
99
r_end= read_reference_end
69
100
70
- if read_reference_start < bp- 10 and r_end > bp:
101
+ if read_reference_start < bp- 20 and r_end > bp+ 20 :
71
102
crossing_r+= 1
72
103
73
- mate_bp_read= (read.next_reference_start < bp and r_end > bp)
104
+ mate_bp_read= (read.next_reference_start < bp- 50 and r_end > bp+ 50 )
74
105
discordant= ( abs (read.isize) > max_ins or read_next_reference_name != read_reference_name )
75
106
76
107
if mate_bp_read and not discordant:
@@ -281,14 +312,16 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
281
312
282
313
# configure filters for CNV based on Read depth
283
314
for sample in samples:
315
+
316
+ covA= sample_data[sample][" covA" ]
317
+ covM= sample_data[sample][" covM" ]
318
+ covB= sample_data[sample][" covB" ]
319
+
284
320
if " DEL" in svtype:
285
321
# homozygout del based on coverage
286
322
if cn == 0 :
287
323
filt= " PASS"
288
324
289
- covA= sample_data[sample][" covA" ]
290
- covM= sample_data[sample][" covM" ]
291
- covB= sample_data[sample][" covB" ]
292
325
293
326
# normal coverage on the flanking regions, abnormal inbetween
294
327
if covA > covM* (cn+ 0.9 ) and covB > covM* (cn+ 0.9 ):
@@ -297,8 +330,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
297
330
# too few reads, but clear DR signal
298
331
elif " DUP" in svtype and filt == " BelowExpectedLinks" :
299
332
filt= " PASS"
300
-
301
-
333
+ scoring_dict= {" n_contigs" :n_contigs, " n_discordants" :n_discordants," n_splits" :n_splits," covA" :covA," covM" :covM," covB" :covB," refRA" :sample_data[sample][" refRA" ]," refRB" :sample_data[sample][" refRB" ]," refFA" :sample_data[sample][" refFA" ]," refFB" :sample_data[sample][" refFB" ]}
302
334
303
335
if svtype != " BND" :
304
336
info= [" SVTYPE={}" .format(svtype)," SVLEN={}" .format(posB- posA)," END={}" .format(posB)]
@@ -363,7 +395,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
363
395
GT= " 0/1"
364
396
365
397
variant.append( " {}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}" .format(GT,cn,sample_data[sample][" covA" ],sample_data[sample][" covM" ],sample_data[sample][" covB" ],n_discordants,n_splits,sample_data[sample][" QA" ],sample_data[sample][" QB" ],sample_data[sample][" refRA" ],sample_data[sample][" refRB" ],sample_data[sample][" refFA" ],sample_data[sample][" refFB" ]) )
366
- variants.append([chrA,posA,variant])
398
+ variants.append([chrA,posA,variant,scoring_dict ])
367
399
else :
368
400
info= [" SVTYPE=BND" .format(svtype)]
369
401
inverted= False
@@ -439,7 +471,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
439
471
440
472
441
473
variant.append( " {}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}" .format(GT,cn,sample_data[sample][" covA" ],sample_data[sample][" covM" ],sample_data[sample][" covB" ],n_discordants,n_splits,sample_data[sample][" QA" ],sample_data[sample][" QB" ],sample_data[sample][" refRA" ],sample_data[sample][" refRB" ],sample_data[sample][" refFA" ],sample_data[sample][" refFB" ]) )
442
- variants.append([chrA,posA,variant])
474
+ variants.append([chrA,posA,variant,scoring_dict ])
443
475
444
476
445
477
variant= [chrB,str (posB)," SV_{}_2" .format(var_n)," N" ,alt_str_b," ." ,filt,info,format_col]
@@ -472,7 +504,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
472
504
473
505
474
506
variant.append( " {}:{}:{},{},{}:{}:{}:{},{}:{},{}:{},{}" .format(GT,cn,sample_data[sample][" covA" ],sample_data[sample][" covM" ],sample_data[sample][" covB" ],n_discordants,n_splits,sample_data[sample][" QA" ],sample_data[sample][" QB" ],sample_data[sample][" refRA" ],sample_data[sample][" refRB" ],sample_data[sample][" refFA" ],sample_data[sample][" refFB" ]) )
475
- variants.append([chrB,posB,variant])
507
+ variants.append([chrB,posB,variant, scoring_dict ])
476
508
477
509
samfile.close()
478
510
return (variants)
@@ -498,8 +530,25 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
498
530
499
531
variants_list= Parallel(n_jobs = args.threads)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
500
532
533
+ ratios= {" fragments_A" :[]," fragments_B" :[]," reads_A" :[]," reads_B" :[]}
501
534
for v in variants_list:
502
535
for variant in v:
536
+ if variant[3 ][" n_discordants" ]:
537
+ ratios[" fragments_A" ].append(variant[3 ][" n_discordants" ]/ (variant[3 ][" refFA" ]+ variant[3 ][" n_discordants" ]) )
538
+ ratios[" fragments_B" ].append(variant[3 ][" n_discordants" ]/ (variant[3 ][" refFB" ]+ variant[3 ][" n_discordants" ]) )
539
+
540
+ if variant[3 ][" n_splits" ]:
541
+ ratios[" reads_A" ].append(variant[3 ][" n_splits" ]/ (variant[3 ][" refRA" ]+ variant[3 ][" n_splits" ]) )
542
+ ratios[" reads_B" ].append(variant[3 ][" n_splits" ]/ (variant[3 ][" refRB" ]+ variant[3 ][" n_splits" ]) )
543
+
544
+
545
+ p= [1 ,5 ,10 ,20 ,30 ,40 ,50 ,60 ,70 ,75 ,80 ,85 ,90 ,95 ,97.5 ,99 ]
546
+ percentiles= {" FA" :numpy.percentile(ratios[" fragments_A" ],p)," FB" :numpy.percentile(ratios[" fragments_B" ],p)," RA" :numpy.percentile(ratios[" reads_A" ],p)," RB" :numpy.percentile(ratios[" reads_B" ],p)}
547
+
548
+ for v in variants_list:
549
+ for variant in v:
550
+ score= scoring(variant[3 ],percentiles)
551
+ variant[2 ][5 ]= str (score)
503
552
variants[ variant[0 ] ].append( [ variant[1 ],variant[2 ] ] )
504
553
505
554
return (variants)
0 commit comments