From bc69865efd2e104cfa0f125ecc562a8cb68d2c97 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Thu, 30 Oct 2025 00:08:42 +0000 Subject: [PATCH] hts wide maxdepth limit check --- hts.c | 27 +++++++++++++ htslib/hts.h | 4 ++ sam.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++-- sam_internal.h | 29 ++++++++++++++ 4 files changed, 158 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index cb032a3ce..4c58fdf47 100644 --- a/hts.c +++ b/hts.c @@ -1216,6 +1216,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "FASTQ_UMI_REGEX") == 0) o->opt = FASTQ_OPT_UMI_REGEX, o->val.s = val; + else if (strcmp(o->arg, "seq_max_depth") == 0 || + strcmp(o->arg, "SEQ_MAX_DEPTH") == 0) + o->opt = SEQ_OPT_MAXDEPTH, o->val.i = atoi(val); + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -1260,6 +1264,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { case FASTQ_OPT_BARCODE: case FASTQ_OPT_UMI: case FASTQ_OPT_UMI_REGEX: + case SEQ_OPT_MAXDEPTH: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1704,6 +1709,7 @@ int hts_close(htsFile *fp) free(fp->fn); free(fp->fn_aux); free(fp->line.s); + sam_cond_destroy((sam_cond_t*)fp->cond_data); free(fp); errno = save; return ret; @@ -1903,6 +1909,27 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { break; } + case SEQ_OPT_MAXDEPTH: { + /* once max depth is reached for all pos covered by the read, it will + be discarded */ + va_start(args, opt); + int depth = va_arg(args, int); + va_end(args); + sam_cond_t *c = NULL; + if (depth > 0) { + if (!fp->cond_data) { + fp->cond_data = sam_cond_init(); + } + if (!fp->cond_data) { + hts_log_warning("Cannot set max depth option"); + return -1; + } + c = (sam_cond_t*)fp->cond_data; + c->maxdepth = depth; + } + } + break; + default: break; } diff --git a/htslib/hts.h b/htslib/hts.h index abbabdaac..c36f762ed 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -260,6 +260,7 @@ typedef struct htsFile { const char *fnidx; struct sam_hdr_t *bam_header; struct hts_filter_t *filter; + void *cond_data; //for extra checks and filters } htsFile; // A combined thread pool and queue allocation size. @@ -332,6 +333,9 @@ enum hts_fmt_option { HTS_OPT_FILTER, HTS_OPT_PROFILE, + // Seq data + SEQ_OPT_MAXDEPTH = 500, // sets the max depth above which read is discarded + // Fastq // Boolean. diff --git a/sam.c b/sam.c index b78fe8972..cc9061b7c 100644 --- a/sam.c +++ b/sam.c @@ -4254,6 +4254,95 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { return ret; } +/// extra checks on reads and decide whether to pass it to user or not +/* fp - file pointer which holds the buffer + * h - header + * b - bam data + * returns 1 to send the read to user and 0 to discard and check next read + * returns -1 on errors + */ +int check_conditions(samFile *fp, sam_hdr_t *h, bam1_t *b) +{ + /* for maxdepth, depth per position is kept on a buffer in sam_cond_t data + in the file. It is safe as long as same file is not shared across threads. + Expects sorted data and if not so, fails. The depth data is discarded as + new position is read, to hold only relevant data. + */ + int ret = 0, i, j, updlen = 0; + sam_cond_t *c = NULL; + uint32_t *depbuf = NULL, *cigar = NULL; + size_t seqoffset = 0, reqbuflen = 0, bkplen = 0; + kstring_t *ksbuf = NULL; + + //assumes fp and condition data to be valid + c = (sam_cond_t*)fp->cond_data; + if (!(ksbuf = &c->ksbuf)) { + return -1; //invalid, earlier alloc failed + } + bkplen = c->ksbuf.m; + if (b->core.flag & BAM_FUNMAP || !b->core.n_cigar) + return 1; //unmapped or no cigar, pass + + if (c->tid != b->core.tid) { //tid changed, reset and reuse + c->bufstart = c->bufend = seqoffset = 0; + c->tid = b->core.tid; + if (ksbuf->s) + memset(ksbuf->s, 0, ksbuf->m); + } else { //same tid, ensure sorted data + if (c->bufstart > b->core.pos) { + hts_log_warning("Unsorted data, max depth checks won't work"); + sam_cond_destroy(c); + fp->cond_data = NULL; + return -1; //return failure + } + } + + seqoffset = c->bufend > b->core.pos ? b->core.pos - c->bufstart : 0; + if (ksbuf->m) { + //sorted and got a new pos, we can discard all upto this point + if ( c->bufstart < b->core.pos) { + if (seqoffset) { //part discard + size_t sz = seqoffset * sizeof(uint32_t); + size_t len = ksbuf->m - sz; + memmove(ksbuf->s, ksbuf->s + sz, len); + memset(ksbuf->s + len, 0, sz); + } else //full discard + memset(ksbuf->s, 0, ksbuf->m); + + c->bufstart = b->core.pos; + seqoffset = 0; + } + } + if (c->bufstart < b->core.pos) + c->bufstart = b->core.pos; + //depends on l_qseq and length given by query consuming cigars matching + reqbuflen = seqoffset + b->core.l_qseq; + cigar = bam_get_cigar(b); + if (reqbuflen * sizeof(uint32_t) > ksbuf->m) { + if (ks_resize(ksbuf, reqbuflen * sizeof(uint32_t))) //failed to realloc + return -1; + memset(ksbuf->s+bkplen, 0, ksbuf->m - bkplen); //reset + } + + depbuf = (uint32_t*)ksbuf->s; + + for (i = 0; i < b->core.n_cigar; ++i) { + if (!(bam_cigar_type(bam_cigar_op(cigar[i]))&1)) { + continue; //irrelevant cigar + } + //cigar consuming query, update depth for each pos + for(j = 0; j < bam_cigar_oplen(cigar[i]); ++j) { + ret |= ++depbuf[seqoffset + updlen + j] <= c->maxdepth; + } + updlen += bam_cigar_oplen(cigar[i]); + } + if (c->bufend < b->core.pos + b->core.l_qseq) + c->bufend = b->core.pos + b->core.l_qseq; //update bufend pos + ksbuf->l = c->bufend - c->bufstart; + + return ret; +} + // Returns 0 on success, // -1 on EOF, // <-1 on error @@ -4296,9 +4385,15 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) return -3; } - pass_filter = (ret >= 0 && fp->filter) - ? sam_passes_filter(h, b, fp->filter) - : 1; + if (ret >= 0) { + pass_filter = fp->filter + ? sam_passes_filter(h, b, fp->filter) + : 1; + + if (pass_filter && fp->cond_data) //extra checks if set so + pass_filter = check_conditions(fp, h, b); + } else + pass_filter = 1; } while (pass_filter == 0); return pass_filter < 0 ? -2 : ret; diff --git a/sam_internal.h b/sam_internal.h index 161782aa3..af7529aa7 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -117,6 +117,35 @@ static inline void nibble2base(uint8_t *nib, char *seq, int len) { #endif } +/// holds data for extra condition checks +typedef struct sam_cond_t { + int tid, maxdepth; + //buffer to hold depth data + hts_pos_t bufstart, bufend; + kstring_t ksbuf; +} sam_cond_t; + +/// allocates condition data +/// returns the allocated data on success and NULL on failure +static inline sam_cond_t* sam_cond_init(void) +{ + sam_cond_t *c = calloc(1, sizeof(sam_cond_t)); + if (!c) { + hts_log_warning("Couldn't create condition data"); + return NULL; + } + return c; +} + +/// deallocates condition data +static inline void sam_cond_destroy(sam_cond_t *c) +{ + if (!c) + return; + ks_free(&c->ksbuf); + free(c); +} + #ifdef __cplusplus } #endif