From 7dde686dc84d5c36c6f8ee77d0fed8830c163749 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Mar 2023 10:37:37 +0000 Subject: [PATCH 1/7] Add VCF/BCF support for POS=0 coordinate Review of `bcftools merge` code and possibly others is necessary because it is relying on uninitialized POS==-1 in synced_bcf_reader This should resolve #1571 --- hts.c | 10 ++++++---- htslib/synced_bcf_reader.h | 2 ++ synced_bcf_reader.c | 31 ++++++++++++++++--------------- tbx.c | 8 ++++---- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/hts.c b/hts.c index cead9d537..0f07fd072 100644 --- a/hts.c +++ b/hts.c @@ -2205,6 +2205,8 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of { int i; hts_pos_t beg, end; + if ( _beg<0 ) _beg = 0; + if ( _end<=_beg ) _end = _beg+1; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -2905,6 +2907,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi { int l, t, s = min_shift + (n_lvls<<1) + n_lvls; if (beg >= end) return 0; + if (beg < 0 ) beg = 0; + if (beg==end) end = 1; if (end >= 1LL<= idx->n || (bidx = idx->bidx[tid]) == NULL) { iter->finished = 1; } else { - if (beg < 0) beg = 0; + if (beg < -1) beg = -1; if (end < beg) { free(iter); return NULL; @@ -3103,7 +3107,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } - rel_off = beg>>idx->min_shift; + rel_off = (beg>=0 ? beg : 0) >> idx->min_shift; // compute min_off bin = hts_bin_first(idx->n_lvls) + rel_off; do { @@ -3138,8 +3142,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t min_off = kh_val(bidx, k).loff; } } else if (unmapped) { //CSI index - if (k != kh_end(bidx)) - min_off = kh_val(bidx, k).loff; } // compute max_off: a virtual offset from a bin to the right of end diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 78e9a0b4a..c8c31ac13 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -64,6 +64,8 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +#define CSI_COOR_EMPTY -2 + /* When reading multiple files in parallel, duplicate records within each file will be reordered and offered in intuitive order. For example, diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 23e0ecaef..1eaf31324 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -371,7 +371,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) if ( !files->regions ) files->regions = _regions_init_string(names[i]); else - _regions_add(files->regions, names[i], -1, -1); + _regions_add(files->regions, names[i], CSI_COOR_EMPTY, CSI_COOR_EMPTY); } free(names); _regions_sort_and_merge(files->regions); @@ -555,7 +555,7 @@ static int _readers_next_region(bcf_srs_t *files) int prev_iseq = files->regions->iseq; hts_pos_t prev_end = files->regions->end; if ( bcf_sr_regions_next(files->regions)<0 ) return -1; - files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1; + files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : CSI_COOR_EMPTY; for (i=0; inreaders; i++) _reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end); @@ -616,7 +616,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { reader->buffer[reader->mbuffer-i] = bcf_init1(); reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack; - reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1 + reader->buffer[reader->mbuffer-i]->pos = CSI_COOR_EMPTY; // for rare cases when VCF starts from 1 or 0 } } if ( files->streaming ) @@ -656,7 +656,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) if ( ret < 0 ) break; // no more lines or an error bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } - // Prevent creation of duplicates from records overlapping multiple regions // and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-) if ( files->regions ) @@ -676,7 +675,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); exit(1); } - if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -954,9 +952,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) // inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions(). static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { - if ( start==-1 && end==-1 ) + if ( start==CSI_COOR_EMPTY && end==CSI_COOR_EMPTY ) { - start = 0; end = MAX_CSI_COOR-1; + start = -1; end = MAX_CSI_COOR-1; } else { @@ -1029,8 +1027,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; + reg->prev_start = reg->prev_end = CSI_COOR_EMPTY; + reg->prev_seq = -1; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; @@ -1075,7 +1074,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); + if ( tmp.l ) _regions_add(reg, tmp.s, CSI_COOR_EMPTY, CSI_COOR_EMPTY); if ( !*ep ) break; sp = ++ep; } @@ -1157,8 +1156,9 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr } reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); - reg->start = reg->end = -1; - reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; + reg->prev_start = reg->prev_end = CSI_COOR_EMPTY; + reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) @@ -1253,7 +1253,8 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *reg) int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq) { - reg->iseq = reg->start = reg->end = -1; + reg->iseq = -1; + reg->start = reg->end = CSI_COOR_EMPTY; if ( khash_str2int_get(reg->seq_hash, seq, ®->iseq) < 0 ) return -1; // sequence seq not in regions // using in-memory regions @@ -1284,7 +1285,7 @@ static int advance_creg(region_t *reg) int bcf_sr_regions_next(bcf_sr_regions_t *reg) { if ( reg->iseq<0 ) return -1; - reg->start = reg->end = -1; + reg->start = reg->end = CSI_COOR_EMPTY; reg->nals = 0; // using in-memory regions @@ -1442,7 +1443,7 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p bcf_sr_regions_flush(reg); bcf_sr_regions_seek(reg, seq); - reg->start = reg->end = -1; + reg->start = reg->end = CSI_COOR_EMPTY; } if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome reg->prev_seq = reg->iseq; diff --git a/tbx.c b/tbx.c index d897a21f1..4fd6ace65 100644 --- a/tbx.c +++ b/tbx.c @@ -107,12 +107,12 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; - if (intv->beg < 0) { + if (intv->beg < -1) { hts_log_warning("Coordinate <= 0 detected. " "Did you forget to use the -0 option?"); intv->beg = 0; } - if (intv->end < 1) intv->end = 1; + if (intv->end < intv->beg ) intv->end = intv->beg; } else { if ((conf->preset&0xffff) == TBX_GENERIC) { if (id == conf->ec) @@ -171,7 +171,7 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) ++id; } } - if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1; + if (intv->ss == 0 || intv->se == 0 || intv->beg < -1 || intv->end < -1) return -1; return 0; } @@ -181,7 +181,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ int c = *intv->se; *intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c; if (intv->tid < 0) return -2; // get_tid out of memory - return (intv->beg >= 0 && intv->end >= 0)? 0 : -1; + return (intv->beg >= -1 && intv->end >= -1)? 0 : -1; } else { char *type = NULL; switch (tbx->conf.preset&0xffff) From 2bb2980eb119453690315a3a73b06b58ba52bd95 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 14:47:47 +0000 Subject: [PATCH 2/7] Remove restriction on negative `beg` in insert_to_l() The restriction is already enforced by hts_idx_push() before this function is called, so no need to do it again. --- hts.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/hts.c b/hts.c index 0f07fd072..25a3348f5 100644 --- a/hts.c +++ b/hts.c @@ -2205,8 +2205,6 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of { int i; hts_pos_t beg, end; - if ( _beg<0 ) _beg = 0; - if ( _end<=_beg ) _end = _beg+1; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { From 057e51a3834a26f22b7b1b5992099160f9585ed3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 14:36:29 +0000 Subject: [PATCH 3/7] Fix bug where bin number would overflow in hts_itr_query When searching for `max_off`, hts_itr_query() looks for a bin to the right of the end of the region. For whole chromosomes, this would be HTS_POS_MAX, which is far beyond the maximum bin position supported. The `bin` calculation overflowed leading to much implementation-defined behaviour as it explored various negative bin numbers. Fix by limiting `end` to the maximum value supported by the index. --- hts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hts.c b/hts.c index 25a3348f5..49ede7e1b 100644 --- a/hts.c +++ b/hts.c @@ -3089,6 +3089,8 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t iter->finished = 1; } else { if (beg < -1) beg = -1; + if (end >= 1LL << (idx->min_shift + 3 * idx->n_lvls)) + end = (1LL << (idx->min_shift + 3 * idx->n_lvls)) - 1; if (end < beg) { free(iter); return NULL; From ef4ce2307f1f968127de475e5e2a86faed592515 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 14:54:37 +0000 Subject: [PATCH 4/7] Handle differences about which index bin holds position -1 Since 71db68750 htslib has put ranges starting -1 into the most appropriate bin starting from 0. However, the SAM specification actually says [-1, 0) should go in bin 4680 (for bai / tbx indexes) and htsjdk does this. To be able to handle indexes made both ways, add bin 4680 (or the equivalent number for csi) to the search list in `reg2bins()` as a special case, and make hts_itr_query() check for it when finding `min_off`, when ranges start at -1. Also take the opportunity to make reg2bins() notice when its memory allocations fail, and ensure the error gets propagated if necessary. --- hts.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/hts.c b/hts.c index 49ede7e1b..4b5e51d50 100644 --- a/hts.c +++ b/hts.c @@ -2900,23 +2900,55 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) *** Iterator *** ****************/ +static inline int grow_itr_bins(hts_itr_t *itr, int num) +{ + if (itr->bins.m - itr->bins.n < num) { + size_t new_n = (size_t) itr->bins.n + num; + size_t new_m = (size_t) itr->bins.m + (itr->bins.m >> 1); + if (new_m < new_n) + new_m = new_n; + if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) { + errno = ENOMEM; + return -1; + } + int *new_a = realloc(itr->bins.a, sizeof(int) * new_m); + if (!new_a) + return -1; + itr->bins.m = (int) new_m; + itr->bins.a = new_a; + } + return 0; +} + // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; - if (beg < 0 ) beg = 0; - if (beg==end) end = 1; + if (beg < 0 ) { + // Deal with historic difference of opinion about which bin the + // range [-1, 0) should end up in. HTSlib since 2016 says the + // smallest position 0 bin; before that it would crash when trying + // to make the index. htsjdk and the SAM specification say + // they should go into the bin before this instead, which is the + // largest one on the previous level - and all other ranges + // starting -1 will go into bin 0 (the SAM spec. lists other + // possibilities, but they can't actually occur). To ensure + // compatibility, we need to add this extra bin to the search list + // as a special case. + beg = 0; + if (end == 0) end = 1; + if (grow_itr_bins(itr, 1) < 0) + return -1; + itr->bins.a[itr->bins.n++] = hts_bin_first(n_lvls) - 1; + } if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; - if (itr->bins.n + n > itr->bins.m) { - itr->bins.m = itr->bins.n + n; - kroundup32(itr->bins.m); - itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); - } + if (grow_itr_bins(itr, n) < 0) + return -1; for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; } return itr->bins.n; @@ -3120,6 +3152,15 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } while (bin); if (bin == 0) k = kh_get(bin, bidx, bin); min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + + if (beg < 0 && min_off > 0) { + // Regions [-1, 0) may be in hts_bin_first(idx->n_lvls) - 1 + // which could give a lower min_off + k = kh_get(bin, bidx, hts_bin_first(idx->n_lvls) - 1); + if (k != kh_end(bidx) && kh_val(bidx, k).loff < min_off) + min_off = kh_val(bidx, k).loff; + } + // min_off can be calculated more accurately if the // linear index is available if (idx->lidx[tid].offset @@ -3159,7 +3200,10 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } // retrieve bins - reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls) < 0) { + hts_itr_destroy(iter); + return NULL; + } for (i = n_off = 0; i < iter->bins.n; ++i) if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) From 41f1934021168f6b84f7ec3acac9de7427b0a7f6 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 15:13:38 +0000 Subject: [PATCH 5/7] Allow range queries for VCF/BCF to start before the first ref base Required to support VCF files with variants at position 0 Add region parse flag HTS_PARSE_REF_START_MINUS_1 to make whole-chromosome ranges start at (zero-based position) -1 instead of 0. Rework hts_parse_region() so it can correctly interpret region specifications that start at zero. If the new HTS_PARSE_REF_START_MINUS_1 flag is set, it will also return region [-1, HTS_POS_MAX) for whole-chromosome requests. Make hts_itr_querys() work out if it's making an iterator for a vcf or bcf file and pass the HTS_PARSE_REF_START_MINUS_1 flag to hts_parse_region() if it is. This detection works by inspecting the `readrec` parameter to see what sort of file is being read. This should work as long as hts_itr_querys() is being called via the bcf_itr_querys() or tbx_itr_querys() wrappers, however it may be better to make a new interface that can accept an explicit flag parameter. --- hts.c | 72 ++++++++++++++++++++++++++++++++-------------------- htslib/hts.h | 1 + 2 files changed, 46 insertions(+), 27 deletions(-) diff --git a/hts.c b/hts.c index 4b5e51d50..db2bc7020 100644 --- a/hts.c +++ b/hts.c @@ -3661,6 +3661,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, const char *colon = NULL, *comma = NULL; int quoted = 0; + hts_pos_t first_base = flags & HTS_PARSE_REF_START_MINUS_1 ? -1 : 0; if (flags & HTS_PARSE_LIST) flags &= ~HTS_PARSE_THOUSANDS_SEP; @@ -3706,7 +3707,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = HTS_POS_MAX; + *beg = first_base; *end = HTS_POS_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3721,7 +3722,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = HTS_POS_MAX; + *beg = first_base; *end = HTS_POS_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3768,39 +3769,46 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, // Finally parse the post-colon coordinates char *hyphen; - *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; - if (*beg < 0) { - if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') { - // User specified zero, but we're 1-based. - hts_log_error("Coordinates must be > 0"); - return NULL; - } - if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { - // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); - *beg = 0; - return s_end; - } else if (*beg < -1) { - hts_log_error("Unexpected string \"%s\" after region", hyphen); - return NULL; + const char *from_pos = colon + 1; + int have_start_pos = 0; + while (isspace_c(*from_pos)) from_pos++; + if (*from_pos == '-') { + // interpret chr:-100 as chr:1-100 + *beg = first_base; + hyphen = (char *) from_pos; + } else { + *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; + if (hyphen != colon+1) { + have_start_pos = 1; + } else { + *beg = first_base; } } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX; - } else if (*hyphen == '-') { - *end = hts_parse_decimal(hyphen+1, &hyphen, flags); - if (*hyphen != '\0' && *hyphen != ',') { - hts_log_error("Unexpected string \"%s\" after region", hyphen); - return NULL; - } - } else { + return s_end; + } else if (*hyphen != '-') { hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; } - if (*end == 0) - *end = HTS_POS_MAX; // interpret chr:100- as chr:100- + char *last; + *end = hts_parse_decimal(hyphen+1, &last, flags); + if (!have_start_pos && + (*last == '-' || *end < 0 || (flags & HTS_PARSE_ONE_COORD))) { + // Attempt to use a negative position, e.g. chr:-100-101 + // or chr:--100 + hts_log_error("Coordinates must be > 0"); + return NULL; + } else if (last == hyphen + 1) { + if (*last != '\0' && !((flags & HTS_PARSE_LIST) && *last == ',')) { + hts_log_error("Unexpected string \"%s\" after region", last); + return NULL; + } else { + *end = HTS_POS_MAX; // interpret chr:100- as chr:100- + } + } if (*beg >= *end) return NULL; @@ -3852,15 +3860,25 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { + extern int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end); int tid; hts_pos_t beg, end; + int flags = HTS_PARSE_THOUSANDS_SEP; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); else if (strcmp(reg, "*") == 0) return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec); - if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) + if (readrec == bcf_readrec) { + flags |= HTS_PARSE_REF_START_MINUS_1; + } else if (readrec == tbx_readrec) { + tbx_t *tbx = (tbx_t *) hdr; + if ((tbx->conf.preset&0xffff) == TBX_VCF) + flags |= HTS_PARSE_REF_START_MINUS_1; + } + + if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, flags)) return NULL; return itr_query(idx, tid, beg, end, readrec); diff --git a/htslib/hts.h b/htslib/hts.h index 9696bcd2c..6292178b8 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1132,6 +1132,7 @@ int hts_idx_nseq(const hts_idx_t *idx); #define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers #define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end #define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) +#define HTS_PARSE_REF_START_MINUS_1 8 ///< Return -1 for the start of a complete reference (mainly for VCF) /// Parse a numeric string /** The number may be expressed in scientific notation, and optionally may From b354c8f06c1916ccfaa5a127d7d34ff473aef712 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 15:33:14 +0000 Subject: [PATCH 6/7] Add tests for vcf files with records at position 0 --- test/tabix/tabix.tst | 33 +++++++++++++++++++++++++- test/tabix/telomere1.bcf | Bin 0 -> 442 bytes test/tabix/telomere1.bcf.csi | Bin 0 -> 103 bytes test/tabix/telomere1.vcf.gz | Bin 0 -> 430 bytes test/tabix/telomere1.vcf.gz.csi | Bin 0 -> 116 bytes test/tabix/telomere2.vcf.gz | Bin 0 -> 430 bytes test/tabix/telomere2.vcf.gz.tbi | Bin 0 -> 111 bytes test/tabix/telomere_all.expected.vcf | 2 ++ test/tabix/telomere_htsjdk.vcf.gz | Bin 0 -> 387 bytes test/tabix/telomere_htsjdk.vcf.gz.tbi | Bin 0 -> 99 bytes test/tabix/telomere_pos0.expected.vcf | 1 + test/tabix/telomere_pos1.expected.vcf | 1 + 12 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 test/tabix/telomere1.bcf create mode 100644 test/tabix/telomere1.bcf.csi create mode 100644 test/tabix/telomere1.vcf.gz create mode 100644 test/tabix/telomere1.vcf.gz.csi create mode 100644 test/tabix/telomere2.vcf.gz create mode 100644 test/tabix/telomere2.vcf.gz.tbi create mode 100644 test/tabix/telomere_all.expected.vcf create mode 100644 test/tabix/telomere_htsjdk.vcf.gz create mode 100644 test/tabix/telomere_htsjdk.vcf.gz.tbi create mode 100644 test/tabix/telomere_pos0.expected.vcf create mode 100644 test/tabix/telomere_pos1.expected.vcf diff --git a/test/tabix/tabix.tst b/test/tabix/tabix.tst index f5fd71a08..1d21078f1 100644 --- a/test/tabix/tabix.tst +++ b/test/tabix/tabix.tst @@ -1,4 +1,4 @@ -# Copyright (C) 2017 Genome Research Ltd. +# Copyright (C) 2017,2023 Genome Research Ltd. # # Author: Robert Davies # @@ -66,3 +66,34 @@ P gff_file.X.2934832.2935190.out $tabix gff_file.tbi.tmp.gff.gz X:2934832-293519 # tabix with --separate-regions P bed_file.separate.out $tabix --separate-regions bed_file.tbi.tmp.bed.gz X:1100-1400 Y:100000-100550 Z:100000-100005 + +# VCF with data at position 0 (indicates telomere) +# telomere1.vcf.gz - vcf with csi index made by htslib +# telomere2.vcf.gz - vcf with tbi index made by htslib +# telomere_htsjdk.vcf.gz - vcf with tbi index made by htsjdk +# telomere1.bcf - bcf with csi index made by htslib + +P telomere_all.expected.vcf $tabix telomere1.vcf.gz 1 +P telomere_all.expected.vcf $tabix telomere2.vcf.gz 1 +P telomere_all.expected.vcf $tabix telomere_htsjdk.vcf.gz 1 +P telomere_all.expected.vcf $tabix telomere1.bcf 1:0- +P telomere_all.expected.vcf $tabix telomere1.vcf.gz 1:0- +P telomere_all.expected.vcf $tabix telomere2.vcf.gz 1:0- +P telomere_all.expected.vcf $tabix telomere_htsjdk.vcf.gz 1:0- +P telomere_all.expected.vcf $tabix telomere1.bcf 1:0- +P telomere_all.expected.vcf $tabix telomere1.vcf.gz 1:-1 +P telomere_all.expected.vcf $tabix telomere2.vcf.gz 1:-1 +P telomere_all.expected.vcf $tabix telomere_htsjdk.vcf.gz 1:-1 +P telomere_all.expected.vcf $tabix telomere1.bcf 1:-1 +P telomere_pos0.expected.vcf $tabix telomere1.vcf.gz 1:0-0 +P telomere_pos0.expected.vcf $tabix telomere2.vcf.gz 1:0-0 +P telomere_pos0.expected.vcf $tabix telomere_htsjdk.vcf.gz 1:0-0 +P telomere_pos0.expected.vcf $tabix telomere1.bcf 1:0-0 +P telomere_pos1.expected.vcf $tabix telomere1.vcf.gz 1:1-1 +P telomere_pos1.expected.vcf $tabix telomere2.vcf.gz 1:1-1 +P telomere_pos1.expected.vcf $tabix telomere_htsjdk.vcf.gz 1:1-1 +P telomere_pos1.expected.vcf $tabix telomere1.bcf 1:1-1 +P telomere_pos1.expected.vcf $tabix telomere1.vcf.gz 1:1- +P telomere_pos1.expected.vcf $tabix telomere2.vcf.gz 1:1- +P telomere_pos1.expected.vcf $tabix telomere_htsjdk.vcf.gz 1:1- +P telomere_pos1.expected.vcf $tabix telomere1.bcf 1:1- diff --git a/test/tabix/telomere1.bcf b/test/tabix/telomere1.bcf new file mode 100644 index 0000000000000000000000000000000000000000..c8c5ddb8e97129180bf3929bcadf6d8878c548c9 GIT binary patch literal 442 zcmV;r0Y&~FiwFb&00000{{{d;LjnMu0c}yuj+-zL9-{VOZhOjg8?Fd&ve{HsU5SQ} zA|)FdXw_Rp*nwDefMRE*^wPdupDUHa{z^M1%k%xrH}iQK_Im#e0Lv=4z@l#VnPT6k z;oX%(P0I?SMLJuvk5Ry4KS}HWW!~~e@w#FIUx>?s3x%z`Y%(eFc3?-r59WO`JGmfJ z$c7#rdqeu(w5%2jY_ZCb6{`8*x`k>S>Umy?RlP^);FddC?i_x)?p-aY?z;-}UB&ZE zxQ}eN*<})ko5zohHs5c>aU=29;SAZy(d2gQ6q!^0HZ4omhc-v1&E}n1tTje4Z&2_G zb=1{*>8IzY^VGgM?6=r5YNy8rS)y8Al`qq49d&H5%S?6Vysi{4&l1#rn2Bvi6I)`dvN8?7|`vR*Aoyh=zEko7Di{vst`zxfN&9fU+T;fiz< zlLwv&0|0mnU7b#+lK}t*{QU#a0|UUAL;>wj`|}I9Fy6iCJ%{Uh1wQQ0`{_OWKPI3j kt}Fro03VA81ONa4009360763o02=@U000000000002lGhG5`Po literal 0 HcmV?d00001 diff --git a/test/tabix/telomere1.bcf.csi b/test/tabix/telomere1.bcf.csi new file mode 100644 index 0000000000000000000000000000000000000000..aa01bf21df006aab3d387b5489ebcb563414872f GIT binary patch literal 103 zcmb2|=3rp}f&Xj_PR>jWUJSV>7YZIw5OBFzZhlcPQ?u*F7kAGqEeQ;VgROtx-+!d{ nw1CHyB}e}~TgFz?ugY&bx3p&E&6Nc}6Xem1l4f8An+zfVL1rK- literal 0 HcmV?d00001 diff --git a/test/tabix/telomere1.vcf.gz b/test/tabix/telomere1.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..d120b96793f34cf944ecbef1997137fc91aea377 GIT binary patch literal 430 zcmb2|=3rp}f&Xj_PR>jWK8&FOXY-mI1nl(v6YppZOPf(9;quev zZ<3Pc?_*r#nLl~v`*H(4iS|WrUx!>;-&}kA5F zL2Lhn8i>8U@Z7vj7#UEZMobJAOeEHLbKmVTCYhgDlcoDn*Pv-S?pENcxGBC)a1*Qvw@e`8=UrG$0 zlsqeWe&sPw0T17^E00U~ADUb-e_-^)jW;S9N_H*z*O2)JC_-YyZ<)F~R#e38?wY@6upHA;_E>K~lX zN$=-WPBU>4oR)O>*Nk}uEm8O9^4utX@b%BHXZGT(J8T*HfX2zAnJdk}47LVD0016) BB$5CC literal 0 HcmV?d00001 diff --git a/test/tabix/telomere2.vcf.gz b/test/tabix/telomere2.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..d120b96793f34cf944ecbef1997137fc91aea377 GIT binary patch literal 430 zcmb2|=3rp}f&Xj_PR>jWK8&FOXY-mI1nl(v6YppZOPf(9;quev zZ<3Pc?_*r#nLl~v`*H(4iS|WrUx!>;-&}kA5F zL2Lhn8i>8U@Z7vj7#UEZMobJAOeEHLbKmVTCYhgDlcoDn*Pv-S?pENcxGBC)a1*Qvw@e`8=UrG$0 zlsqeWe&sPw0T17^E00U~ADUb-e_-^)jWK@6d1Hu5$Y@VHz|??1rhbAYSFaZ&OnAD@RiPMF49-QE9Q vPSC^TVT@0V(Bl=X`3HF_(xR*H`_H|*_D}4UjWX^g3X(b>%g619Hyt}E9k2~|rcOM2+9OIW=+Z~dZgm)jys z)x>ti?{~>ee51Nl&+Zzd+@l*etc+%)~}ELJ~>S=j=S zr=?H6?(&_pNk)5mOIXir7Y+;GxUI+S-_2d|eaFPg7kZjodS+c*%P4kgTKLiXhI%1e zLsIhj=GH#Qt@>Qhv$AdTu1R}dBxb!@;T?4^!YX!_YDlZiX3M#gZq{tPko0#_sZ!(3 zmUErH^F^*X@#%(0Ziq2EFR}XSqLv9xt?>>K6N|NB|lZ?)wjWZVcRopHfm%5)u-ak|cPUP6bFEF=v@9=OiFHTSC$3`GPI# iAu`9(6Fk^APq@m&FquhsP8v|RJeo1m49s9tK?DHPSQYUA literal 0 HcmV?d00001 diff --git a/test/tabix/telomere_pos0.expected.vcf b/test/tabix/telomere_pos0.expected.vcf new file mode 100644 index 000000000..f20ec9a9a --- /dev/null +++ b/test/tabix/telomere_pos0.expected.vcf @@ -0,0 +1 @@ +1 0 ID1 C G . PASS . GT 1/0 diff --git a/test/tabix/telomere_pos1.expected.vcf b/test/tabix/telomere_pos1.expected.vcf new file mode 100644 index 000000000..8e6df5deb --- /dev/null +++ b/test/tabix/telomere_pos1.expected.vcf @@ -0,0 +1 @@ +1 1 ID2 G T . PASS . GT 1/0 From 4d780c0c42ac5cf547ad0987a0aea954bb93cf5b Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 21 Mar 2023 13:42:02 +0000 Subject: [PATCH 7/7] Make synced bcf reader handle regions starting with '-' correctly. These should be interpreted as start of chr. to the given position. --- synced_bcf_reader.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 1eaf31324..aed455e1b 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1042,11 +1042,20 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep==':' ) { sp = ep+1; - from = hts_parse_decimal(sp,(char**)&ep,0); - if ( sp==ep ) + while (isspace((uint8_t) *sp)) sp++; + if (*sp == '-') { - hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + ep = sp; + from = -1; + } + else + { + from = hts_parse_decimal(sp,(char**)&ep,0); + if ( sp==ep ) + { + hts_log_error("Could not parse the region(s): %s", str); + free(reg); free(tmp.s); return NULL; + } } if ( !*ep || *ep==',' ) {