Skip to content

Commit 24a4808

Browse files
committed
r718: retrieve sequence from the index
1 parent 29ed675 commit 24a4808

File tree

9 files changed

+115
-20
lines changed

9 files changed

+115
-20
lines changed

index.c

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
KHASH_INIT(idx, uint64_t, uint64_t, 1, idx_hash, idx_eq)
2121
typedef khash_t(idx) idxhash_t;
2222

23+
KHASH_MAP_INIT_STR(str, uint32_t)
24+
2325
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
2426

2527
typedef struct mm_idx_bucket_s {
@@ -29,15 +31,6 @@ typedef struct mm_idx_bucket_s {
2931
void *h; // hash table indexing _p_ and minimizers appearing once
3032
} mm_idx_bucket_t;
3133

32-
void mm_idxopt_init(mm_idxopt_t *opt)
33-
{
34-
memset(opt, 0, sizeof(mm_idxopt_t));
35-
opt->k = 15, opt->w = 10, opt->flag = 0;
36-
opt->bucket_bits = 14;
37-
opt->mini_batch_size = 50000000;
38-
opt->batch_size = 4000000000ULL;
39-
}
40-
4134
mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
4235
{
4336
mm_idx_t *mi;
@@ -54,6 +47,7 @@ void mm_idx_destroy(mm_idx_t *mi)
5447
{
5548
int i;
5649
if (mi == 0) return;
50+
if (mi->h) kh_destroy(str, (khash_t(str)*)mi->h);
5751
for (i = 0; i < 1<<mi->b; ++i) {
5852
free(mi->B[i].p);
5953
free(mi->B[i].a.a);
@@ -109,6 +103,34 @@ void mm_idx_stat(const mm_idx_t *mi)
109103
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), n, 100.0*n1/n, (double)sum / n, (double)len / sum);
110104
}
111105

106+
int mm_idx_index_name(mm_idx_t *mi)
107+
{
108+
khash_t(str) *h;
109+
uint32_t i;
110+
int has_dup = 0, absent;
111+
if (mi->h) return 0;
112+
h = kh_init(str);
113+
for (i = 0; i < mi->n_seq; ++i) {
114+
khint_t k;
115+
k = kh_put(str, h, mi->seq[i].name, &absent);
116+
if (absent) kh_val(h, k) = i;
117+
else has_dup = 1;
118+
}
119+
mi->h = h;
120+
if (has_dup && mm_verbose >= 2)
121+
fprintf(stderr, "[WARNING] some database sequences have identical sequence names\n");
122+
return has_dup;
123+
}
124+
125+
int mm_idx_name2id(const mm_idx_t *mi, const char *name)
126+
{
127+
khash_t(str) *h = (khash_t(str)*)mi->h;
128+
khint_t k;
129+
if (h == 0) return -2;
130+
k = kh_get(str, h, name);
131+
return k == kh_end(h)? -1 : kh_val(h, k);
132+
}
133+
112134
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
113135
{
114136
uint64_t i, st1, en1;

main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#include "mmpriv.h"
77
#include "getopt.h"
88

9-
#define MM_VERSION "2.8-r711-dirty"
9+
#define MM_VERSION "2.8-r718-dirty"
1010

1111
#ifdef __linux__
1212
#include <sys/resource.h>

minimap.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ typedef struct {
5959
mm_idx_seq_t *seq; // sequence name, length and offset
6060
uint32_t *S; // 4-bit packed sequence
6161
struct mm_idx_bucket_s *B; // index (hidden)
62-
void *km;
62+
void *km, *h;
6363
} mm_idx_t;
6464

6565
// minimap2 alignment
@@ -297,6 +297,11 @@ int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int
297297

298298
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
299299

300+
// query sequence name and sequence in the minimap2 index
301+
int mm_idx_index_name(mm_idx_t *mi);
302+
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
303+
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
304+
300305
// deprecated APIs for backward compatibility
301306
void mm_mapopt_init(mm_mapopt_t *opt);
302307
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);

mmpriv.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
6363

6464
void mm_idxopt_init(mm_idxopt_t *opt);
6565
const uint64_t *mm_idx_get(const mm_idx_t *mi, uint64_t minier, int *n);
66-
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
6766
int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f);
6867
mm128_t *mm_chain_dp(int max_dist_x, int max_dist_y, int bw, int max_skip, int min_cnt, int min_sc, int is_cdna, int n_segs, int64_t n, mm128_t *a, int *n_u_, uint64_t **_u, void *km);
6968
mm_reg1_t *mm_align_skeleton(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int qlen, const char *qstr, int *n_regs_, mm_reg1_t *regs, mm128_t *a);

options.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
#include <stdio.h>
22
#include "mmpriv.h"
33

4+
void mm_idxopt_init(mm_idxopt_t *opt)
5+
{
6+
memset(opt, 0, sizeof(mm_idxopt_t));
7+
opt->k = 15, opt->w = 10, opt->flag = 0;
8+
opt->bucket_bits = 14;
9+
opt->mini_batch_size = 50000000;
10+
opt->batch_size = 4000000000ULL;
11+
}
12+
413
void mm_mapopt_init(mm_mapopt_t *opt)
514
{
615
memset(opt, 0, sizeof(mm_mapopt_t));

python/README.rst

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ The following Python script demonstrates the key functionality of mappy:
3434
import mappy as mp
3535
a = mp.Aligner("test/MT-human.fa") # load or build index
3636
if not a: raise Exception("ERROR: failed to load/build index")
37+
s = a.seq("MT_human", 100, 200) # retrieve a subsequence from the index
38+
print(mp.revcomp(s)) # reverse complement
3739
for name, seq, qual in mp.fastx_read("test/MT-orang.fa"): # read a fasta/q sequence
3840
for hit in a.map(seq): # traverse alignments
3941
print("{}\t{}\t{}\t{}".format(hit.ctg, hit.r_st, hit.r_en, hit.cigar_str))
@@ -87,7 +89,15 @@ This method aligns :code:`seq` against the index. It is a generator, *yielding*
8789
a series of :code:`mappy.Alignment` objects. If :code:`seq2` is present, mappy
8890
performs paired-end alignment, assuming the two ends are in the FR orientation.
8991
Alignments of the two ends can be distinguished by the :code:`read_num` field
90-
(see below).
92+
(see Class mappy.Alignment below).
93+
94+
.. code:: python
95+
96+
mappy.Aligner.seq(name, start=0, end=0x7fffffff)
97+
98+
This method retrieves a (sub)sequence from the index and returns it as a Python
99+
string. :code:`None` is returned if :code:`name` is not present in the index or
100+
the start/end coordinates are invalid.
91101

92102
Class mappy.Alignment
93103
~~~~~~~~~~~~~~~~~~~~~
@@ -144,11 +154,12 @@ Miscellaneous Functions
144154

145155
.. code:: python
146156
147-
mappy.fastx_read(fn)
157+
mappy.fastx_read(fn, read_comment=False)
148158
149159
This generator function opens a FASTA/FASTQ file and *yields* a
150160
:code:`(name,seq,qual)` tuple for each sequence entry. The input file may be
151-
optionally gzip'd.
161+
optionally gzip'd. If :code:`read_comment` is True, this generator yields
162+
a :code:`(name,seq,qual,comment)` tuple instead.
152163

153164
.. code:: python
154165

python/cmappy.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,4 +112,22 @@ static inline char *mappy_revcomp(int len, const uint8_t *seq)
112112
return rev;
113113
}
114114

115+
static char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *len)
116+
{
117+
int i, rid;
118+
char *s;
119+
*len = 0;
120+
rid = mm_idx_name2id(mi, name);
121+
if (rid < 0) return 0;
122+
if (st >= mi->seq[i].len || st >= en) return 0;
123+
if (en < 0 || en > mi->seq[i].len)
124+
en = mi->seq[i].len;
125+
s = (char*)malloc(en - st + 1);
126+
*len = mm_idx_getseq(mi, rid, st, en, s);
127+
for (i = 0; i < *len; ++i)
128+
s[i] = "ACGTN"[(uint8_t)s[i]];
129+
s[*len] = 0;
130+
return s;
131+
}
132+
115133
#endif

python/cmappy.pxd

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ cdef extern from "minimap.h":
5858
mm_idx_seq_t *seq
5959
uint32_t *S
6060
mm_idx_bucket_t *B
61-
void *km
61+
void *km, *h
6262

6363
ctypedef struct mm_idx_reader_t:
6464
pass
@@ -69,6 +69,8 @@ cdef extern from "minimap.h":
6969
void mm_idx_destroy(mm_idx_t *mi)
7070
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
7171

72+
int mm_idx_index_name(mm_idx_t *mi)
73+
7274
#
7375
# Mapping (key struct defined in cmappy.h below)
7476
#
@@ -99,6 +101,7 @@ cdef extern from "cmappy.h":
99101
void mm_reg2hitpy(const mm_idx_t *mi, mm_reg1_t *r, mm_hitpy_t *h)
100102
void mm_free_reg1(mm_reg1_t *r)
101103
mm_reg1_t *mm_map_aux(const mm_idx_t *mi, const char *seq1, const char *seq2, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt)
104+
char *mappy_fetch_seq(const mm_idx_t *mi, const char *name, int st, int en, int *l)
102105

103106
ctypedef struct kstring_t:
104107
unsigned l, m

python/mappy.pyx

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ cdef class Aligner:
123123
self._idx = cmappy.mm_idx_reader_read(r, n_threads) # NB: ONLY read the first part
124124
cmappy.mm_idx_reader_close(r)
125125
cmappy.mm_mapopt_update(&self.map_opt, self._idx)
126+
cmappy.mm_idx_index_name(self._idx)
126127

127128
def __dealloc__(self):
128129
if self._idx is not NULL:
@@ -140,8 +141,13 @@ cdef class Aligner:
140141
if self._idx is NULL: return None
141142
if buf is None: b = ThreadBuffer()
142143
else: b = buf
143-
if seq2 is None: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), NULL, &n_regs, b._b, &self.map_opt)
144-
else: regs = cmappy.mm_map_aux(self._idx, str.encode(seq), str.encode(seq2), &n_regs, b._b, &self.map_opt)
144+
145+
_seq = seq if isinstance(seq, bytes) else seq.encode()
146+
if seq2 is None:
147+
regs = cmappy.mm_map_aux(self._idx, _seq, NULL, &n_regs, b._b, &self.map_opt)
148+
else:
149+
_seq2 = seq2 if isinstance(seq2, bytes) else seq2.encode()
150+
regs = cmappy.mm_map_aux(self._idx, _seq, _seq2, &n_regs, b._b, &self.map_opt)
145151

146152
for i in range(n_regs):
147153
cmappy.mm_reg2hitpy(self._idx, &regs[i], &h)
@@ -153,7 +159,24 @@ cdef class Aligner:
153159
cmappy.mm_free_reg1(&regs[i])
154160
free(regs)
155161

156-
def fastx_read(fn):
162+
def seq(self, str name, int start=0, int end=0x7fffffff):
163+
cdef int l
164+
cdef char *s = cmappy.mappy_fetch_seq(self._idx, name.encode(), start, end, &l)
165+
if l == 0: return None
166+
r = s[:l] if isinstance(s, str) else s[:l].decode()
167+
free(s)
168+
return r
169+
170+
@property
171+
def k(self): return self._idx.k
172+
173+
@property
174+
def w(self): return self._idx.w
175+
176+
@property
177+
def n_seq(self): return self._idx.n_seq
178+
179+
def fastx_read(fn, read_comment=False):
157180
cdef cmappy.kseq_t *ks
158181
ks = cmappy.mm_fastx_open(str.encode(fn))
159182
if ks is NULL: return None
@@ -162,7 +185,12 @@ def fastx_read(fn):
162185
else: qual = None
163186
name = ks.name.s if isinstance(ks.name.s, str) else ks.name.s.decode()
164187
seq = ks.seq.s if isinstance(ks.seq.s, str) else ks.seq.s.decode()
165-
yield name, seq, qual
188+
if read_comment:
189+
if ks.comment.l > 0: comment = ks.comment.s if isinstance(ks.comment.s, str) else ks.comment.s.decode()
190+
else: comment = None
191+
yield name, seq, qual, comment
192+
else:
193+
yield name, seq, qual
166194
cmappy.mm_fastx_close(ks)
167195

168196
def revcomp(seq):

0 commit comments

Comments
 (0)