Skip to content

Commit ace990c

Browse files
authored
Illumina Complete Long Read presets (#1069)
* Implements a transition-aware alignment scoring scheme and configuration presets for ICLR * Fix to enable use of general scoring matrix in ksw as suggested by lh3 --------- Co-authored-by: koadman <>
1 parent e28a55b commit ace990c

File tree

5 files changed

+32
-6
lines changed

5 files changed

+32
-6
lines changed

README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,15 @@ parameters at the same time. The default setting is the same as `map-ont`.
139139
```sh
140140
minimap2 -ax map-pb ref.fa pacbio-reads.fq > aln.sam # for PacBio CLR reads
141141
minimap2 -ax map-ont ref.fa ont-reads.fq > aln.sam # for Oxford Nanopore reads
142+
minimap2 -ax map-iclr ref.fa iclr-reads.fq > aln.sam # for Illumina Complete Long Reads
142143
```
143144
The difference between `map-pb` and `map-ont` is that `map-pb` uses
144145
homopolymer-compressed (HPC) minimizers as seeds, while `map-ont` uses ordinary
145-
minimizers as seeds. Emperical evaluation suggests HPC minimizers improve
146+
minimizers as seeds. Empirical evaluation suggests HPC minimizers improve
146147
performance and sensitivity when aligning PacBio CLR reads, but hurt when aligning
147-
Nanopore reads.
148+
Nanopore reads. `map-iclr` uses an adjusted alignment scoring matrix that
149+
accounts for the low overall error rate in the reads, with transversion errors
150+
being less frequent than transitions.
148151

149152
#### <a name="map-long-splice"></a>Map long mRNA/cDNA reads
150153

align.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,17 @@ static void ksw_gen_simple_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t sc
2121
mat[(m - 1) * m + j] = sc_ambi;
2222
}
2323

24+
static void ksw_gen_ts_mat(int m, int8_t *mat, int8_t a, int8_t b, int8_t transition, int8_t sc_ambi)
25+
{
26+
assert(m==5);
27+
ksw_gen_simple_mat(m,mat,a,b,sc_ambi);
28+
transition = transition > 0? -transition : transition;
29+
mat[0*m+2]=transition; // A->G
30+
mat[1*m+3]=transition; // C->T
31+
mat[2*m+0]=transition; // G->A
32+
mat[3*m+1]=transition; // T->C
33+
}
34+
2435
static inline void mm_seq_rev(uint32_t len, uint8_t *seq)
2536
{
2637
uint32_t i;
@@ -323,6 +334,7 @@ static void mm_align_pair(void *km, const mm_mapopt_t *opt, int qlen, const uint
323334
for (i = 0; i < qlen; ++i) fputc("ACGTN"[qseq[i]], stderr);
324335
fputc('\n', stderr);
325336
}
337+
if (opt->b != opt->transition) flag |= KSW_EZ_GENERIC_SC;
326338
if (opt->max_sw_mat > 0 && (int64_t)tlen * qlen > opt->max_sw_mat) {
327339
ksw_reset_extz(ez);
328340
ez->zdropped = 1;
@@ -586,7 +598,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
586598

587599
r2->cnt = 0;
588600
if (r->cnt == 0) return;
589-
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
601+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
590602
bw = (int)(opt->bw * 1.5 + 1.);
591603
bw_long = (int)(opt->bw_long * 1.5 + 1.);
592604
if (bw_long < bw) bw_long = bw;
@@ -844,7 +856,7 @@ static int mm_align1_inv(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, i
844856
if (ql < opt->min_chain_score || ql > opt->max_gap) return 0;
845857
if (tl < opt->min_chain_score || tl > opt->max_gap) return 0;
846858

847-
ksw_gen_simple_mat(5, mat, opt->a, opt->b, opt->sc_ambi);
859+
ksw_gen_ts_mat(5, mat, opt->a, opt->b, opt->transition, opt->sc_ambi);
848860
tseq = (uint8_t*)kmalloc(km, tl);
849861
mm_idx_getseq(mi, r1->rid, r1->re, r2->rs, tseq);
850862
qseq = r1->rev? &qseq0[0][r2->qe] : &qseq0[1][qlen - r2->qs];

main.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int64_t flag, int long_idx, const
120120

121121
int main(int argc, char *argv[])
122122
{
123-
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
123+
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:b:O:E:m:N:Qu:R:hF:LC:yYPo:e:U:J:";
124124
ketopt_t o = KETOPT_INIT;
125125
mm_mapopt_t opt;
126126
mm_idxopt_t ipt;
@@ -178,6 +178,7 @@ int main(int argc, char *argv[])
178178
else if (c == 'm') opt.min_chain_score = atoi(o.arg);
179179
else if (c == 'A') opt.a = atoi(o.arg);
180180
else if (c == 'B') opt.b = atoi(o.arg);
181+
else if (c == 'b') opt.transition = atoi(o.arg);
181182
else if (c == 's') opt.min_dp_max = atoi(o.arg);
182183
else if (c == 'C') opt.noncan = atoi(o.arg);
183184
else if (c == 'I') ipt.batch_size = mm_parse_num(o.arg);
@@ -367,7 +368,7 @@ int main(int argc, char *argv[])
367368
fprintf(fp_help, " --version show version number\n");
368369
fprintf(fp_help, " Preset:\n");
369370
fprintf(fp_help, " -x STR preset (always applied before other options; see minimap2.1 for details) []\n");
370-
fprintf(fp_help, " - map-pb/map-ont - PacBio CLR/Nanopore vs reference mapping\n");
371+
fprintf(fp_help, " - map-pb/map-ont/map-iclr-prerender/map-iclr - PacBio/Nanopore/ICLR vs reference mapping\n");
371372
fprintf(fp_help, " - map-hifi - PacBio HiFi reads vs reference mapping\n");
372373
fprintf(fp_help, " - ava-pb/ava-ont - PacBio/Nanopore read overlap\n");
373374
fprintf(fp_help, " - asm5/asm10/asm20 - asm-to-ref mapping, for ~0.1/1/5%% sequence divergence\n");

minimap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ typedef struct {
153153
float alt_drop;
154154

155155
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
156+
int transition; // transition mismatch score (A:G, C:T)
156157
int sc_ambi; // score when one or both bases are "N"
157158
int noncan; // cost of non-canonical splicing sites
158159
int junc_bonus;

options.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)
4545
opt->alt_drop = 0.15f;
4646

4747
opt->a = 2, opt->b = 4, opt->q = 4, opt->e = 2, opt->q2 = 24, opt->e2 = 1;
48+
opt->transition = opt->b;
4849
opt->sc_ambi = 1;
4950
opt->zdrop = 400, opt->zdrop_inv = 200;
5051
opt->end_bonus = -1;
@@ -112,6 +113,14 @@ int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo)
112113
mo->occ_dist = 500;
113114
mo->min_mid_occ = 50, mo->max_mid_occ = 500;
114115
mo->min_dp_max = 200;
116+
} else if (strcmp(preset, "map-iclr-prerender") == 0) {
117+
io->flag = 0, io->k = 15;
118+
mo->b = 6, mo->transition = 1;
119+
mo->q = 10, mo->q2 = 50;
120+
} else if (strcmp(preset, "map-iclr") == 0) {
121+
io->flag = 0, io->k = 19;
122+
mo->b = 6, mo->transition = 4;
123+
mo->q = 10, mo->q2 = 50;
115124
} else if (strncmp(preset, "asm", 3) == 0) {
116125
io->flag = 0, io->k = 19, io->w = 19;
117126
mo->bw = 1000, mo->bw_long = 100000;

0 commit comments

Comments
 (0)