Skip to content

Commit

Permalink
Add VCF/BCF support for POS=0 coordinate
Browse files Browse the repository at this point in the history
Review of `bcftools merge` code and possibly others is necessary because it is relying
on uninitialized POS==-1 in synced_bcf_reader

This should resolve #1571
  • Loading branch information
pd3 committed Mar 1, 2023
1 parent 70df047 commit 7dde686
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 23 deletions.
10 changes: 6 additions & 4 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,8 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of
{
int i;
hts_pos_t beg, end;
if ( _beg<0 ) _beg = 0;
if ( _end<=_beg ) _end = _beg+1;
beg = _beg >> min_shift;
end = (_end - 1) >> min_shift;
if (l->m < end + 1) {
Expand Down Expand Up @@ -2905,6 +2907,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi
{
int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
if (beg >= end) return 0;
if (beg < 0 ) beg = 0;
if (beg==end) end = 1;
if (end >= 1LL<<s) end = 1LL<<s;
for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
hts_pos_t b, e;
Expand Down Expand Up @@ -3086,7 +3090,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
} else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
iter->finished = 1;
} else {
if (beg < 0) beg = 0;
if (beg < -1) beg = -1;
if (end < beg) {
free(iter);
return NULL;
Expand All @@ -3103,7 +3107,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t

if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }

rel_off = beg>>idx->min_shift;
rel_off = (beg>=0 ? beg : 0) >> idx->min_shift;
// compute min_off
bin = hts_bin_first(idx->n_lvls) + rel_off;
do {
Expand Down Expand Up @@ -3138,8 +3142,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
min_off = kh_val(bidx, k).loff;
}
} else if (unmapped) { //CSI index
if (k != kh_end(bidx))
min_off = kh_val(bidx, k).loff;
}

// compute max_off: a virtual offset from a bin to the right of end
Expand Down
2 changes: 2 additions & 0 deletions htslib/synced_bcf_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ DEALINGS IN THE SOFTWARE. */
extern "C" {
#endif

#define CSI_COOR_EMPTY -2

/*
When reading multiple files in parallel, duplicate records within each
file will be reordered and offered in intuitive order. For example,
Expand Down
31 changes: 16 additions & 15 deletions synced_bcf_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
if ( !files->regions )
files->regions = _regions_init_string(names[i]);
else
_regions_add(files->regions, names[i], -1, -1);
_regions_add(files->regions, names[i], CSI_COOR_EMPTY, CSI_COOR_EMPTY);
}
free(names);
_regions_sort_and_merge(files->regions);
Expand Down Expand Up @@ -555,7 +555,7 @@ static int _readers_next_region(bcf_srs_t *files)
int prev_iseq = files->regions->iseq;
hts_pos_t prev_end = files->regions->end;
if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1;
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : CSI_COOR_EMPTY;

for (i=0; i<files->nreaders; i++)
_reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
Expand Down Expand Up @@ -616,7 +616,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
reader->buffer[reader->mbuffer-i] = bcf_init1();
reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
reader->buffer[reader->mbuffer-i]->pos = CSI_COOR_EMPTY; // for rare cases when VCF starts from 1 or 0
}
}
if ( files->streaming )
Expand Down Expand Up @@ -656,7 +656,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
if ( ret < 0 ) break; // no more lines or an error
bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
}

// Prevent creation of duplicates from records overlapping multiple regions
// and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-)
if ( files->regions )
Expand All @@ -676,7 +675,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap);
exit(1);
}

if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue;
}

Expand Down Expand Up @@ -954,9 +952,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
// inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
{
if ( start==-1 && end==-1 )
if ( start==CSI_COOR_EMPTY && end==CSI_COOR_EMPTY )
{
start = 0; end = MAX_CSI_COOR-1;
start = -1; end = MAX_CSI_COOR-1;
}
else
{
Expand Down Expand Up @@ -1029,8 +1027,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
static bcf_sr_regions_t *_regions_init_string(const char *str)
{
bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
reg->start = reg->end = -1;
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
reg->prev_seq = -1;

kstring_t tmp = {0,0,0};
const char *sp = str, *ep = str;
Expand Down Expand Up @@ -1075,7 +1074,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
}
else
{
if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
if ( tmp.l ) _regions_add(reg, tmp.s, CSI_COOR_EMPTY, CSI_COOR_EMPTY);
if ( !*ep ) break;
sp = ++ep;
}
Expand Down Expand Up @@ -1157,8 +1156,9 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
}

reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
reg->start = reg->end = -1;
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
reg->prev_seq = -1;

reg->file = hts_open(regions, "rb");
if ( !reg->file )
Expand Down Expand Up @@ -1253,7 +1253,8 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)

int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
{
reg->iseq = reg->start = reg->end = -1;
reg->iseq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions

// using in-memory regions
Expand Down Expand Up @@ -1284,7 +1285,7 @@ static int advance_creg(region_t *reg)
int bcf_sr_regions_next(bcf_sr_regions_t *reg)
{
if ( reg->iseq<0 ) return -1;
reg->start = reg->end = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->nals = 0;

// using in-memory regions
Expand Down Expand Up @@ -1442,7 +1443,7 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p
bcf_sr_regions_flush(reg);

bcf_sr_regions_seek(reg, seq);
reg->start = reg->end = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
}
if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
reg->prev_seq = reg->iseq;
Expand Down
8 changes: 4 additions & 4 deletions tbx.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,12 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
if ( s==line+b ) return -1; // expected int
if (!(conf->preset&TBX_UCSC)) --intv->beg;
else ++intv->end;
if (intv->beg < 0) {
if (intv->beg < -1) {
hts_log_warning("Coordinate <= 0 detected. "
"Did you forget to use the -0 option?");
intv->beg = 0;
}
if (intv->end < 1) intv->end = 1;
if (intv->end < intv->beg ) intv->end = intv->beg;
} else {
if ((conf->preset&0xffff) == TBX_GENERIC) {
if (id == conf->ec)
Expand Down Expand Up @@ -171,7 +171,7 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
++id;
}
}
if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
if (intv->ss == 0 || intv->se == 0 || intv->beg < -1 || intv->end < -1) return -1;
return 0;
}

Expand All @@ -181,7 +181,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
int c = *intv->se;
*intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
if (intv->tid < 0) return -2; // get_tid out of memory
return (intv->beg >= 0 && intv->end >= 0)? 0 : -1;
return (intv->beg >= -1 && intv->end >= -1)? 0 : -1;
} else {
char *type = NULL;
switch (tbx->conf.preset&0xffff)
Expand Down

0 comments on commit 7dde686

Please sign in to comment.