Skip to content

Commit

Permalink
Support for telomere (POS=0) coordinate
Browse files Browse the repository at this point in the history
This follows changes in
samtools/htslib#1573

See also
samtools/htslib#1571
  • Loading branch information
pd3 committed Mar 1, 2023
1 parent fe98a6b commit 4da6806
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 51 deletions.
6 changes: 3 additions & 3 deletions csq.c
Original file line number Diff line number Diff line change
Expand Up @@ -3374,7 +3374,7 @@ void vbuf_flush(args_t *args, uint32_t pos)
i = rbuf_shift(&args->vcf_rbuf);
assert( i>=0 );
vbuf = args->vcf_buf[i];
int pos = vbuf->n ? vbuf->vrec[0]->line->pos : -1;
int pos = vbuf->n ? vbuf->vrec[0]->line->pos : CSI_COOR_EMPTY;
for (i=0; i<vbuf->n; i++)
{
vrec_t *vrec = vbuf->vrec[i];
Expand Down Expand Up @@ -3413,7 +3413,7 @@ void vbuf_flush(args_t *args, uint32_t pos)
bcf_empty(vrec->line);
vrec->line->pos = save_pos;
}
if ( pos!=-1 )
if ( pos!=CSI_COOR_EMPTY )
{
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, pos);
if ( k != kh_end(args->pos2vbuf) ) kh_del(pos2vbuf, args->pos2vbuf, k);
Expand Down Expand Up @@ -4169,7 +4169,7 @@ static void process(args_t *args, bcf1_t **rec_ptr)
}

bcf1_t *rec = *rec_ptr;
static int32_t prev_rid = -1, prev_pos = -1;
static int32_t prev_rid = -1, prev_pos = CSI_COOR_EMPTY;
if ( prev_rid!=rec->rid )
{
prev_rid = rec->rid;
Expand Down
1 change: 1 addition & 0 deletions test/telomere.0.out
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
22 0 id0 C G . . .
1 change: 1 addition & 0 deletions test/telomere.1.out
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
22 1 id1 C G . . .
5 changes: 5 additions & 0 deletions test/telomere.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##fileformat=VCFv4.2
##contig=<ID=22>
#CHROM POS ID REF ALT QUAL FILTER INFO
22 0 id0 C G . . .
22 1 id1 C G . . .
2 changes: 2 additions & 0 deletions test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
run_test(\&test_tabix,$opts,in=>'merge.a',reg=>'1:3000151-3000151',out=>'tabix.1.3000151.out');
run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20:1-2147483647',out=>'large_chrom_csi_limit.20.1.2147483647.out'); # 2147483647 (1<<31-1) is the current chrom limit for csi. bcf conversion and indexing fail above this
run_test(\&test_index,$opts,in=>'large_chrom_csi_limit',reg=>'chr20',out=>'large_chrom.20.1.2147483647.out'); # this fails until bug resolved
run_test(\&test_index,$opts,in=>'telomere',reg=>'22:0',out=>'telomere.0.out');
run_test(\&test_index,$opts,in=>'telomere',reg=>'22:1',out=>'telomere.1.out');
run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-s',out=>'idx.out');
run_test(\&test_vcf_idxstats,$opts,in=>'idx',args=>'-n',out=>'idx_count.out');
run_test(\&test_vcf_idxstats,$opts,in=>'empty',args=>'-s',out=>'empty.idx.out');
Expand Down
37 changes: 20 additions & 17 deletions vcfconcat.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* vcfconcat.c -- Concatenate or combine VCF/BCF files.
Copyright (C) 2013-2021 Genome Research Ltd.
Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -39,6 +39,8 @@ THE SOFTWARE. */
#include <sys/time.h>
#include "bcftools.h"

#define EMPTY_FILE -3

typedef struct _args_t
{
bcf_srs_t *files;
Expand Down Expand Up @@ -95,11 +97,11 @@ static void init_data(args_t *args)
if ( args->phased_concat )
{
int ret = bcf_read(fp, hdr, line);
if ( ret!=0 ) args->start_pos[i] = -2; // empty file
if ( ret!=0 ) args->start_pos[i] = EMPTY_FILE;
else
{
int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
args->start_pos[i] = chrid==prev_chrid ? line->pos : CSI_COOR_EMPTY;
prev_chrid = chrid;
}
}
Expand Down Expand Up @@ -171,11 +173,11 @@ static void init_data(args_t *args)
int nok = 0;
while (1)
{
while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
while ( nok<args->nfnames && args->start_pos[nok]!=EMPTY_FILE ) nok++;
if ( nok==args->nfnames ) break;

i = nok;
while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
while ( i<args->nfnames && args->start_pos[i]==EMPTY_FILE ) i++;
if ( i==args->nfnames ) break;

int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
Expand All @@ -185,7 +187,7 @@ static void init_data(args_t *args)
args->nfnames = nok;

for (i=1; i<args->nfnames; i++)
if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
if ( args->start_pos[i-1]!=CSI_COOR_EMPTY && args->start_pos[i]!=CSI_COOR_EMPTY && args->start_pos[i]<args->start_pos[i-1] )
error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);

args->prev_chr = -1;
Expand Down Expand Up @@ -264,7 +266,7 @@ static void phased_flush(args_t *args)
bcf1_t *brec = args->buf[i+1];

int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
if ( nGTs < 0 )
if ( nGTs < 0 )
{
if ( !gt_absent_warned )
{
Expand Down Expand Up @@ -359,7 +361,7 @@ static void phased_flush(args_t *args)
bcf_update_format_int32(args->out_hdr,rec,"PQ",args->phase_qual,nsmpl);
PQ_printed = 1;
for (j=0; j<nsmpl; j++)
if ( args->phase_qual[j] < args->min_PQ )
if ( args->phase_qual[j] < args->min_PQ )
{
args->phase_set[j] = rec->pos+1;
args->phase_set_changed = 1;
Expand Down Expand Up @@ -404,7 +406,7 @@ static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec, int is_overlap
if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", arec ? bcf_seqname(ahdr,arec) : bcf_seqname(bhdr,brec));
args->seen_seq[chr_id] = 1;
args->prev_chr = chr_id;
args->prev_pos_check = -1;
args->prev_pos_check = CSI_COOR_EMPTY;
}

if ( !is_overlap )
Expand Down Expand Up @@ -463,12 +465,12 @@ static void concat(args_t *args)
new_file = 1;

args->ifname++;
if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open
if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
if ( args->start_pos[args->ifname-1]==CSI_COOR_EMPTY ) break; // new chromosome, start with only one file open
if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==CSI_COOR_EMPTY ) break; // next file starts on a different chromosome
}

// is there a line from the previous run? Seek the newly opened reader to that position
int seek_pos = -1;
int seek_pos = CSI_COOR_EMPTY;
int seek_chr = -1;
if ( bcf_sr_has_line(args->files,0) )
{
Expand Down Expand Up @@ -521,11 +523,12 @@ static void concat(args_t *args)

// This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[ir].header,line)) ) continue;
seek_pos = seek_chr = -1;
seek_pos = CSI_COOR_EMPTY;
seek_chr = -1;

// Check if the position overlaps with the next, yet unopened, reader
int must_seek = 0;
while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=CSI_COOR_EMPTY && line->pos >= args->start_pos[args->ifname] )
{
must_seek = 1;
if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
Expand Down Expand Up @@ -618,7 +621,7 @@ static void concat(args_t *args)
if ( chr_id<0 ) error("\nThe sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
if ( prev_chr_id!=chr_id )
{
prev_pos = -1;
prev_pos = CSI_COOR_EMPTY;
if ( args->seen_seq[chr_id] )
error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
}
Expand All @@ -643,7 +646,7 @@ static void concat(args_t *args)

if ( prev_chr_id!=line->rid )
{
prev_pos = -1;
prev_pos = CSI_COOR_EMPTY;
if ( args->seen_seq[line->rid] )
error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
}
Expand Down Expand Up @@ -980,7 +983,7 @@ int main_vcfconcat(int argc, char *argv[])
case 'R': args->regions_list = optarg; args->regions_is_file = 1; break;
case 'd': args->remove_dups = optarg; break;
case 'D': args->remove_dups = "exact"; break;
case 'q':
case 'q':
args->min_PQ = strtol(optarg,&tmp,10);
if ( *tmp ) error("Could not parse argument: --min-PQ %s\n", optarg);
break;
Expand Down
48 changes: 24 additions & 24 deletions vcfconvert.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* vcfconvert.c -- convert between VCF/BCF and related formats.
Copyright (C) 2013-2021 Genome Research Ltd.
Copyright (C) 2013-2023 Genome Research Ltd.
Author: Petr Danecek <[email protected]>
Expand Down Expand Up @@ -59,7 +59,7 @@ struct _args_t
bcf_hdr_t *header;
void (*convert_func)(struct _args_t *);
struct {
int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
int total, skipped, hom_rr, het_ra, hom_aa, het_aa, missing;
} n;
kstring_t str;
int32_t *gts;
Expand Down Expand Up @@ -160,7 +160,7 @@ static int _set_chrom_pos_ref_alt(tsv_t *tsv, bcf1_t *rec, void *usr)
// REF,ALT
args->str.l = 0;
se = ++ss;
while ( se < tsv->se && *se!='_' ) se++;
while ( se < tsv->se && *se!='_' ) se++;
if ( *se!='_' ) return -1;
kputsn(ss,se-ss,&args->str);
ss = ++se;
Expand Down Expand Up @@ -202,14 +202,14 @@ static int tsv_setter_chrom_pos_ref_alt_or_id(tsv_t *tsv, bcf1_t *rec, void *usr
{
args_t *args = (args_t*)usr;
if ( _set_chrom_pos_ref_alt(tsv,rec,usr)==0 ) return 0;
rec->pos = -1; // mark the record as unset
rec->pos = CSI_COOR_EMPTY; // mark the record as unset
if ( !args->output_vcf_ids) return 0;
return tsv_setter_id(tsv,rec,usr);
}
static int tsv_setter_chrom_pos_ref_alt_id_or_die(tsv_t *tsv, bcf1_t *rec, void *usr)
{
args_t *args = (args_t*)usr;
if ( rec->pos!=-1 )
if ( rec->pos!=CSI_COOR_EMPTY )
{
if ( !args->output_vcf_ids ) return 0;
return tsv_setter_id(tsv,rec,usr);
Expand Down Expand Up @@ -269,12 +269,12 @@ static int tsv_setter_gt_gp(tsv_t *tsv, bcf1_t *rec, void *usr)
if ( aa >= ab )
{
if ( aa >= bb ) args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(0);
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
else if ( ab >= bb )
else if ( ab >= bb )
{
args->gts[2*i+0] = bcf_gt_unphased(0);
args->gts[2*i+1] = bcf_gt_unphased(1);
args->gts[2*i+1] = bcf_gt_unphased(1);
}
else args->gts[2*i+0] = args->gts[2*i+1] = bcf_gt_unphased(1);
}
Expand All @@ -293,7 +293,7 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
else { a0 = bcf_gt_phased(0); a1 = bcf_gt_phased(1); }

// up is short for "unphased"
int nup = 0;
int nup = 0;
for (i=0; i<nsamples; i++)
{
char *ss = tsv->ss + 4*i + nup;
Expand Down Expand Up @@ -324,11 +324,11 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
break;
default :
fprintf(stderr,"Could not parse: [%c][%s]\n", ss[all*2+up],tsv->ss);
return -1;
return -1;
}
if( ss[all*2+up+1]=='*' ) up = up + 1;
}

if(up && up != 2)
{
fprintf(stderr,"Missing unphased marker '*': [%c][%s]", ss[2+up], tsv->ss);
Expand Down Expand Up @@ -356,13 +356,13 @@ static int tsv_setter_haps(tsv_t *tsv, bcf1_t *rec, void *usr)
static void gensample_to_vcf(args_t *args)
{
/*
* Inpute: IMPUTE2 output (indentation changed here for clarity):
* Inpute: IMPUTE2 output (indentation changed here for clarity):
*
* 20:62116619_C_T 20:62116619 62116619 C T 0.969 0.031 0 ...
* --- 20:62116698_C_A 62116698 C A 1 0 0 ...
*
* Second column is expected in the form of CHROM:POS_REF_ALT. We use second
* column because the first can be empty ("--") when filling sites from reference
* column because the first can be empty ("--") when filling sites from reference
* panel. When the option --vcf-ids is given, the first column is used to set the
* VCF ID.
*
Expand Down Expand Up @@ -784,7 +784,7 @@ char *init_sample2sex(bcf_hdr_t *hdr, char *sex_fname)
}
for (i=0; i<nlines; i++) free(lines[i]);
free(lines);
for (i=0; i<bcf_hdr_nsamples(hdr); i++)
for (i=0; i<bcf_hdr_nsamples(hdr); i++)
if ( !sample2sex[i] ) error("Missing sex for sample %s in %s\n", bcf_hdr_int2id(hdr, BCF_DT_SAMPLE, i),sex_fname);
return sample2sex;
}
Expand Down Expand Up @@ -847,7 +847,7 @@ static void vcf_to_gensample(args_t *args)
if (sample_fname) fprintf(stderr, "Sample file: %s\n", sample_fname);

// write samples file
if (sample_fname)
if (sample_fname)
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);
Expand Down Expand Up @@ -877,7 +877,7 @@ static void vcf_to_gensample(args_t *args)
return;
}

int prev_rid = -1, prev_pos = -1;
int prev_rid = -1, prev_pos = CSI_COOR_EMPTY;
int no_alt = 0, non_biallelic = 0, filtered = 0, ndup = 0, nok = 0;
BGZF *gout = bgzf_open(gen_fname, gen_compressed ? "wg" : "wu");
while ( bcf_sr_next_line(args->files) )
Expand Down Expand Up @@ -915,7 +915,7 @@ static void vcf_to_gensample(args_t *args)
nok++;
}
}
fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
fprintf(stderr, "%d records written, %d skipped: %d/%d/%d/%d no-ALT/non-biallelic/filtered/duplicated\n",
nok, no_alt+non_biallelic+filtered+ndup, no_alt, non_biallelic, filtered, ndup);

if ( str.m ) free(str.s);
Expand Down Expand Up @@ -976,7 +976,7 @@ static void vcf_to_haplegendsample(args_t *args)
{
char *sample2sex = NULL;
if ( args->sex_fname ) sample2sex = init_sample2sex(args->header,args->sex_fname);

int i;
BGZF *sout = bgzf_open(sample_fname, sample_compressed ? "wg" : "wu");
str.l = 0;
Expand Down Expand Up @@ -1078,7 +1078,7 @@ static void vcf_to_hapsample(args_t *args)
kputs("%CHROM:%POS\\_%REF\\_%FIRST_ALT %ID %POS %REF %FIRST_ALT ", &str);
else
kputs("%CHROM %CHROM:%POS\\_%REF\\_%FIRST_ALT %POS %REF %FIRST_ALT ", &str);

if ( args->hap2dip )
kputs("%_GT_TO_HAP2\n", &str);
else
Expand Down Expand Up @@ -1229,7 +1229,7 @@ static inline int tsv_setter_aa1(args_t *args, char *ss, char *se, int alleles[]
if ( alleles[a0]<0 ) alleles[a0] = (*nals)++;
if ( alleles[a1]<0 ) alleles[a1] = (*nals)++;

gts[0] = bcf_gt_unphased(alleles[a0]);
gts[0] = bcf_gt_unphased(alleles[a0]);
gts[1] = ss[1] ? bcf_gt_unphased(alleles[a1]) : bcf_int32_vector_end;

if ( ref==a0 && ref==a1 ) args->n.hom_rr++; // hom ref: RR
Expand Down Expand Up @@ -1265,7 +1265,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)
}
ret = tsv_setter_aa1(args, tsv->ss, tsv->se, alleles, &nals, iref, args->gts+i*2);
if ( ret==-1 ) error("Error parsing the site %s:%"PRId64", expected two characters\n", bcf_hdr_id2name(args->header,rec->rid),(int64_t) rec->pos+1);
if ( ret==-2 )
if ( ret==-2 )
{
// something else than a SNP
free(ref);
Expand All @@ -1275,7 +1275,7 @@ static int tsv_setter_aa(tsv_t *tsv, bcf1_t *rec, void *usr)

args->str.l = 0;
kputc(ref[0], &args->str);
for (i=0; i<5; i++)
for (i=0; i<5; i++)
{
if ( alleles[i]>0 )
{
Expand Down Expand Up @@ -1419,7 +1419,7 @@ static void gvcf_to_vcf(args_t *args)
{
int pass = filter_test(args->filter, line, NULL);
if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
if ( !pass )
if ( !pass )
{
if ( bcf_write(out_fh,hdr,line)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->outfname);
continue;
Expand Down Expand Up @@ -1667,7 +1667,7 @@ int main_vcfconvert(int argc, char *argv[])
else args->infname = argv[optind];
}
if ( !args->infname ) usage();

if ( args->convert_func ) args->convert_func(args);
else vcf_to_vcf(args);

Expand Down
Loading

0 comments on commit 4da6806

Please sign in to comment.