Skip to content

Commit 1719c60

Browse files
committed
New option to remove or annotate clusters of sites within a window
1 parent 78099f2 commit 1719c60

File tree

12 files changed

+297
-51
lines changed

12 files changed

+297
-51
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ Changes affecting specific commands:
7676

7777
- Make the option `-s, --sample-names` functional again (#2353)
7878

79+
* bcftools +prune
80+
81+
- New option to remove or annotate clusters of sites within a window
82+
7983
* bcftools query
8084

8185
- The functions used in -i/-e filtering expressions (such as SUM, MEDIAN, etc) can be

doc/bcftools.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2952,8 +2952,9 @@ By default, appropriate system directories are searched for installed plugins.
29522952
determine parental origin of a CNV region
29532953

29542954
*prune*::
2955-
prune sites by missingness, allele frequency or linkage disequilibrium.
2956-
Alternatively, annotate sites with r2, Lewontin's D' (PMID:19433632), Ragsdale's D (PMID:31697386).
2955+
annotate sites with or prune sites by the number of variants within a window, Lewontin's D
2956+
(doi:10.1093/molbev/msz265), Ragsdale's D (doi:10.1534/genetics.108.093153), or correlation
2957+
coefficient r-squared.
29572958

29582959
*remove-overlaps*::
29592960
remove overlapping variants and duplicate sites

plugins/prune.c

Lines changed: 66 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (C) 2017-2024 Genome Research Ltd.
2+
Copyright (C) 2017-2025 Genome Research Ltd.
33
44
Author: Petr Danecek <[email protected]>
55
@@ -60,9 +60,10 @@ typedef struct
6060
vcfbuf_t *vcfbuf;
6161
double ld_max[VCFBUF_LD_N];
6262
int ld_max_set[VCFBUF_LD_N];
63-
char *ld_annot[VCFBUF_LD_N], *ld_annot_pos[VCFBUF_LD_N];
63+
char *ld_annot[VCFBUF_LD_N], *ld_annot_pos[VCFBUF_LD_N], *cluster_annot;
6464
int ld_mask;
6565
int argc, region_is_file, target_is_file, output_type, ld_filter_id, rand_missing, nsites, ld_win, rseed, clevel;
66+
int max_cluster;
6667
char *nsites_mode;
6768
int keep_sites;
6869
char **argv, *region, *target, *fname, *output_fname, *ld_filter;
@@ -76,35 +77,38 @@ args_t;
7677

7778
const char *about(void)
7879
{
79-
return "Prune sites by missingness, linkage disequilibrium\n";
80+
return "Annotate sites with or prune sites by linkage disequilibrium or number of sites within a window\n";
8081
}
8182

8283
static const char *usage_text(void)
8384
{
8485
return
8586
"\n"
86-
"About: Prune sites by missingness or linkage disequilibrium.\n"
87-
"\n"
87+
"About: Annotate sites with or prune sites by the number of variants within a window (\"count\"), Lewontin's D\n"
88+
" (\"LD\"; doi:10.1093/molbev/msz265), Ragsdale's D (\"RD\"; doi:10.1534/genetics.108.093153), or correlation\n"
89+
" coefficient r-squared.\n"
8890
"Usage: bcftools +prune [Options]\n"
8991
"Plugin options:\n"
9092
" --AF-tag STR Use this tag with -n to determine allele frequency\n"
91-
" -a, --annotate r2,LD Add position of an upstream record with the biggest r2/LD value\n"
92-
" -e, --exclude EXPR Exclude sites for which the expression is true\n"
93+
" -a, --annotate count|LD|RD|r2 Annotate with the number of variants within the -w window (\"count\"),\n"
94+
" or with the biggest LD, RD, or r2 value and the position of the record\n"
9395
" -f, --set-filter STR Apply soft filter STR instead of discarding the site (only with -m)\n"
94-
" -i, --include EXPR Include only sites for which the expression is true\n"
95-
" -k, --keep-sites Leave sites filtered by -i/-e unchanged instead of discarding them\n"
96-
" -m, --max [r2|LD=]FLOAT Remove sites with r2 or Lewontin's D bigger than FLOAT within the -w window\n"
96+
" -m, --max count|LD|RD|r2=NUM Remove clusters of more than NUM sites (\"count\") or sites with LD, RD, or r2 bigger than NUM\n"
9797
" -n, --nsites-per-win N Keep at most N sites in the -w window. See also -N, --nsites-per-win-mode\n"
9898
" -N, --nsites-per-win-mode STR Keep sites with biggest AF (\"maxAF\"); sites that come first (\"1st\"); pick randomly (\"rand\") [maxAF]\n"
99-
" -o, --output FILE Write output to the FILE [standard output]\n"
100-
" -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
10199
" --random-seed INT Use the provided random seed for reproducibility\n"
102100
" --randomize-missing Replace missing data with randomly assigned genotype based on site's allele frequency\n"
101+
" -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -m/-n options [100kb]\n"
102+
"Common options:\n"
103+
" -e, --exclude EXPR Exclude sites for which the expression is true\n"
104+
" -i, --include EXPR Include only sites for which the expression is true\n"
105+
" -k, --keep-sites Leave sites filtered by -i/-e unchanged instead of discarding them\n"
106+
" -o, --output FILE Write output to the FILE [standard output]\n"
107+
" -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
103108
" -r, --regions REGION Restrict to comma-separated list of regions\n"
104109
" -R, --regions-file FILE Restrict to regions listed in a file\n"
105110
" -t, --targets REGION Similar to -r but streams rather than index-jumps\n"
106111
" -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
107-
" -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -n/-l options [100kb]\n"
108112
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
109113
"Examples:\n"
110114
" # Discard records with r2 bigger than 0.6 in a window of 1000 sites\n"
@@ -121,6 +125,9 @@ static const char *usage_text(void)
121125
"\n"
122126
" # Discard records with r2 bigger than 0.6, first removing records with more than 2% of genotypes missing\n"
123127
" bcftools +prune -m 0.6 -e'F_MISSING>=0.02' input.bcf -Ob -o output.bcf\n"
128+
"\n"
129+
" # Mark clusters of more than 3 sites within a 10bp window, do not mark ref-only sites\n"
130+
" bcftools +prune -m count=3 -w 10bp -k -i 'type!=\"ref\"' input.bcf -Ob -o output.bcf\n"
124131
"\n";
125132
}
126133

@@ -155,11 +162,11 @@ static void init_data(args_t *args)
155162
kputs("LD bigger than ",&str);
156163
kputd(args->ld_max[VCFBUF_LD_IDX_LD],&str);
157164
}
158-
if ( args->ld_max_set[VCFBUF_LD_IDX_HD] )
165+
if ( args->ld_max_set[VCFBUF_LD_IDX_RD] )
159166
{
160167
if ( str.l ) kputs(" or ",&str);
161-
kputs("HD bigger than ",&str);
162-
kputd(args->ld_max[VCFBUF_LD_IDX_HD],&str);
168+
kputs("RD bigger than ",&str);
169+
kputd(args->ld_max[VCFBUF_LD_IDX_RD],&str);
163170
}
164171
bcf_hdr_printf(args->hdr,"##FILTER=<ID=%s,Description=\"An upstream site within %d%s with %s\">",args->ld_filter,
165172
args->ld_win < 0 ? -args->ld_win/1000 : args->ld_win,
@@ -179,12 +186,14 @@ static void init_data(args_t *args)
179186
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Lewontin's D' (PMID:19433632) with the %s site\">",args->ld_annot[VCFBUF_LD_IDX_LD],args->ld_annot_pos[VCFBUF_LD_IDX_LD]);
180187
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">",args->ld_annot_pos[VCFBUF_LD_IDX_LD],args->ld_annot[VCFBUF_LD_IDX_LD]);
181188
}
182-
if ( args->ld_annot[VCFBUF_LD_IDX_HD] )
189+
if ( args->ld_annot[VCFBUF_LD_IDX_RD] )
183190
{
184-
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Ragsdale's \\hat{D} (PMID:31697386) with the %s site\">",args->ld_annot[VCFBUF_LD_IDX_HD],args->ld_annot_pos[VCFBUF_LD_IDX_HD]);
185-
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">",args->ld_annot_pos[VCFBUF_LD_IDX_HD],args->ld_annot[VCFBUF_LD_IDX_HD]);
191+
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Ragsdale's \\hat{D} (PMID:31697386) with the %s site\">",args->ld_annot[VCFBUF_LD_IDX_RD],args->ld_annot_pos[VCFBUF_LD_IDX_RD]);
192+
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">",args->ld_annot_pos[VCFBUF_LD_IDX_RD],args->ld_annot[VCFBUF_LD_IDX_RD]);
186193
}
187194
}
195+
if ( args->cluster_annot )
196+
bcf_hdr_printf(args->hdr,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The number of variants within %d bp of the site\">",args->cluster_annot,args->ld_win);
188197
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
189198
if ( init_index2(args->out_fh,args->hdr,args->output_fname,
190199
&args->index_fn, args->write_index)<0 )
@@ -193,7 +202,16 @@ static void init_data(args_t *args)
193202
if ( args->ld_filter && strcmp(".",args->ld_filter) )
194203
args->ld_filter_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, args->ld_filter);
195204

205+
if ( args->ld_win>0 && (args->max_cluster || args->cluster_annot) )
206+
{
207+
fprintf(stderr,"Warning: assuming `-w %dbp` was intended instead of `-w %d`\n",args->ld_win,args->ld_win);
208+
args->ld_win *= -1;
209+
if ( args->max_cluster && -args->ld_win <= args->max_cluster ) error("Error: -w must be bigger than -m\n");
210+
}
211+
196212
args->vcfbuf = vcfbuf_init(args->hdr, args->ld_win);
213+
if ( args->max_cluster ) vcfbuf_set(args->vcfbuf,CLUSTER_PRUNE,args->max_cluster);
214+
if ( args->cluster_annot ) vcfbuf_set(args->vcfbuf,CLUSTER_SIZE,1);
197215
if ( args->ld_max_set[VCFBUF_LD_IDX_R2] ) vcfbuf_set(args->vcfbuf,LD_MAX_R2,args->ld_max[VCFBUF_LD_IDX_R2]);
198216
if ( args->ld_max_set[VCFBUF_LD_IDX_LD] ) vcfbuf_set(args->vcfbuf,LD_MAX_LD,args->ld_max[VCFBUF_LD_IDX_LD]);
199217
if ( args->ld_max_set[VCFBUF_LD_IDX_HD] ) vcfbuf_set(args->vcfbuf,LD_MAX_HD,args->ld_max[VCFBUF_LD_IDX_HD]);
@@ -203,6 +221,10 @@ static void init_data(args_t *args)
203221
hts_srand48(args->rseed);
204222
}
205223
if ( args->rand_missing ) vcfbuf_set(args->vcfbuf,LD_RAND_MISSING,1);
224+
if ( args->max_cluster )
225+
{
226+
vcfbuf_set(args->vcfbuf,CLUSTER_PRUNE,args->max_cluster);
227+
}
206228
if ( args->nsites )
207229
{
208230
vcfbuf_set(args->vcfbuf,PRUNE_NSITES,args->nsites);
@@ -235,7 +257,18 @@ static void flush(args_t *args, int flush_all)
235257
{
236258
bcf1_t *rec;
237259
while ( (rec = vcfbuf_flush(args->vcfbuf, flush_all)) )
260+
{
261+
if ( args->cluster_annot )
262+
{
263+
int is_marked = vcfbuf_get_val(args->vcfbuf,int,CLUSTER_SIZE);
264+
if ( is_marked > 0 )
265+
{
266+
int32_t val = is_marked;
267+
bcf_update_info_int32(args->hdr, rec, args->cluster_annot, &val, 1);
268+
}
269+
}
238270
if ( bcf_write1(args->out_fh, args->hdr, rec)!=0 ) error("[%s] Error: cannot write to %s\n", __func__,args->output_fname);
271+
}
239272
}
240273
static void process(args_t *args)
241274
{
@@ -361,10 +394,14 @@ int run(int argc, char **argv)
361394
args->ld_annot_pos[VCFBUF_LD_IDX_LD] = "POS_LD";
362395
args->ld_annot[VCFBUF_LD_IDX_LD] = "LD";
363396
}
364-
else if ( !strcasecmp("HD",tag[i]) )
397+
else if ( !strcasecmp("RD",tag[i]) || !strcasecmp("HD",tag[i]) )
398+
{
399+
args->ld_annot_pos[VCFBUF_LD_IDX_RD] = "POS_RD";
400+
args->ld_annot[VCFBUF_LD_IDX_RD] = "RD";
401+
}
402+
else if ( !strcasecmp("COUNT",tag[i]) )
365403
{
366-
args->ld_annot_pos[VCFBUF_LD_IDX_HD] = "POS_HD";
367-
args->ld_annot[VCFBUF_LD_IDX_HD] = "HD";
404+
args->cluster_annot = "CLUSTER_SIZE";
368405
}
369406
else error("The tag \"%s\" is not supported\n",tag[i]);
370407
free(tag[i]);
@@ -395,10 +432,14 @@ int run(int argc, char **argv)
395432
args->ld_max_set[VCFBUF_LD_IDX_LD] = 1;
396433
args->ld_max[VCFBUF_LD_IDX_LD] = strtod(optarg+3,&tmp);
397434
}
398-
else if ( !strncasecmp("HD=",optarg,3) )
435+
else if ( !strncasecmp("RD=",optarg,3) || !strncasecmp("HD=",optarg,3) )
436+
{
437+
args->ld_max_set[VCFBUF_LD_IDX_RD] = 1;
438+
args->ld_max[VCFBUF_LD_IDX_RD] = strtod(optarg+3,&tmp);
439+
}
440+
else if ( !strncasecmp("count=",optarg,6) )
399441
{
400-
args->ld_max_set[VCFBUF_LD_IDX_HD] = 1;
401-
args->ld_max[VCFBUF_LD_IDX_HD] = strtod(optarg+3,&tmp);
442+
args->max_cluster = strtod(optarg+6,&tmp);
402443
}
403444
else
404445
{

test/prune.1.1.out

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@
88
##INFO=<ID=POS_R2,Number=1,Type=Integer,Description="The position of the site for which R2 was calculated">
99
##INFO=<ID=LD,Number=1,Type=Float,Description="Pairwise Lewontin's D' (PMID:19433632) with the POS_LD site">
1010
##INFO=<ID=POS_LD,Number=1,Type=Integer,Description="The position of the site for which LD was calculated">
11-
##INFO=<ID=HD,Number=1,Type=Float,Description="Pairwise Ragsdale's \hat{D} (PMID:31697386) with the POS_HD site">
12-
##INFO=<ID=POS_HD,Number=1,Type=Integer,Description="The position of the site for which HD was calculated">
11+
##INFO=<ID=RD,Number=1,Type=Float,Description="Pairwise Ragsdale's \hat{D} (PMID:31697386) with the POS_RD site">
12+
##INFO=<ID=POS_RD,Number=1,Type=Integer,Description="The position of the site for which RD was calculated">
1313
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3
1414
1 101 . T A . . AF=0.3 GT 0/1 0/1 0/1
15-
1 102 . T A . . AF=0.2;POS_R2=101;POS_LD=101;POS_HD=101;R2=1;LD=1;HD=0 GT 0/1 0/1 0/1
16-
1 103 . T A . . AF=0.1;POS_R2=102;POS_LD=102;POS_HD=102;R2=0.105235;LD=0.145076;HD=0 GT 0/1 0/0 0/0
17-
1 104 . T A . . AF=0.3;POS_R2=103;POS_LD=103;POS_HD=103;R2=0.105235;LD=0;HD=0 GT 0/0 0/0 0/0
18-
1 105 . T A . . AF=0.2;POS_R2=104;POS_LD=104;POS_HD=104;R2=1;LD=0;HD=0 GT 0/0 0/0 0/0
19-
1 106 . T A . . AF=0.1;POS_R2=105;POS_LD=105;POS_HD=105;R2=0.755358;LD=0;HD=0 GT 0/1 1/1 1/1
20-
1 107 . T A . . AF=0.3;POS_R2=106;POS_LD=106;POS_HD=106;R2=0.25;LD=0.790569;HD=-0.0208333 GT 0/1 0/0 0/1
21-
1 108 . T A . . AF=0.2;POS_R2=107;POS_LD=107;POS_HD=107;R2=1;LD=1;HD=-0.0416667 GT 0/1 1/1 0/1
15+
1 102 . T A . . AF=0.2;POS_R2=101;POS_LD=101;POS_RD=101;R2=1;LD=1;RD=0 GT 0/1 0/1 0/1
16+
1 103 . T A . . AF=0.1;POS_R2=102;POS_LD=102;POS_RD=102;R2=0.105235;LD=0.145076;RD=0 GT 0/1 0/0 0/0
17+
1 104 . T A . . AF=0.3;POS_R2=103;POS_LD=103;POS_RD=103;R2=0.105235;LD=0;RD=0 GT 0/0 0/0 0/0
18+
1 105 . T A . . AF=0.2;POS_R2=104;POS_LD=104;POS_RD=104;R2=1;LD=0;RD=0 GT 0/0 0/0 0/0
19+
1 106 . T A . . AF=0.1;POS_R2=105;POS_LD=105;POS_RD=105;R2=0.755358;LD=0;RD=0 GT 0/1 1/1 1/1
20+
1 107 . T A . . AF=0.3;POS_R2=106;POS_LD=106;POS_RD=106;R2=0.25;LD=0.790569;RD=-0.0208333 GT 0/1 0/0 0/1
21+
1 108 . T A . . AF=0.2;POS_R2=107;POS_LD=107;POS_RD=107;R2=1;LD=1;RD=-0.0416667 GT 0/1 1/1 0/1

test/prune.2.1.out

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
##INFO=<ID=POS_R2,Number=1,Type=Integer,Description="The position of the site for which R2 was calculated">
88
##INFO=<ID=LD,Number=1,Type=Float,Description="Pairwise Lewontin's D' (PMID:19433632) with the POS_LD site">
99
##INFO=<ID=POS_LD,Number=1,Type=Integer,Description="The position of the site for which LD was calculated">
10-
##INFO=<ID=HD,Number=1,Type=Float,Description="Pairwise Ragsdale's \hat{D} (PMID:31697386) with the POS_HD site">
11-
##INFO=<ID=POS_HD,Number=1,Type=Integer,Description="The position of the site for which HD was calculated">
10+
##INFO=<ID=RD,Number=1,Type=Float,Description="Pairwise Ragsdale's \hat{D} (PMID:31697386) with the POS_RD site">
11+
##INFO=<ID=POS_RD,Number=1,Type=Integer,Description="The position of the site for which RD was calculated">
1212
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3 s4 s5 s6 s7 s8 s9 s10 s11 s12 s13 s14 s15 s16 s17 s18 s19 s20
1313
1 101 . T A . . . GT 0/1 ./. 0/0 ./. 0/1 ./. 0/0 0/1 0/0 0/0 0/1 1/1 0/0 0/1 0/1 0/0 ./. 0/1 0/1 ./.
14-
1 102 . T A . . POS_R2=101;POS_LD=101;POS_HD=101;R2=0.363372;LD=0.487431;HD=0.0520833 GT 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/1 0/0 0/1 1/1 0/1 1/1 0/1 0/0 0/1 0/1 0/1 1/1
15-
1 103 . T A . . POS_R2=102;POS_LD=102;POS_HD=102;R2=0.0990065;LD=0.24274;HD=0.0202381 GT 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/0 0/1 0/1 0/1 0/0
14+
1 102 . T A . . POS_R2=101;POS_LD=101;POS_RD=101;R2=0.363372;LD=0.487431;RD=0.0520833 GT 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/1 0/0 0/1 1/1 0/1 1/1 0/1 0/0 0/1 0/1 0/1 1/1
15+
1 103 . T A . . POS_R2=102;POS_LD=102;POS_RD=102;R2=0.0990065;LD=0.24274;RD=0.0202381 GT 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/1 0/0 0/1 0/1 0/1 0/0

test/prune.3.1.out

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##reference=file://some/path/human_g1k_v37.fasta
4+
##contig=<ID=1,length=2147483647>
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##INFO=<ID=XX,Number=1,Type=Integer,Description="Number">
7+
##INFO=<ID=CLUSTER_SIZE,Number=1,Type=Integer,Description="The number of variants within -3 bp of the site">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3
9+
1 101 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/1 0/1
10+
1 102 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/1 0/1
11+
1 103 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/0 0/0
12+
1 104 . T A . . XX=0 GT 0/0 0/0 0/0
13+
1 105 . T A . . XX=0 GT 0/0 0/0 0/0
14+
1 106 . T A . . XX=3;CLUSTER_SIZE=2 GT 0/1 1/1 1/1
15+
1 107 . T A . . XX=3;CLUSTER_SIZE=2 GT 0/1 0/0 0/1
16+
1 108 . T A . . XX=0 GT 0/1 1/1 0/1

test/prune.3.2.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##reference=file://some/path/human_g1k_v37.fasta
4+
##contig=<ID=1,length=2147483647>
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##INFO=<ID=XX,Number=1,Type=Integer,Description="Number">
7+
##INFO=<ID=CLUSTER_SIZE,Number=1,Type=Integer,Description="The number of variants within -3 bp of the site">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3
9+
1 101 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/1 0/1
10+
1 102 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/1 0/1
11+
1 103 . T A . . XX=3;CLUSTER_SIZE=3 GT 0/1 0/0 0/0
12+
1 106 . T A . . XX=3;CLUSTER_SIZE=2 GT 0/1 1/1 1/1
13+
1 107 . T A . . XX=3;CLUSTER_SIZE=2 GT 0/1 0/0 0/1

test/prune.3.3.out

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##reference=file://some/path/human_g1k_v37.fasta
4+
##contig=<ID=1,length=2147483647>
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##INFO=<ID=XX,Number=1,Type=Integer,Description="Number">
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3
8+
1 106 . T A . . XX=3 GT 0/1 1/1 1/1
9+
1 107 . T A . . XX=3 GT 0/1 0/0 0/1

test/prune.3.vcf

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
##fileformat=VCFv4.2
2+
##reference=file://some/path/human_g1k_v37.fasta
3+
##contig=<ID=1,length=2147483647>
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##INFO=<ID=XX,Number=1,Type=Integer,Description="Number">
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 s2 s3
7+
1 101 . T A . . XX=3 GT 0/1 0/1 0/1
8+
1 102 . T A . . XX=3 GT 0/1 0/1 0/1
9+
1 103 . T A . . XX=3 GT 0/1 0/0 0/0
10+
1 104 . T A . . XX=0 GT 0/0 0/0 0/0
11+
1 105 . T A . . XX=0 GT 0/0 0/0 0/0
12+
1 106 . T A . . XX=3 GT 0/1 1/1 1/1
13+
1 107 . T A . . XX=3 GT 0/1 0/0 0/1
14+
1 108 . T A . . XX=0 GT 0/1 1/1 0/1

0 commit comments

Comments
 (0)