1
1
/*
2
- Copyright (C) 2017-2024 Genome Research Ltd.
2
+ Copyright (C) 2017-2025 Genome Research Ltd.
3
3
4
4
Author: Petr Danecek <[email protected] >
5
5
@@ -60,9 +60,10 @@ typedef struct
60
60
vcfbuf_t * vcfbuf ;
61
61
double ld_max [VCFBUF_LD_N ];
62
62
int ld_max_set [VCFBUF_LD_N ];
63
- char * ld_annot [VCFBUF_LD_N ], * ld_annot_pos [VCFBUF_LD_N ];
63
+ char * ld_annot [VCFBUF_LD_N ], * ld_annot_pos [VCFBUF_LD_N ], * cluster_annot ;
64
64
int ld_mask ;
65
65
int argc , region_is_file , target_is_file , output_type , ld_filter_id , rand_missing , nsites , ld_win , rseed , clevel ;
66
+ int max_cluster ;
66
67
char * nsites_mode ;
67
68
int keep_sites ;
68
69
char * * argv , * region , * target , * fname , * output_fname , * ld_filter ;
@@ -76,35 +77,38 @@ args_t;
76
77
77
78
const char * about (void )
78
79
{
79
- return "Prune sites by missingness, linkage disequilibrium\n" ;
80
+ return "Annotate sites with or prune sites by linkage disequilibrium or number of sites within a window \n" ;
80
81
}
81
82
82
83
static const char * usage_text (void )
83
84
{
84
85
return
85
86
"\n"
86
- "About: Prune sites by missingness or linkage disequilibrium.\n"
87
- "\n"
87
+ "About: Annotate sites with or prune sites by the number of variants within a window (\"count\"), Lewontin's D\n"
88
+ " (\"LD\"; doi:10.1093/molbev/msz265), Ragsdale's D (\"RD\"; doi:10.1534/genetics.108.093153), or correlation\n"
89
+ " coefficient r-squared.\n"
88
90
"Usage: bcftools +prune [Options]\n"
89
91
"Plugin options:\n"
90
92
" --AF-tag STR Use this tag with -n to determine allele frequency\n"
91
- " -a, --annotate r2,LD Add position of an upstream record with the biggest r2/LD value \n"
92
- " -e, --exclude EXPR Exclude sites for which the expression is true \n"
93
+ " -a, --annotate count|LD|RD|r2 Annotate with the number of variants within the -w window (\"count\"), \n"
94
+ " or with the biggest LD, RD, or r2 value and the position of the record \n"
93
95
" -f, --set-filter STR Apply soft filter STR instead of discarding the site (only with -m)\n"
94
- " -i, --include EXPR Include only sites for which the expression is true\n"
95
- " -k, --keep-sites Leave sites filtered by -i/-e unchanged instead of discarding them\n"
96
- " -m, --max [r2|LD=]FLOAT Remove sites with r2 or Lewontin's D bigger than FLOAT within the -w window\n"
96
+ " -m, --max count|LD|RD|r2=NUM Remove clusters of more than NUM sites (\"count\") or sites with LD, RD, or r2 bigger than NUM\n"
97
97
" -n, --nsites-per-win N Keep at most N sites in the -w window. See also -N, --nsites-per-win-mode\n"
98
98
" -N, --nsites-per-win-mode STR Keep sites with biggest AF (\"maxAF\"); sites that come first (\"1st\"); pick randomly (\"rand\") [maxAF]\n"
99
- " -o, --output FILE Write output to the FILE [standard output]\n"
100
- " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
101
99
" --random-seed INT Use the provided random seed for reproducibility\n"
102
100
" --randomize-missing Replace missing data with randomly assigned genotype based on site's allele frequency\n"
101
+ " -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -m/-n options [100kb]\n"
102
+ "Common options:\n"
103
+ " -e, --exclude EXPR Exclude sites for which the expression is true\n"
104
+ " -i, --include EXPR Include only sites for which the expression is true\n"
105
+ " -k, --keep-sites Leave sites filtered by -i/-e unchanged instead of discarding them\n"
106
+ " -o, --output FILE Write output to the FILE [standard output]\n"
107
+ " -O, --output-type u|b|v|z[0-9] u/b: un/compressed BCF, v/z: un/compressed VCF, 0-9: compression level [v]\n"
103
108
" -r, --regions REGION Restrict to comma-separated list of regions\n"
104
109
" -R, --regions-file FILE Restrict to regions listed in a file\n"
105
110
" -t, --targets REGION Similar to -r but streams rather than index-jumps\n"
106
111
" -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n"
107
- " -w, --window INT[bp|kb|Mb] The window size of INT sites or INT bp/kb/Mb for the -n/-l options [100kb]\n"
108
112
" -W, --write-index[=FMT] Automatically index the output files [off]\n"
109
113
"Examples:\n"
110
114
" # Discard records with r2 bigger than 0.6 in a window of 1000 sites\n"
@@ -121,6 +125,9 @@ static const char *usage_text(void)
121
125
"\n"
122
126
" # Discard records with r2 bigger than 0.6, first removing records with more than 2% of genotypes missing\n"
123
127
" bcftools +prune -m 0.6 -e'F_MISSING>=0.02' input.bcf -Ob -o output.bcf\n"
128
+ "\n"
129
+ " # Mark clusters of more than 3 sites within a 10bp window, do not mark ref-only sites\n"
130
+ " bcftools +prune -m count=3 -w 10bp -k -i 'type!=\"ref\"' input.bcf -Ob -o output.bcf\n"
124
131
"\n" ;
125
132
}
126
133
@@ -155,11 +162,11 @@ static void init_data(args_t *args)
155
162
kputs ("LD bigger than " ,& str );
156
163
kputd (args -> ld_max [VCFBUF_LD_IDX_LD ],& str );
157
164
}
158
- if ( args -> ld_max_set [VCFBUF_LD_IDX_HD ] )
165
+ if ( args -> ld_max_set [VCFBUF_LD_IDX_RD ] )
159
166
{
160
167
if ( str .l ) kputs (" or " ,& str );
161
- kputs ("HD bigger than " ,& str );
162
- kputd (args -> ld_max [VCFBUF_LD_IDX_HD ],& str );
168
+ kputs ("RD bigger than " ,& str );
169
+ kputd (args -> ld_max [VCFBUF_LD_IDX_RD ],& str );
163
170
}
164
171
bcf_hdr_printf (args -> hdr ,"##FILTER=<ID=%s,Description=\"An upstream site within %d%s with %s\">" ,args -> ld_filter ,
165
172
args -> ld_win < 0 ? - args -> ld_win /1000 : args -> ld_win ,
@@ -179,12 +186,14 @@ static void init_data(args_t *args)
179
186
bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Lewontin's D' (PMID:19433632) with the %s site\">" ,args -> ld_annot [VCFBUF_LD_IDX_LD ],args -> ld_annot_pos [VCFBUF_LD_IDX_LD ]);
180
187
bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">" ,args -> ld_annot_pos [VCFBUF_LD_IDX_LD ],args -> ld_annot [VCFBUF_LD_IDX_LD ]);
181
188
}
182
- if ( args -> ld_annot [VCFBUF_LD_IDX_HD ] )
189
+ if ( args -> ld_annot [VCFBUF_LD_IDX_RD ] )
183
190
{
184
- bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Ragsdale's \\hat{D} (PMID:31697386) with the %s site\">" ,args -> ld_annot [VCFBUF_LD_IDX_HD ],args -> ld_annot_pos [VCFBUF_LD_IDX_HD ]);
185
- bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">" ,args -> ld_annot_pos [VCFBUF_LD_IDX_HD ],args -> ld_annot [VCFBUF_LD_IDX_HD ]);
191
+ bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Float,Description=\"Pairwise Ragsdale's \\hat{D} (PMID:31697386) with the %s site\">" ,args -> ld_annot [VCFBUF_LD_IDX_RD ],args -> ld_annot_pos [VCFBUF_LD_IDX_RD ]);
192
+ bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The position of the site for which %s was calculated\">" ,args -> ld_annot_pos [VCFBUF_LD_IDX_RD ],args -> ld_annot [VCFBUF_LD_IDX_RD ]);
186
193
}
187
194
}
195
+ if ( args -> cluster_annot )
196
+ bcf_hdr_printf (args -> hdr ,"##INFO=<ID=%s,Number=1,Type=Integer,Description=\"The number of variants within %d bp of the site\">" ,args -> cluster_annot ,args -> ld_win );
188
197
if ( bcf_hdr_write (args -> out_fh , args -> hdr )!= 0 ) error ("[%s] Error: cannot write to %s\n" , __func__ ,args -> output_fname );
189
198
if ( init_index2 (args -> out_fh ,args -> hdr ,args -> output_fname ,
190
199
& args -> index_fn , args -> write_index )< 0 )
@@ -193,7 +202,16 @@ static void init_data(args_t *args)
193
202
if ( args -> ld_filter && strcmp ("." ,args -> ld_filter ) )
194
203
args -> ld_filter_id = bcf_hdr_id2int (args -> hdr , BCF_DT_ID , args -> ld_filter );
195
204
205
+ if ( args -> ld_win > 0 && (args -> max_cluster || args -> cluster_annot ) )
206
+ {
207
+ fprintf (stderr ,"Warning: assuming `-w %dbp` was intended instead of `-w %d`\n" ,args -> ld_win ,args -> ld_win );
208
+ args -> ld_win *= -1 ;
209
+ if ( args -> max_cluster && - args -> ld_win <= args -> max_cluster ) error ("Error: -w must be bigger than -m\n" );
210
+ }
211
+
196
212
args -> vcfbuf = vcfbuf_init (args -> hdr , args -> ld_win );
213
+ if ( args -> max_cluster ) vcfbuf_set (args -> vcfbuf ,CLUSTER_PRUNE ,args -> max_cluster );
214
+ if ( args -> cluster_annot ) vcfbuf_set (args -> vcfbuf ,CLUSTER_SIZE ,1 );
197
215
if ( args -> ld_max_set [VCFBUF_LD_IDX_R2 ] ) vcfbuf_set (args -> vcfbuf ,LD_MAX_R2 ,args -> ld_max [VCFBUF_LD_IDX_R2 ]);
198
216
if ( args -> ld_max_set [VCFBUF_LD_IDX_LD ] ) vcfbuf_set (args -> vcfbuf ,LD_MAX_LD ,args -> ld_max [VCFBUF_LD_IDX_LD ]);
199
217
if ( args -> ld_max_set [VCFBUF_LD_IDX_HD ] ) vcfbuf_set (args -> vcfbuf ,LD_MAX_HD ,args -> ld_max [VCFBUF_LD_IDX_HD ]);
@@ -203,6 +221,10 @@ static void init_data(args_t *args)
203
221
hts_srand48 (args -> rseed );
204
222
}
205
223
if ( args -> rand_missing ) vcfbuf_set (args -> vcfbuf ,LD_RAND_MISSING ,1 );
224
+ if ( args -> max_cluster )
225
+ {
226
+ vcfbuf_set (args -> vcfbuf ,CLUSTER_PRUNE ,args -> max_cluster );
227
+ }
206
228
if ( args -> nsites )
207
229
{
208
230
vcfbuf_set (args -> vcfbuf ,PRUNE_NSITES ,args -> nsites );
@@ -235,7 +257,18 @@ static void flush(args_t *args, int flush_all)
235
257
{
236
258
bcf1_t * rec ;
237
259
while ( (rec = vcfbuf_flush (args -> vcfbuf , flush_all )) )
260
+ {
261
+ if ( args -> cluster_annot )
262
+ {
263
+ int is_marked = vcfbuf_get_val (args -> vcfbuf ,int ,CLUSTER_SIZE );
264
+ if ( is_marked > 0 )
265
+ {
266
+ int32_t val = is_marked ;
267
+ bcf_update_info_int32 (args -> hdr , rec , args -> cluster_annot , & val , 1 );
268
+ }
269
+ }
238
270
if ( bcf_write1 (args -> out_fh , args -> hdr , rec )!= 0 ) error ("[%s] Error: cannot write to %s\n" , __func__ ,args -> output_fname );
271
+ }
239
272
}
240
273
static void process (args_t * args )
241
274
{
@@ -361,10 +394,14 @@ int run(int argc, char **argv)
361
394
args -> ld_annot_pos [VCFBUF_LD_IDX_LD ] = "POS_LD" ;
362
395
args -> ld_annot [VCFBUF_LD_IDX_LD ] = "LD" ;
363
396
}
364
- else if ( !strcasecmp ("HD" ,tag [i ]) )
397
+ else if ( !strcasecmp ("RD" ,tag [i ]) || !strcasecmp ("HD" ,tag [i ]) )
398
+ {
399
+ args -> ld_annot_pos [VCFBUF_LD_IDX_RD ] = "POS_RD" ;
400
+ args -> ld_annot [VCFBUF_LD_IDX_RD ] = "RD" ;
401
+ }
402
+ else if ( !strcasecmp ("COUNT" ,tag [i ]) )
365
403
{
366
- args -> ld_annot_pos [VCFBUF_LD_IDX_HD ] = "POS_HD" ;
367
- args -> ld_annot [VCFBUF_LD_IDX_HD ] = "HD" ;
404
+ args -> cluster_annot = "CLUSTER_SIZE" ;
368
405
}
369
406
else error ("The tag \"%s\" is not supported\n" ,tag [i ]);
370
407
free (tag [i ]);
@@ -395,10 +432,14 @@ int run(int argc, char **argv)
395
432
args -> ld_max_set [VCFBUF_LD_IDX_LD ] = 1 ;
396
433
args -> ld_max [VCFBUF_LD_IDX_LD ] = strtod (optarg + 3 ,& tmp );
397
434
}
398
- else if ( !strncasecmp ("HD=" ,optarg ,3 ) )
435
+ else if ( !strncasecmp ("RD=" ,optarg ,3 ) || !strncasecmp ("HD=" ,optarg ,3 ) )
436
+ {
437
+ args -> ld_max_set [VCFBUF_LD_IDX_RD ] = 1 ;
438
+ args -> ld_max [VCFBUF_LD_IDX_RD ] = strtod (optarg + 3 ,& tmp );
439
+ }
440
+ else if ( !strncasecmp ("count=" ,optarg ,6 ) )
399
441
{
400
- args -> ld_max_set [VCFBUF_LD_IDX_HD ] = 1 ;
401
- args -> ld_max [VCFBUF_LD_IDX_HD ] = strtod (optarg + 3 ,& tmp );
442
+ args -> max_cluster = strtod (optarg + 6 ,& tmp );
402
443
}
403
444
else
404
445
{
0 commit comments