Skip to content

Commit 1574694

Browse files
committed
Use a different metric for selecting good sites
Originally, the sort function would sort by VAF counts, considering big-VAF bins as more significant. In this commit we switch to the overall number of alternate reads, which leads to more stable var2 profiles
1 parent 5cff2de commit 1574694

File tree

1 file changed

+60
-24
lines changed

1 file changed

+60
-24
lines changed

misc/vrfs-variances

Lines changed: 60 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,39 +41,61 @@ sub error
4141
my (@msg) = @_;
4242
if ( scalar @msg ) { confess @msg; }
4343
print
44-
"About: Parse bcftools/vrfs output and from a subset of sites calculate variances.\n",
45-
"Usage: vrfs-variances [OPTIONS]\n",
44+
"About: Parse bcftools/vrfs output and calculate variances from a subset of automatically selected\n",
45+
" reference sites\n",
46+
"Usage: zcat scores.txt.gz | vrfs-variances [OPTIONS]\n",
4647
"Options:\n",
47-
" -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n",
48-
" -r, --rand-noise INT Add random noise, INT is a seed for reproducibility, or 0 for no seed [0]\n",
49-
" -s, --list-sites List sites passing the -n setting\n",
50-
" -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n",
51-
" -h, -?, --help This help message\n",
48+
" -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n",
49+
" -r, --rand-noise SEED[,RATE] Add random noise, 0 for random seed [0,1e-3]\n",
50+
" -s, --list-sites List sites passing the -n setting\n",
51+
" -S, --sort-func FUNC Reference site selection is based on the ordering defined by FUNC [nalt]\n",
52+
" nalt .. sort by the overall number of alternate reads\n",
53+
" vaf .. sort by the big-VAF bins being most significant first\n",
54+
" -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n",
55+
" -h, -?, --help This help message\n",
5256
"\n";
5357
exit -1;
5458
}
5559
sub parse_params
5660
{
57-
my $opts = { ndat=>0.2 };
61+
my $opts =
62+
{
63+
ndat => 0.2,
64+
sort_func => \&cmp_dist_nalt,
65+
};
5866
if ( -t STDIN && !@ARGV ) { error(); }
5967
while (defined(my $arg=shift(@ARGV)))
6068
{
61-
if ( $arg eq '-r' or $arg eq '--rand-noise' ) { $$opts{rand_noise}=shift(@ARGV); next }
69+
if ( $arg eq '-r' or $arg eq '--rand-noise' )
70+
{
71+
my ($seed,$rate) = split(/,/,shift(@ARGV));
72+
$$opts{rand_seed} = $seed;
73+
$$opts{rand_rate} = defined $rate ? $rate : 1e-3;
74+
next;
75+
}
6276
if ( $arg eq '-s' or $arg eq '--list-sites' ) { $$opts{list_sites}=1; next }
77+
if ( $arg eq '-S' or $arg eq '--sort-func' )
78+
{
79+
my $func = shift(@ARGV);
80+
if ( $func eq 'nalt' ) { $$opts{sort_func} = \&cmp_dist_nalt; }
81+
elsif ( $func eq 'vaf' ) { $$opts{sort_func} = \&cmp_dist_max_vaf; }
82+
else { error("Error: the sort function \"$func\" is not supported\n"); }
83+
next;
84+
}
6385
if ( $arg eq '-v' or $arg eq '--list-var2' ) { $$opts{list_var2}=1; next }
6486
if ( $arg eq '-n' or $arg eq '--ndat' ) { $$opts{ndat}=shift(@ARGV); next }
6587
if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); }
6688
error("Unknown parameter \"$arg\". Run -h for help.\n");
6789
}
68-
if ( exists($$opts{rand_noise}) )
90+
if ( exists($$opts{rand_seed}) )
6991
{
70-
if ( $$opts{rand_noise} ) { srand($$opts{rand_noise}); }
92+
if ( $$opts{rand_seed} ) { srand($$opts{rand_seed}); }
7193
else { srand(); }
7294
}
7395
return $opts;
7496
}
7597

76-
sub cmp_dist
98+
sub cmp_dist_max_vaf
7799
{
78100
for (my $i=@{$$a{dist}}-1; $i>=0; $i--)
79101
{
@@ -83,9 +105,28 @@ sub cmp_dist
83105
return 0;
84106
}
85107

108+
sub cmp_dist_nalt
109+
{
110+
my ($sa,$sb,$na,$nb);
111+
for (my $i=0; $i<@{$$a{dist}}; $i++)
112+
{
113+
# sa,sb .. normalize to the same number of samples the site had data for
114+
$sa += $$a{dist}[$i];
115+
$sb += $$b{dist}[$i];
116+
117+
# na,nb .. the number of alternate reads across all samples
118+
$na += $$a{dist}[$i]*$i;
119+
$nb += $$b{dist}[$i]*$i;
120+
}
121+
$na /= $sa;
122+
$nb /= $sb;
123+
return $na<=>$nb;
124+
}
125+
86126
sub parse_and_calc
87127
{
88128
my ($opts) = @_;
129+
my $sort_func = $$opts{sort_func};
89130
my @dat = ();
90131
while (my $line=<STDIN>)
91132
{
@@ -96,20 +137,19 @@ sub parse_and_calc
96137
my @dist = split(/-/,$col[-1]);
97138
push @dat, { line=>$line, dist=>\@dist };
98139
}
99-
my @sdat = sort cmp_dist @dat;
100-
my $nmax = $$opts{ndat};
101-
if ( $nmax <= 1 ) { $nmax = int($nmax * scalar @sdat); }
140+
my @sdat = sort $sort_func @dat;
141+
my $ndat = $$opts{ndat};
142+
if ( $ndat <= 1 ) { $ndat = int($ndat * scalar @sdat); }
102143
my $n = 0;
103144
my @avg = ();
104145
my @avg2 = ();
105146
for my $x (@sdat)
106147
{
107-
my $rand = -1;
108-
if ( exists($$opts{rand_noise}) && rand(1000)<10 ) { $rand = int(rand(@{$$x{dist}})); }
109148
my $max = 0;
110149
for (my $i=0; $i<@{$$x{dist}}; $i++)
111150
{
112-
if ( $rand==$i ) { $$x{dist}[$i]++; }
151+
# Add random noise in a very simplistic way: optionally increment one or more VAF bins
152+
if ( $$opts{rand_seed} && rand(1./$$opts{rand_rate})<=1 ) { $$x{dist}[$i]++; }
113153
if ( $max < $$x{dist}[$i] ) { $max = $$x{dist}[$i]; }
114154
}
115155
for (my $i=0; $i<@{$$x{dist}}; $i++)
@@ -119,11 +159,7 @@ sub parse_and_calc
119159
$avg2[$i] += $val * $val;
120160
}
121161
if ( $$opts{list_sites} ) { print $$x{line}."\n"; }
122-
if ( ++$n >= $nmax )
123-
{
124-
if ( !$$opts{list_var2} ) { print $$x{line}."\n"; }
125-
last;
126-
}
162+
if ( ++$n >= $ndat ) { last; }
127163
}
128164
if ( $$opts{list_sites} ) { return; }
129165
$avg2[0] = 1;
@@ -133,7 +169,7 @@ sub parse_and_calc
133169
$avg2[$i] = $avg2[$i]/$n - $avg[$i]*$avg[$i];
134170
if ( $avg2[$i]<=0 )
135171
{
136-
# yes, it be smaller than zero, machine precision in play when the values are close to zero
172+
# yes, it can be smaller than zero as well, machine precision is in play when the values are close to zero
137173
$avg2[$i] = $i>0 ? $avg2[$i-1]/2 : 1;
138174
}
139175
if ( !exists($$opts{rand_noise}) && $avg2[$i] < 1e-9 )

0 commit comments

Comments
 (0)