@@ -41,39 +41,61 @@ sub error
41
41
my (@msg ) = @_ ;
42
42
if ( scalar @msg ) { confess @msg ; }
43
43
print
44
- " About: Parse bcftools/vrfs output and from a subset of sites calculate variances.\n " ,
45
- " Usage: vrfs-variances [OPTIONS]\n " ,
44
+ " About: Parse bcftools/vrfs output and calculate variances from a subset of automatically selected\n " ,
45
+ " reference sites\n " ,
46
+ " Usage: zcat scores.txt.gz | vrfs-variances [OPTIONS]\n " ,
46
47
" Options:\n " ,
47
- " -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n " ,
48
- " -r, --rand-noise INT Add random noise, INT is a seed for reproducibility, or 0 for no seed [0]\n " ,
49
- " -s, --list-sites List sites passing the -n setting\n " ,
50
- " -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n " ,
51
- " -h, -?, --help This help message\n " ,
48
+ " -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n " ,
49
+ " -r, --rand-noise SEED[,RATE] Add random noise, 0 for random seed [0,1e-3]\n " ,
50
+ " -s, --list-sites List sites passing the -n setting\n " ,
51
+ " -S, --sort-func FUNC Reference site selection is based on the ordering defined by FUNC [nalt]\n " ,
52
+ " nalt .. sort by the overall number of alternate reads\n " ,
53
+ " vaf .. sort by the big-VAF bins being most significant first\n " ,
54
+ " -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n " ,
55
+ " -h, -?, --help This help message\n " ,
52
56
" \n " ;
53
57
exit -1;
54
58
}
55
59
sub parse_params
56
60
{
57
- my $opts = { ndat => 0.2 };
61
+ my $opts =
62
+ {
63
+ ndat => 0.2,
64
+ sort_func => \&cmp_dist_nalt,
65
+ };
58
66
if ( -t STDIN && !@ARGV ) { error(); }
59
67
while (defined (my $arg =shift (@ARGV )))
60
68
{
61
- if ( $arg eq ' -r' or $arg eq ' --rand-noise' ) { $$opts {rand_noise }=shift (@ARGV ); next }
69
+ if ( $arg eq ' -r' or $arg eq ' --rand-noise' )
70
+ {
71
+ my ($seed ,$rate ) = split (/ ,/ ,shift (@ARGV ));
72
+ $$opts {rand_seed } = $seed ;
73
+ $$opts {rand_rate } = defined $rate ? $rate : 1e-3;
74
+ next ;
75
+ }
62
76
if ( $arg eq ' -s' or $arg eq ' --list-sites' ) { $$opts {list_sites }=1; next }
77
+ if ( $arg eq ' -S' or $arg eq ' --sort-func' )
78
+ {
79
+ my $func = shift (@ARGV );
80
+ if ( $func eq ' nalt' ) { $$opts {sort_func } = \&cmp_dist_nalt; }
81
+ elsif ( $func eq ' vaf' ) { $$opts {sort_func } = \&cmp_dist_max_vaf; }
82
+ else { error(" Error: the sort function \" $func \" is not supported\n " ); }
83
+ next ;
84
+ }
63
85
if ( $arg eq ' -v' or $arg eq ' --list-var2' ) { $$opts {list_var2 }=1; next }
64
86
if ( $arg eq ' -n' or $arg eq ' --ndat' ) { $$opts {ndat }=shift (@ARGV ); next }
65
87
if ( $arg eq ' -?' or $arg eq ' -h' or $arg eq ' --help' ) { error(); }
66
88
error(" Unknown parameter \" $arg \" . Run -h for help.\n " );
67
89
}
68
- if ( exists ($$opts {rand_noise }) )
90
+ if ( exists ($$opts {rand_seed }) )
69
91
{
70
- if ( $$opts {rand_noise } ) { srand ($$opts {rand_noise }); }
92
+ if ( $$opts {rand_seed } ) { srand ($$opts {rand_seed }); }
71
93
else { srand (); }
72
94
}
73
95
return $opts ;
74
96
}
75
97
76
- sub cmp_dist
98
+ sub cmp_dist_max_vaf
77
99
{
78
100
for (my $i =@{$$a {dist }}-1; $i >=0; $i --)
79
101
{
@@ -83,9 +105,28 @@ sub cmp_dist
83
105
return 0;
84
106
}
85
107
108
+ sub cmp_dist_nalt
109
+ {
110
+ my ($sa ,$sb ,$na ,$nb );
111
+ for (my $i =0; $i <@{$$a {dist }}; $i ++)
112
+ {
113
+ # sa,sb .. normalize to the same number of samples the site had data for
114
+ $sa += $$a {dist }[$i ];
115
+ $sb += $$b {dist }[$i ];
116
+
117
+ # na,nb .. the number of alternate reads across all samples
118
+ $na += $$a {dist }[$i ]*$i ;
119
+ $nb += $$b {dist }[$i ]*$i ;
120
+ }
121
+ $na /= $sa ;
122
+ $nb /= $sb ;
123
+ return $na <=> $nb ;
124
+ }
125
+
86
126
sub parse_and_calc
87
127
{
88
128
my ($opts ) = @_ ;
129
+ my $sort_func = $$opts {sort_func };
89
130
my @dat = ();
90
131
while (my $line =<STDIN >)
91
132
{
@@ -96,20 +137,19 @@ sub parse_and_calc
96
137
my @dist = split (/ -/ ,$col [-1]);
97
138
push @dat , { line => $line , dist => \@dist };
98
139
}
99
- my @sdat = sort cmp_dist @dat ;
100
- my $nmax = $$opts {ndat };
101
- if ( $nmax <= 1 ) { $nmax = int ($nmax * scalar @sdat ); }
140
+ my @sdat = sort $sort_func @dat ;
141
+ my $ndat = $$opts {ndat };
142
+ if ( $ndat <= 1 ) { $ndat = int ($ndat * scalar @sdat ); }
102
143
my $n = 0;
103
144
my @avg = ();
104
145
my @avg2 = ();
105
146
for my $x (@sdat )
106
147
{
107
- my $rand = -1;
108
- if ( exists ($$opts {rand_noise }) && rand (1000)<10 ) { $rand = int (rand (@{$$x {dist }})); }
109
148
my $max = 0;
110
149
for (my $i =0; $i <@{$$x {dist }}; $i ++)
111
150
{
112
- if ( $rand ==$i ) { $$x {dist }[$i ]++; }
151
+ # Add random noise in a very simplistic way: optionally increment one or more VAF bins
152
+ if ( $$opts {rand_seed } && rand (1./$$opts {rand_rate })<=1 ) { $$x {dist }[$i ]++; }
113
153
if ( $max < $$x {dist }[$i ] ) { $max = $$x {dist }[$i ]; }
114
154
}
115
155
for (my $i =0; $i <@{$$x {dist }}; $i ++)
@@ -119,11 +159,7 @@ sub parse_and_calc
119
159
$avg2 [$i ] += $val * $val ;
120
160
}
121
161
if ( $$opts {list_sites } ) { print $$x {line }." \n " ; }
122
- if ( ++$n >= $nmax )
123
- {
124
- if ( !$$opts {list_var2 } ) { print $$x {line }." \n " ; }
125
- last ;
126
- }
162
+ if ( ++$n >= $ndat ) { last ; }
127
163
}
128
164
if ( $$opts {list_sites } ) { return ; }
129
165
$avg2 [0] = 1;
@@ -133,7 +169,7 @@ sub parse_and_calc
133
169
$avg2 [$i ] = $avg2 [$i ]/$n - $avg [$i ]*$avg [$i ];
134
170
if ( $avg2 [$i ]<=0 )
135
171
{
136
- # yes, it be smaller than zero, machine precision in play when the values are close to zero
172
+ # yes, it can be smaller than zero as well , machine precision is in play when the values are close to zero
137
173
$avg2 [$i ] = $i >0 ? $avg2 [$i -1]/2 : 1;
138
174
}
139
175
if ( !exists ($$opts {rand_noise }) && $avg2 [$i ] < 1e-9 )
0 commit comments