@@ -31,17 +31,28 @@ class Field:
31
31
vcf_key : str
32
32
vcf_type : str
33
33
vcf_number : str
34
+ description : str = "Generated field"
34
35
35
36
def get_header (self ):
36
37
return (
37
38
f"##{ self .category } =<"
38
39
f"ID={ self .vcf_key } ,"
39
40
f"Type={ self .vcf_type } ,"
40
41
f"Number={ self .vcf_number } ,"
41
- f'Description="{ self .category } ,Type= { self . vcf_type } ,Number= { self . vcf_number } ">'
42
+ f'Description="{ self .description } ">'
42
43
)
43
44
44
45
46
+ # GT is a special case, since it has a special syntax, and must be listed as the first
47
+ # format field (if present)
48
+ GT = Field (
49
+ category = "FORMAT" ,
50
+ vcf_key = "GT" ,
51
+ vcf_type = "String" ,
52
+ vcf_number = "1" ,
53
+ description = "Genotype" ,
54
+ )
55
+
45
56
# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf
46
57
47
58
# [Table 1: Reserved INFO keys]
@@ -133,7 +144,7 @@ def vcf_numbers(category, max_number):
133
144
def vcf_fields (category , max_number ):
134
145
# info flag fields must have number 0
135
146
# non-flag fields can't have number 0
136
- return builds (
147
+ general_fields = builds (
137
148
Field ,
138
149
category = just (category ),
139
150
vcf_key = vcf_field_keys (category ),
@@ -143,6 +154,11 @@ def vcf_fields(category, max_number):
143
154
lambda field : (field .vcf_type == "Flag" and field .vcf_number == "0" )
144
155
or (field .vcf_type != "Flag" and field .vcf_number != "0" )
145
156
)
157
+ if category == "INFO" :
158
+ return general_fields
159
+ else :
160
+ # FORMAT: GT special case
161
+ return one_of (just (GT ), general_fields )
146
162
147
163
148
164
# [1.6.1 Fixed fields]
@@ -183,8 +199,31 @@ def qualities():
183
199
)
184
200
185
201
202
+ # [1.6.2 Genotype fields]
203
+
204
+
205
+ def genotypes (alleles , ploidy ):
206
+ def gt_str (allele_indexes , phased ):
207
+ sep = "|" if phased else "/"
208
+ return sep .join (
209
+ [str (idx ) if idx is not None else "." for idx in allele_indexes ]
210
+ )
211
+
212
+ return builds (
213
+ gt_str ,
214
+ lists (
215
+ one_of (integers (0 , alleles - 1 ), none ()), min_size = ploidy , max_size = ploidy
216
+ ),
217
+ booleans (),
218
+ )
219
+
220
+
186
221
@composite
187
222
def vcf_values (draw , field , * , max_number , alt_alleles , ploidy ):
223
+ # GT special case
224
+ if field is GT :
225
+ return [draw (genotypes (alleles = alt_alleles + 1 , ploidy = ploidy ))]
226
+
188
227
# [1.3 Data types]
189
228
if field .vcf_type == "Integer" :
190
229
# some integer values at lower end of range are not allowed
@@ -231,6 +270,15 @@ def vcf_number_to_ints(vcf_number, *, max_number, alt_alleles, ploidy):
231
270
raise ValueError (f"Number '{ vcf_number } ' is not supported." )
232
271
233
272
273
+ def ensure_gt_first (format_fields ):
274
+ # GT must be the first field if present [1.6.2 Genotype fields]
275
+ try :
276
+ i = format_fields .index (GT )
277
+ format_fields .insert (0 , format_fields .pop (i ))
278
+ except ValueError :
279
+ pass
280
+
281
+
234
282
@composite
235
283
def vcf (
236
284
draw ,
@@ -290,6 +338,7 @@ def vcf(
290
338
unique_by = lambda f : f .vcf_key .lower (),
291
339
)
292
340
)
341
+ ensure_gt_first (format_fields )
293
342
sample_ids = draw (
294
343
lists (
295
344
text (alphabet = ALPHANUMERIC , min_size = 1 ), max_size = max_samples , unique = True
0 commit comments