Skip to content

Commit 5b3b0c5

Browse files
committed
Generate genotypes
1 parent 5003c65 commit 5b3b0c5

File tree

2 files changed

+65
-2
lines changed

2 files changed

+65
-2
lines changed

hypothesis_vcf/strategies.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,28 @@ class Field:
3131
vcf_key: str
3232
vcf_type: str
3333
vcf_number: str
34+
description: str = "Generated field"
3435

3536
def get_header(self):
3637
return (
3738
f"##{self.category}=<"
3839
f"ID={self.vcf_key},"
3940
f"Type={self.vcf_type},"
4041
f"Number={self.vcf_number},"
41-
f'Description="{self.category},Type={self.vcf_type},Number={self.vcf_number}">'
42+
f'Description="{self.description}">'
4243
)
4344

4445

46+
# GT is a special case, since it has a special syntax, and must be listed as the first
47+
# format field (if present)
48+
GT = Field(
49+
category="FORMAT",
50+
vcf_key="GT",
51+
vcf_type="String",
52+
vcf_number="1",
53+
description="Genotype",
54+
)
55+
4556
# references to the VCF spec are for https://samtools.github.io/hts-specs/VCFv4.3.pdf
4657

4758
# [Table 1: Reserved INFO keys]
@@ -133,7 +144,7 @@ def vcf_numbers(category, max_number):
133144
def vcf_fields(category, max_number):
134145
# info flag fields must have number 0
135146
# non-flag fields can't have number 0
136-
return builds(
147+
general_fields = builds(
137148
Field,
138149
category=just(category),
139150
vcf_key=vcf_field_keys(category),
@@ -143,6 +154,11 @@ def vcf_fields(category, max_number):
143154
lambda field: (field.vcf_type == "Flag" and field.vcf_number == "0")
144155
or (field.vcf_type != "Flag" and field.vcf_number != "0")
145156
)
157+
if category == "INFO":
158+
return general_fields
159+
else:
160+
# FORMAT: GT special case
161+
return one_of(just(GT), general_fields)
146162

147163

148164
# [1.6.1 Fixed fields]
@@ -183,8 +199,31 @@ def qualities():
183199
)
184200

185201

202+
# [1.6.2 Genotype fields]
203+
204+
205+
def genotypes(alleles, ploidy):
206+
def gt_str(allele_indexes, phased):
207+
sep = "|" if phased else "/"
208+
return sep.join(
209+
[str(idx) if idx is not None else "." for idx in allele_indexes]
210+
)
211+
212+
return builds(
213+
gt_str,
214+
lists(
215+
one_of(integers(0, alleles - 1), none()), min_size=ploidy, max_size=ploidy
216+
),
217+
booleans(),
218+
)
219+
220+
186221
@composite
187222
def vcf_values(draw, field, *, max_number, alt_alleles, ploidy):
223+
# GT special case
224+
if field is GT:
225+
return [draw(genotypes(alleles=alt_alleles + 1, ploidy=ploidy))]
226+
188227
# [1.3 Data types]
189228
if field.vcf_type == "Integer":
190229
# some integer values at lower end of range are not allowed
@@ -231,6 +270,15 @@ def vcf_number_to_ints(vcf_number, *, max_number, alt_alleles, ploidy):
231270
raise ValueError(f"Number '{vcf_number}' is not supported.")
232271

233272

273+
def ensure_gt_first(format_fields):
274+
# GT must be the first field if present [1.6.2 Genotype fields]
275+
try:
276+
i = format_fields.index(GT)
277+
format_fields.insert(0, format_fields.pop(i))
278+
except ValueError:
279+
pass
280+
281+
234282
@composite
235283
def vcf(
236284
draw,
@@ -290,6 +338,7 @@ def vcf(
290338
unique_by=lambda f: f.vcf_key.lower(),
291339
)
292340
)
341+
ensure_gt_first(format_fields)
293342
sample_ids = draw(
294343
lists(
295344
text(alphabet=ALPHANUMERIC, min_size=1), max_size=max_samples, unique=True

tests/test_strategies.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import re
2+
13
import pysam
24
from hypothesis import HealthCheck, given, note, settings
35
from hypothesis.strategies import data
@@ -7,6 +9,7 @@
79
RESERVED_FORMAT_KEYS,
810
RESERVED_INFO_KEYS,
911
Field,
12+
genotypes,
1013
vcf_field_keys,
1114
vcf_fields,
1215
vcf_values,
@@ -40,6 +43,17 @@ def test_format_fields(data):
4043
assert field.vcf_number != "0"
4144

4245

46+
@given(data=data())
47+
def test_genotypes(data):
48+
alleles = 3
49+
ploidy = 2
50+
gt = data.draw(genotypes(alleles=alleles, ploidy=ploidy))
51+
allele_index_strs = re.split("[/|]", gt)
52+
assert len(allele_index_strs) == ploidy
53+
allele_indexes = [int(idx) if idx != "." else None for idx in allele_index_strs]
54+
assert all(0 <= idx < alleles for idx in allele_indexes if idx is not None)
55+
56+
4357
@given(data=data())
4458
def test_vcf_values(data):
4559
field = Field("INFO", "I1", "Integer", "1")

0 commit comments

Comments
 (0)