Skip to content

Commit 1252c15

Browse files
Move and rename VcfParserConstants to vcf_header_io. (googlegenomics#176)
1 parent ef31462 commit 1252c15

File tree

5 files changed

+82
-65
lines changed

5 files changed

+82
-65
lines changed

gcp_variant_transforms/beam_io/vcf_header_io.py

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,27 @@
2828

2929
from gcp_variant_transforms.beam_io import vcfio
3030

31-
_all__ = ['VcfHeader', 'VcfHeaderSource', 'ReadAllVcfHeaders',
32-
'ReadVcfHeaders', 'WriteVcfHeaders']
31+
32+
class VcfHeaderFieldTypeConstants(object):
33+
"""Constants for types from VCF header."""
34+
FLOAT = 'Float'
35+
INTEGER = 'Integer'
36+
STRING = 'String'
37+
FLAG = 'Flag'
38+
CHARACTER = 'Character'
39+
STRING = 'String'
40+
41+
42+
class VcfParserHeaderKeyConstants(object):
43+
"""Constants for header fields from the parser (currently PyVCF)."""
44+
ID = 'id'
45+
NUM = 'num'
46+
TYPE = 'type'
47+
DESC = 'desc'
48+
SOURCE = 'source'
49+
VERSION = 'version'
50+
LENGTH = 'length'
51+
3352

3453
class VcfHeader(object):
3554
"""Container for header data."""
@@ -310,19 +329,19 @@ def _format_header_key_value(self, key, value):
310329
return '{}={}'.format(key, value)
311330

312331
def _format_header_key(self, key):
313-
if key == 'id':
332+
if key == VcfParserHeaderKeyConstants.ID:
314333
return _HeaderFieldKeyConstants.ID
315-
elif key == 'num':
334+
elif key == VcfParserHeaderKeyConstants.NUM:
316335
return _HeaderFieldKeyConstants.NUMBER
317-
elif key == 'desc':
336+
elif key == VcfParserHeaderKeyConstants.DESC:
318337
return _HeaderFieldKeyConstants.DESCRIPTION
319-
elif key == 'type':
338+
elif key == VcfParserHeaderKeyConstants.TYPE:
320339
return _HeaderFieldKeyConstants.TYPE
321-
elif key == 'source':
340+
elif key == VcfParserHeaderKeyConstants.SOURCE:
322341
return _HeaderFieldKeyConstants.SOURCE
323-
elif key == 'version':
342+
elif key == VcfParserHeaderKeyConstants.VERSION:
324343
return _HeaderFieldKeyConstants.VERSION
325-
elif key == 'length':
344+
elif key == VcfParserHeaderKeyConstants.LENGTH:
326345
return _HeaderFieldKeyConstants.LENGTH
327346
else:
328347
raise ValueError('Invalid VCF header key {}.'.format(key))

gcp_variant_transforms/libs/vcf_field_conflict_resolver.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,11 @@
1616

1717
import vcf
1818

19+
from gcp_variant_transforms.beam_io import vcf_header_io
1920
from gcp_variant_transforms.libs import bigquery_schema_descriptor # pylint: disable=unused-import
2021
from gcp_variant_transforms.libs import bigquery_util
2122

2223

23-
class VcfParserConstants(object):
24-
"""Constants for type and number from VCF parser."""
25-
FLOAT = 'Float'
26-
INTEGER = 'Integer'
27-
STRING = 'String'
28-
FLAG = 'Flag'
29-
CHARACTER = 'Character'
30-
NUM = 'num'
31-
STRING = 'String'
32-
TYPE = 'type'
33-
34-
3524
class FieldConflictResolver(object):
3625
"""A class for resolving all VCF field related mismatches.
3726
@@ -114,9 +103,9 @@ def resolve_attribute_conflict(self, attribute_name, first_attribute_value,
114103
Raises:
115104
ValueError: if the conflict cannot be resolved.
116105
"""
117-
if attribute_name == VcfParserConstants.TYPE:
106+
if attribute_name == vcf_header_io.VcfParserHeaderKeyConstants.TYPE:
118107
return self._resolve_type(first_attribute_value, second_attribute_value)
119-
elif attribute_name == VcfParserConstants.NUM:
108+
elif attribute_name == vcf_header_io.VcfParserHeaderKeyConstants.NUM:
120109
return self._resolve_number(first_attribute_value, second_attribute_value)
121110
else:
122111
# We only care about conflicts in 'num' and 'type' attributes.
@@ -125,13 +114,14 @@ def resolve_attribute_conflict(self, attribute_name, first_attribute_value,
125114
return first_attribute_value
126115

127116
def _resolve_type(self, first, second):
117+
type_constants = vcf_header_io.VcfHeaderFieldTypeConstants
128118
if first == second:
129119
return first
130-
elif (first in (VcfParserConstants.INTEGER, VcfParserConstants.FLOAT) and
131-
second in (VcfParserConstants.INTEGER, VcfParserConstants.FLOAT)):
132-
return VcfParserConstants.FLOAT
120+
elif (first in (type_constants.INTEGER, type_constants.FLOAT) and
121+
second in (type_constants.INTEGER, type_constants.FLOAT)):
122+
return type_constants.FLOAT
133123
elif self._resolve_always:
134-
return VcfParserConstants.STRING
124+
return type_constants.STRING
135125
else:
136126
raise ValueError('Incompatible values cannot be resolved: '
137127
'{}, {}'.format(first, second))

gcp_variant_transforms/libs/vcf_field_conflict_resolver_test.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,18 @@
2020

2121
import vcf
2222

23+
from gcp_variant_transforms.beam_io.vcf_header_io import VcfHeaderFieldTypeConstants
24+
from gcp_variant_transforms.beam_io.vcf_header_io import VcfParserHeaderKeyConstants
2325
from gcp_variant_transforms.libs import bigquery_schema_descriptor
2426
from gcp_variant_transforms.libs import vcf_field_conflict_resolver
2527
from gcp_variant_transforms.libs.bigquery_util import TableFieldConstants
26-
from gcp_variant_transforms.libs.vcf_field_conflict_resolver import VcfParserConstants
2728

2829

2930
SchemaTestConfig = namedtuple('SchemaTestConfig',
3031
['schema_type', 'schema_mode', 'field_data',
3132
'expected_resolved_field_data'])
3233

34+
3335
class ConflictResolverTest(unittest.TestCase):
3436
"""Test case for :class:`FieldConflictResolver`."""
3537

@@ -169,79 +171,83 @@ def test_resolving_schema_conflict_type_and_number(self):
169171
def test_resolving_attribute_conflict_type(self):
170172
self.assertEqual(
171173
self._resolver.resolve_attribute_conflict(
172-
VcfParserConstants.TYPE,
173-
VcfParserConstants.INTEGER,
174-
VcfParserConstants.FLOAT),
175-
VcfParserConstants.FLOAT)
174+
VcfParserHeaderKeyConstants.TYPE,
175+
VcfHeaderFieldTypeConstants.INTEGER,
176+
VcfHeaderFieldTypeConstants.FLOAT),
177+
VcfHeaderFieldTypeConstants.FLOAT)
176178
with self.assertRaises(ValueError):
177179
self._resolver.resolve_attribute_conflict(
178-
VcfParserConstants.TYPE,
179-
VcfParserConstants.INTEGER,
180-
VcfParserConstants.STRING)
180+
VcfParserHeaderKeyConstants.TYPE,
181+
VcfHeaderFieldTypeConstants.INTEGER,
182+
VcfHeaderFieldTypeConstants.STRING)
181183
self.fail('Should raise exception for unresolvable types')
182184

183185
def test_resolving_attribute_conflict_number(self):
184186
self.assertEqual(
185187
self._resolver.resolve_attribute_conflict(
186-
VcfParserConstants.NUM, 2, 3),
188+
VcfParserHeaderKeyConstants.NUM, 2, 3),
187189
None)
188190
self.assertEqual(
189191
self._resolver.resolve_attribute_conflict(
190-
VcfParserConstants.NUM, 2, None),
192+
VcfParserHeaderKeyConstants.NUM, 2, None),
191193
None)
192194
# Unresolvable cases.
193195
for i in [0, 1]:
194196
for j in [self._field_count('R'), self._field_count('G'),
195197
self._field_count('A'), 2, None]:
196198
with self.assertRaises(ValueError):
197199
self._resolver.resolve_attribute_conflict(
198-
VcfParserConstants.NUM, i, j)
200+
VcfParserHeaderKeyConstants.NUM, i, j)
199201
self.fail(
200202
'Should raise exception for unresolvable number: %d vs %d'%(i, j))
201203

202204
def test_resolving_attribute_conflict_in_number_allele(self):
203205
self.assertEqual(
204206
self._resolver_allele.resolve_attribute_conflict(
205-
VcfParserConstants.NUM, 2, 3),
207+
VcfParserHeaderKeyConstants.NUM, 2, 3),
206208
None)
207209
self.assertEqual(
208210
self._resolver_allele.resolve_attribute_conflict(
209-
VcfParserConstants.NUM, 2, None),
211+
VcfParserHeaderKeyConstants.NUM, 2, None),
210212
None)
211213
# Unresolvable cases.
212214
for i in [self._field_count('A')]:
213215
for j in [self._field_count('R'), self._field_count('G'), 0, 1, 2, None]:
214216
with self.assertRaises(ValueError):
215217
self._resolver_allele.resolve_attribute_conflict(
216-
VcfParserConstants.NUM, i, j)
218+
VcfParserHeaderKeyConstants.NUM, i, j)
217219
self.fail(
218220
'Should raise exception for unresolvable number: %d vs %d'%(i, j))
219221

220222
def test_resolving_all_field_definition_conflict_in_type(self):
221223
self.assertEqual(
222224
self._resolver_always.resolve_attribute_conflict(
223-
VcfParserConstants.TYPE, VcfParserConstants.INTEGER,
224-
VcfParserConstants.FLOAT),
225-
VcfParserConstants.FLOAT)
226-
for i in [VcfParserConstants.FLOAT, VcfParserConstants.INTEGER,
227-
VcfParserConstants.STRING, VcfParserConstants.CHARACTER]:
228-
for j in [VcfParserConstants.FLAG, VcfParserConstants.STRING]:
225+
VcfParserHeaderKeyConstants.TYPE,
226+
VcfHeaderFieldTypeConstants.INTEGER,
227+
VcfHeaderFieldTypeConstants.FLOAT),
228+
VcfHeaderFieldTypeConstants.FLOAT)
229+
for i in [VcfHeaderFieldTypeConstants.FLOAT,
230+
VcfHeaderFieldTypeConstants.INTEGER,
231+
VcfHeaderFieldTypeConstants.STRING,
232+
VcfHeaderFieldTypeConstants.CHARACTER]:
233+
for j in [VcfHeaderFieldTypeConstants.FLAG,
234+
VcfHeaderFieldTypeConstants.STRING]:
229235
self.assertEqual(
230236
self._resolver_always.resolve_attribute_conflict(
231-
VcfParserConstants.TYPE, i, j),
232-
VcfParserConstants.STRING)
237+
VcfParserHeaderKeyConstants.TYPE, i, j),
238+
VcfHeaderFieldTypeConstants.STRING)
233239

234240
def test_resolving_all_field_definition_conflict_in_number(self):
235241
self.assertEqual(
236242
self._resolver_always.resolve_attribute_conflict(
237-
VcfParserConstants.NUM, 2, 3), None)
243+
VcfParserHeaderKeyConstants.NUM, 2, 3), None)
238244
self.assertEqual(
239245
self._resolver_always.resolve_attribute_conflict(
240-
VcfParserConstants.NUM, 2, None), None)
246+
VcfParserHeaderKeyConstants.NUM, 2, None), None)
241247

242248
for i in [0, 1]:
243249
for j in [self._field_count('R'), self._field_count('G'),
244250
self._field_count('A'), 2, None]:
245251
self.assertEqual(
246252
self._resolver_always.resolve_attribute_conflict(
247-
VcfParserConstants.NUM, i, j), None)
253+
VcfParserHeaderKeyConstants.NUM, i, j), None)

gcp_variant_transforms/transforms/infer_undefined_headers.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from vcf.parser import field_counts
2424

2525
from gcp_variant_transforms.beam_io import vcf_header_io
26-
from gcp_variant_transforms.libs import vcf_field_conflict_resolver
2726
from gcp_variant_transforms.transforms import merge_headers
2827

2928

@@ -56,16 +55,16 @@ def _get_field_type(self, field_value):
5655
"""
5756
if isinstance(field_value, list):
5857
return (self._get_field_type(field_value[0]) if field_value else
59-
vcf_field_conflict_resolver.VcfParserConstants.STRING)
58+
vcf_header_io.VcfHeaderFieldTypeConstants.STRING)
6059

6160
if isinstance(field_value, bool):
62-
return vcf_field_conflict_resolver.VcfParserConstants.FLAG
61+
return vcf_header_io.VcfHeaderFieldTypeConstants.FLAG
6362
elif isinstance(field_value, int):
64-
return vcf_field_conflict_resolver.VcfParserConstants.INTEGER
63+
return vcf_header_io.VcfHeaderFieldTypeConstants.INTEGER
6564
elif isinstance(field_value, float):
66-
return vcf_field_conflict_resolver.VcfParserConstants.FLOAT
65+
return vcf_header_io.VcfHeaderFieldTypeConstants.FLOAT
6766
else:
68-
return vcf_field_conflict_resolver.VcfParserConstants.STRING
67+
return vcf_header_io.VcfHeaderFieldTypeConstants.STRING
6968

7069
def _infer_undefined_info_fields(self, variant, defined_headers):
7170
"""Returns info fields not defined in the headers.

gcp_variant_transforms/transforms/merge_header_definitions.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
from typing import Dict, List # pylint: disable=unused-import
2020

2121
import apache_beam as beam
22+
from gcp_variant_transforms.beam_io import vcf_header_io
2223
from gcp_variant_transforms.beam_io.vcf_header_io import VcfHeader # pylint: disable=unused-import
23-
from gcp_variant_transforms.libs.vcf_field_conflict_resolver import VcfParserConstants
2424

2525
# ``Definition`` cherry-picks the attributes from vcf header definitions that
2626
# are critical for checking field compatibilities across VCF files.
27-
Definition = namedtuple('Definition', [VcfParserConstants.NUM,
28-
VcfParserConstants.TYPE])
27+
Definition = namedtuple('Definition',
28+
[vcf_header_io.VcfParserHeaderKeyConstants.NUM,
29+
vcf_header_io.VcfParserHeaderKeyConstants.TYPE])
2930

3031

3132
class VcfHeaderDefinitions(object):
@@ -43,12 +44,14 @@ def __init__(self, vcf_header=None):
4344
if not vcf_header:
4445
return
4546
for key, val in vcf_header.infos.iteritems():
46-
definition = Definition(val[VcfParserConstants.NUM],
47-
val[VcfParserConstants.TYPE])
47+
definition = Definition(
48+
val[vcf_header_io.VcfParserHeaderKeyConstants.NUM],
49+
val[vcf_header_io.VcfParserHeaderKeyConstants.TYPE])
4850
self._infos[key][definition] = [vcf_header.file_name]
4951
for key, val in vcf_header.formats.iteritems():
50-
definition = Definition(val[VcfParserConstants.NUM],
51-
val[VcfParserConstants.TYPE])
52+
definition = Definition(
53+
val[vcf_header_io.VcfParserHeaderKeyConstants.NUM],
54+
val[vcf_header_io.VcfParserHeaderKeyConstants.TYPE])
5255
self._formats[key][definition] = [vcf_header.file_name]
5356

5457
def __eq__(self, other):

0 commit comments

Comments
 (0)