diff --git a/allel/io/vcf_read.py b/allel/io/vcf_read.py index 9eae0261..9b8d5216 100644 --- a/allel/io/vcf_read.py +++ b/allel/io/vcf_read.py @@ -12,9 +12,9 @@ from collections import namedtuple, defaultdict import warnings import time -import subprocess import textwrap from collections import OrderedDict +from allel.io.vcfreader import VCFReader import numpy as np @@ -205,11 +205,12 @@ def _chunk_iter_rename(it, rename_fields): overlaps the region - such a variant will *not* be included in the data returned by this function.""" -_doc_param_tabix = \ - """Name or path to tabix executable. Only required if `region` is given. Setting - `tabix` to `None` will cause a fall-back to scanning through the VCF file from - the beginning, which may be much slower than tabix but the only option if tabix - is not available on your system and/or the VCF file has not been tabix-indexed.""" +_doc_param_index = \ + """Path to the index file of the VCF. Only required if `region` is given. Setting to `None` (the + default) will automatically check weather there is a index file with the same name but + suffix '.tbi', which is the most common case. A warning will be issued if the index file + is older than the VCF file. + """ _doc_param_samples = \ """Selection of samples to extract calldata for. If provided, should be a list of @@ -244,7 +245,7 @@ def read_vcf(input, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, @@ -276,8 +277,8 @@ def read_vcf(input, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} samples : list of strings {samples} transformers : list of transformer objects, optional @@ -304,7 +305,7 @@ def read_vcf(input, fields, samples, headers, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=samples, transformers=transformers ) @@ -353,7 +354,7 @@ def read_vcf(input, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, samples=_doc_param_samples, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, @@ -381,7 +382,7 @@ def vcf_to_npz(input, output, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix=True, + index=None, samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, @@ -419,8 +420,8 @@ def vcf_to_npz(input, output, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} samples : list of strings {samples} transformers : list of transformer objects, optional @@ -443,7 +444,7 @@ def vcf_to_npz(input, output, input=input, fields=fields, exclude_fields=exclude_fields, rename_fields=rename_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, chunk_length=chunk_length, - log=log, fills=fills, region=region, tabix=tabix, samples=samples, + log=log, fills=fills, region=region, index=index, samples=samples, transformers=transformers ) @@ -473,7 +474,7 @@ def vcf_to_npz(input, output, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, samples=_doc_param_samples, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, @@ -610,7 +611,7 @@ def vcf_to_hdf5(input, output, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, @@ -666,8 +667,8 @@ def vcf_to_hdf5(input, output, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} samples : list of strings {samples} transformers : list of transformer objects, optional @@ -693,7 +694,7 @@ def vcf_to_hdf5(input, output, fields, samples, headers, it = iter_vcf_chunks( input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=samples, transformers=transformers ) @@ -767,7 +768,7 @@ def vcf_to_hdf5(input, output, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, samples=_doc_param_samples, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, @@ -850,7 +851,7 @@ def vcf_to_zarr(input, output, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, @@ -891,8 +892,8 @@ def vcf_to_zarr(input, output, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} samples : list of strings {samples} transformers : list of transformer objects, optional @@ -918,7 +919,7 @@ def vcf_to_zarr(input, output, fields, samples, headers, it = iter_vcf_chunks( input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=samples, transformers=transformers ) @@ -997,7 +998,7 @@ def vcf_to_zarr(input, output, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, samples=_doc_param_samples, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, @@ -1008,46 +1009,25 @@ def vcf_to_zarr(input, output, # noinspection PyShadowingBuiltins -def _setup_input_stream(input, region=None, tabix=None, buffer_size=DEFAULT_BUFFER_SIZE): +def _setup_input_stream(input, index=None, region=None, buffer_size=DEFAULT_BUFFER_SIZE): # obtain a file-like object close = False if isinstance(input, str) and input.endswith('gz'): + if index is None: + index_file = '{}.tbi'.format(input) + if os.path.exists(index_file): + index = index_file - if region and tabix and os.name != 'nt': - - try: - # try tabix - p = subprocess.Popen([tabix, '-h', input, region], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - bufsize=0) - - # check if tabix exited early, look for tabix error - time.sleep(.5) - poll = p.poll() - if poll is not None and poll > 0: - err = p.stdout.read() - err = str(err, 'ascii') - p.stdout.close() - raise RuntimeError(err.strip()) - fileobj = p.stdout - close = True - # N.B., still pass the region parameter through so we get strictly only - # variants that start within the requested region. See also - # https://github.com/alimanfoo/vcfnp/issues/54 - - except FileNotFoundError: - # no tabix, fall back to scanning - warnings.warn('tabix not found, falling back to scanning to region') - fileobj = gzip.open(input, mode='rb') - close = True - - except Exception as e: - warnings.warn('error occurred attempting tabix (%s); falling back to ' - 'scanning to region' % e) - fileobj = gzip.open(input, mode='rb') - close = True + if index is not None and os.path.getmtime(index) < os.path.getmtime(input): + warnings.warn('the index file is older than the input VCF: {}'.format(index)) + + if region and index is not None: + fileobj = VCFReader(input, index, region) # random access + close = True + # N.B., still pass the region parameter through so we get strictly only + # variants that start within the requested region. See also + # https://github.com/alimanfoo/vcfnp/issues/54 else: fileobj = gzip.open(input, mode='rb') @@ -1076,7 +1056,7 @@ def iter_vcf_chunks(input, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, samples=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, @@ -1101,8 +1081,8 @@ def iter_vcf_chunks(input, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} samples : list of strings {samples} transformers : list of transformer objects, optional @@ -1131,7 +1111,7 @@ def iter_vcf_chunks(input, fills=fills, samples=samples, region=region) # setup input stream - stream = _setup_input_stream(input=input, region=region, tabix=tabix, + stream = _setup_input_stream(input=input, index=index, region=region, # tabix=tabix, buffer_size=buffer_size) # setup iterator @@ -1158,7 +1138,7 @@ def iter_vcf_chunks(input, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, samples=_doc_param_samples, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, @@ -1797,7 +1777,7 @@ def vcf_to_dataframe(input, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, @@ -1822,8 +1802,8 @@ def vcf_to_dataframe(input, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} transformers : list of transformer objects, optional {transformers} buffer_size : int, optional @@ -1849,7 +1829,7 @@ def vcf_to_dataframe(input, fields, _, _, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[], + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=[], transformers=transformers ) @@ -1881,7 +1861,7 @@ def vcf_to_dataframe(input, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, chunk_length=_doc_param_chunk_length, @@ -1898,7 +1878,7 @@ def vcf_to_csv(input, output, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, @@ -1926,8 +1906,8 @@ def vcf_to_csv(input, output, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} transformers : list of transformer objects, optional {transformers} buffer_size : int, optional @@ -1950,7 +1930,7 @@ def vcf_to_csv(input, output, fields, _, _, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[], + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=[], transformers=transformers ) @@ -1980,7 +1960,7 @@ def vcf_to_csv(input, output, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, chunk_length=_doc_param_chunk_length, @@ -2016,7 +1996,7 @@ def vcf_to_recarray(input, alt_number=DEFAULT_ALT_NUMBER, fills=None, region=None, - tabix='tabix', + index=None, transformers=None, buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, @@ -2041,8 +2021,8 @@ def vcf_to_recarray(input, {fills} region : string, optional {region} - tabix : string, optional - {tabix} + index : string, optional + {index} transformers : list of transformer objects, optional {transformers} buffer_size : int, optional @@ -2067,7 +2047,7 @@ def vcf_to_recarray(input, fields, _, _, it = iter_vcf_chunks( input=input, fields=fields, exclude_fields=exclude_fields, types=types, numbers=numbers, alt_number=alt_number, buffer_size=buffer_size, - chunk_length=chunk_length, fills=fills, region=region, tabix=tabix, samples=[], + chunk_length=chunk_length, fills=fills, region=region, index=index, samples=[], transformers=transformers ) @@ -2098,7 +2078,7 @@ def vcf_to_recarray(input, alt_number=_doc_param_alt_number, fills=_doc_param_fills, region=_doc_param_region, - tabix=_doc_param_tabix, + index=_doc_param_index, transformers=_doc_param_transformers, buffer_size=_doc_param_buffer_size, chunk_length=_doc_param_chunk_length, diff --git a/allel/io/vcfreader.py b/allel/io/vcfreader.py new file mode 100644 index 00000000..e7b2c9f9 --- /dev/null +++ b/allel/io/vcfreader.py @@ -0,0 +1,255 @@ +import os +import struct +import gzip +import zlib +import re +import numpy as np + +# max value of int 32 +MAX_INT32 = 0xffffffff >> 1 + +# max value of uint64 +MAX_UINT64 = 0xffffffffffffffff + + +class VCFReader: + def __init__(self, filename, index, region): + self._chrom, self._begin, self._end = self.parse_region(region) + self._chunk_begin = 0 + self._chunk_end = MAX_UINT64 + self._meta = b'#' + self._header = True # track whether we should read VCF header + self._block = None + self._parse_index(index) + self._fileobj = open(filename, 'rb') + self._load_block(0) # load header + + @staticmethod + def parse_region(region): + """ + parse a region to chromosome, begin, end + + Parameters + ---------- + region: string + tabix style region string + + Returns + ------- + chromosome: bytes + contig name + begin: int + position begin + end: int + position end + + """ + try: + chrom, begin, end = re.findall(r'^(\w+):*(\d*)-*(\d*)$', region).pop(0) + except IndexError: + raise ValueError('bad region string: {}'.format(region)) + begin = int(begin) - 1 if begin else 0 + end = int(end) if end else MAX_INT32 + if begin > 0 and end == MAX_INT32: + end = begin + 1 + return chrom.encode('ascii'), begin, end + + @staticmethod + def reg2bins(begin, end, n_lvls=5, min_shift=14): + """ + generate key of bins which may overlap the given region, + check out https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3042176/ + and https://samtools.github.io/hts-specs/tabix.pdf + for more information. + + Parameters + ---------- + begin: int + chromosome position begin + end: int + chromosome position end + n_lvls: int, optional + cluster level, for tabix, set to 5 + min_shift: int, optional + minimum shift, for tabix, set to 14 + + Returns + ------- + generator + + """ + begin, end = begin, end + t, s = 0, min_shift + (n_lvls << 1) + n_lvls + for l in range(n_lvls + 1): + b, e = t + (begin >> s), t + (end >> s) + n = e - b + 1 + for k in range(b, e + 1): + yield k + n += 1 + t += 1 << ((l << 1) + l) + s -= 3 + + def _parse_index(self, index): + """ + read and parse the index file, set the virtual offset for the target region, + check out https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3042176/ + and https://samtools.github.io/hts-specs/tabix.pdf + for more information. + + Parameters + ---------- + index: str + path for the index file + + """ + with gzip.open(index, 'rb') as f: + magic = f.read(4) + if magic != b'TBI\x01': # check magic + raise RuntimeError('invalid tabix index file: {}'.format(index)) + header = np.frombuffer(f.read(4 * 8), dtype=np.int32) + if header[1] != 2: # check format + raise RuntimeError('invalid VCF index file: {}'.format(index)) + # this is the comment marker use to determine VCF header, usually set to b'#' + self._meta = chr(header[5]).encode('ascii') + try: + rid = f.read(header[7]).split(b'\x00').index(self._chrom) + except ValueError: + # contig name is not exist, read header only + self._chunk_begin = -1 + return + # seek to our target index + for _ in range(rid): + for _ in range(struct.unpack('> 14] + # binning index: record cluster in large interval + overlap = np.concatenate([chunks for bin_key, chunks in bidx.items() if chunks is not None]) + # coupled binning and linear indices, filter out low level bins + chunk_begin, *_, chunk_end = np.sort(np.ravel(overlap[overlap[:, 0] >= min_ioff])) + # convert to native int + self._chunk_begin, self._chunk_end = chunk_begin.item(), chunk_end.item() + + def _load_block(self, offset): + """ + load a BGZF block into buffer + + Parameters + ---------- + offset: uint64 + the offset of the BGZF block, the higher 48 bits keep the real file offset + of the start of the gzip block the byte falls in, the lower 16 bits store + the offset of the byte inside the gzip block. + check out https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3042176/ + and http://samtools.sourceforge.net/SAM1.pdf + for more information. + + """ + if offset < 0: # contig name is not exist + return + # the higher 48 bits keep the real file offset of the gzip block that byte falls in + block_offset = offset >> 16 + if block_offset > (self._chunk_end >> 16): # out of range, stop reading + return + self._fileobj.seek(block_offset) + # read a BGZF block + magic = self._fileobj.read(4) + if not magic: # end of file + return + if magic != b"\x1f\x8b\x08\x04": # check whether this is a BGZF block + raise RuntimeError("invalid BGZF block offset: {}!".format(block_offset)) + self._fileobj.seek(4 + 1 + 1, os.SEEK_CUR) # seek useless MTIME, XFL, OS + x_len = extra_len = struct.unpack(" 0: + subfield_identifier = self._fileobj.read(2) + subfield_length = struct.unpack(" 0: # skip left extra length + self._fileobj.seek(self._fileobj.tell() + extra_len) + # read and decompress the block data + cdata = self._fileobj.read(bsize - x_len - 19) + data = zlib.decompress(cdata, wbits=-15) + # check out https://docs.python.org/3/library/zlib.html#zlib.crc32 + crc = zlib.crc32(data) & 0xffffffff + if crc != struct.unpack("> 16 + block_end = self._chunk_end & 0xffff if self._fileobj.tell() > end_offset else None + self._block = data[block_begin:block_end] + + def close(self): + self._fileobj.close() + + def readinto(self, b): + """ + imitate the readinto function in BufferedStream class. + This is th function used by FileInputStream. + + Parameters + ---------- + b: bytes or bytearray + pre-allocated, writeable bytes-like object. + + Returns + ------- + num : int + the number of bytes read + + """ + num = 0 + # read header + while self._header and self._block and num < len(b): + line_end = self._block.find(b'\n') # no matter '\n' or '\r\n', this is the end + read_len = min(len(b) - num, len(self._block) if line_end < 0 else line_end + 1) + b[num:num + read_len] = self._block[:read_len] + num += read_len + self._block = self._block[read_len:] + if not self._block: # more to read, do not reach the end of header yet + self._load_block(self._fileobj.tell() << 16) + if not self._block.startswith(self._meta): # end of header, load target region block + self._header = False + self._load_block(self._chunk_begin) + break + # read record + while self._block and num < len(b): + read_len = min(len(b) - num, len(self._block)) + b[num:num+read_len] = self._block[:read_len] + num += read_len + self._block = self._block[read_len:] + if not self._block: + self._load_block(self._fileobj.tell() << 16) + return num diff --git a/allel/test/io/test_vcf_read.py b/allel/test/io/test_vcf_read.py index e0f2aeb3..6efdba7f 100644 --- a/allel/test/io/test_vcf_read.py +++ b/allel/test/io/test_vcf_read.py @@ -1268,71 +1268,69 @@ def test_read_region(): for vcf_path in (fixture_path('sample.vcf.gz'), fixture_path('sample.vcf')): - for tabix in 'tabix', None, 'foobar': - - region = '19' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 2 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == '19') - assert 2 == len(pos) - assert_array_equal([111, 112], pos) - - region = '20' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 6 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == '20') - assert 6 == len(pos) - assert_array_equal([14370, 17330, 1110696, 1230237, 1234567, 1235237], pos) - - region = 'X' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 1 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == 'X') - assert 1 == len(pos) - assert_array_equal([10], pos) - - region = 'Y' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - assert callset is None - - region = '20:1-100000' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 2 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == '20') - assert 2 == len(pos) - assert_array_equal([14370, 17330], pos) - - region = '20:1000000-1233000' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 2 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == '20') - assert 2 == len(pos) - assert_array_equal([1110696, 1230237], pos) - - region = '20:1233000-2000000' - callset = read_vcf(vcf_path, region=region, tabix=tabix) - chrom = callset['variants/CHROM'] - pos = callset['variants/POS'] - assert 2 == len(chrom) - assert isinstance(chrom, np.ndarray) - assert np.all(chrom == '20') - assert 2 == len(pos) - assert_array_equal([1234567, 1235237], pos) + region = '19' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 2 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == '19') + assert 2 == len(pos) + assert_array_equal([111, 112], pos) + + region = '20' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 6 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == '20') + assert 6 == len(pos) + assert_array_equal([14370, 17330, 1110696, 1230237, 1234567, 1235237], pos) + + region = 'X' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 1 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == 'X') + assert 1 == len(pos) + assert_array_equal([10], pos) + + region = 'Y' + callset = read_vcf(vcf_path, region=region) + assert callset is None + + region = '20:1-100000' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 2 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == '20') + assert 2 == len(pos) + assert_array_equal([14370, 17330], pos) + + region = '20:1000000-1233000' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 2 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == '20') + assert 2 == len(pos) + assert_array_equal([1110696, 1230237], pos) + + region = '20:1233000-2000000' + callset = read_vcf(vcf_path, region=region) + chrom = callset['variants/CHROM'] + pos = callset['variants/POS'] + assert 2 == len(chrom) + assert isinstance(chrom, np.ndarray) + assert np.all(chrom == '20') + assert 2 == len(pos) + assert_array_equal([1234567, 1235237], pos) def test_read_region_unsorted(): @@ -1340,10 +1338,9 @@ def test_read_region_unsorted(): # not available. fn = fixture_path('unsorted.vcf') - tabix = None region = '19' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 2 == len(chrom) @@ -1353,7 +1350,7 @@ def test_read_region_unsorted(): assert_array_equal([111, 112], pos) region = '20' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 6 == len(chrom) @@ -1363,7 +1360,7 @@ def test_read_region_unsorted(): assert_array_equal([14370, 1230237, 1234567, 1235237, 17330, 1110696], pos) region = 'X' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 1 == len(chrom) @@ -1373,11 +1370,11 @@ def test_read_region_unsorted(): assert_array_equal([10], pos) region = 'Y' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) assert callset is None region = '20:1-100000' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 2 == len(chrom) @@ -1387,7 +1384,7 @@ def test_read_region_unsorted(): assert_array_equal([14370, 17330], pos) region = '20:1000000-1233000' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 2 == len(chrom) @@ -1397,7 +1394,7 @@ def test_read_region_unsorted(): assert_array_equal([1230237, 1110696], pos) region = '20:1233000-2000000' - callset = read_vcf(fn, region=region, tabix=tabix) + callset = read_vcf(fn, region=region) chrom = callset['variants/CHROM'] pos = callset['variants/POS'] assert 2 == len(chrom) @@ -1957,7 +1954,7 @@ def test_genotype_ac(): def test_region_truncate(): vcf_path = fixture_path('test54.vcf.gz') for tabix in 'tabix', None: - callset = read_vcf(vcf_path, region='chr1:10-100', tabix=tabix) + callset = read_vcf(vcf_path, region='chr1:10-100') pos = callset['variants/POS'] assert 2 == pos.shape[0] assert_array_equal([20, 30], pos) @@ -2116,11 +2113,11 @@ def test_vcf_to_npz(): for vcf_path, region, tabix, samples, string_type in param_matrix: types = {'CHROM': string_type, 'ALT': string_type, 'samples': string_type} expected = read_vcf(vcf_path, fields='*', alt_number=2, region=region, - tabix=tabix, samples=samples, types=types) + samples=samples, types=types) if os.path.exists(npz_path): os.remove(npz_path) vcf_to_npz(vcf_path, npz_path, fields='*', chunk_length=2, alt_number=2, - region=region, tabix=tabix, samples=samples, types=types) + region=region, samples=samples, types=types) if expected is None: assert not os.path.exists(npz_path) else: @@ -2187,11 +2184,11 @@ def test_vcf_to_zarr(): for vcf_path, region, tabix, samples, string_type in param_matrix: types = {'CHROM': string_type, 'ALT': string_type, 'samples': string_type} expected = read_vcf(vcf_path, fields='*', alt_number=2, region=region, - tabix=tabix, samples=samples, types=types) + samples=samples, types=types) if os.path.exists(zarr_path): shutil.rmtree(zarr_path) vcf_to_zarr(vcf_path, zarr_path, fields='*', alt_number=2, chunk_length=2, - region=region, tabix=tabix, samples=samples, types=types) + region=region, samples=samples, types=types) if expected is None: assert not os.path.exists(zarr_path) else: @@ -2372,11 +2369,11 @@ def test_vcf_to_hdf5(): for vcf_path, region, tabix, samples, string_type in param_matrix: types = {'CHROM': string_type, 'ALT': string_type, 'samples': string_type} expected = read_vcf(vcf_path, fields='*', alt_number=2, region=region, - tabix=tabix, samples=samples, types=types) + samples=samples, types=types) if os.path.exists(h5_path): os.remove(h5_path) vcf_to_hdf5(vcf_path, h5_path, fields='*', alt_number=2, chunk_length=2, - region=region, tabix=tabix, samples=samples, types=types) + region=region, samples=samples, types=types) if expected is None: assert not os.path.exists(h5_path) else: diff --git a/allel/test/io/test_vcfreader.py b/allel/test/io/test_vcfreader.py new file mode 100644 index 00000000..f9a23ddb --- /dev/null +++ b/allel/test/io/test_vcfreader.py @@ -0,0 +1,40 @@ +from allel.io.vcfreader import VCFReader +import gzip +import os + + +def fixture_path(fn): + return os.path.join(os.path.dirname(__file__), os.pardir, 'data', fn) + + +def test_parse_region(): + chrom, begin, end = VCFReader.parse_region('chr1:10-100') + assert chrom == b'chr1' + assert begin == 9 + assert end == 100 + + chrom, begin, end = VCFReader.parse_region('chr1:10') + assert chrom == b'chr1' + assert begin == 9 + assert end == 10 + + chrom, begin, end = VCFReader.parse_region('chr1') + assert chrom == b'chr1' + assert begin == 0 + assert end == 2147483647 + + +def test_decompress(): + vcf_reader = VCFReader( + fixture_path('test54.vcf.gz'), fixture_path('test54.vcf.gz.tbi'), 'chr1:1-100' + ) + gzip_reader = gzip.open(fixture_path('test54.vcf.gz'), 'rb') + buffer_vcf, buffer_gzip = bytearray(2 ** 14), bytearray(2 ** 14) + while True: + num_vcf = vcf_reader.readinto(buffer_vcf) + num_gzip = gzip_reader.readinto(buffer_gzip) + assert num_vcf == num_gzip + if num_gzip > 0: + assert buffer_vcf == buffer_gzip + else: + break