Skip to content

Commit 79b7101

Browse files
committed
Garud H
1 parent dda474e commit 79b7101

File tree

8 files changed

+347
-4
lines changed

8 files changed

+347
-4
lines changed

docs/api.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Methods
4848
divergence
4949
diversity
5050
Fst
51+
Garud_h
5152
gwas_linear_regression
5253
hardy_weinberg_test
5354
regenie
@@ -90,6 +91,10 @@ Variables
9091
variables.pc_relate_phi_spec
9192
variables.sample_id_spec
9293
variables.sample_pcs_spec
94+
variables.stat_Garud_h1_spec
95+
variables.stat_Garud_h12_spec
96+
variables.stat_Garud_h123_spec
97+
variables.stat_Garud_h2_h1_spec
9398
variables.traits_spec
9499
variables.variant_allele_spec
95100
variables.variant_allele_count_spec

sgkit/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from .stats.hwe import hardy_weinberg_test
2020
from .stats.pc_relate import pc_relate
2121
from .stats.pca import pca
22-
from .stats.popgen import Fst, Tajimas_D, divergence, diversity, pbs
22+
from .stats.popgen import Fst, Garud_h, Tajimas_D, divergence, diversity, pbs
2323
from .stats.preprocessing import filter_partial_calls
2424
from .stats.regenie import regenie
2525
from .testing import simulate_genotype_call_dataset
@@ -45,6 +45,7 @@
4545
"diversity",
4646
"divergence",
4747
"Fst",
48+
"Garud_h",
4849
"Tajimas_D",
4950
"pbs",
5051
"pc_relate",

sgkit/stats/popgen.py

Lines changed: 194 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,19 @@
1+
import collections
12
from typing import Hashable, Optional
23

34
import dask.array as da
45
import numpy as np
56
from numba import guvectorize
67
from xarray import Dataset
78

9+
from sgkit import to_haplotype_calls
810
from sgkit.stats.utils import assert_array_shape
911
from sgkit.typing import ArrayLike
10-
from sgkit.utils import conditional_merge_datasets, define_variable_if_absent
12+
from sgkit.utils import (
13+
conditional_merge_datasets,
14+
define_variable_if_absent,
15+
hash_columns,
16+
)
1117
from sgkit.window import has_windows, window_statistic
1218

1319
from .. import variables
@@ -682,3 +688,190 @@ def pbs(
682688
{variables.stat_pbs: (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)}
683689
)
684690
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
691+
692+
693+
N_GARUD_H_STATS = 4 # H1, H12, H123, H2/H1
694+
695+
696+
def _Garud_h(k: ArrayLike) -> ArrayLike:
697+
# find haplotype counts (sorted in descending order)
698+
counts = sorted(collections.Counter(k.tolist()).values(), reverse=True)
699+
counts = np.array(counts)
700+
701+
# find haplotype frequencies
702+
n = k.shape[0]
703+
f = counts / n
704+
705+
# compute H1
706+
h1 = np.sum(f ** 2)
707+
708+
# compute H12
709+
h12 = np.sum(f[:2]) ** 2 + np.sum(f[2:] ** 2)
710+
711+
# compute H123
712+
h123 = np.sum(f[:3]) ** 2 + np.sum(f[3:] ** 2)
713+
714+
# compute H2/H1
715+
h2 = h1 - f[0] ** 2
716+
h2_h1 = h2 / h1
717+
718+
return np.array([h1, h12, h123, h2_h1])
719+
720+
721+
def _Garud_h_cohorts(
722+
ht: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int
723+
) -> ArrayLike:
724+
k = hash_columns(ht) # hash haplotypes
725+
arr = np.empty((n_cohorts, N_GARUD_H_STATS))
726+
for c in range(n_cohorts):
727+
arr[c, :] = _Garud_h(k[sample_cohort == c])
728+
return arr
729+
730+
731+
def Garud_h(
732+
ds: Dataset,
733+
*,
734+
call_haplotype: Hashable = variables.call_haplotype,
735+
merge: bool = True,
736+
) -> Dataset:
737+
"""Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
738+
of soft sweeps, as defined in Garud et al. (2015).
739+
740+
By default, values of this statistic are calculated across all variants.
741+
To compute values in windows, call :func:`window` before calling
742+
this function.
743+
744+
Parameters
745+
----------
746+
ds
747+
Genotype call dataset.
748+
call_haplotype
749+
Call haplotype variable to use or calculate. Defined by
750+
:data:`sgkit.variables.call_haplotype_spec`.
751+
If the variable is not present in ``ds``, it will be computed
752+
using :func:`to_haplotype_calls`.
753+
merge
754+
If True (the default), merge the input dataset and the computed
755+
output variables into a single dataset, otherwise return only
756+
the computed output variables.
757+
See :ref:`dataset_merge` for more details.
758+
759+
Returns
760+
-------
761+
A dataset containing the following variables:
762+
763+
- `stat_Garud_h1` (windows, cohorts): Garud H1 statistic.
764+
Defined by :data:`sgkit.variables.stat_Garud_h1_spec`.
765+
766+
- `stat_Garud_h12` (windows, cohorts): Garud H12 statistic.
767+
Defined by :data:`sgkit.variables.stat_Garud_h12_spec`.
768+
769+
- `stat_Garud_h123` (windows, cohorts): Garud H123 statistic.
770+
Defined by :data:`sgkit.variables.stat_Garud_h123_spec`.
771+
772+
- `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic.
773+
Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`.
774+
775+
Raises
776+
------
777+
NotImplementedError
778+
If the dataset is not diploid.
779+
780+
Warnings
781+
--------
782+
This function is currently only implemented for diploid datasets.
783+
784+
Examples
785+
--------
786+
787+
>>> import numpy as np
788+
>>> import sgkit as sg
789+
>>> import xarray as xr
790+
>>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)
791+
792+
>>> # Divide samples into two cohorts
793+
>>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
794+
>>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
795+
796+
>>> # Divide into windows of size three (variants)
797+
>>> ds = sg.window(ds, size=3, step=3)
798+
799+
>>> gh = sg.Garud_h(ds)
800+
>>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE
801+
array([[0.25 , 0.375],
802+
[0.375, 0.375]])
803+
>>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE
804+
array([[0.375, 0.625],
805+
[0.625, 0.625]])
806+
>>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE
807+
array([[0.625, 1. ],
808+
[1. , 1. ]])
809+
>>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE
810+
array([[0.75 , 0.33333333],
811+
[0.33333333, 0.33333333]])
812+
"""
813+
814+
if ds.dims["ploidy"] != 2:
815+
raise NotImplementedError("Garud H only implemented for diploid genotypes")
816+
817+
ds = define_variable_if_absent(
818+
ds, variables.call_haplotype, call_haplotype, to_haplotype_calls
819+
)
820+
variables.validate(ds, {call_haplotype: variables.call_haplotype_spec})
821+
822+
ht = ds[call_haplotype]
823+
824+
# convert sample cohorts to haplotype layout
825+
sc = ds.sample_cohort.values
826+
hsc = np.stack((sc, sc), axis=1).ravel() # TODO: assumes diploid
827+
n_cohorts = sc.max() + 1 # 0-based indexing
828+
829+
if has_windows(ds):
830+
gh = window_statistic(
831+
ht,
832+
lambda ht: _Garud_h_cohorts(ht, hsc, n_cohorts),
833+
ds.window_start.values,
834+
ds.window_stop.values,
835+
dtype=np.float64,
836+
# first chunks dimension is windows, computed in window_statistic
837+
chunks=(-1, n_cohorts, N_GARUD_H_STATS),
838+
new_axis=2, # 2d -> 3d
839+
)
840+
n_windows = ds.window_start.shape[0]
841+
assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
842+
new_ds = Dataset(
843+
{
844+
variables.stat_Garud_h1: (
845+
("windows", "cohorts"),
846+
gh[:, :, 0],
847+
),
848+
variables.stat_Garud_h12: (
849+
("windows", "cohorts"),
850+
gh[:, :, 1],
851+
),
852+
variables.stat_Garud_h123: (
853+
("windows", "cohorts"),
854+
gh[:, :, 2],
855+
),
856+
variables.stat_Garud_h2_h1: (
857+
("windows", "cohorts"),
858+
gh[:, :, 3],
859+
),
860+
}
861+
)
862+
else:
863+
# TODO: note this materializes all the data, so windowless should be discouraged/not supported
864+
ht = ht.values
865+
866+
gh = _Garud_h_cohorts(ht, sample_cohort=hsc, n_cohorts=n_cohorts)
867+
assert_array_shape(gh, n_cohorts, N_GARUD_H_STATS)
868+
869+
new_ds = Dataset(
870+
{
871+
variables.stat_Garud_h1: gh[:, 0],
872+
variables.stat_Garud_h12: gh[:, 1],
873+
variables.stat_Garud_h123: gh[:, 2],
874+
variables.stat_Garud_h2_h1: gh[:, 3],
875+
}
876+
)
877+
return conditional_merge_datasets(ds, variables.validate(new_ds), merge)

sgkit/tests/test_popgen.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99

1010
from sgkit import (
1111
Fst,
12+
Garud_h,
1213
Tajimas_D,
1314
count_cohort_alleles,
1415
count_variant_alleles,
1516
create_genotype_call_dataset,
1617
divergence,
1718
diversity,
1819
pbs,
20+
simulate_genotype_call_dataset,
1921
variables,
2022
)
2123
from sgkit.window import window
@@ -399,3 +401,75 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
399401
)
400402

401403
np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
404+
405+
406+
@pytest.mark.parametrize(
407+
"n_variants, n_samples, n_contigs, n_cohorts",
408+
[(3, 5, 1, 1), (3, 5, 1, 2)],
409+
)
410+
@pytest.mark.parametrize("chunks", [(-1, -1), (2, -1)])
411+
def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
412+
# We can't use msprime since it doesn't generate diploid data, and Garud uses phased data
413+
ds = simulate_genotype_call_dataset(
414+
n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
415+
)
416+
ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
417+
subsets = np.array_split(ds.samples.values, n_cohorts)
418+
sample_cohorts = np.concatenate(
419+
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
420+
)
421+
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
422+
423+
gh = Garud_h(ds)
424+
h1 = gh.stat_Garud_h1.values
425+
h12 = gh.stat_Garud_h12.values
426+
h123 = gh.stat_Garud_h123.values
427+
h2_h1 = gh.stat_Garud_h2_h1.values
428+
429+
# scikit-allel
430+
for c in range(n_cohorts):
431+
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
432+
ska_gt = allel.GenotypeArray(gt)
433+
ska_ha = ska_gt.to_haplotypes()
434+
ska_h = allel.garud_h(ska_ha)
435+
436+
np.testing.assert_allclose(h1[c], ska_h[0])
437+
np.testing.assert_allclose(h12[c], ska_h[1])
438+
np.testing.assert_allclose(h123[c], ska_h[2])
439+
np.testing.assert_allclose(h2_h1[c], ska_h[3])
440+
441+
442+
@pytest.mark.parametrize(
443+
"n_variants, n_samples, n_contigs, n_cohorts",
444+
[(9, 5, 1, 1), (9, 5, 1, 2)],
445+
)
446+
@pytest.mark.parametrize("chunks", [(-1, -1), (5, -1)])
447+
def test_Garud_h__windowed(n_variants, n_samples, n_contigs, n_cohorts, chunks):
448+
ds = simulate_genotype_call_dataset(
449+
n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
450+
)
451+
ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
452+
subsets = np.array_split(ds.samples.values, n_cohorts)
453+
sample_cohorts = np.concatenate(
454+
[np.full_like(subset, i) for i, subset in enumerate(subsets)]
455+
)
456+
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
457+
ds = window(ds, size=3, step=3)
458+
459+
gh = Garud_h(ds)
460+
h1 = gh.stat_Garud_h1.values
461+
h12 = gh.stat_Garud_h12.values
462+
h123 = gh.stat_Garud_h123.values
463+
h2_h1 = gh.stat_Garud_h2_h1.values
464+
465+
# scikit-allel
466+
for c in range(n_cohorts):
467+
gt = ds.call_genotype.values[:, sample_cohorts == c, :]
468+
ska_gt = allel.GenotypeArray(gt)
469+
ska_ha = ska_gt.to_haplotypes()
470+
ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
471+
472+
np.testing.assert_allclose(h1[:, c], ska_h[0])
473+
np.testing.assert_allclose(h12[:, c], ska_h[1])
474+
np.testing.assert_allclose(h123[:, c], ska_h[2])
475+
np.testing.assert_allclose(h2_h1[:, c], ska_h[3])

sgkit/tests/test_utils.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
check_array_like,
1414
define_variable_if_absent,
1515
encode_array,
16+
hash_columns,
1617
max_str_len,
1718
merge_datasets,
1819
split_array_chunks,
@@ -208,3 +209,25 @@ def test_split_array_chunks__raise_on_blocks_lte_0():
208209
def test_split_array_chunks__raise_on_n_lte_0():
209210
with pytest.raises(ValueError, match=r"Number of elements .* must be >= 0"):
210211
split_array_chunks(0, 0)
212+
213+
214+
@given(st.integers(1, 50), st.integers(2, 50))
215+
@settings(deadline=None) # avoid problem with numba jit compilation
216+
def test_hash_columns(n_rows, n_cols):
217+
# construct an array with random repeated columns
218+
x = np.random.randint(-2, 10, size=(n_rows, n_cols // 2))
219+
cols = np.random.choice(x.shape[1], n_cols, replace=True)
220+
x = x[:, cols]
221+
222+
# find unique column counts (exact method)
223+
_, expected_inverse, expected_counts = np.unique(
224+
x, axis=1, return_inverse=True, return_counts=True
225+
)
226+
227+
# hash columns, then find unique column counts using the hash values
228+
h = hash_columns(x)
229+
_, inverse, counts = np.unique(h, return_inverse=True, return_counts=True)
230+
231+
# counts[inverse] gives the count for each column in x
232+
# these should be the same for both ways of counting
233+
np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse])

0 commit comments

Comments
 (0)