sgkit-dev
diff --git a/‎docs/api.rst
Lines changed: 5 additions & 0 deletions b/‎docs/api.rst
Lines changed: 5 additions & 0 deletions
diff --git a/‎sgkit/__init__.py
Lines changed: 2 additions & 1 deletion b/‎sgkit/__init__.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎sgkit/stats/popgen.py
Lines changed: 194 additions & 1 deletion b/‎sgkit/stats/popgen.py
Lines changed: 194 additions & 1 deletion
diff --git a/‎sgkit/tests/test_popgen.py
Lines changed: 74 additions & 0 deletions b/‎sgkit/tests/test_popgen.py
Lines changed: 74 additions & 0 deletions
diff --git a/‎sgkit/tests/test_utils.py
Lines changed: 23 additions & 0 deletions b/‎sgkit/tests/test_utils.py
Lines changed: 23 additions & 0 deletions
@@ -48,6 +48,7 @@ Methods
    divergence
    diversity
    Fst
+   Garud_h
    gwas_linear_regression
    hardy_weinberg_test
    regenie
@@ -90,6 +91,10 @@ Variables
     variables.pc_relate_phi_spec
     variables.sample_id_spec
     variables.sample_pcs_spec
+    variables.stat_Garud_h1_spec
+    variables.stat_Garud_h12_spec
+    variables.stat_Garud_h123_spec
+    variables.stat_Garud_h2_h1_spec
     variables.traits_spec
     variables.variant_allele_spec
     variables.variant_allele_count_spec
 
@@ -19,7 +19,7 @@
 from .stats.hwe import hardy_weinberg_test
 from .stats.pc_relate import pc_relate
 from .stats.pca import pca
-from .stats.popgen import Fst, Tajimas_D, divergence, diversity, pbs
+from .stats.popgen import Fst, Garud_h, Tajimas_D, divergence, diversity, pbs
 from .stats.preprocessing import filter_partial_calls
 from .stats.regenie import regenie
 from .testing import simulate_genotype_call_dataset
@@ -45,6 +45,7 @@
     "diversity",
     "divergence",
     "Fst",
+    "Garud_h",
     "Tajimas_D",
     "pbs",
     "pc_relate",
 
@@ -1,13 +1,19 @@
+import collections
 from typing import Hashable, Optional
 
 import dask.array as da
 import numpy as np
 from numba import guvectorize
 from xarray import Dataset
 
+from sgkit import to_haplotype_calls
 from sgkit.stats.utils import assert_array_shape
 from sgkit.typing import ArrayLike
-from sgkit.utils import conditional_merge_datasets, define_variable_if_absent
+from sgkit.utils import (
+    conditional_merge_datasets,
+    define_variable_if_absent,
+    hash_columns,
+)
 from sgkit.window import has_windows, window_statistic
 
 from .. import variables
@@ -682,3 +688,190 @@ def pbs(
         {variables.stat_pbs: (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)}
     )
     return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
+
+
+N_GARUD_H_STATS = 4  # H1, H12, H123, H2/H1
+
+
+def _Garud_h(k: ArrayLike) -> ArrayLike:
+    # find haplotype counts (sorted in descending order)
+    counts = sorted(collections.Counter(k.tolist()).values(), reverse=True)
+    counts = np.array(counts)
+
+    # find haplotype frequencies
+    n = k.shape[0]
+    f = counts / n
+
+    # compute H1
+    h1 = np.sum(f ** 2)
+
+    # compute H12
+    h12 = np.sum(f[:2]) ** 2 + np.sum(f[2:] ** 2)
+
+    # compute H123
+    h123 = np.sum(f[:3]) ** 2 + np.sum(f[3:] ** 2)
+
+    # compute H2/H1
+    h2 = h1 - f[0] ** 2
+    h2_h1 = h2 / h1
+
+    return np.array([h1, h12, h123, h2_h1])
+
+
+def _Garud_h_cohorts(
+    ht: ArrayLike, sample_cohort: ArrayLike, n_cohorts: int
+) -> ArrayLike:
+    k = hash_columns(ht)  # hash haplotypes
+    arr = np.empty((n_cohorts, N_GARUD_H_STATS))
+    for c in range(n_cohorts):
+        arr[c, :] = _Garud_h(k[sample_cohort == c])
+    return arr
+
+
+def Garud_h(
+    ds: Dataset,
+    *,
+    call_haplotype: Hashable = variables.call_haplotype,
+    merge: bool = True,
+) -> Dataset:
+    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
+    of soft sweeps, as defined in Garud et al. (2015).
+
+    By default, values of this statistic are calculated across all variants.
+    To compute values in windows, call :func:`window` before calling
+    this function.
+
+    Parameters
+    ----------
+    ds
+        Genotype call dataset.
+    call_haplotype
+        Call haplotype variable to use or calculate. Defined by
+        :data:`sgkit.variables.call_haplotype_spec`.
+        If the variable is not present in ``ds``, it will be computed
+        using :func:`to_haplotype_calls`.
+    merge
+        If True (the default), merge the input dataset and the computed
+        output variables into a single dataset, otherwise return only
+        the computed output variables.
+        See :ref:`dataset_merge` for more details.
+
+    Returns
+    -------
+    A dataset containing the following variables:
+
+    - `stat_Garud_h1` (windows, cohorts): Garud H1 statistic.
+        Defined by :data:`sgkit.variables.stat_Garud_h1_spec`.
+
+    - `stat_Garud_h12` (windows, cohorts): Garud H12 statistic.
+        Defined by :data:`sgkit.variables.stat_Garud_h12_spec`.
+
+    - `stat_Garud_h123` (windows, cohorts): Garud H123 statistic.
+        Defined by :data:`sgkit.variables.stat_Garud_h123_spec`.
+
+    - `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic.
+        Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`.
+
+    Raises
+    ------
+    NotImplementedError
+        If the dataset is not diploid.
+
+    Warnings
+    --------
+    This function is currently only implemented for diploid datasets.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> import sgkit as sg
+    >>> import xarray as xr
+    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)
+
+    >>> # Divide samples into two cohorts
+    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
+    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
+
+    >>> # Divide into windows of size three (variants)
+    >>> ds = sg.window(ds, size=3, step=3)
+
+    >>> gh = sg.Garud_h(ds)
+    >>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE
+    array([[0.25 , 0.375],
+        [0.375, 0.375]])
+    >>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE
+    array([[0.375, 0.625],
+        [0.625, 0.625]])
+    >>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE
+    array([[0.625, 1.   ],
+        [1.   , 1.   ]])
+    >>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE
+    array([[0.75      , 0.33333333],
+        [0.33333333, 0.33333333]])
+    """
+
+    if ds.dims["ploidy"] != 2:
+        raise NotImplementedError("Garud H only implemented for diploid genotypes")
+
+    ds = define_variable_if_absent(
+        ds, variables.call_haplotype, call_haplotype, to_haplotype_calls
+    )
+    variables.validate(ds, {call_haplotype: variables.call_haplotype_spec})
+
+    ht = ds[call_haplotype]
+
+    # convert sample cohorts to haplotype layout
+    sc = ds.sample_cohort.values
+    hsc = np.stack((sc, sc), axis=1).ravel()  # TODO: assumes diploid
+    n_cohorts = sc.max() + 1  # 0-based indexing
+
+    if has_windows(ds):
+        gh = window_statistic(
+            ht,
+            lambda ht: _Garud_h_cohorts(ht, hsc, n_cohorts),
+            ds.window_start.values,
+            ds.window_stop.values,
+            dtype=np.float64,
+            # first chunks dimension is windows, computed in window_statistic
+            chunks=(-1, n_cohorts, N_GARUD_H_STATS),
+            new_axis=2,  # 2d -> 3d
+        )
+        n_windows = ds.window_start.shape[0]
+        assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
+        new_ds = Dataset(
+            {
+                variables.stat_Garud_h1: (
+                    ("windows", "cohorts"),
+                    gh[:, :, 0],
+                ),
+                variables.stat_Garud_h12: (
+                    ("windows", "cohorts"),
+                    gh[:, :, 1],
+                ),
+                variables.stat_Garud_h123: (
+                    ("windows", "cohorts"),
+                    gh[:, :, 2],
+                ),
+                variables.stat_Garud_h2_h1: (
+                    ("windows", "cohorts"),
+                    gh[:, :, 3],
+                ),
+            }
+        )
+    else:
+        # TODO: note this materializes all the data, so windowless should be discouraged/not supported
+        ht = ht.values
+
+        gh = _Garud_h_cohorts(ht, sample_cohort=hsc, n_cohorts=n_cohorts)
+        assert_array_shape(gh, n_cohorts, N_GARUD_H_STATS)
+
+        new_ds = Dataset(
+            {
+                variables.stat_Garud_h1: gh[:, 0],
+                variables.stat_Garud_h12: gh[:, 1],
+                variables.stat_Garud_h123: gh[:, 2],
+                variables.stat_Garud_h2_h1: gh[:, 3],
+            }
+        )
+    return conditional_merge_datasets(ds, variables.validate(new_ds), merge)
@@ -9,13 +9,15 @@
 
 from sgkit import (
     Fst,
+    Garud_h,
     Tajimas_D,
     count_cohort_alleles,
     count_variant_alleles,
     create_genotype_call_dataset,
     divergence,
     diversity,
     pbs,
+    simulate_genotype_call_dataset,
     variables,
 )
 from sgkit.window import window
@@ -399,3 +401,75 @@ def test_pbs__windowed(sample_size, n_cohorts, chunks):
         )
 
     np.testing.assert_allclose(stat_pbs[:-1], ska_pbs_value)
+
+
+@pytest.mark.parametrize(
+    "n_variants, n_samples, n_contigs, n_cohorts",
+    [(3, 5, 1, 1), (3, 5, 1, 2)],
+)
+@pytest.mark.parametrize("chunks", [(-1, -1), (2, -1)])
+def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, chunks):
+    # We can't use msprime since it doesn't generate diploid data, and Garud uses phased data
+    ds = simulate_genotype_call_dataset(
+        n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
+    )
+    ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
+    subsets = np.array_split(ds.samples.values, n_cohorts)
+    sample_cohorts = np.concatenate(
+        [np.full_like(subset, i) for i, subset in enumerate(subsets)]
+    )
+    ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
+
+    gh = Garud_h(ds)
+    h1 = gh.stat_Garud_h1.values
+    h12 = gh.stat_Garud_h12.values
+    h123 = gh.stat_Garud_h123.values
+    h2_h1 = gh.stat_Garud_h2_h1.values
+
+    # scikit-allel
+    for c in range(n_cohorts):
+        gt = ds.call_genotype.values[:, sample_cohorts == c, :]
+        ska_gt = allel.GenotypeArray(gt)
+        ska_ha = ska_gt.to_haplotypes()
+        ska_h = allel.garud_h(ska_ha)
+
+        np.testing.assert_allclose(h1[c], ska_h[0])
+        np.testing.assert_allclose(h12[c], ska_h[1])
+        np.testing.assert_allclose(h123[c], ska_h[2])
+        np.testing.assert_allclose(h2_h1[c], ska_h[3])
+
+
+@pytest.mark.parametrize(
+    "n_variants, n_samples, n_contigs, n_cohorts",
+    [(9, 5, 1, 1), (9, 5, 1, 2)],
+)
+@pytest.mark.parametrize("chunks", [(-1, -1), (5, -1)])
+def test_Garud_h__windowed(n_variants, n_samples, n_contigs, n_cohorts, chunks):
+    ds = simulate_genotype_call_dataset(
+        n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs
+    )
+    ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
+    subsets = np.array_split(ds.samples.values, n_cohorts)
+    sample_cohorts = np.concatenate(
+        [np.full_like(subset, i) for i, subset in enumerate(subsets)]
+    )
+    ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
+    ds = window(ds, size=3, step=3)
+
+    gh = Garud_h(ds)
+    h1 = gh.stat_Garud_h1.values
+    h12 = gh.stat_Garud_h12.values
+    h123 = gh.stat_Garud_h123.values
+    h2_h1 = gh.stat_Garud_h2_h1.values
+
+    # scikit-allel
+    for c in range(n_cohorts):
+        gt = ds.call_genotype.values[:, sample_cohorts == c, :]
+        ska_gt = allel.GenotypeArray(gt)
+        ska_ha = ska_gt.to_haplotypes()
+        ska_h = allel.moving_garud_h(ska_ha, size=3, step=3)
+
+        np.testing.assert_allclose(h1[:, c], ska_h[0])
+        np.testing.assert_allclose(h12[:, c], ska_h[1])
+        np.testing.assert_allclose(h123[:, c], ska_h[2])
+        np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
@@ -13,6 +13,7 @@
     check_array_like,
     define_variable_if_absent,
     encode_array,
+    hash_columns,
     max_str_len,
     merge_datasets,
     split_array_chunks,
@@ -208,3 +209,25 @@ def test_split_array_chunks__raise_on_blocks_lte_0():
 def test_split_array_chunks__raise_on_n_lte_0():
     with pytest.raises(ValueError, match=r"Number of elements .* must be >= 0"):
         split_array_chunks(0, 0)
+
+
+@given(st.integers(1, 50), st.integers(2, 50))
+@settings(deadline=None)  # avoid problem with numba jit compilation
+def test_hash_columns(n_rows, n_cols):
+    # construct an array with random repeated columns
+    x = np.random.randint(-2, 10, size=(n_rows, n_cols // 2))
+    cols = np.random.choice(x.shape[1], n_cols, replace=True)
+    x = x[:, cols]
+
+    # find unique column counts (exact method)
+    _, expected_inverse, expected_counts = np.unique(
+        x, axis=1, return_inverse=True, return_counts=True
+    )
+
+    # hash columns, then find unique column counts using the hash values
+    h = hash_columns(x)
+    _, inverse, counts = np.unique(h, return_inverse=True, return_counts=True)
+
+    # counts[inverse] gives the count for each column in x
+    # these should be the same for both ways of counting
+    np.testing.assert_equal(counts[inverse], expected_counts[expected_inverse])