JamesYang007 · davidbonet · Mar 10, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/adelie/data.py b/adelie/data.py
@@ -501,3 +501,148 @@ def snp_phased_ancestry(
         "group_sizes": group_sizes,
         "penalty": penalty,
     }
+
+
+def snp_combine_r(
+    n: int, 
+    s: int, 
+    A: int,
+    *,
+    K: int =1,
+    glm: str ="gaussian",
+    sparsity: float =0.95,
+    one_ratio: float =0.25,
+    two_ratio: float =0.05,
+    zero_penalty: float =0,
+    snr: float =1,
+    seed: int =0,
+):
+    """Creates a SNP unphased, ancestry dataset.
+
+    - The groups and group sizes are generated randomly
+      such that ``G`` groups are created and the sum of the group sizes is ``p``.
+    - The calldata matrix ``X`` has sparsity ratio 
+      ``1 - one_ratio - two_ratio``
+      where ``one_ratio`` of the entries are randomly set to ``1``
+      and ``two_ratio`` are randomly set to ``2``.
+    - The ancestry matrix randomly generates integers in the range ``[0, A)``.
+    - The true coefficients :math:`\\beta` is such that ``sparsity`` proportion of the entries are set to :math:`0`.
+    - The response ``y`` is generated from the GLM specified by ``glm``.
+    - The penalty factors are by default set to ``np.sqrt(group_sizes)``,
+      however if ``zero_penalty > 0``, a random set of penalties will be set to zero,
+      in which case, ``penalty`` is rescaled such that the :math:`\\ell_2` norm squared is ``p``.
+
+    Parameters
+    ----------
+    n : int
+        Number of data points.
+    s : int
+        Number of SNPs.
+    A : int
+        Number of ancestries.
+    K : int, optional
+        Number of classes for multi-response GLMs.
+        Default is ``1``.
+    glm : str, optional
+        GLM name.
+        It must be one of the following:
+
+            - ``"binomial"``
+            - ``"cox"``
+            - ``"gaussian"``
+            - ``"multigaussian"``
+            - ``"multinomial"``
+            - ``"poisson"``
+
+        Default is ``"gaussian"``.
+    sparsity : float, optional
+        Proportion of :math:`\\beta` entries to be zeroed out.
+        Default is ``0.95``.
+    one_ratio : float, optional
+        Proportion of the entries of ``X`` that is set to ``1``.
+        Default is ``0.25``.
+    two_ratio : float, optional
+        Proportion of the entries of ``X`` that is set to ``2``.
+        Default is ``0.05``.
+    zero_penalty : float, optional
+        Proportion of ``penalty`` entries to be zeroed out.
+        Default is ``0``.
+    snr : float, optional
+        Signal-to-noise ratio.
+        Default is ``1``.
+    seed : int, optional
+        Random seed.
+        Default is ``0``.
+
+    Returns
+    -------
+    data : dict
+        A dictionary containing the generated data:
+
+            - ``"X"``: feature matrix.
+            - ``"ancestries"``: ancestry label of the same shape as ``X``.
+            - ``"y"``: response vector.
+            - ``"groups"``: mapping of group index to the starting column index of ``X``.
+            - ``"group_sizes"``: mapping of group index to the group size.
+            - ``"penalty"``: penalty factor for each group index.
+    """
+    assert n >= 1
+    assert s >= 1
+    assert A >= 1
+    assert sparsity >= 0 and sparsity <= 1
+    assert one_ratio >= 0 and one_ratio <= 1
+    assert two_ratio >= 0 and two_ratio <= 1
+    assert zero_penalty >= 0 and zero_penalty <= 1
+    assert snr > 0
+    assert seed >= 0
+
+    np.random.seed(seed)
+
+    nnz_ratio = one_ratio + two_ratio
+    one_ratio = one_ratio / nnz_ratio
+    two_ratio = two_ratio / nnz_ratio
+    nnz = int(nnz_ratio * n * s)
+    nnz_indices = np.random.choice(n * s, nnz, replace=False)
+    nnz_indices = np.random.permutation(nnz_indices)
+    one_indices = nnz_indices[:int(one_ratio * nnz)]
+    two_indices = nnz_indices[int(one_ratio * nnz):]
+    X = np.zeros((n, s), dtype=np.int8)
+    X.ravel()[one_indices] = 1
+    X.ravel()[two_indices] = 2
+
+    # Generate ancestries for each haplotype at each SNP position
+    # This is completely independent of the mutations
+    ancestries = np.zeros((n, 2 * s), dtype=np.int8)
+    ancestries.ravel()[:] = np.random.choice(A, n * 2 * s, replace=True)
+
+    # Each SNP has (1 + A) columns in the dense matrix:
+    # - 1 column for calldata
+    # - A columns for ancestry dosages
+    groups = (1 + A) * np.arange(s)
+    group_sizes = np.full(s, 1 + A, dtype=int)
+
+    penalty = np.sqrt(group_sizes)
+    penalty[np.random.choice(s, int(zero_penalty * s), replace=False)] = 0
+    penalty /= np.linalg.norm(penalty) / np.sqrt(s * (1 + A))
+
+    beta = np.random.normal(0, 1, (s, K))
+    beta_nnz_indices = np.random.choice(s, int((1-sparsity) * s), replace=False)
+    X_sub = X[:, beta_nnz_indices]
+    beta_sub = beta[beta_nnz_indices]
+
+    eta = X_sub @ beta_sub 
+    glm = _sample_y(
+        glm=glm,
+        eta=eta,
+        beta=beta_sub,
+        snr=snr,
+    )
+
+    return {
+        "X": np.asfortranarray(X), 
+        "ancestries": np.asfortranarray(ancestries),
+        "glm": glm,
+        "groups": groups,
+        "group_sizes": group_sizes,
+        "penalty": penalty,
+    }
diff --git a/adelie/io.py b/adelie/io.py
@@ -192,3 +192,154 @@ def write(
         if error != "":
             raise RuntimeError(error)
         return total_bytes, benchmark
+
+
+class snp_combine_r(core_io.IOSNPCombineR):
+    """IO handler for SNP unphased, ancestry matrix.
+
+    A SNP unphased, ancestry matrix is a matrix 
+    that combines unphased calldata and phased local ancestry information.
+
+    Let :math:`X \\in \\mathbb{R}^{n \\times s(1+A)}` denote such a matrix
+    where
+    :math:`n` is the number of samples,
+    :math:`s` is the number of SNPs,
+    and :math:`A` is the number of ancestries.
+    Every :math:`1+A` (contiguous) columns is called a
+    *SNP block* corresponding to a single SNP
+    with the following structure:
+    - The first column contains the SNP data (0, 1, or 2)
+    - The next :math:`A` columns contain the unphased ancestry dosages for each ancestry (0, 1, or 2)
+
+    For example, for SNP :math:`j`, the columns are:
+    - Column :math:`j(1+A)`: SNP data
+    - Columns :math:`j(1+A)+1` to :math:`j(1+A)+A`: Ancestry dosages
+
+    Parameters
+    ----------
+    filename : str
+        File name containing the SNP data in ``.snpdat`` format.
+    read_mode : str, optional
+        Reading mode of the SNP data. 
+        It must be one of the following:
+
+            - ``"file"``: reads the file using standard file IO.
+              This method is the most general and portable,
+              however, with large files, it is the slowest option.
+            - ``"mmap"``: reads the file using mmap.
+              This method is only supported on Linux and MacOS.
+              It is the most efficient way to read large files.
+
+        Default is ``"file"``.
+    """
+    def __init__(
+        self,
+        filename: str,
+        read_mode: str ="file",
+    ):
+        core_io.IOSNPCombineR.__init__(self, filename, read_mode)
+
+    def write(
+        self, 
+        calldata: np.ndarray,
+        ancestries: np.ndarray,
+        A: int,
+        n_threads: int =1,
+        *,
+        selected_ancestries: np.ndarray = None,
+    ):
+        """Writes a dense SNP unphased, ancestry matrix to the file in ``.snpdat`` format.
+
+        .. note::
+            The calldata and ancestries matrices must not contain
+            any missing values.
+
+        Parameters
+        ----------
+        calldata : (n, s) ndarray
+            SNP unphased calldata in dense format.
+            ``calldata[i, j]`` is the data for individual ``i`` and SNP ``j``.
+            It must only contain values in :math:`\\{0,1,2\\}`.
+        ancestries : (n, 2*s) ndarray
+            Local ancestry information in dense format.
+            ``ancestries[i, 2*j]`` and ``ancestries[i, 2*j+1]`` are the ancestries 
+            for individual ``i`` and SNP ``j`` for each of the two haplotypes.
+            It must only contain values in :math:`\\{0,\\ldots, A-1\\}`.
+        A : int
+            Total number of ancestries present in ``ancestries`` (upper bound of labels).
+        n_threads : int, optional
+            Number of threads.
+            Default is ``1``.
+        selected_ancestries : ndarray of uint32, optional
+            If provided, restrict the ancestry dosage columns per SNP to these
+            ancestry labels (in the given order). The output block width per SNP
+            becomes ``1 + len(selected_ancestries)`` instead of ``1 + A``.
+
+        Returns
+        -------
+        total_bytes : int
+            Number of bytes written.
+        benchmark : dict
+            Dictionary of benchmark timings for each step of the serializer.
+        """
+        if selected_ancestries is None:
+            (
+                total_bytes, 
+                benchmark, 
+                error,
+            ) = core_io.IOSNPCombineR.write(self, calldata, ancestries, A, n_threads)
+        else:
+            selected_ancestries = np.asarray(selected_ancestries, dtype=np.uint32)
+            (
+                total_bytes,
+                benchmark,
+                error,
+            ) = core_io.IOSNPCombineR.write(self, calldata, ancestries, A, selected_ancestries, n_threads)
+        if error != "":
+            raise RuntimeError(error)
+        return total_bytes, benchmark
+
+
+class snp_combine_s(core_io.IOSNPCombineS):
+    """IO handler for SNP both-ancestry matrix.
+
+    Combines:
+    - Mutated haplotype counts per ancestry (like phased ancestry summed across haps)
+    - Ancestry dosage counts per ancestry (like unphased ancestry dosages)
+
+    Shape: (n, s * (2*A)). For each SNP j:
+    - Columns j*(2*A) .. j*(2*A)+(A-1): number of mutated haplotypes labeled with ancestry a
+    - Columns j*(2*A)+A .. j*(2*A)+(2*A-1): ancestry dosages per ancestry a
+    """
+    def __init__(
+        self,
+        filename: str,
+        read_mode: str ="file",
+    ):
+        core_io.IOSNPCombineS.__init__(self, filename, read_mode)
+
+    def write(
+        self,
+        calldata: np.ndarray,
+        ancestries: np.ndarray,
+        A: int,
+        n_threads: int =1,
+        *,
+        selected_ancestries: np.ndarray = None,
+    ):
+        if selected_ancestries is None:
+            (
+                total_bytes,
+                benchmark,
+                error,
+            ) = core_io.IOSNPCombineS.write(self, calldata, ancestries, A, n_threads)
+        else:
+            selected_ancestries = np.asarray(selected_ancestries, dtype=np.uint32)
+            (
+                total_bytes,
+                benchmark,
+                error,
+            ) = core_io.IOSNPCombineS.write(self, calldata, ancestries, A, selected_ancestries, n_threads)
+        if error != "":
+            raise RuntimeError(error)
+        return total_bytes, benchmark
diff --git a/adelie/matrix.py b/adelie/matrix.py
@@ -1298,6 +1298,93 @@ def __init__(self):
     return _snp_unphased()
 
 
+def snp_combine_r(
+    io: io.snp_combine_r,
+    *,
+    n_threads: int =1,
+    dtype: Union[np.float32, np.float64] =np.float64,
+):
+    """Creates a SNP unphased, ancestry matrix.
+
+    The SNP unphased, ancestry matrix is a wrapper around
+    the corresponding IO handler :class:`adelie.io.snp_combine_r`
+    exposing some matrix operations.
+
+    .. note::
+        This matrix only works for naive method!
+
+    Parameters
+    ----------
+    io : snp_phased_ancestry
+        IO handler for SNP unphased, ancestry data.
+    n_threads : int, optional
+        Number of threads.
+        Default is ``1``.
+    dtype : Union[float32, float64], optional
+        Underlying value type.
+        Default is ``np.float64``.
+
+    Returns
+    -------
+    wrap
+        Wrapper matrix object.
+
+    See Also
+    --------
+    adelie.io.snp_combine_r
+    adelie.adelie_core.matrix.MatrixNaiveSNPCombineR32
+    adelie.adelie_core.matrix.MatrixNaiveSNPCombineR64
+    """
+    dispatcher = {
+        np.float64: core.matrix.MatrixNaiveSNPCombineR64,
+        np.float32: core.matrix.MatrixNaiveSNPCombineR32,
+    }
+    core_base = dispatcher[dtype]
+    py_base = PyMatrixNaiveBase
+
+    if not io.is_read:
+        io.read()
+
+    class _snp_combine_r(core_base, py_base):
+        def __init__(self):
+            self._io = io
+            core_base.__init__(self, self._io, n_threads)
+            py_base.__init__(self, n_threads=n_threads)
+
+    return _snp_combine_r()
+
+
+def snp_combine_s(
+    io: io.snp_combine_s,
+    *,
+    n_threads: int =1,
+    dtype: Union[np.float32, np.float64] =np.float64,
+):
+    """Creates a SNP both-ancestry matrix.
+
+    For each SNP j, columns are divided into two blocks of A each:
+    - First A: mutated haplotype counts per ancestry
+    - Next  A: ancestry dosage counts per ancestry
+    """
+    dispatcher = {
+        np.float64: core.matrix.MatrixNaiveSNPCombineS64,
+        np.float32: core.matrix.MatrixNaiveSNPCombineS32,
+    }
+    core_base = dispatcher[dtype]
+    py_base = PyMatrixNaiveBase
+
+    if not io.is_read:
+        io.read()
+
+    class _snp_combine_s(core_base, py_base):
+        def __init__(self):
+            self._io = io
+            core_base.__init__(self, self._io, n_threads)
+            py_base.__init__(self, n_threads=n_threads)
+
+    return _snp_combine_s()
+
+
 def sparse(
     mat: Union[csc_matrix, csr_matrix],
     *,