Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions adelie/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,3 +501,148 @@ def snp_phased_ancestry(
"group_sizes": group_sizes,
"penalty": penalty,
}


def snp_combine_r(
n: int,
s: int,
A: int,
*,
K: int =1,
glm: str ="gaussian",
sparsity: float =0.95,
one_ratio: float =0.25,
two_ratio: float =0.05,
zero_penalty: float =0,
snr: float =1,
seed: int =0,
):
"""Creates a SNP unphased, ancestry dataset.

- The groups and group sizes are generated randomly
such that ``G`` groups are created and the sum of the group sizes is ``p``.
- The calldata matrix ``X`` has sparsity ratio
``1 - one_ratio - two_ratio``
where ``one_ratio`` of the entries are randomly set to ``1``
and ``two_ratio`` are randomly set to ``2``.
- The ancestry matrix randomly generates integers in the range ``[0, A)``.
- The true coefficients :math:`\\beta` is such that ``sparsity`` proportion of the entries are set to :math:`0`.
- The response ``y`` is generated from the GLM specified by ``glm``.
- The penalty factors are by default set to ``np.sqrt(group_sizes)``,
however if ``zero_penalty > 0``, a random set of penalties will be set to zero,
in which case, ``penalty`` is rescaled such that the :math:`\\ell_2` norm squared is ``p``.

Parameters
----------
n : int
Number of data points.
s : int
Number of SNPs.
A : int
Number of ancestries.
K : int, optional
Number of classes for multi-response GLMs.
Default is ``1``.
glm : str, optional
GLM name.
It must be one of the following:

- ``"binomial"``
- ``"cox"``
- ``"gaussian"``
- ``"multigaussian"``
- ``"multinomial"``
- ``"poisson"``

Default is ``"gaussian"``.
sparsity : float, optional
Proportion of :math:`\\beta` entries to be zeroed out.
Default is ``0.95``.
one_ratio : float, optional
Proportion of the entries of ``X`` that is set to ``1``.
Default is ``0.25``.
two_ratio : float, optional
Proportion of the entries of ``X`` that is set to ``2``.
Default is ``0.05``.
zero_penalty : float, optional
Proportion of ``penalty`` entries to be zeroed out.
Default is ``0``.
snr : float, optional
Signal-to-noise ratio.
Default is ``1``.
seed : int, optional
Random seed.
Default is ``0``.

Returns
-------
data : dict
A dictionary containing the generated data:

- ``"X"``: feature matrix.
- ``"ancestries"``: ancestry label of the same shape as ``X``.
- ``"y"``: response vector.
- ``"groups"``: mapping of group index to the starting column index of ``X``.
- ``"group_sizes"``: mapping of group index to the group size.
- ``"penalty"``: penalty factor for each group index.
"""
assert n >= 1
assert s >= 1
assert A >= 1
assert sparsity >= 0 and sparsity <= 1
assert one_ratio >= 0 and one_ratio <= 1
assert two_ratio >= 0 and two_ratio <= 1
assert zero_penalty >= 0 and zero_penalty <= 1
assert snr > 0
assert seed >= 0

np.random.seed(seed)

nnz_ratio = one_ratio + two_ratio
one_ratio = one_ratio / nnz_ratio
two_ratio = two_ratio / nnz_ratio
nnz = int(nnz_ratio * n * s)
nnz_indices = np.random.choice(n * s, nnz, replace=False)
nnz_indices = np.random.permutation(nnz_indices)
one_indices = nnz_indices[:int(one_ratio * nnz)]
two_indices = nnz_indices[int(one_ratio * nnz):]
X = np.zeros((n, s), dtype=np.int8)
X.ravel()[one_indices] = 1
X.ravel()[two_indices] = 2

# Generate ancestries for each haplotype at each SNP position
# This is completely independent of the mutations
ancestries = np.zeros((n, 2 * s), dtype=np.int8)
ancestries.ravel()[:] = np.random.choice(A, n * 2 * s, replace=True)

# Each SNP has (1 + A) columns in the dense matrix:
# - 1 column for calldata
# - A columns for ancestry dosages
groups = (1 + A) * np.arange(s)
group_sizes = np.full(s, 1 + A, dtype=int)

penalty = np.sqrt(group_sizes)
penalty[np.random.choice(s, int(zero_penalty * s), replace=False)] = 0
penalty /= np.linalg.norm(penalty) / np.sqrt(s * (1 + A))

beta = np.random.normal(0, 1, (s, K))
beta_nnz_indices = np.random.choice(s, int((1-sparsity) * s), replace=False)
X_sub = X[:, beta_nnz_indices]
beta_sub = beta[beta_nnz_indices]

eta = X_sub @ beta_sub
glm = _sample_y(
glm=glm,
eta=eta,
beta=beta_sub,
snr=snr,
)

return {
"X": np.asfortranarray(X),
"ancestries": np.asfortranarray(ancestries),
"glm": glm,
"groups": groups,
"group_sizes": group_sizes,
"penalty": penalty,
}
151 changes: 151 additions & 0 deletions adelie/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,154 @@ def write(
if error != "":
raise RuntimeError(error)
return total_bytes, benchmark


class snp_combine_r(core_io.IOSNPCombineR):
"""IO handler for SNP unphased, ancestry matrix.

A SNP unphased, ancestry matrix is a matrix
that combines unphased calldata and phased local ancestry information.

Let :math:`X \\in \\mathbb{R}^{n \\times s(1+A)}` denote such a matrix
where
:math:`n` is the number of samples,
:math:`s` is the number of SNPs,
and :math:`A` is the number of ancestries.
Every :math:`1+A` (contiguous) columns is called a
*SNP block* corresponding to a single SNP
with the following structure:
- The first column contains the SNP data (0, 1, or 2)
- The next :math:`A` columns contain the unphased ancestry dosages for each ancestry (0, 1, or 2)

For example, for SNP :math:`j`, the columns are:
- Column :math:`j(1+A)`: SNP data
- Columns :math:`j(1+A)+1` to :math:`j(1+A)+A`: Ancestry dosages

Parameters
----------
filename : str
File name containing the SNP data in ``.snpdat`` format.
read_mode : str, optional
Reading mode of the SNP data.
It must be one of the following:

- ``"file"``: reads the file using standard file IO.
This method is the most general and portable,
however, with large files, it is the slowest option.
- ``"mmap"``: reads the file using mmap.
This method is only supported on Linux and MacOS.
It is the most efficient way to read large files.

Default is ``"file"``.
"""
def __init__(
self,
filename: str,
read_mode: str ="file",
):
core_io.IOSNPCombineR.__init__(self, filename, read_mode)

def write(
self,
calldata: np.ndarray,
ancestries: np.ndarray,
A: int,
n_threads: int =1,
*,
selected_ancestries: np.ndarray = None,
):
"""Writes a dense SNP unphased, ancestry matrix to the file in ``.snpdat`` format.

.. note::
The calldata and ancestries matrices must not contain
any missing values.

Parameters
----------
calldata : (n, s) ndarray
SNP unphased calldata in dense format.
``calldata[i, j]`` is the data for individual ``i`` and SNP ``j``.
It must only contain values in :math:`\\{0,1,2\\}`.
ancestries : (n, 2*s) ndarray
Local ancestry information in dense format.
``ancestries[i, 2*j]`` and ``ancestries[i, 2*j+1]`` are the ancestries
for individual ``i`` and SNP ``j`` for each of the two haplotypes.
It must only contain values in :math:`\\{0,\\ldots, A-1\\}`.
A : int
Total number of ancestries present in ``ancestries`` (upper bound of labels).
n_threads : int, optional
Number of threads.
Default is ``1``.
selected_ancestries : ndarray of uint32, optional
If provided, restrict the ancestry dosage columns per SNP to these
ancestry labels (in the given order). The output block width per SNP
becomes ``1 + len(selected_ancestries)`` instead of ``1 + A``.

Returns
-------
total_bytes : int
Number of bytes written.
benchmark : dict
Dictionary of benchmark timings for each step of the serializer.
"""
if selected_ancestries is None:
(
total_bytes,
benchmark,
error,
) = core_io.IOSNPCombineR.write(self, calldata, ancestries, A, n_threads)
else:
selected_ancestries = np.asarray(selected_ancestries, dtype=np.uint32)
(
total_bytes,
benchmark,
error,
) = core_io.IOSNPCombineR.write(self, calldata, ancestries, A, selected_ancestries, n_threads)
if error != "":
raise RuntimeError(error)
return total_bytes, benchmark


class snp_combine_s(core_io.IOSNPCombineS):
"""IO handler for SNP both-ancestry matrix.

Combines:
- Mutated haplotype counts per ancestry (like phased ancestry summed across haps)
- Ancestry dosage counts per ancestry (like unphased ancestry dosages)

Shape: (n, s * (2*A)). For each SNP j:
- Columns j*(2*A) .. j*(2*A)+(A-1): number of mutated haplotypes labeled with ancestry a
- Columns j*(2*A)+A .. j*(2*A)+(2*A-1): ancestry dosages per ancestry a
"""
def __init__(
self,
filename: str,
read_mode: str ="file",
):
core_io.IOSNPCombineS.__init__(self, filename, read_mode)

def write(
self,
calldata: np.ndarray,
ancestries: np.ndarray,
A: int,
n_threads: int =1,
*,
selected_ancestries: np.ndarray = None,
):
if selected_ancestries is None:
(
total_bytes,
benchmark,
error,
) = core_io.IOSNPCombineS.write(self, calldata, ancestries, A, n_threads)
else:
selected_ancestries = np.asarray(selected_ancestries, dtype=np.uint32)
(
total_bytes,
benchmark,
error,
) = core_io.IOSNPCombineS.write(self, calldata, ancestries, A, selected_ancestries, n_threads)
if error != "":
raise RuntimeError(error)
return total_bytes, benchmark
87 changes: 87 additions & 0 deletions adelie/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,6 +1298,93 @@ def __init__(self):
return _snp_unphased()


def snp_combine_r(
io: io.snp_combine_r,
*,
n_threads: int =1,
dtype: Union[np.float32, np.float64] =np.float64,
):
"""Creates a SNP unphased, ancestry matrix.

The SNP unphased, ancestry matrix is a wrapper around
the corresponding IO handler :class:`adelie.io.snp_combine_r`
exposing some matrix operations.

.. note::
This matrix only works for naive method!

Parameters
----------
io : snp_phased_ancestry
IO handler for SNP unphased, ancestry data.
n_threads : int, optional
Number of threads.
Default is ``1``.
dtype : Union[float32, float64], optional
Underlying value type.
Default is ``np.float64``.

Returns
-------
wrap
Wrapper matrix object.

See Also
--------
adelie.io.snp_combine_r
adelie.adelie_core.matrix.MatrixNaiveSNPCombineR32
adelie.adelie_core.matrix.MatrixNaiveSNPCombineR64
"""
dispatcher = {
np.float64: core.matrix.MatrixNaiveSNPCombineR64,
np.float32: core.matrix.MatrixNaiveSNPCombineR32,
}
core_base = dispatcher[dtype]
py_base = PyMatrixNaiveBase

if not io.is_read:
io.read()

class _snp_combine_r(core_base, py_base):
def __init__(self):
self._io = io
core_base.__init__(self, self._io, n_threads)
py_base.__init__(self, n_threads=n_threads)

return _snp_combine_r()


def snp_combine_s(
io: io.snp_combine_s,
*,
n_threads: int =1,
dtype: Union[np.float32, np.float64] =np.float64,
):
"""Creates a SNP both-ancestry matrix.

For each SNP j, columns are divided into two blocks of A each:
- First A: mutated haplotype counts per ancestry
- Next A: ancestry dosage counts per ancestry
"""
dispatcher = {
np.float64: core.matrix.MatrixNaiveSNPCombineS64,
np.float32: core.matrix.MatrixNaiveSNPCombineS32,
}
core_base = dispatcher[dtype]
py_base = PyMatrixNaiveBase

if not io.is_read:
io.read()

class _snp_combine_s(core_base, py_base):
def __init__(self):
self._io = io
core_base.__init__(self, self._io, n_threads)
py_base.__init__(self, n_threads=n_threads)

return _snp_combine_s()


def sparse(
mat: Union[csc_matrix, csr_matrix],
*,
Expand Down
Loading