From d84f307f556b7e37dc17ed4306f3818ec486e21c Mon Sep 17 00:00:00 2001 From: Gilbert Han Date: Wed, 3 Dec 2025 16:50:36 +0800 Subject: [PATCH 1/4] add score_per_million to merge_peak --- snapatac2-core/src/utils/mod.rs | 18 ++++++++++++++++++ snapatac2/tools/_call_peaks.py | 3 ++- src/call_peaks.rs | 7 ++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/snapatac2-core/src/utils/mod.rs b/snapatac2-core/src/utils/mod.rs index bbd5ff452..e3ca947d0 100644 --- a/snapatac2-core/src/utils/mod.rs +++ b/snapatac2-core/src/utils/mod.rs @@ -54,6 +54,24 @@ pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes) peak } +fn score_per_million(mut peaks: Vec) -> Result> { + + let total_signal: f64 = peaks.iter().filter_map(|p| p.p_value).sum(); + + if total_signal == 0.0 { + // Prevent division by zero; just return peaks as-is. + return Ok(peaks); + } + let factor = 1_000_000.0 / total_signal; + for peak in &mut peaks { + if let Some(score) = peak.p_value.as_mut() { + *score *= factor; + } + } + + Ok(peaks) +} + #[derive(Debug, Clone, Copy)] pub enum Compression { Gzip, diff --git a/snapatac2/tools/_call_peaks.py b/snapatac2/tools/_call_peaks.py index 600ae13f9..e1741be48 100644 --- a/snapatac2/tools/_call_peaks.py +++ b/snapatac2/tools/_call_peaks.py @@ -194,6 +194,7 @@ def merge_peaks( peaks: dict[str, 'polars.DataFrame'], chrom_sizes: dict[str, int] | Genome, half_width: int = 250, + normalize: bool = False, ) -> 'polars.DataFrame': """Merge peaks from different groups. @@ -232,7 +233,7 @@ def merge_peaks( import polars as pl chrom_sizes = chrom_sizes.chrom_sizes if isinstance(chrom_sizes, Genome) else chrom_sizes peaks = { k: pl.from_pandas(v) if isinstance(v, pd.DataFrame) else v for k, v in peaks.items()} - return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width) + return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width, normalize) def _par_map(mapper, args, nprocs): import time diff --git a/src/call_peaks.rs b/src/call_peaks.rs index ff12c6cd1..c3c204285 100644 --- a/src/call_peaks.rs +++ b/src/call_peaks.rs @@ -35,6 +35,7 @@ pub fn py_merge_peaks<'py>( peaks: HashMap, chrom_sizes: HashMap, half_width: u64, + normalize: bool, ) -> Result { let peak_list: Vec<_> = peaks .into_iter() @@ -43,7 +44,11 @@ pub fn py_merge_peaks<'py>( Ok((key, ps)) }) .collect::>()?; - + if normalize{ + for (_, peaks) in peak_list.iter_mut() { + *peaks = score_per_million(peaks.clone())?; + } + } let chrom_sizes = chrom_sizes.into_iter().collect(); let peaks: Vec<_> = merge_peaks(peak_list.iter().flat_map(|x| x.1.clone()), half_width) .flatten() From 830140aafcd0ba5814ce4ceb6df03fee5e4e87cb Mon Sep 17 00:00:00 2001 From: Gilbert Han Date: Wed, 3 Dec 2025 17:17:49 +0800 Subject: [PATCH 2/4] add score per million in merge peaks --- snapatac2/tools/_call_peaks.py | 2 ++ src/call_peaks.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/snapatac2/tools/_call_peaks.py b/snapatac2/tools/_call_peaks.py index e1741be48..345e65e92 100644 --- a/snapatac2/tools/_call_peaks.py +++ b/snapatac2/tools/_call_peaks.py @@ -219,6 +219,8 @@ def merge_peaks( chromosome sizes will be obtained from the genome. half_width Half width of the merged peaks. + normalize + Score per million normalization. Maybe useful when sequencing depth varies, DOI: 10.1126/science.aav1898. Returns ------- diff --git a/src/call_peaks.rs b/src/call_peaks.rs index c3c204285..f97636fc4 100644 --- a/src/call_peaks.rs +++ b/src/call_peaks.rs @@ -9,7 +9,7 @@ use pyo3::ffi::c_str; use snapatac2_core::utils::{self, Compression}; use snapatac2_core::{ preprocessing::Fragment, - utils::{clip_peak, merge_peaks, open_file_for_write}, + utils::{clip_peak, merge_peaks, open_file_for_write,score_per_million}, SnapData, }; From a54aea289467626b8d1e1d85dbb9e8aff508c952 Mon Sep 17 00:00:00 2001 From: Gilbert Han Date: Wed, 3 Dec 2025 18:02:51 +0800 Subject: [PATCH 3/4] minor --- snapatac2-core/src/utils/mod.rs | 2 +- src/call_peaks.rs | 13 +++++--- test_script/test.py | 55 +++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 test_script/test.py diff --git a/snapatac2-core/src/utils/mod.rs b/snapatac2-core/src/utils/mod.rs index e3ca947d0..066a7828a 100644 --- a/snapatac2-core/src/utils/mod.rs +++ b/snapatac2-core/src/utils/mod.rs @@ -54,7 +54,7 @@ pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes) peak } -fn score_per_million(mut peaks: Vec) -> Result> { +pub fn score_per_million(mut peaks: Vec) -> Result> { let total_signal: f64 = peaks.iter().filter_map(|p| p.p_value).sum(); diff --git a/src/call_peaks.rs b/src/call_peaks.rs index f97636fc4..2ef9a3a9e 100644 --- a/src/call_peaks.rs +++ b/src/call_peaks.rs @@ -44,11 +44,14 @@ pub fn py_merge_peaks<'py>( Ok((key, ps)) }) .collect::>()?; - if normalize{ - for (_, peaks) in peak_list.iter_mut() { - *peaks = score_per_million(peaks.clone())?; - } - } + let peak_list = if normalize { + peak_list + .into_iter() + .map(|(key, peaks)| Ok((key, score_per_million(peaks)?))) + .collect::>>()? + } else { + peak_list + }; let chrom_sizes = chrom_sizes.into_iter().collect(); let peaks: Vec<_> = merge_peaks(peak_list.iter().flat_map(|x| x.1.clone()), half_width) .flatten() diff --git a/test_script/test.py b/test_script/test.py new file mode 100644 index 000000000..b99e046fc --- /dev/null +++ b/test_script/test.py @@ -0,0 +1,55 @@ +import snapatac2 as snap +import polars as pl +narrowpeak_cols = [ + "chrom", "start", "end", "name", "score", + "strand", "signal_value", "p_value", "q_value", "peak" +] +schema_types = { + "chrom": pl.String, + "start": pl.UInt64, + "end": pl.UInt64, + "score": pl.UInt16, # <--- THIS IS THE FIX + "peak": pl.UInt64 # Good practice to define this too +} +peak1 = pl.read_csv( + "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-1_ATAC3_peaks.narrowPeak", + separator="\t", + has_header=False, + comment_prefix="#", + ignore_errors=True, # Skip malformed lines like track lines + new_columns=narrowpeak_cols, + schema_overrides=schema_types, + ) +peak2 = pl.read_csv( + "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-2_ATAC3_peaks.narrowPeak", + separator="\t", + has_header=False, + comment_prefix="#", + ignore_errors=True, # Skip malformed lines like track lines + new_columns=narrowpeak_cols, + schema_overrides=schema_types, + ) +def clean_narrowpeak(df: pl.DataFrame) -> pl.DataFrame: + return df.with_columns([ + # 1. Clip score to 0-1000 (standard narrowPeak limit) to prevent overflow + pl.col("score").clip(0, 1000).cast(pl.UInt16), + + # 2. Ensure coordinates are UInt64 + pl.col("start").cast(pl.UInt64), + pl.col("end").cast(pl.UInt64), + + # 3. Ensure chrom is String + pl.col("chrom").cast(pl.String) + ]) + +# Apply the cleaning function to your existing loaded dataframes +peak1 = clean_narrowpeak(peak1) +peak2 = clean_narrowpeak(peak2) + +# Re-create the dictionary +peaks_dict = { + "B1-1": peak1, + "B1-2": peak2, +} +peak_merge = snap.tl.merge_peaks(peaks_dict, snap.genome.hg38) +peak_merge \ No newline at end of file From e58c63dbda36d08697cd7f1e593c17d20d61fb2b Mon Sep 17 00:00:00 2001 From: Gilbert Han Date: Wed, 3 Dec 2025 18:03:32 +0800 Subject: [PATCH 4/4] minor --- test_script/test.py | 55 --------------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 test_script/test.py diff --git a/test_script/test.py b/test_script/test.py deleted file mode 100644 index b99e046fc..000000000 --- a/test_script/test.py +++ /dev/null @@ -1,55 +0,0 @@ -import snapatac2 as snap -import polars as pl -narrowpeak_cols = [ - "chrom", "start", "end", "name", "score", - "strand", "signal_value", "p_value", "q_value", "peak" -] -schema_types = { - "chrom": pl.String, - "start": pl.UInt64, - "end": pl.UInt64, - "score": pl.UInt16, # <--- THIS IS THE FIX - "peak": pl.UInt64 # Good practice to define this too -} -peak1 = pl.read_csv( - "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-1_ATAC3_peaks.narrowPeak", - separator="\t", - has_header=False, - comment_prefix="#", - ignore_errors=True, # Skip malformed lines like track lines - new_columns=narrowpeak_cols, - schema_overrides=schema_types, - ) -peak2 = pl.read_csv( - "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-2_ATAC3_peaks.narrowPeak", - separator="\t", - has_header=False, - comment_prefix="#", - ignore_errors=True, # Skip malformed lines like track lines - new_columns=narrowpeak_cols, - schema_overrides=schema_types, - ) -def clean_narrowpeak(df: pl.DataFrame) -> pl.DataFrame: - return df.with_columns([ - # 1. Clip score to 0-1000 (standard narrowPeak limit) to prevent overflow - pl.col("score").clip(0, 1000).cast(pl.UInt16), - - # 2. Ensure coordinates are UInt64 - pl.col("start").cast(pl.UInt64), - pl.col("end").cast(pl.UInt64), - - # 3. Ensure chrom is String - pl.col("chrom").cast(pl.String) - ]) - -# Apply the cleaning function to your existing loaded dataframes -peak1 = clean_narrowpeak(peak1) -peak2 = clean_narrowpeak(peak2) - -# Re-create the dictionary -peaks_dict = { - "B1-1": peak1, - "B1-2": peak2, -} -peak_merge = snap.tl.merge_peaks(peaks_dict, snap.genome.hg38) -peak_merge \ No newline at end of file