From d84f307f556b7e37dc17ed4306f3818ec486e21c Mon Sep 17 00:00:00 2001
From: Gilbert Han <GilbertHan1011@gmail.com>
Date: Wed, 3 Dec 2025 16:50:36 +0800
Subject: [PATCH 1/4] add score_per_million to merge_peak

---
 snapatac2-core/src/utils/mod.rs | 18 ++++++++++++++++++
 snapatac2/tools/_call_peaks.py  |  3 ++-
 src/call_peaks.rs               |  7 ++++++-
 3 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/snapatac2-core/src/utils/mod.rs b/snapatac2-core/src/utils/mod.rs
index bbd5ff452..e3ca947d0 100644
--- a/snapatac2-core/src/utils/mod.rs
+++ b/snapatac2-core/src/utils/mod.rs
@@ -54,6 +54,24 @@ pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes)
     peak
 }
 
+fn score_per_million(mut peaks: Vec<NarrowPeak>) -> Result<Vec<NarrowPeak>> {
+
+    let total_signal: f64 = peaks.iter().filter_map(|p| p.p_value).sum();
+
+    if total_signal == 0.0 {
+        // Prevent division by zero; just return peaks as-is.
+        return Ok(peaks);
+    }
+    let factor = 1_000_000.0 / total_signal;
+    for peak in &mut peaks {
+        if let Some(score) = peak.p_value.as_mut() {
+            *score *= factor;
+        }
+    }
+
+    Ok(peaks)
+}
+
 #[derive(Debug, Clone, Copy)]
 pub enum Compression {
     Gzip,
diff --git a/snapatac2/tools/_call_peaks.py b/snapatac2/tools/_call_peaks.py
index 600ae13f9..e1741be48 100644
--- a/snapatac2/tools/_call_peaks.py
+++ b/snapatac2/tools/_call_peaks.py
@@ -194,6 +194,7 @@ def merge_peaks(
     peaks: dict[str, 'polars.DataFrame'],
     chrom_sizes: dict[str, int] | Genome,
     half_width: int = 250,
+    normalize: bool = False,
 ) -> 'polars.DataFrame':
     """Merge peaks from different groups.
 
@@ -232,7 +233,7 @@ def merge_peaks(
     import polars as pl
     chrom_sizes = chrom_sizes.chrom_sizes if isinstance(chrom_sizes, Genome) else chrom_sizes
     peaks = { k: pl.from_pandas(v) if isinstance(v, pd.DataFrame) else v for k, v in peaks.items()}
-    return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width)
+    return _snapatac2.py_merge_peaks(peaks, chrom_sizes, half_width, normalize)
 
 def _par_map(mapper, args, nprocs):
     import time
diff --git a/src/call_peaks.rs b/src/call_peaks.rs
index ff12c6cd1..c3c204285 100644
--- a/src/call_peaks.rs
+++ b/src/call_peaks.rs
@@ -35,6 +35,7 @@ pub fn py_merge_peaks<'py>(
     peaks: HashMap<String, PyDataFrame>,
     chrom_sizes: HashMap<String, u64>,
     half_width: u64,
+    normalize: bool,
 ) -> Result<PyDataFrame> {
     let peak_list: Vec<_> = peaks
         .into_iter()
@@ -43,7 +44,11 @@ pub fn py_merge_peaks<'py>(
             Ok((key, ps))
         })
         .collect::<Result<_>>()?;
-
+    if normalize{
+        for (_, peaks) in peak_list.iter_mut() {
+            *peaks = score_per_million(peaks.clone())?;
+        }
+    }
     let chrom_sizes = chrom_sizes.into_iter().collect();
     let peaks: Vec<_> = merge_peaks(peak_list.iter().flat_map(|x| x.1.clone()), half_width)
         .flatten()

From 830140aafcd0ba5814ce4ceb6df03fee5e4e87cb Mon Sep 17 00:00:00 2001
From: Gilbert Han <GilbertHan1011@gmail.com>
Date: Wed, 3 Dec 2025 17:17:49 +0800
Subject: [PATCH 2/4] add score per million in merge peaks

---
 snapatac2/tools/_call_peaks.py | 2 ++
 src/call_peaks.rs              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/snapatac2/tools/_call_peaks.py b/snapatac2/tools/_call_peaks.py
index e1741be48..345e65e92 100644
--- a/snapatac2/tools/_call_peaks.py
+++ b/snapatac2/tools/_call_peaks.py
@@ -219,6 +219,8 @@ def merge_peaks(
         chromosome sizes will be obtained from the genome.
     half_width
         Half width of the merged peaks.
+    normalize
+        Score per million normalization. Maybe useful when sequencing depth varies, DOI: 10.1126/science.aav1898.
 
     Returns
     -------
diff --git a/src/call_peaks.rs b/src/call_peaks.rs
index c3c204285..f97636fc4 100644
--- a/src/call_peaks.rs
+++ b/src/call_peaks.rs
@@ -9,7 +9,7 @@ use pyo3::ffi::c_str;
 use snapatac2_core::utils::{self, Compression};
 use snapatac2_core::{
     preprocessing::Fragment,
-    utils::{clip_peak, merge_peaks, open_file_for_write},
+    utils::{clip_peak, merge_peaks, open_file_for_write,score_per_million},
     SnapData,
 };
 

From a54aea289467626b8d1e1d85dbb9e8aff508c952 Mon Sep 17 00:00:00 2001
From: Gilbert Han <GilbertHan1011@gmail.com>
Date: Wed, 3 Dec 2025 18:02:51 +0800
Subject: [PATCH 3/4] minor

---
 snapatac2-core/src/utils/mod.rs |  2 +-
 src/call_peaks.rs               | 13 +++++---
 test_script/test.py             | 55 +++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 test_script/test.py

diff --git a/snapatac2-core/src/utils/mod.rs b/snapatac2-core/src/utils/mod.rs
index e3ca947d0..066a7828a 100644
--- a/snapatac2-core/src/utils/mod.rs
+++ b/snapatac2-core/src/utils/mod.rs
@@ -54,7 +54,7 @@ pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes)
     peak
 }
 
-fn score_per_million(mut peaks: Vec<NarrowPeak>) -> Result<Vec<NarrowPeak>> {
+pub fn score_per_million(mut peaks: Vec<NarrowPeak>) -> Result<Vec<NarrowPeak>> {
 
     let total_signal: f64 = peaks.iter().filter_map(|p| p.p_value).sum();
 
diff --git a/src/call_peaks.rs b/src/call_peaks.rs
index f97636fc4..2ef9a3a9e 100644
--- a/src/call_peaks.rs
+++ b/src/call_peaks.rs
@@ -44,11 +44,14 @@ pub fn py_merge_peaks<'py>(
             Ok((key, ps))
         })
         .collect::<Result<_>>()?;
-    if normalize{
-        for (_, peaks) in peak_list.iter_mut() {
-            *peaks = score_per_million(peaks.clone())?;
-        }
-    }
+    let peak_list = if normalize {
+        peak_list
+            .into_iter()
+            .map(|(key, peaks)| Ok((key, score_per_million(peaks)?)))
+            .collect::<Result<Vec<_>>>()?
+    } else {
+        peak_list
+    };
     let chrom_sizes = chrom_sizes.into_iter().collect();
     let peaks: Vec<_> = merge_peaks(peak_list.iter().flat_map(|x| x.1.clone()), half_width)
         .flatten()
diff --git a/test_script/test.py b/test_script/test.py
new file mode 100644
index 000000000..b99e046fc
--- /dev/null
+++ b/test_script/test.py
@@ -0,0 +1,55 @@
+import snapatac2 as snap
+import polars as pl
+narrowpeak_cols = [
+    "chrom", "start", "end", "name", "score", 
+    "strand", "signal_value", "p_value", "q_value", "peak"
+]
+schema_types = {
+    "chrom": pl.String,
+    "start": pl.UInt64, 
+    "end": pl.UInt64,
+    "score": pl.UInt16,  # <--- THIS IS THE FIX
+    "peak": pl.UInt64    # Good practice to define this too
+}
+peak1 = pl.read_csv(
+            "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-1_ATAC3_peaks.narrowPeak",
+            separator="\t",
+            has_header=False,
+            comment_prefix="#",
+            ignore_errors=True, # Skip malformed lines like track lines
+            new_columns=narrowpeak_cols,
+            schema_overrides=schema_types,
+        )
+peak2 = pl.read_csv(
+            "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-2_ATAC3_peaks.narrowPeak",
+            separator="\t",
+            has_header=False,
+            comment_prefix="#",
+            ignore_errors=True, # Skip malformed lines like track lines
+            new_columns=narrowpeak_cols,
+            schema_overrides=schema_types,
+        )
+def clean_narrowpeak(df: pl.DataFrame) -> pl.DataFrame:
+    return df.with_columns([
+        # 1. Clip score to 0-1000 (standard narrowPeak limit) to prevent overflow
+        pl.col("score").clip(0, 1000).cast(pl.UInt16),
+        
+        # 2. Ensure coordinates are UInt64
+        pl.col("start").cast(pl.UInt64),
+        pl.col("end").cast(pl.UInt64),
+        
+        # 3. Ensure chrom is String
+        pl.col("chrom").cast(pl.String)
+    ])
+
+# Apply the cleaning function to your existing loaded dataframes
+peak1 = clean_narrowpeak(peak1)
+peak2 = clean_narrowpeak(peak2)
+
+# Re-create the dictionary
+peaks_dict = {
+    "B1-1": peak1,
+    "B1-2": peak2,
+}
+peak_merge = snap.tl.merge_peaks(peaks_dict, snap.genome.hg38)
+peak_merge
\ No newline at end of file

From e58c63dbda36d08697cd7f1e593c17d20d61fb2b Mon Sep 17 00:00:00 2001
From: Gilbert Han <GilbertHan1011@gmail.com>
Date: Wed, 3 Dec 2025 18:03:32 +0800
Subject: [PATCH 4/4] minor

---
 test_script/test.py | 55 ---------------------------------------------
 1 file changed, 55 deletions(-)
 delete mode 100644 test_script/test.py

diff --git a/test_script/test.py b/test_script/test.py
deleted file mode 100644
index b99e046fc..000000000
--- a/test_script/test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import snapatac2 as snap
-import polars as pl
-narrowpeak_cols = [
-    "chrom", "start", "end", "name", "score", 
-    "strand", "signal_value", "p_value", "q_value", "peak"
-]
-schema_types = {
-    "chrom": pl.String,
-    "start": pl.UInt64, 
-    "end": pl.UInt64,
-    "score": pl.UInt16,  # <--- THIS IS THE FIX
-    "peak": pl.UInt64    # Good practice to define this too
-}
-peak1 = pl.read_csv(
-            "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-1_ATAC3_peaks.narrowPeak",
-            separator="\t",
-            has_header=False,
-            comment_prefix="#",
-            ignore_errors=True, # Skip malformed lines like track lines
-            new_columns=narrowpeak_cols,
-            schema_overrides=schema_types,
-        )
-peak2 = pl.read_csv(
-            "/storage/zhangkaiLab/hanlitian/macrophage/script/utils/merge_peaks/test_data/B1-2_ATAC3_peaks.narrowPeak",
-            separator="\t",
-            has_header=False,
-            comment_prefix="#",
-            ignore_errors=True, # Skip malformed lines like track lines
-            new_columns=narrowpeak_cols,
-            schema_overrides=schema_types,
-        )
-def clean_narrowpeak(df: pl.DataFrame) -> pl.DataFrame:
-    return df.with_columns([
-        # 1. Clip score to 0-1000 (standard narrowPeak limit) to prevent overflow
-        pl.col("score").clip(0, 1000).cast(pl.UInt16),
-        
-        # 2. Ensure coordinates are UInt64
-        pl.col("start").cast(pl.UInt64),
-        pl.col("end").cast(pl.UInt64),
-        
-        # 3. Ensure chrom is String
-        pl.col("chrom").cast(pl.String)
-    ])
-
-# Apply the cleaning function to your existing loaded dataframes
-peak1 = clean_narrowpeak(peak1)
-peak2 = clean_narrowpeak(peak2)
-
-# Re-create the dictionary
-peaks_dict = {
-    "B1-1": peak1,
-    "B1-2": peak2,
-}
-peak_merge = snap.tl.merge_peaks(peaks_dict, snap.genome.hg38)
-peak_merge
\ No newline at end of file