From cda2e85a21150f6af0bf82ecbce1335b20263431 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 24 Jun 2026 08:25:10 -0700
Subject: [PATCH] BUG: DataFrame.combine_first loses precision for wide
 integers (GH#60128)

Align rows positionally and fill column-by-column from the original arrays
instead of reindexing self to the row union, which promoted integer columns
through float64 and lost precision for values outside its exactly-representable
range. Fully-covered columns now keep their dtype and exact values, including
with duplicate row or column labels.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 doc/source/whatsnew/v3.1.0.rst                |   1 +
 pandas/core/frame.py                          | 102 +++++++++++++++++-
 .../tests/frame/methods/test_combine_first.py |  50 +++++++--
 3 files changed, 142 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst
index 10f24f01e7688..dc0ee9a146fa0 100644
--- a/doc/source/whatsnew/v3.1.0.rst
+++ b/doc/source/whatsnew/v3.1.0.rst
@@ -291,6 +291,7 @@ Conversion
 - Bug in :class:`DataFrame` constructor where ``NaT`` in a :class:`TimedeltaIndex` row was incorrectly inferred as ``datetime64`` instead of ``timedelta64`` (:issue:`23985`)
 - Bug in :class:`DataFrame` constructor where constructing from a list of uniform-dtype arrays (e.g. pyarrow, :class:`CategoricalDtype`, nullable dtypes) lost the dtype (:issue:`49593`)
 - Bug in :func:`pd.array` silently converting NaN to a nonsensical integer when given float data containing NaN and a NumPy integer dtype (:issue:`41724`)
+- Bug in :meth:`DataFrame.combine_first` where NumPy integers with absolute value greater than ``2**53`` would lose precision after the operation (:issue:`60128`)
 - Fixed :func:`pandas.array` to preserve mask information when converting NumPy masked arrays, converting masked values to missing values (:issue:`63879`)
 - Fixed bug in :meth:`DataFrame.from_records` where ``exclude`` was ignored when ``data`` was an iterator and ``nrows=0`` (:issue:`63774`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index a2f870367b6aa..a687012635e42 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -124,7 +124,10 @@
 )
 from pandas.core.accessor import Accessor
 from pandas.core.apply import reconstruct_and_relabel_result
-from pandas.core.array_algos.take import take_2d_multi
+from pandas.core.array_algos.take import (
+    take_2d_multi,
+    take_nd,
+)
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays import (
     BaseMaskedArray,
@@ -12364,17 +12367,106 @@ def combine_first(self, other: DataFrame) -> DataFrame:
         2  NaN  3.0  1.0
         """
 
-        def combiner(x: Series, y: Series):
-            # GH#60128 The combiner is supposed to preserve EA Dtypes.
-            return y if y.name not in self.columns else y.where(x.isna(), x)
+        from pandas.core.reshape.concat import concat
 
         if len(other) == 0:
             combined = self.reindex(
                 self.columns.append(other.columns.difference(self.columns)), axis=1
             )
             combined = combined.astype(other.dtypes)
+            return combined.__finalize__(self, method="combine_first")
+
+        # GH#60128 Align the rows positionally and fill column-by-column without
+        #  ever reindexing a column through a NA-incompatible cast. Taking values
+        #  by integer position from the original arrays (rather than reindexing
+        #  self to the union, which promotes int -> float) preserves integer
+        #  values that float64 cannot represent exactly.
+        new_columns = self.columns.union(other.columns, sort=False)
+        left: npt.NDArray[np.intp]
+        right: npt.NDArray[np.intp]
+        if self.index.equals(other.index):
+            new_index = self.index
+            left = right = np.arange(len(new_index), dtype=np.intp)
         else:
-            combined = self.combine(other, combiner, overwrite=False)
+            # join(how="outer") matches the row order produced by align; the
+            #  indexers map each result row to a position in self / other (-1
+            #  when absent) and handle duplicate labels positionally.
+            new_index, lidx, ridx = self.index.join(
+                other.index, how="outer", return_indexers=True
+            )
+            left = np.arange(len(new_index), dtype=np.intp) if lidx is None else lidx
+            right = np.arange(len(new_index), dtype=np.intp) if ridx is None else ridx
+
+        if len(new_columns) == 0 and len(new_index) == len(self.index):
+            # No data columns and no new rows (e.g. set_index left nothing to
+            #  combine): match align's order-preserving short-circuit.
+            return self.copy().__finalize__(self, method="combine_first")
+
+        def col_positions(columns) -> npt.NDArray[np.intp]:
+            # Position in `columns` of each result column, broadcasting the last
+            #  duplicate when `columns` has fewer copies than `new_columns`.
+            groups: dict[Hashable, list[int]] = {}
+            for pos, label in enumerate(columns):
+                groups.setdefault(label, []).append(pos)
+            seen: dict[Hashable, int] = {}
+            out = np.empty(len(new_columns), dtype=np.intp)
+            for i, label in enumerate(new_columns):
+                positions = groups.get(label)
+                if positions is None:
+                    out[i] = -1
+                else:
+                    nth = seen.get(label, 0)
+                    out[i] = positions[min(nth, len(positions) - 1)]
+                    seen[label] = nth + 1
+            return out
+
+        self_cols = col_positions(self.columns)
+        other_cols = col_positions(other.columns)
+
+        def combine_column(self_col: Series, other_col: Series) -> Series:
+            # Take `other` only where `self` is NA; keep every other `self` row
+            #  (including its own NA values, preserving their dtype). `concat`
+            #  of the two disjoint pieces yields the common dtype without
+            #  forcing a NA fill into either piece.
+            self_isna = self_col.isna().to_numpy()
+            self_na = left == -1
+            present = ~self_na
+            self_na[present] = self_isna[left[present]]
+            take_other = self_na & (right != -1)
+            if not take_other.any():
+                merged = self_col.take(left)
+            elif take_other.all():
+                merged = other_col.take(right)
+            else:
+                take_self = ~take_other
+                merged = concat(
+                    [self_col.take(left[take_self]), other_col.take(right[take_other])],
+                    ignore_index=True,
+                )
+                order = np.concatenate(
+                    [np.flatnonzero(take_self), np.flatnonzero(take_other)]
+                )
+                merged = merged.iloc[np.argsort(order, kind="stable")]
+            merged.index = new_index
+            return merged
+
+        result = {}
+        for i in range(len(new_columns)):
+            self_pos = self_cols[i]
+            other_pos = other_cols[i]
+            if self_pos != -1 and other_pos != -1:
+                result[i] = combine_column(
+                    self.iloc[:, self_pos], other.iloc[:, other_pos]
+                )
+            elif self_pos != -1:
+                values = take_nd(self.iloc[:, self_pos]._values, left)
+                result[i] = Series(values, index=new_index)
+            else:
+                values = take_nd(other.iloc[:, other_pos]._values, right)
+                result[i] = Series(values, index=new_index)
+
+        combined = self._constructor(result, index=new_index)
+        combined.columns = new_columns
 
         dtypes = {
             # Check for isinstance(..., (np.dtype, ExtensionDtype))
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index da4a240ed8691..ac8170478cc01 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -214,11 +214,14 @@ def test_combine_first_align_nan(self):
         assert res["b"].dtype == "int64"
 
         res = dfa.iloc[:0].combine_first(dfb)
-        exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
+        # GH#60128 "a" keeps self's datetime64 dtype and "b" stays int64
+        #  instead of being promoted through float
+        exp = DataFrame(
+            {"a": Series([pd.NaT, pd.NaT], dtype="datetime64[s]"), "b": [4, 5]},
+            columns=["a", "b"],
+        )
         tm.assert_frame_equal(res, exp)
-        # TODO: this must be datetime64
-        assert res["a"].dtype == "float64"
-        # TODO: this must be int64
+        assert res["a"].dtype == "datetime64[s]"
         assert res["b"].dtype == "int64"
 
     def test_combine_first_timezone(self, unit):
@@ -404,18 +407,30 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
     @pytest.mark.parametrize(
         "wide_val, dtype",
         (
+            (1666880195890293744, "uint64"),
             (1666880195890293744, "UInt64"),
+            (-1666880195890293744, "int64"),
             (-1666880195890293744, "Int64"),
         ),
     )
-    def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
-        # GH#60128
+    def test_combine_first_preserve_precision(self, wide_val, dtype):
+        # GH#60128 values outside float64's exactly-representable range must
+        #  not be promoted through float when filling unaligned rows
         df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype)
         df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype)
         result = df1.combine_first(df2)
         expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+    def test_combine_first_preserve_precision_non_unique_index(self):
+        # GH#60128 precision is preserved even with duplicate row labels
+        wide_val = 1666880195890293744
+        df1 = DataFrame({"A": [wide_val, 5]}, index=[0, 0])
+        df2 = DataFrame({"A": [6, 7]}, index=[1, 1])
+        result = df1.combine_first(df2)
+        expected = DataFrame({"A": [wide_val, 5, 6, 7]}, index=[0, 0, 1, 1])
+        tm.assert_frame_equal(result, expected)
+
     def test_combine_first_non_unique_columns(self):
         # GH#29135
         df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
@@ -428,6 +443,14 @@ def test_combine_first_non_unique_columns(self):
         )
         tm.assert_frame_equal(result, expected)
 
+    def test_combine_first_duplicate_columns_new_column(self):
+        # GH#60128 self has duplicate columns and other introduces a new one
+        df1 = DataFrame([[1, 2, 3]], columns=["P", "Q", "Q"])
+        df2 = DataFrame([[4, 5]], columns=["P", "R"])
+        result = df1.combine_first(df2)
+        expected = DataFrame([[1, 2, 3, 5]], columns=["P", "Q", "Q", "R"])
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.parametrize(
     "scalar1, scalar2",
@@ -595,3 +618,18 @@ def test_combine_first_preserve_column_order():
     result = df1.combine_first(df2)
     expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_combine_first_preserve_precision_mixed_columns():
+    # GH#60128 a wide-int column keeps its precision even alongside a column
+    #  that does pass through float
+    wide_val = 1666880195890293744
+    df1 = DataFrame({"a": [wide_val, 5], "b": [1.0, 2.0]})
+    df2 = DataFrame({"a": [6, 7, 8]})
+
+    result = df1.combine_first(df2)
+    expected = DataFrame(
+        {"a": [wide_val, 5, 8], "b": [1.0, 2.0, np.nan]},
+    )
+    tm.assert_frame_equal(result, expected)
+    assert result["a"].dtype == "int64"