From cda2e85a21150f6af0bf82ecbce1335b20263431 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 24 Jun 2026 08:25:10 -0700 Subject: [PATCH] BUG: DataFrame.combine_first loses precision for wide integers (GH#60128) Align rows positionally and fill column-by-column from the original arrays instead of reindexing self to the row union, which promoted integer columns through float64 and lost precision for values outside its exactly-representable range. Fully-covered columns now keep their dtype and exact values, including with duplicate row or column labels. Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/source/whatsnew/v3.1.0.rst | 1 + pandas/core/frame.py | 102 +++++++++++++++++- .../tests/frame/methods/test_combine_first.py | 50 +++++++-- 3 files changed, 142 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 10f24f01e7688..dc0ee9a146fa0 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -291,6 +291,7 @@ Conversion - Bug in :class:`DataFrame` constructor where ``NaT`` in a :class:`TimedeltaIndex` row was incorrectly inferred as ``datetime64`` instead of ``timedelta64`` (:issue:`23985`) - Bug in :class:`DataFrame` constructor where constructing from a list of uniform-dtype arrays (e.g. pyarrow, :class:`CategoricalDtype`, nullable dtypes) lost the dtype (:issue:`49593`) - Bug in :func:`pd.array` silently converting NaN to a nonsensical integer when given float data containing NaN and a NumPy integer dtype (:issue:`41724`) +- Bug in :meth:`DataFrame.combine_first` where NumPy integers with absolute value greater than ``2**53`` would lose precision after the operation (:issue:`60128`) - Fixed :func:`pandas.array` to preserve mask information when converting NumPy masked arrays, converting masked values to missing values (:issue:`63879`) - Fixed bug in :meth:`DataFrame.from_records` where ``exclude`` was ignored when ``data`` was an iterator and ``nrows=0`` (:issue:`63774`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a2f870367b6aa..a687012635e42 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -124,7 +124,10 @@ ) from pandas.core.accessor import Accessor from pandas.core.apply import reconstruct_and_relabel_result -from pandas.core.array_algos.take import take_2d_multi +from pandas.core.array_algos.take import ( + take_2d_multi, + take_nd, +) from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ( BaseMaskedArray, @@ -12364,17 +12367,106 @@ def combine_first(self, other: DataFrame) -> DataFrame: 2 NaN 3.0 1.0 """ - def combiner(x: Series, y: Series): - # GH#60128 The combiner is supposed to preserve EA Dtypes. - return y if y.name not in self.columns else y.where(x.isna(), x) + from pandas.core.reshape.concat import concat if len(other) == 0: combined = self.reindex( self.columns.append(other.columns.difference(self.columns)), axis=1 ) combined = combined.astype(other.dtypes) + return combined.__finalize__(self, method="combine_first") + + # GH#60128 Align the rows positionally and fill column-by-column without + # ever reindexing a column through a NA-incompatible cast. Taking values + # by integer position from the original arrays (rather than reindexing + # self to the union, which promotes int -> float) preserves integer + # values that float64 cannot represent exactly. + new_columns = self.columns.union(other.columns, sort=False) + left: npt.NDArray[np.intp] + right: npt.NDArray[np.intp] + if self.index.equals(other.index): + new_index = self.index + left = right = np.arange(len(new_index), dtype=np.intp) else: - combined = self.combine(other, combiner, overwrite=False) + # join(how="outer") matches the row order produced by align; the + # indexers map each result row to a position in self / other (-1 + # when absent) and handle duplicate labels positionally. + new_index, lidx, ridx = self.index.join( + other.index, how="outer", return_indexers=True + ) + left = np.arange(len(new_index), dtype=np.intp) if lidx is None else lidx + right = np.arange(len(new_index), dtype=np.intp) if ridx is None else ridx + + if len(new_columns) == 0 and len(new_index) == len(self.index): + # No data columns and no new rows (e.g. set_index left nothing to + # combine): match align's order-preserving short-circuit. + return self.copy().__finalize__(self, method="combine_first") + + def col_positions(columns) -> npt.NDArray[np.intp]: + # Position in `columns` of each result column, broadcasting the last + # duplicate when `columns` has fewer copies than `new_columns`. + groups: dict[Hashable, list[int]] = {} + for pos, label in enumerate(columns): + groups.setdefault(label, []).append(pos) + seen: dict[Hashable, int] = {} + out = np.empty(len(new_columns), dtype=np.intp) + for i, label in enumerate(new_columns): + positions = groups.get(label) + if positions is None: + out[i] = -1 + else: + nth = seen.get(label, 0) + out[i] = positions[min(nth, len(positions) - 1)] + seen[label] = nth + 1 + return out + + self_cols = col_positions(self.columns) + other_cols = col_positions(other.columns) + + def combine_column(self_col: Series, other_col: Series) -> Series: + # Take `other` only where `self` is NA; keep every other `self` row + # (including its own NA values, preserving their dtype). `concat` + # of the two disjoint pieces yields the common dtype without + # forcing a NA fill into either piece. + self_isna = self_col.isna().to_numpy() + self_na = left == -1 + present = ~self_na + self_na[present] = self_isna[left[present]] + take_other = self_na & (right != -1) + if not take_other.any(): + merged = self_col.take(left) + elif take_other.all(): + merged = other_col.take(right) + else: + take_self = ~take_other + merged = concat( + [self_col.take(left[take_self]), other_col.take(right[take_other])], + ignore_index=True, + ) + order = np.concatenate( + [np.flatnonzero(take_self), np.flatnonzero(take_other)] + ) + merged = merged.iloc[np.argsort(order, kind="stable")] + merged.index = new_index + return merged + + result = {} + for i in range(len(new_columns)): + self_pos = self_cols[i] + other_pos = other_cols[i] + if self_pos != -1 and other_pos != -1: + result[i] = combine_column( + self.iloc[:, self_pos], other.iloc[:, other_pos] + ) + elif self_pos != -1: + values = take_nd(self.iloc[:, self_pos]._values, left) + result[i] = Series(values, index=new_index) + else: + values = take_nd(other.iloc[:, other_pos]._values, right) + result[i] = Series(values, index=new_index) + + combined = self._constructor(result, index=new_index) + combined.columns = new_columns dtypes = { # Check for isinstance(..., (np.dtype, ExtensionDtype)) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index da4a240ed8691..ac8170478cc01 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -214,11 +214,14 @@ def test_combine_first_align_nan(self): assert res["b"].dtype == "int64" res = dfa.iloc[:0].combine_first(dfb) - exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + # GH#60128 "a" keeps self's datetime64 dtype and "b" stays int64 + # instead of being promoted through float + exp = DataFrame( + {"a": Series([pd.NaT, pd.NaT], dtype="datetime64[s]"), "b": [4, 5]}, + columns=["a", "b"], + ) tm.assert_frame_equal(res, exp) - # TODO: this must be datetime64 - assert res["a"].dtype == "float64" - # TODO: this must be int64 + assert res["a"].dtype == "datetime64[s]" assert res["b"].dtype == "int64" def test_combine_first_timezone(self, unit): @@ -404,18 +407,30 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype): @pytest.mark.parametrize( "wide_val, dtype", ( + (1666880195890293744, "uint64"), (1666880195890293744, "UInt64"), + (-1666880195890293744, "int64"), (-1666880195890293744, "Int64"), ), ) - def test_combine_first_preserve_EA_precision(self, wide_val, dtype): - # GH#60128 + def test_combine_first_preserve_precision(self, wide_val, dtype): + # GH#60128 values outside float64's exactly-representable range must + # not be promoted through float when filling unaligned rows df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype) df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype) result = df1.combine_first(df2) expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype) tm.assert_frame_equal(result, expected) + def test_combine_first_preserve_precision_non_unique_index(self): + # GH#60128 precision is preserved even with duplicate row labels + wide_val = 1666880195890293744 + df1 = DataFrame({"A": [wide_val, 5]}, index=[0, 0]) + df2 = DataFrame({"A": [6, 7]}, index=[1, 1]) + result = df1.combine_first(df2) + expected = DataFrame({"A": [wide_val, 5, 6, 7]}, index=[0, 0, 1, 1]) + tm.assert_frame_equal(result, expected) + def test_combine_first_non_unique_columns(self): # GH#29135 df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"]) @@ -428,6 +443,14 @@ def test_combine_first_non_unique_columns(self): ) tm.assert_frame_equal(result, expected) + def test_combine_first_duplicate_columns_new_column(self): + # GH#60128 self has duplicate columns and other introduces a new one + df1 = DataFrame([[1, 2, 3]], columns=["P", "Q", "Q"]) + df2 = DataFrame([[4, 5]], columns=["P", "R"]) + result = df1.combine_first(df2) + expected = DataFrame([[1, 2, 3, 5]], columns=["P", "Q", "Q", "R"]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "scalar1, scalar2", @@ -595,3 +618,18 @@ def test_combine_first_preserve_column_order(): result = df1.combine_first(df2) expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]}) tm.assert_frame_equal(result, expected) + + +def test_combine_first_preserve_precision_mixed_columns(): + # GH#60128 a wide-int column keeps its precision even alongside a column + # that does pass through float + wide_val = 1666880195890293744 + df1 = DataFrame({"a": [wide_val, 5], "b": [1.0, 2.0]}) + df2 = DataFrame({"a": [6, 7, 8]}) + + result = df1.combine_first(df2) + expected = DataFrame( + {"a": [wide_val, 5, 8], "b": [1.0, 2.0, np.nan]}, + ) + tm.assert_frame_equal(result, expected) + assert result["a"].dtype == "int64"