Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ Conversion
- Bug in :class:`DataFrame` constructor where ``NaT`` in a :class:`TimedeltaIndex` row was incorrectly inferred as ``datetime64`` instead of ``timedelta64`` (:issue:`23985`)
- Bug in :class:`DataFrame` constructor where constructing from a list of uniform-dtype arrays (e.g. pyarrow, :class:`CategoricalDtype`, nullable dtypes) lost the dtype (:issue:`49593`)
- Bug in :func:`pd.array` silently converting NaN to a nonsensical integer when given float data containing NaN and a NumPy integer dtype (:issue:`41724`)
- Bug in :meth:`DataFrame.combine_first` where NumPy integers with absolute value greater than ``2**53`` would lose precision after the operation (:issue:`60128`)
- Fixed :func:`pandas.array` to preserve mask information when converting NumPy masked arrays, converting masked values to missing values (:issue:`63879`)
- Fixed bug in :meth:`DataFrame.from_records` where ``exclude`` was ignored when ``data`` was an iterator and ``nrows=0`` (:issue:`63774`)

Expand Down
102 changes: 97 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@
)
from pandas.core.accessor import Accessor
from pandas.core.apply import reconstruct_and_relabel_result
from pandas.core.array_algos.take import take_2d_multi
from pandas.core.array_algos.take import (
take_2d_multi,
take_nd,
)
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import (
BaseMaskedArray,
Expand Down Expand Up @@ -12364,17 +12367,106 @@ def combine_first(self, other: DataFrame) -> DataFrame:
2 NaN 3.0 1.0
"""

def combiner(x: Series, y: Series):
# GH#60128 The combiner is supposed to preserve EA Dtypes.
return y if y.name not in self.columns else y.where(x.isna(), x)
from pandas.core.reshape.concat import concat

if len(other) == 0:
combined = self.reindex(
self.columns.append(other.columns.difference(self.columns)), axis=1
)
combined = combined.astype(other.dtypes)
return combined.__finalize__(self, method="combine_first")

# GH#60128 Align the rows positionally and fill column-by-column without
# ever reindexing a column through a NA-incompatible cast. Taking values
# by integer position from the original arrays (rather than reindexing
# self to the union, which promotes int -> float) preserves integer
# values that float64 cannot represent exactly.
new_columns = self.columns.union(other.columns, sort=False)
left: npt.NDArray[np.intp]
right: npt.NDArray[np.intp]
if self.index.equals(other.index):
new_index = self.index
left = right = np.arange(len(new_index), dtype=np.intp)
else:
combined = self.combine(other, combiner, overwrite=False)
# join(how="outer") matches the row order produced by align; the
# indexers map each result row to a position in self / other (-1
# when absent) and handle duplicate labels positionally.
new_index, lidx, ridx = self.index.join(
other.index, how="outer", return_indexers=True
)
left = np.arange(len(new_index), dtype=np.intp) if lidx is None else lidx
right = np.arange(len(new_index), dtype=np.intp) if ridx is None else ridx

if len(new_columns) == 0 and len(new_index) == len(self.index):
# No data columns and no new rows (e.g. set_index left nothing to
# combine): match align's order-preserving short-circuit.
return self.copy().__finalize__(self, method="combine_first")

def col_positions(columns) -> npt.NDArray[np.intp]:
# Position in `columns` of each result column, broadcasting the last
# duplicate when `columns` has fewer copies than `new_columns`.
groups: dict[Hashable, list[int]] = {}
for pos, label in enumerate(columns):
groups.setdefault(label, []).append(pos)
seen: dict[Hashable, int] = {}
out = np.empty(len(new_columns), dtype=np.intp)
for i, label in enumerate(new_columns):
positions = groups.get(label)
if positions is None:
out[i] = -1
else:
nth = seen.get(label, 0)
out[i] = positions[min(nth, len(positions) - 1)]
seen[label] = nth + 1
return out

self_cols = col_positions(self.columns)
other_cols = col_positions(other.columns)

def combine_column(self_col: Series, other_col: Series) -> Series:
# Take `other` only where `self` is NA; keep every other `self` row
# (including its own NA values, preserving their dtype). `concat`
# of the two disjoint pieces yields the common dtype without
# forcing a NA fill into either piece.
self_isna = self_col.isna().to_numpy()
self_na = left == -1
present = ~self_na
self_na[present] = self_isna[left[present]]
take_other = self_na & (right != -1)
if not take_other.any():
merged = self_col.take(left)
elif take_other.all():
merged = other_col.take(right)
else:
take_self = ~take_other
merged = concat(
[self_col.take(left[take_self]), other_col.take(right[take_other])],
ignore_index=True,
)
order = np.concatenate(
[np.flatnonzero(take_self), np.flatnonzero(take_other)]
)
merged = merged.iloc[np.argsort(order, kind="stable")]
merged.index = new_index
return merged

result = {}
for i in range(len(new_columns)):
self_pos = self_cols[i]
other_pos = other_cols[i]
if self_pos != -1 and other_pos != -1:
result[i] = combine_column(
self.iloc[:, self_pos], other.iloc[:, other_pos]
)
elif self_pos != -1:
values = take_nd(self.iloc[:, self_pos]._values, left)
result[i] = Series(values, index=new_index)
else:
values = take_nd(other.iloc[:, other_pos]._values, right)
result[i] = Series(values, index=new_index)

combined = self._constructor(result, index=new_index)
combined.columns = new_columns

dtypes = {
# Check for isinstance(..., (np.dtype, ExtensionDtype))
Expand Down
50 changes: 44 additions & 6 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,14 @@ def test_combine_first_align_nan(self):
assert res["b"].dtype == "int64"

res = dfa.iloc[:0].combine_first(dfb)
exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"])
# GH#60128 "a" keeps self's datetime64 dtype and "b" stays int64
# instead of being promoted through float
exp = DataFrame(
{"a": Series([pd.NaT, pd.NaT], dtype="datetime64[s]"), "b": [4, 5]},
columns=["a", "b"],
)
tm.assert_frame_equal(res, exp)
# TODO: this must be datetime64
assert res["a"].dtype == "float64"
# TODO: this must be int64
assert res["a"].dtype == "datetime64[s]"
assert res["b"].dtype == "int64"

def test_combine_first_timezone(self, unit):
Expand Down Expand Up @@ -404,18 +407,30 @@ def test_combine_first_string_dtype_only_na(self, nullable_string_dtype):
@pytest.mark.parametrize(
"wide_val, dtype",
(
(1666880195890293744, "uint64"),
(1666880195890293744, "UInt64"),
(-1666880195890293744, "int64"),
(-1666880195890293744, "Int64"),
),
)
def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
# GH#60128
def test_combine_first_preserve_precision(self, wide_val, dtype):
# GH#60128 values outside float64's exactly-representable range must
# not be promoted through float when filling unaligned rows
df1 = DataFrame({"A": [wide_val, 5]}, dtype=dtype)
df2 = DataFrame({"A": [6, 7, wide_val]}, dtype=dtype)
result = df1.combine_first(df2)
expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
tm.assert_frame_equal(result, expected)

def test_combine_first_preserve_precision_non_unique_index(self):
# GH#60128 precision is preserved even with duplicate row labels
wide_val = 1666880195890293744
df1 = DataFrame({"A": [wide_val, 5]}, index=[0, 0])
df2 = DataFrame({"A": [6, 7]}, index=[1, 1])
result = df1.combine_first(df2)
expected = DataFrame({"A": [wide_val, 5, 6, 7]}, index=[0, 0, 1, 1])
tm.assert_frame_equal(result, expected)

def test_combine_first_non_unique_columns(self):
# GH#29135
df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
Expand All @@ -428,6 +443,14 @@ def test_combine_first_non_unique_columns(self):
)
tm.assert_frame_equal(result, expected)

def test_combine_first_duplicate_columns_new_column(self):
# GH#60128 self has duplicate columns and other introduces a new one
df1 = DataFrame([[1, 2, 3]], columns=["P", "Q", "Q"])
df2 = DataFrame([[4, 5]], columns=["P", "R"])
result = df1.combine_first(df2)
expected = DataFrame([[1, 2, 3, 5]], columns=["P", "Q", "Q", "R"])
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"scalar1, scalar2",
Expand Down Expand Up @@ -595,3 +618,18 @@ def test_combine_first_preserve_column_order():
result = df1.combine_first(df2)
expected = DataFrame({"B": [1, 2, 3], "A": [4.0, 5.0, 6.0]})
tm.assert_frame_equal(result, expected)


def test_combine_first_preserve_precision_mixed_columns():
# GH#60128 a wide-int column keeps its precision even alongside a column
# that does pass through float
wide_val = 1666880195890293744
df1 = DataFrame({"a": [wide_val, 5], "b": [1.0, 2.0]})
df2 = DataFrame({"a": [6, 7, 8]})

result = df1.combine_first(df2)
expected = DataFrame(
{"a": [wide_val, 5, 8], "b": [1.0, 2.0, np.nan]},
)
tm.assert_frame_equal(result, expected)
assert result["a"].dtype == "int64"