From 46dfaea35c5e0e75ae06979d5dcefec7d18c0a42 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Jun 2026 17:47:55 -0700 Subject: [PATCH 1/3] PERF: serialize tz-aware datetime to_json from datetime64, fix Series ISO "Z" dt64tz values are now written to JSON directly from the underlying UTC-localized datetime64 ndarray, threading a "this is UTC" flag through the encoder so the ISO output keeps its trailing "Z", instead of materializing an object array of Timestamps. This makes timezone-aware Index / index / column-label serialization much faster (~30-50x) and fixes Series.to_json(date_format="iso") dropping the "Z" marker (it had fallen through to a tz-naive path because a Series exposes tz only via .dt.tz, not a top-level .tz attribute). Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/source/whatsnew/v3.1.0.rst | 2 + .../pandas/datetime/date_conversions.h | 3 +- .../include/pandas/datetime/pd_datetime.h | 7 +- pandas/_libs/src/datetime/date_conversions.c | 11 ++- .../src/vendored/ujson/python/objToJSON.c | 91 ++++++++++++++----- pandas/tests/io/json/test_pandas.py | 24 +++++ 6 files changed, 105 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 4e7d7f11cac08..fe86b040ef4fd 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -175,6 +175,7 @@ Performance improvements - Performance improvement in :func:`read_sas` when decoding strings (:issue:`47339`) - Performance improvement in :func:`read_sql` with ADBC connections by requesting only table metadata when checking whether an input string names a table (:issue:`65652`) - Performance improvement in :func:`to_datetime` with the default ``cache=True`` for inputs that are already datetime-typed or use a ``unit`` (:issue:`65380`) +- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` when the index or columns is a timezone-aware :class:`DatetimeIndex` (:issue:`XXXXX`) - Performance improvement in :func:`tseries.frequencies.to_offset` parsing of frequency strings, especially for tick-resolution offsets (e.g. ``"h"``, ``"5min"``, ``"3s"``) and compound expressions (e.g. ``"1D1h"``) (:issue:`65395`) - Performance improvement in :func:`util.hash_pandas_object` for PyArrow-backed string and binary types by using PyArrow's ``dictionary_encode`` instead of converting to NumPy for factorization (:issue:`48964`) - Performance improvement in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` with user-defined functions (:issue:`46505`) @@ -357,6 +358,7 @@ I/O - Fixed segfault when instantiating the internal ``pandas._libs.parsers.TextReader`` with no arguments; it now raises ``TypeError`` (:issue:`53131`) - Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows`` when the requested row count is not a multiple of the chunk size (:issue:`64025`) +- Bug in :meth:`Series.to_json` with ``date_format="iso"`` where a timezone-aware datetime :class:`Series` was serialized without the trailing ``Z`` marker, losing the timezone information that is retained for an equivalent :class:`DatetimeIndex` or :class:`DataFrame` column (:issue:`XXXXX`) - :meth:`HDFStore.put` and :meth:`HDFStore.append` now support storing :class:`Series` and :class:`DataFrame` columns with :class:`PeriodDtype` in both ``"fixed"`` and ``"table"`` formats (:issue:`41978`) - Bug in :meth:`DataFrame.__repr__` raising ``TypeError`` for a column with a NumPy structured dtype (e.g. produced by :meth:`DataFrame.from_records` from a structured ``ndarray``) (:issue:`55011`) - Bug in :meth:`DataFrame.__repr__` where horizontally truncated output could exceed the terminal width by up to 4 characters (:issue:`32461`) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index 3a7f3f5090536..52d8c2741db23 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -18,9 +18,10 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit); // Converts an int64 object representing a date to ISO format // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" +// `utc` appends a trailing "Z" when nonzero (for UTC-localized values). // len is mutated to save the length of the returned string char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, size_t *len); + NPY_DATETIMEUNIT base, int utc, size_t *len); char *int64ToIsoDuration(int64_t value, NPY_DATETIMEUNIT valueUnit, size_t *len); diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 89857c6a5b435..736f87a37c431 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -34,7 +34,8 @@ typedef struct { npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT, const npy_datetimestruct *); int (*scaleNanosecToUnit)(int64_t *, NPY_DATETIMEUNIT); - char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *); + char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, int, + size_t *); char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *); npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT); char *(*int64ToIsoDuration)(int64_t, NPY_DATETIMEUNIT, size_t *); @@ -73,8 +74,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; (npy_datetimestruct)) # define scaleNanosecToUnit(value, unit) \ PandasDateTimeAPI->scaleNanosecToUnit((value), (unit)) -# define int64ToIso(value, valueUnit, base, len) \ - PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len)) +# define int64ToIso(value, valueUnit, base, utc, len) \ + PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (utc), (len)) # define NpyDateTimeToEpoch(dt, base) \ PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base)) # define PyDateTimeToIso(obj, base, len) \ diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 9f02277e69255..1737233b5b7fe 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -40,14 +40,18 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) { return 0; } -/* Converts the int64_t representation of a datetime to ISO; mutates len */ +/* Converts the int64_t representation of a datetime to ISO; mutates len. + * ``utc`` controls whether the trailing "Z" is appended: pass 1 for values + * that are UTC-localized (e.g. the underlying ndarray of a dt64tz array), + * 0 for tz-naive datetime64. */ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, size_t *len) { + NPY_DATETIMEUNIT base, int utc, size_t *len) { npy_datetimestruct dts; int ret_code; pandas_datetime_to_datetimestruct(value, valueUnit, &dts); + // NB get_datetime_iso_8601_strlen(0, ...) already reserves room for "Z" *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); @@ -55,8 +59,7 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, PyErr_NoMemory(); return NULL; } - // datetime64 is always naive - ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); + ret_code = make_iso_8601_datetime(&dts, result, *len, utc, base); if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index b089e3477e053..a52be81a6a247 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -76,6 +76,9 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; + // whether the values are UTC-localized datetime64 (dt64tz), so that the + // ISO output should carry a trailing "Z" + int is_utc; char **rowLabels; char **columnLabels; @@ -112,6 +115,9 @@ typedef struct __TypeContext { NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; + // whether newObj is a UTC-localized datetime64 ndarray (dt64tz values), + // propagated to the NpyArrContext so the ISO output keeps its "Z" + int ndarrayIsUTC; char **rowLabels; char **columnLabels; npy_intp rowLabelsLen; @@ -139,6 +145,13 @@ typedef struct __PyObjectEncoder { // (has to be set when calling NpyDateTimeToIsoCallback or // NpyTimeDeltaToIsoCallback) NPY_DATETIMEUNIT valueUnit; + // pass-through: whether the datetime64 value being encoded is UTC-localized + // (dt64tz), so NpyDateTimeToIsoCallback appends a trailing "Z" + int datetimeIsUTC; + // one-shot signal that the next bare ndarray encoded via the generic + // PyArray_Check path holds UTC-localized datetime64 values (used by the + // Series "split" data path, which routes get_values() through that path) + int pendingArrayIsUTC; // output format style for pandas data types int outputFormat; @@ -174,6 +187,7 @@ static TypeContext *createTypeContext(void) { pc->rowLabels = NULL; pc->columnLabels = NULL; pc->transpose = 0; + pc->ndarrayIsUTC = 0; pc->rowLabelsLen = 0; pc->columnLabelsLen = 0; @@ -184,20 +198,11 @@ static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; if (object_is_index_type(obj) || object_is_series_type(obj)) { - // The special cases to worry about are dt64tz and category[dt64tz]. - // In both cases we want the UTC-localized datetime64 ndarray, - // without going through and object array of Timestamps. - if (PyObject_HasAttrString(obj, "tz")) { - PyObject *tz = PyObject_GetAttrString(obj, "tz"); - if (tz != Py_None) { - // Go through object array if we have dt64tz, since tz info will - // be lost if values is used directly. - Py_DECREF(tz); - values = PyObject_CallMethod(obj, "__array__", NULL); - return values; - } - Py_DECREF(tz); - } + // For dt64tz, ``.values`` is the underlying UTC-localized datetime64 + // ndarray. We serialize that directly rather than an object array of + // Timestamps; callers flag dt64tz objects as UTC (see + // ``object_is_dt64tz``) so the ISO output keeps its trailing "Z". + // category[dt64tz] is handled below via ``_values_for_json``. values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { // Clear so we can subsequently try another method @@ -237,6 +242,23 @@ static PyObject *get_values(PyObject *obj) { return values; } +// Returns 1 if obj has a DatetimeTZDtype (dt64tz), else 0. Works for Series, +// Index, and DatetimeArray, all of which expose the tz via ``obj.dtype.tz``. +// The underlying datetime64 values of such objects are UTC-localized, so the +// ISO serialization must append a trailing "Z". +static int object_is_dt64tz(PyObject *obj) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + if (dtype == NULL) { + PyErr_Clear(); + return 0; + } + // Only DatetimeTZDtype exposes a `tz` attribute; numpy datetime64 dtypes + // and CategoricalDtype do not. + const int is_dt64tz = PyObject_HasAttrString(dtype, "tz"); + Py_DECREF(dtype); + return is_dt64tz; +} + static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { PyObject *tmp = PyObject_GetAttrString(obj, attr); if (tmp == 0) { @@ -334,7 +356,9 @@ static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); + const int utc = ((PyObjectEncoder *)tc->encoder)->datetimeIsUTC; + GET_TC(tc)->cStr = + int64ToIso(GET_TC(tc)->longValue, valueUnit, base, utc, len); return GET_TC(tc)->cStr; } @@ -441,6 +465,7 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; + npyarr->is_utc = GET_TC(tc)->ndarrayIsUTC; if (GET_TC(tc)->transpose) { npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); @@ -520,6 +545,8 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; + // and whether these UTC-localized values should serialize with a "Z" + ((PyObjectEncoder *)tc->encoder)->datetimeIsUTC = npyarr->is_utc; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { @@ -1092,6 +1119,9 @@ static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->itemValue) { return 0; } + // get_values returns the bare UTC datetime64 ndarray for dt64tz; flag it + // so the generic ndarray path keeps the trailing "Z". + ((PyObjectEncoder *)tc->encoder)->pendingArrayIsUTC = object_is_dt64tz(obj); } else { return 0; } @@ -1246,8 +1276,10 @@ static void NpyArr_freeLabels(char **labels, npy_intp len) { * which may need to be represented in various formats. */ static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, - npy_intp num) { + npy_intp num, int is_utc) { // NOTE this function steals a reference to labels. + // is_utc: whether numpy datetime64 labels are UTC-localized (dt64tz), so + // the ISO output should carry a trailing "Z". PyObject *item = NULL; const NPY_DATETIMEUNIT targetUnit = enc->datetimeUnit; @@ -1329,7 +1361,7 @@ static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = int64ToIsoDuration(i8date, valueUnit, &len); } else { if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(i8date, valueUnit, targetUnit, &len); + cLabel = int64ToIso(i8date, valueUnit, targetUnit, is_utc, &len); } else { cLabel = PyDateTimeToIso(item, targetUnit, &len); } @@ -1639,6 +1671,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->newObj = get_values(obj); if (pc->newObj) { + pc->ndarrayIsUTC = object_is_dt64tz(obj); tc->type = JT_ARRAY; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; @@ -1665,6 +1698,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!pc->newObj) { goto INVALID; } + pc->ndarrayIsUTC = object_is_dt64tz(obj); if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { tc->type = JT_OBJECT; @@ -1673,6 +1707,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } PyObject *values = get_values(tmpObj); + const int values_is_utc = object_is_dt64tz(tmpObj); Py_DECREF(tmpObj); if (!values) { goto INVALID; @@ -1685,8 +1720,8 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); if (!pc->columnLabels) { goto INVALID; } @@ -1715,6 +1750,9 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } tc->type = JT_ARRAY; + // consume a one-shot UTC flag set by a caller (e.g. Series "split" data) + pc->ndarrayIsUTC = enc->pendingArrayIsUTC; + enc->pendingArrayIsUTC = 0; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; pc->iterNext = NpyArr_iterNext; @@ -1766,9 +1804,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { Py_DECREF(tmpObj); goto INVALID; } + const int values_is_utc = object_is_dt64tz(tmpObj); pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); Py_DECREF(tmpObj); if (!pc->columnLabels) { goto INVALID; @@ -1786,9 +1825,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { Py_DECREF(tmpObj); goto INVALID; } + int values_is_utc = object_is_dt64tz(tmpObj); pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen, values_is_utc); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -1805,9 +1845,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->rowLabels = NULL; goto INVALID; } + values_is_utc = object_is_dt64tz(tmpObj); pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); Py_DECREF(tmpObj); if (!pc->columnLabels) { NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index fefd08fcad923..776b8ccec446e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1437,6 +1437,7 @@ def test_tz_is_naive(self): ) def test_tz_range_is_utc(self, tz_range): exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' + serexp = '{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}' dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000Z","1":"2013-01-02T05:00:00.000Z"}}' assert ujson_dumps(tz_range, iso_dates=True) == exp @@ -1445,11 +1446,34 @@ def test_tz_range_is_utc(self, tz_range): # in addition to the normal DTI case assert ujson_dumps(dti, iso_dates=True) == exp assert ujson_dumps(dti.astype(object), iso_dates=True) == exp + # Series[dt64tz] must preserve the tz like the DTI case; it + # previously fell through to a tz-naive path (no "Z" suffix) + assert ujson_dumps(Series(dti), iso_dates=True) == serexp df = DataFrame({"DT": dti}) result = ujson_dumps(df, iso_dates=True) assert result == dfexp assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) + @pytest.mark.parametrize( + "orient", ["split", "records", "index", "columns", "values"] + ) + def test_tz_aware_to_json_matches_object(self, orient): + # tz-aware datetime values and labels serialize identically to the + # equivalent object-dtype Timestamps for every orient, keeping the + # trailing "Z" -- the "split" data path in particular regressed when + # values were taken straight from the datetime64 ndarray + dti = date_range("2013-01-01 05:00:00", periods=2, tz="US/Eastern") + ser = Series(dti, index=dti) + expected = Series(dti.astype(object), index=dti.astype(object)) + assert ser.to_json(orient=orient, date_format="iso") == expected.to_json( + orient=orient, date_format="iso" + ) + df = DataFrame({"A": dti}, index=dti) + df_expected = DataFrame({"A": dti.astype(object)}, index=dti.astype(object)) + assert df.to_json(orient=orient, date_format="iso") == df_expected.to_json( + orient=orient, date_format="iso" + ) + def test_tz_range_is_naive(self): dti = date_range("2013-01-01 05:00:00", periods=2, unit="ns") From 52f52b555780e8026be73c25a667d96763d64876 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Jun 2026 17:49:29 -0700 Subject: [PATCH 2/3] DOC: fill in PR number for to_json tz whatsnew entries Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/source/whatsnew/v3.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index fe86b040ef4fd..ad6bcc3c23ab1 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -175,7 +175,6 @@ Performance improvements - Performance improvement in :func:`read_sas` when decoding strings (:issue:`47339`) - Performance improvement in :func:`read_sql` with ADBC connections by requesting only table metadata when checking whether an input string names a table (:issue:`65652`) - Performance improvement in :func:`to_datetime` with the default ``cache=True`` for inputs that are already datetime-typed or use a ``unit`` (:issue:`65380`) -- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` when the index or columns is a timezone-aware :class:`DatetimeIndex` (:issue:`XXXXX`) - Performance improvement in :func:`tseries.frequencies.to_offset` parsing of frequency strings, especially for tick-resolution offsets (e.g. ``"h"``, ``"5min"``, ``"3s"``) and compound expressions (e.g. ``"1D1h"``) (:issue:`65395`) - Performance improvement in :func:`util.hash_pandas_object` for PyArrow-backed string and binary types by using PyArrow's ``dictionary_encode`` instead of converting to NumPy for factorization (:issue:`48964`) - Performance improvement in :meth:`.DataFrameGroupBy.agg` and :meth:`.SeriesGroupBy.agg` with user-defined functions (:issue:`46505`) @@ -207,6 +206,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`) - Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`) - Performance improvement in :meth:`NDFrame.__finalize__`, :meth:`Series.to_numpy`, :attr:`DataFrame.dtypes`, and :meth:`DataFrame.__getitem__` (:issue:`57431`) +- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` when the index or columns is a timezone-aware :class:`DatetimeIndex` (:issue:`66007`) - Performance improvement in :meth:`Timedelta.total_seconds` (:issue:`65388`) - Performance improvement in :meth:`arrays.SparseArray.isna` by avoiding a dense-then-resparsify round-trip (:issue:`41023`) - Performance improvement in datetime/timedelta unit conversion (e.g. ``datetime64[s]`` to ``datetime64[ns]``) (:issue:`35025`) @@ -358,12 +358,12 @@ I/O - Fixed segfault when instantiating the internal ``pandas._libs.parsers.TextReader`` with no arguments; it now raises ``TypeError`` (:issue:`53131`) - Fixed :func:`read_json` with ``lines=True`` and ``chunksize`` to respect ``nrows`` when the requested row count is not a multiple of the chunk size (:issue:`64025`) -- Bug in :meth:`Series.to_json` with ``date_format="iso"`` where a timezone-aware datetime :class:`Series` was serialized without the trailing ``Z`` marker, losing the timezone information that is retained for an equivalent :class:`DatetimeIndex` or :class:`DataFrame` column (:issue:`XXXXX`) - :meth:`HDFStore.put` and :meth:`HDFStore.append` now support storing :class:`Series` and :class:`DataFrame` columns with :class:`PeriodDtype` in both ``"fixed"`` and ``"table"`` formats (:issue:`41978`) - Bug in :meth:`DataFrame.__repr__` raising ``TypeError`` for a column with a NumPy structured dtype (e.g. produced by :meth:`DataFrame.from_records` from a structured ``ndarray``) (:issue:`55011`) - Bug in :meth:`DataFrame.__repr__` where horizontally truncated output could exceed the terminal width by up to 4 characters (:issue:`32461`) - Bug in :meth:`DataFrame.to_stata` raising ``KeyError`` when column names require renaming and ``convert_dates`` is specified for a different column (:issue:`60536`) - Bug in :meth:`DataFrame.to_string` where ``formatters`` dict was applied to wrong columns when output was horizontally truncated via ``max_cols`` (:issue:`35410`) +- Bug in :meth:`Series.to_json` with ``date_format="iso"`` where a timezone-aware datetime :class:`Series` was serialized without the trailing ``Z`` marker, losing the timezone information that is retained for an equivalent :class:`DatetimeIndex` or :class:`DataFrame` column (:issue:`66007`) - Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`) - :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`) - :func:`read_hdf` can again read fixed-format files written by very old pandas versions (``<=0.15.x``) that stored a ``freq`` attribute on non-datetimelike indexes, which previously failed with a ``TypeError`` or ``ValueError`` (:issue:`33186`) From 665f304d19f963777b376d6d692c199bd4c9783a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 26 Jun 2026 18:43:49 -0700 Subject: [PATCH 3/3] PERF: serialize dt64tz to_json from datetime64 ndarray Re-derive the tz-aware to_json fast path on top of main's get_values (GH#65744): for a dt64tz Series/Index, serialize directly from the underlying UTC datetime64 ndarray (DatetimeArray._ndarray) instead of boxing into an object array of Timestamps via _values_for_json. The existing UTC-flag plumbing keeps the trailing "Z". DataFrame data columns still go through _values_for_json (follow-up). Co-Authored-By: Claude Opus 4.8 (1M context) --- doc/source/whatsnew/v3.1.0.rst | 3 +- .../src/vendored/ujson/python/objToJSON.c | 55 +++++++++++++------ 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 6d378598de183..3f0cba9da7623 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -209,7 +209,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`) - Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`) - Performance improvement in :meth:`NDFrame.__finalize__`, :meth:`Series.to_numpy`, :attr:`DataFrame.dtypes`, and :meth:`DataFrame.__getitem__` (:issue:`57431`) -- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` when the index or columns is a timezone-aware :class:`DatetimeIndex` (:issue:`66007`) +- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` with ``date_format="iso"`` for a timezone-aware datetime :class:`Series` and for a timezone-aware :class:`DatetimeIndex` (:issue:`66007`) - Performance improvement in :meth:`Timedelta.total_seconds` (:issue:`65388`) - Performance improvement in :meth:`arrays.SparseArray.isna` by avoiding a dense-then-resparsify round-trip (:issue:`41023`) - Performance improvement in datetime/timedelta unit conversion (e.g. ``datetime64[s]`` to ``datetime64[ns]``) (:issue:`35025`) @@ -374,7 +374,6 @@ I/O - Bug in :meth:`DataFrame.__repr__` where horizontally truncated output could exceed the terminal width by up to 4 characters (:issue:`32461`) - Bug in :meth:`DataFrame.to_stata` raising ``KeyError`` when column names require renaming and ``convert_dates`` is specified for a different column (:issue:`60536`) - Bug in :meth:`DataFrame.to_string` where ``formatters`` dict was applied to wrong columns when output was horizontally truncated via ``max_cols`` (:issue:`35410`) -- Bug in :meth:`Series.to_json` with ``date_format="iso"`` where a timezone-aware datetime :class:`Series` was serialized without the trailing ``Z`` marker, losing the timezone information that is retained for an equivalent :class:`DatetimeIndex` or :class:`DataFrame` column (:issue:`66007`) - Fixed :func:`read_json` with ``lines=True`` and ``nrows=0`` to return an empty DataFrame (:issue:`64025`) - :meth:`DataFrame.to_hdf` now raises a clear :class:`NotImplementedError` when writing a column or :class:`Index` of an unsupported extension dtype (such as :class:`IntervalDtype`, :class:`SparseDtype`, or the nullable integer/float/boolean dtypes), instead of a low-level ``AttributeError`` or PyTables ``TypeError`` (:issue:`26144`, :issue:`38305`, :issue:`42070`) - :func:`read_hdf` can again read fixed-format files written by very old pandas versions (``<=0.15.x``) that stored a ``freq`` attribute on non-datetimelike indexes, which previously failed with a ``TypeError`` or ``ValueError`` (:issue:`33186`) diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index df943c0e28981..7feff134e9ebc 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -194,6 +194,23 @@ static TypeContext *createTypeContext(void) { return pc; } +// Returns 1 if obj has a DatetimeTZDtype (dt64tz), else 0. Works for Series, +// Index, and DatetimeArray, all of which expose the tz via ``obj.dtype.tz``. +// The underlying datetime64 values of such objects are UTC-localized, so the +// ISO serialization must append a trailing "Z". +static int object_is_dt64tz(PyObject *obj) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + if (dtype == NULL) { + PyErr_Clear(); + return 0; + } + // Only DatetimeTZDtype exposes a `tz` attribute; numpy datetime64 dtypes + // and CategoricalDtype do not. + const int is_dt64tz = PyObject_HasAttrString(dtype, "tz"); + Py_DECREF(dtype); + return is_dt64tz; +} + static PyObject *get_values(PyObject *obj) { PyObject *typ = NULL; PyObject *arr = NULL; @@ -235,6 +252,27 @@ static PyObject *get_values(PyObject *obj) { return NULL; } + if (object_is_dt64tz(obj)) { + // Serialize tz-aware datetimes from the underlying UTC datetime64 ndarray + // rather than boxing into an object array of Timestamps (what + // _values_for_json does for dt64tz). Callers flag dt64tz objects as UTC + // (see object_is_dt64tz) so the ISO output keeps its trailing "Z". + // category[dt64tz] has no dtype.tz and falls through to _values_for_json. + values = PyObject_GetAttrString(arr, "_ndarray"); + Py_DECREF(arr); + if (values == NULL) { + PyErr_SetString(PyExc_ValueError, + "Error retrieving ._ndarray from DatetimeArray"); + return NULL; + } + if (!PyArray_CheckExact(values)) { + PyErr_Format(PyExc_ValueError, "._ndarray should be a numpy array"); + Py_DECREF(values); + return NULL; + } + return values; + } + values = PyObject_CallMethod(arr, "_values_for_json", NULL); if (values == NULL) { @@ -253,23 +291,6 @@ static PyObject *get_values(PyObject *obj) { return values; } -// Returns 1 if obj has a DatetimeTZDtype (dt64tz), else 0. Works for Series, -// Index, and DatetimeArray, all of which expose the tz via ``obj.dtype.tz``. -// The underlying datetime64 values of such objects are UTC-localized, so the -// ISO serialization must append a trailing "Z". -static int object_is_dt64tz(PyObject *obj) { - PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); - if (dtype == NULL) { - PyErr_Clear(); - return 0; - } - // Only DatetimeTZDtype exposes a `tz` attribute; numpy datetime64 dtypes - // and CategoricalDtype do not. - const int is_dt64tz = PyObject_HasAttrString(dtype, "tz"); - Py_DECREF(dtype); - return is_dt64tz; -} - static PyObject *get_sub_attr(PyObject *obj, char *attr, char *subAttr) { PyObject *tmp = PyObject_GetAttrString(obj, attr); if (tmp == 0) {