diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 1f04f52c2fb4e..3f0cba9da7623 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -209,6 +209,7 @@ Performance improvements - Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`) - Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`) - Performance improvement in :meth:`NDFrame.__finalize__`, :meth:`Series.to_numpy`, :attr:`DataFrame.dtypes`, and :meth:`DataFrame.__getitem__` (:issue:`57431`) +- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` with ``date_format="iso"`` for a timezone-aware datetime :class:`Series` and for a timezone-aware :class:`DatetimeIndex` (:issue:`66007`) - Performance improvement in :meth:`Timedelta.total_seconds` (:issue:`65388`) - Performance improvement in :meth:`arrays.SparseArray.isna` by avoiding a dense-then-resparsify round-trip (:issue:`41023`) - Performance improvement in datetime/timedelta unit conversion (e.g. ``datetime64[s]`` to ``datetime64[ns]``) (:issue:`35025`) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index 3a7f3f5090536..52d8c2741db23 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -18,9 +18,10 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit); // Converts an int64 object representing a date to ISO format // up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z // while base="ns" yields "2020-01-01T00:00:00.000000000Z" +// `utc` appends a trailing "Z" when nonzero (for UTC-localized values). // len is mutated to save the length of the returned string char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, size_t *len); + NPY_DATETIMEUNIT base, int utc, size_t *len); char *int64ToIsoDuration(int64_t value, NPY_DATETIMEUNIT valueUnit, size_t *len); diff --git a/pandas/_libs/include/pandas/datetime/pd_datetime.h b/pandas/_libs/include/pandas/datetime/pd_datetime.h index 89857c6a5b435..736f87a37c431 100644 --- a/pandas/_libs/include/pandas/datetime/pd_datetime.h +++ b/pandas/_libs/include/pandas/datetime/pd_datetime.h @@ -34,7 +34,8 @@ typedef struct { npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT, const npy_datetimestruct *); int (*scaleNanosecToUnit)(int64_t *, NPY_DATETIMEUNIT); - char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *); + char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, int, + size_t *); char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *); npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT); char *(*int64ToIsoDuration)(int64_t, NPY_DATETIMEUNIT, size_t *); @@ -73,8 +74,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL; (npy_datetimestruct)) # define scaleNanosecToUnit(value, unit) \ PandasDateTimeAPI->scaleNanosecToUnit((value), (unit)) -# define int64ToIso(value, valueUnit, base, len) \ - PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len)) +# define int64ToIso(value, valueUnit, base, utc, len) \ + PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (utc), (len)) # define NpyDateTimeToEpoch(dt, base) \ PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base)) # define PyDateTimeToIso(obj, base, len) \ diff --git a/pandas/_libs/src/datetime/date_conversions.c b/pandas/_libs/src/datetime/date_conversions.c index 9f02277e69255..1737233b5b7fe 100644 --- a/pandas/_libs/src/datetime/date_conversions.c +++ b/pandas/_libs/src/datetime/date_conversions.c @@ -40,14 +40,18 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) { return 0; } -/* Converts the int64_t representation of a datetime to ISO; mutates len */ +/* Converts the int64_t representation of a datetime to ISO; mutates len. + * ``utc`` controls whether the trailing "Z" is appended: pass 1 for values + * that are UTC-localized (e.g. the underlying ndarray of a dt64tz array), + * 0 for tz-naive datetime64. */ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, - NPY_DATETIMEUNIT base, size_t *len) { + NPY_DATETIMEUNIT base, int utc, size_t *len) { npy_datetimestruct dts; int ret_code; pandas_datetime_to_datetimestruct(value, valueUnit, &dts); + // NB get_datetime_iso_8601_strlen(0, ...) already reserves room for "Z" *len = (size_t)get_datetime_iso_8601_strlen(0, base); char *result = PyObject_Malloc(*len); @@ -55,8 +59,7 @@ char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit, PyErr_NoMemory(); return NULL; } - // datetime64 is always naive - ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base); + ret_code = make_iso_8601_datetime(&dts, result, *len, utc, base); if (ret_code != 0) { PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); diff --git a/pandas/_libs/src/vendored/ujson/python/objToJSON.c b/pandas/_libs/src/vendored/ujson/python/objToJSON.c index d72464bf5fff5..7feff134e9ebc 100644 --- a/pandas/_libs/src/vendored/ujson/python/objToJSON.c +++ b/pandas/_libs/src/vendored/ujson/python/objToJSON.c @@ -76,6 +76,9 @@ typedef struct __NpyArrContext { npy_intp ndim; npy_intp index[NPY_MAXDIMS]; int type_num; + // whether the values are UTC-localized datetime64 (dt64tz), so that the + // ISO output should carry a trailing "Z" + int is_utc; char **rowLabels; char **columnLabels; @@ -112,6 +115,9 @@ typedef struct __TypeContext { NpyArrContext *npyarr; PdBlockContext *pdblock; int transpose; + // whether newObj is a UTC-localized datetime64 ndarray (dt64tz values), + // propagated to the NpyArrContext so the ISO output keeps its "Z" + int ndarrayIsUTC; char **rowLabels; char **columnLabels; npy_intp rowLabelsLen; @@ -139,6 +145,13 @@ typedef struct __PyObjectEncoder { // (has to be set when calling NpyDateTimeToIsoCallback or // NpyTimeDeltaToIsoCallback) NPY_DATETIMEUNIT valueUnit; + // pass-through: whether the datetime64 value being encoded is UTC-localized + // (dt64tz), so NpyDateTimeToIsoCallback appends a trailing "Z" + int datetimeIsUTC; + // one-shot signal that the next bare ndarray encoded via the generic + // PyArray_Check path holds UTC-localized datetime64 values (used by the + // Series "split" data path, which routes get_values() through that path) + int pendingArrayIsUTC; // output format style for pandas data types int outputFormat; @@ -174,12 +187,30 @@ static TypeContext *createTypeContext(void) { pc->rowLabels = NULL; pc->columnLabels = NULL; pc->transpose = 0; + pc->ndarrayIsUTC = 0; pc->rowLabelsLen = 0; pc->columnLabelsLen = 0; return pc; } +// Returns 1 if obj has a DatetimeTZDtype (dt64tz), else 0. Works for Series, +// Index, and DatetimeArray, all of which expose the tz via ``obj.dtype.tz``. +// The underlying datetime64 values of such objects are UTC-localized, so the +// ISO serialization must append a trailing "Z". +static int object_is_dt64tz(PyObject *obj) { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + if (dtype == NULL) { + PyErr_Clear(); + return 0; + } + // Only DatetimeTZDtype exposes a `tz` attribute; numpy datetime64 dtypes + // and CategoricalDtype do not. + const int is_dt64tz = PyObject_HasAttrString(dtype, "tz"); + Py_DECREF(dtype); + return is_dt64tz; +} + static PyObject *get_values(PyObject *obj) { PyObject *typ = NULL; PyObject *arr = NULL; @@ -221,6 +252,27 @@ static PyObject *get_values(PyObject *obj) { return NULL; } + if (object_is_dt64tz(obj)) { + // Serialize tz-aware datetimes from the underlying UTC datetime64 ndarray + // rather than boxing into an object array of Timestamps (what + // _values_for_json does for dt64tz). Callers flag dt64tz objects as UTC + // (see object_is_dt64tz) so the ISO output keeps its trailing "Z". + // category[dt64tz] has no dtype.tz and falls through to _values_for_json. + values = PyObject_GetAttrString(arr, "_ndarray"); + Py_DECREF(arr); + if (values == NULL) { + PyErr_SetString(PyExc_ValueError, + "Error retrieving ._ndarray from DatetimeArray"); + return NULL; + } + if (!PyArray_CheckExact(values)) { + PyErr_Format(PyExc_ValueError, "._ndarray should be a numpy array"); + Py_DECREF(values); + return NULL; + } + return values; + } + values = PyObject_CallMethod(arr, "_values_for_json", NULL); if (values == NULL) { @@ -336,7 +388,9 @@ static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit; - GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len); + const int utc = ((PyObjectEncoder *)tc->encoder)->datetimeIsUTC; + GET_TC(tc)->cStr = + int64ToIso(GET_TC(tc)->longValue, valueUnit, base, utc, len); return GET_TC(tc)->cStr; } @@ -443,6 +497,7 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; npyarr->type_num = PyArray_DESCR(obj)->type_num; + npyarr->is_utc = GET_TC(tc)->ndarrayIsUTC; if (GET_TC(tc)->transpose) { npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim); @@ -522,6 +577,8 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { PyArray_Descr *dtype = PyArray_DESCR(arrayobj); ((PyObjectEncoder *)tc->encoder)->valueUnit = get_datetime_metadata_from_dtype(dtype).base; + // and whether these UTC-localized values should serialize with a "Z" + ((PyObjectEncoder *)tc->encoder)->datetimeIsUTC = npyarr->is_utc; ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { @@ -1094,6 +1151,9 @@ static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (!GET_TC(tc)->itemValue) { return 0; } + // get_values returns the bare UTC datetime64 ndarray for dt64tz; flag it + // so the generic ndarray path keeps the trailing "Z". + ((PyObjectEncoder *)tc->encoder)->pendingArrayIsUTC = object_is_dt64tz(obj); } else { return 0; } @@ -1248,8 +1308,10 @@ static void NpyArr_freeLabels(char **labels, npy_intp len) { * which may need to be represented in various formats. */ static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, - npy_intp num) { + npy_intp num, int is_utc) { // NOTE this function steals a reference to labels. + // is_utc: whether numpy datetime64 labels are UTC-localized (dt64tz), so + // the ISO output should carry a trailing "Z". PyObject *item = NULL; const NPY_DATETIMEUNIT targetUnit = enc->datetimeUnit; @@ -1331,7 +1393,7 @@ static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, cLabel = int64ToIsoDuration(i8date, valueUnit, &len); } else { if (type_num == NPY_DATETIME) { - cLabel = int64ToIso(i8date, valueUnit, targetUnit, &len); + cLabel = int64ToIso(i8date, valueUnit, targetUnit, is_utc, &len); } else { cLabel = PyDateTimeToIso(item, targetUnit, &len); } @@ -1641,6 +1703,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->newObj = get_values(obj); if (pc->newObj) { + pc->ndarrayIsUTC = object_is_dt64tz(obj); tc->type = JT_ARRAY; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; @@ -1667,6 +1730,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (!pc->newObj) { goto INVALID; } + pc->ndarrayIsUTC = object_is_dt64tz(obj); if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { tc->type = JT_OBJECT; @@ -1675,6 +1739,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } PyObject *values = get_values(tmpObj); + const int values_is_utc = object_is_dt64tz(tmpObj); Py_DECREF(tmpObj); if (!values) { goto INVALID; @@ -1687,8 +1752,8 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj; pc->columnLabelsLen = PyArray_DIM(arrayobj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); if (!pc->columnLabels) { goto INVALID; } @@ -1717,6 +1782,9 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } tc->type = JT_ARRAY; + // consume a one-shot UTC flag set by a caller (e.g. Series "split" data) + pc->ndarrayIsUTC = enc->pendingArrayIsUTC; + enc->pendingArrayIsUTC = 0; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; pc->iterNext = NpyArr_iterNext; @@ -1768,9 +1836,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { Py_DECREF(tmpObj); goto INVALID; } + const int values_is_utc = object_is_dt64tz(tmpObj); pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); Py_DECREF(tmpObj); if (!pc->columnLabels) { goto INVALID; @@ -1788,9 +1857,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { Py_DECREF(tmpObj); goto INVALID; } + int values_is_utc = object_is_dt64tz(tmpObj); pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen, values_is_utc); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -1807,9 +1877,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->rowLabels = NULL; goto INVALID; } + values_is_utc = object_is_dt64tz(tmpObj); pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, - pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels( + (PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc); Py_DECREF(tmpObj); if (!pc->columnLabels) { NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e7f73bd40ff91..dfeb5401bf4db 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1477,23 +1477,6 @@ def test_tz_range_is_utc(self, tz_range): assert result == dfexp assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp - def test_tz_range_is_naive(self): - dti = date_range("2013-01-01 05:00:00", periods=2, unit="ns") - - exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' - serexp = '{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}' - dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' - - # Ensure datetimes in object array are serialized correctly - # in addition to the normal DTI case - assert ujson_dumps(dti, iso_dates=True) == exp - assert ujson_dumps(dti.astype(object), iso_dates=True) == exp - assert ujson_dumps(Series(dti), iso_dates=True) == serexp - df = DataFrame({"DT": dti}) - result = ujson_dumps(df, iso_dates=True) - assert result == dfexp - assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp - @pytest.mark.parametrize( "orient", ["split", "records", "index", "columns", "values"] ) @@ -1514,6 +1497,23 @@ def test_tz_aware_to_json_matches_object(self, orient): orient=orient, date_format="iso" ) + def test_tz_range_is_naive(self): + dti = date_range("2013-01-01 05:00:00", periods=2, unit="ns") + + exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]' + serexp = '{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}' + dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}' + + # Ensure datetimes in object array are serialized correctly + # in addition to the normal DTI case + assert ujson_dumps(dti, iso_dates=True) == exp + assert ujson_dumps(dti.astype(object), iso_dates=True) == exp + assert ujson_dumps(Series(dti), iso_dates=True) == serexp + df = DataFrame({"DT": dti}) + result = ujson_dumps(df, iso_dates=True) + assert result == dfexp + assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp + def test_read_inline_jsonl(self): # GH9180