Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ Performance improvements
- Performance improvement in :meth:`Index.join` and :meth:`Index.union` for :class:`RangeIndex` by avoiding unnecessary memory allocation in the libjoin fastpath (:issue:`54646`)
- Performance improvement in :meth:`IntervalIndex.get_indexer` for monotonic non-overlapping indexes, which now uses binary search instead of the interval tree (:issue:`47614`)
- Performance improvement in :meth:`NDFrame.__finalize__`, :meth:`Series.to_numpy`, :attr:`DataFrame.dtypes`, and :meth:`DataFrame.__getitem__` (:issue:`57431`)
- Performance improvement in :meth:`Series.to_json` and :meth:`DataFrame.to_json` with ``date_format="iso"`` for a timezone-aware datetime :class:`Series` and for a timezone-aware :class:`DatetimeIndex` (:issue:`66007`)
- Performance improvement in :meth:`Timedelta.total_seconds` (:issue:`65388`)
- Performance improvement in :meth:`arrays.SparseArray.isna` by avoiding a dense-then-resparsify round-trip (:issue:`41023`)
- Performance improvement in datetime/timedelta unit conversion (e.g. ``datetime64[s]`` to ``datetime64[ns]``) (:issue:`35025`)
Expand Down
3 changes: 2 additions & 1 deletion pandas/_libs/include/pandas/datetime/date_conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit);
// Converts an int64 object representing a date to ISO format
// up to precision `base` e.g. base="s" yields 2020-01-03T00:00:00Z
// while base="ns" yields "2020-01-01T00:00:00.000000000Z"
// `utc` appends a trailing "Z" when nonzero (for UTC-localized values).
// len is mutated to save the length of the returned string
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base, size_t *len);
NPY_DATETIMEUNIT base, int utc, size_t *len);

char *int64ToIsoDuration(int64_t value, NPY_DATETIMEUNIT valueUnit,
size_t *len);
7 changes: 4 additions & 3 deletions pandas/_libs/include/pandas/datetime/pd_datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ typedef struct {
npy_datetime (*npy_datetimestruct_to_datetime)(NPY_DATETIMEUNIT,
const npy_datetimestruct *);
int (*scaleNanosecToUnit)(int64_t *, NPY_DATETIMEUNIT);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, size_t *);
char *(*int64ToIso)(int64_t, NPY_DATETIMEUNIT, NPY_DATETIMEUNIT, int,
size_t *);
char *(*PyDateTimeToIso)(PyObject *, NPY_DATETIMEUNIT, size_t *);
npy_datetime (*PyDateTimeToEpoch)(PyObject *, NPY_DATETIMEUNIT);
char *(*int64ToIsoDuration)(int64_t, NPY_DATETIMEUNIT, size_t *);
Expand Down Expand Up @@ -73,8 +74,8 @@ static PandasDateTime_CAPI *PandasDateTimeAPI = NULL;
(npy_datetimestruct))
# define scaleNanosecToUnit(value, unit) \
PandasDateTimeAPI->scaleNanosecToUnit((value), (unit))
# define int64ToIso(value, valueUnit, base, len) \
PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (len))
# define int64ToIso(value, valueUnit, base, utc, len) \
PandasDateTimeAPI->int64ToIso((value), (valueUnit), (base), (utc), (len))
# define NpyDateTimeToEpoch(dt, base) \
PandasDateTimeAPI->NpyDateTimeToEpoch((dt), (base))
# define PyDateTimeToIso(obj, base, len) \
Expand Down
11 changes: 7 additions & 4 deletions pandas/_libs/src/datetime/date_conversions.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,26 @@ int scaleNanosecToUnit(int64_t *value, NPY_DATETIMEUNIT unit) {
return 0;
}

/* Converts the int64_t representation of a datetime to ISO; mutates len */
/* Converts the int64_t representation of a datetime to ISO; mutates len.
* ``utc`` controls whether the trailing "Z" is appended: pass 1 for values
* that are UTC-localized (e.g. the underlying ndarray of a dt64tz array),
* 0 for tz-naive datetime64. */
char *int64ToIso(int64_t value, NPY_DATETIMEUNIT valueUnit,
NPY_DATETIMEUNIT base, size_t *len) {
NPY_DATETIMEUNIT base, int utc, size_t *len) {
npy_datetimestruct dts;
int ret_code;

pandas_datetime_to_datetimestruct(value, valueUnit, &dts);

// NB get_datetime_iso_8601_strlen(0, ...) already reserves room for "Z"
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
char *result = PyObject_Malloc(*len);

if (result == NULL) {
PyErr_NoMemory();
return NULL;
}
// datetime64 is always naive
ret_code = make_iso_8601_datetime(&dts, result, *len, 0, base);
ret_code = make_iso_8601_datetime(&dts, result, *len, utc, base);
if (ret_code != 0) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
Expand Down
93 changes: 82 additions & 11 deletions pandas/_libs/src/vendored/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ typedef struct __NpyArrContext {
npy_intp ndim;
npy_intp index[NPY_MAXDIMS];
int type_num;
// whether the values are UTC-localized datetime64 (dt64tz), so that the
// ISO output should carry a trailing "Z"
int is_utc;

char **rowLabels;
char **columnLabels;
Expand Down Expand Up @@ -112,6 +115,9 @@ typedef struct __TypeContext {
NpyArrContext *npyarr;
PdBlockContext *pdblock;
int transpose;
// whether newObj is a UTC-localized datetime64 ndarray (dt64tz values),
// propagated to the NpyArrContext so the ISO output keeps its "Z"
int ndarrayIsUTC;
char **rowLabels;
char **columnLabels;
npy_intp rowLabelsLen;
Expand Down Expand Up @@ -139,6 +145,13 @@ typedef struct __PyObjectEncoder {
// (has to be set when calling NpyDateTimeToIsoCallback or
// NpyTimeDeltaToIsoCallback)
NPY_DATETIMEUNIT valueUnit;
// pass-through: whether the datetime64 value being encoded is UTC-localized
// (dt64tz), so NpyDateTimeToIsoCallback appends a trailing "Z"
int datetimeIsUTC;
// one-shot signal that the next bare ndarray encoded via the generic
// PyArray_Check path holds UTC-localized datetime64 values (used by the
// Series "split" data path, which routes get_values() through that path)
int pendingArrayIsUTC;

// output format style for pandas data types
int outputFormat;
Expand Down Expand Up @@ -174,12 +187,30 @@ static TypeContext *createTypeContext(void) {
pc->rowLabels = NULL;
pc->columnLabels = NULL;
pc->transpose = 0;
pc->ndarrayIsUTC = 0;
pc->rowLabelsLen = 0;
pc->columnLabelsLen = 0;

return pc;
}

// Returns 1 if obj has a DatetimeTZDtype (dt64tz), else 0. Works for Series,
// Index, and DatetimeArray, all of which expose the tz via ``obj.dtype.tz``.
// The underlying datetime64 values of such objects are UTC-localized, so the
// ISO serialization must append a trailing "Z".
static int object_is_dt64tz(PyObject *obj) {
PyObject *dtype = PyObject_GetAttrString(obj, "dtype");
if (dtype == NULL) {
PyErr_Clear();
return 0;
}
// Only DatetimeTZDtype exposes a `tz` attribute; numpy datetime64 dtypes
// and CategoricalDtype do not.
const int is_dt64tz = PyObject_HasAttrString(dtype, "tz");
Py_DECREF(dtype);
return is_dt64tz;
}

static PyObject *get_values(PyObject *obj) {
PyObject *typ = NULL;
PyObject *arr = NULL;
Expand Down Expand Up @@ -221,6 +252,27 @@ static PyObject *get_values(PyObject *obj) {
return NULL;
}

if (object_is_dt64tz(obj)) {
// Serialize tz-aware datetimes from the underlying UTC datetime64 ndarray
// rather than boxing into an object array of Timestamps (what
// _values_for_json does for dt64tz). Callers flag dt64tz objects as UTC
// (see object_is_dt64tz) so the ISO output keeps its trailing "Z".
// category[dt64tz] has no dtype.tz and falls through to _values_for_json.
values = PyObject_GetAttrString(arr, "_ndarray");
Py_DECREF(arr);
if (values == NULL) {
PyErr_SetString(PyExc_ValueError,
"Error retrieving ._ndarray from DatetimeArray");
return NULL;
}
if (!PyArray_CheckExact(values)) {
PyErr_Format(PyExc_ValueError, "._ndarray should be a numpy array");
Py_DECREF(values);
return NULL;
}
return values;
}

values = PyObject_CallMethod(arr, "_values_for_json", NULL);

if (values == NULL) {
Expand Down Expand Up @@ -336,7 +388,9 @@ static const char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
NPY_DATETIMEUNIT valueUnit = ((PyObjectEncoder *)tc->encoder)->valueUnit;
GET_TC(tc)->cStr = int64ToIso(GET_TC(tc)->longValue, valueUnit, base, len);
const int utc = ((PyObjectEncoder *)tc->encoder)->datetimeIsUTC;
GET_TC(tc)->cStr =
int64ToIso(GET_TC(tc)->longValue, valueUnit, base, utc, len);
return GET_TC(tc)->cStr;
}

Expand Down Expand Up @@ -443,6 +497,7 @@ static void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) {
npyarr->ndim = PyArray_NDIM(obj) - 1;
npyarr->curdim = 0;
npyarr->type_num = PyArray_DESCR(obj)->type_num;
npyarr->is_utc = GET_TC(tc)->ndarrayIsUTC;

if (GET_TC(tc)->transpose) {
npyarr->dim = PyArray_DIM(obj, (int)npyarr->ndim);
Expand Down Expand Up @@ -522,6 +577,8 @@ static int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) {
PyArray_Descr *dtype = PyArray_DESCR(arrayobj);
((PyObjectEncoder *)tc->encoder)->valueUnit =
get_datetime_metadata_from_dtype(dtype).base;
// and whether these UTC-localized values should serialize with a "Z"
((PyObjectEncoder *)tc->encoder)->datetimeIsUTC = npyarr->is_utc;
((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr;
((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr;
} else {
Expand Down Expand Up @@ -1094,6 +1151,9 @@ static int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) {
if (!GET_TC(tc)->itemValue) {
return 0;
}
// get_values returns the bare UTC datetime64 ndarray for dt64tz; flag it
// so the generic ndarray path keeps the trailing "Z".
((PyObjectEncoder *)tc->encoder)->pendingArrayIsUTC = object_is_dt64tz(obj);
} else {
return 0;
}
Expand Down Expand Up @@ -1248,8 +1308,10 @@ static void NpyArr_freeLabels(char **labels, npy_intp len) {
* which may need to be represented in various formats.
*/
static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
npy_intp num) {
npy_intp num, int is_utc) {
// NOTE this function steals a reference to labels.
// is_utc: whether numpy datetime64 labels are UTC-localized (dt64tz), so
// the ISO output should carry a trailing "Z".
PyObject *item = NULL;
const NPY_DATETIMEUNIT targetUnit = enc->datetimeUnit;

Expand Down Expand Up @@ -1331,7 +1393,7 @@ static char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
cLabel = int64ToIsoDuration(i8date, valueUnit, &len);
} else {
if (type_num == NPY_DATETIME) {
cLabel = int64ToIso(i8date, valueUnit, targetUnit, &len);
cLabel = int64ToIso(i8date, valueUnit, targetUnit, is_utc, &len);
} else {
cLabel = PyDateTimeToIso(item, targetUnit, &len);
}
Expand Down Expand Up @@ -1641,6 +1703,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {

pc->newObj = get_values(obj);
if (pc->newObj) {
pc->ndarrayIsUTC = object_is_dt64tz(obj);
tc->type = JT_ARRAY;
pc->iterBegin = NpyArr_iterBegin;
pc->iterEnd = NpyArr_iterEnd;
Expand All @@ -1667,6 +1730,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
if (!pc->newObj) {
goto INVALID;
}
pc->ndarrayIsUTC = object_is_dt64tz(obj);

if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) {
tc->type = JT_OBJECT;
Expand All @@ -1675,6 +1739,7 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
goto INVALID;
}
PyObject *values = get_values(tmpObj);
const int values_is_utc = object_is_dt64tz(tmpObj);
Py_DECREF(tmpObj);
if (!values) {
goto INVALID;
Expand All @@ -1687,8 +1752,8 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
}
const PyArrayObject *arrayobj = (const PyArrayObject *)pc->newObj;
pc->columnLabelsLen = PyArray_DIM(arrayobj, 0);
pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
pc->columnLabelsLen);
pc->columnLabels = NpyArr_encodeLabels(
(PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc);
if (!pc->columnLabels) {
goto INVALID;
}
Expand Down Expand Up @@ -1717,6 +1782,9 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
}

tc->type = JT_ARRAY;
// consume a one-shot UTC flag set by a caller (e.g. Series "split" data)
pc->ndarrayIsUTC = enc->pendingArrayIsUTC;
enc->pendingArrayIsUTC = 0;
pc->iterBegin = NpyArr_iterBegin;
pc->iterEnd = NpyArr_iterEnd;
pc->iterNext = NpyArr_iterNext;
Expand Down Expand Up @@ -1768,9 +1836,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
Py_DECREF(tmpObj);
goto INVALID;
}
const int values_is_utc = object_is_dt64tz(tmpObj);
pc->columnLabelsLen = PyObject_Size(tmpObj);
pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
pc->columnLabelsLen);
pc->columnLabels = NpyArr_encodeLabels(
(PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc);
Py_DECREF(tmpObj);
if (!pc->columnLabels) {
goto INVALID;
Expand All @@ -1788,9 +1857,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
Py_DECREF(tmpObj);
goto INVALID;
}
int values_is_utc = object_is_dt64tz(tmpObj);
pc->rowLabelsLen = PyObject_Size(tmpObj);
pc->rowLabels =
NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->rowLabelsLen);
pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
pc->rowLabelsLen, values_is_utc);
Py_DECREF(tmpObj);
tmpObj =
(enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns")
Expand All @@ -1807,9 +1877,10 @@ static void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
pc->rowLabels = NULL;
goto INVALID;
}
values_is_utc = object_is_dt64tz(tmpObj);
pc->columnLabelsLen = PyObject_Size(tmpObj);
pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc,
pc->columnLabelsLen);
pc->columnLabels = NpyArr_encodeLabels(
(PyArrayObject *)values, enc, pc->columnLabelsLen, values_is_utc);
Py_DECREF(tmpObj);
if (!pc->columnLabels) {
NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen);
Expand Down
34 changes: 17 additions & 17 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1477,23 +1477,6 @@ def test_tz_range_is_utc(self, tz_range):
assert result == dfexp
assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp

def test_tz_range_is_naive(self):
dti = date_range("2013-01-01 05:00:00", periods=2, unit="ns")

exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
serexp = '{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}'
dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'

# Ensure datetimes in object array are serialized correctly
# in addition to the normal DTI case
assert ujson_dumps(dti, iso_dates=True) == exp
assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
assert ujson_dumps(Series(dti), iso_dates=True) == serexp
df = DataFrame({"DT": dti})
result = ujson_dumps(df, iso_dates=True)
assert result == dfexp
assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp

@pytest.mark.parametrize(
"orient", ["split", "records", "index", "columns", "values"]
)
Expand All @@ -1514,6 +1497,23 @@ def test_tz_aware_to_json_matches_object(self, orient):
orient=orient, date_format="iso"
)

def test_tz_range_is_naive(self):
dti = date_range("2013-01-01 05:00:00", periods=2, unit="ns")

exp = '["2013-01-01T05:00:00.000","2013-01-02T05:00:00.000"]'
serexp = '{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}'
dfexp = '{"DT":{"0":"2013-01-01T05:00:00.000","1":"2013-01-02T05:00:00.000"}}'

# Ensure datetimes in object array are serialized correctly
# in addition to the normal DTI case
assert ujson_dumps(dti, iso_dates=True) == exp
assert ujson_dumps(dti.astype(object), iso_dates=True) == exp
assert ujson_dumps(Series(dti), iso_dates=True) == serexp
df = DataFrame({"DT": dti})
result = ujson_dumps(df, iso_dates=True)
assert result == dfexp
assert ujson_dumps(df.astype({"DT": object}), iso_dates=True) == dfexp

def test_read_inline_jsonl(self):
# GH9180

Expand Down