Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
BUG: Series creation with datetime64 with non-ns unit as object dtype
closes #11275 closes #11745
  • Loading branch information
sumitbinnani authored and jreback committed Apr 2, 2017
commit ad7356e16191daa650a63728d15659ad6a76cd63
14 changes: 13 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ Check the :ref:`API Changes <whatsnew_0200.api_breaking>` and :ref:`deprecations
New features
~~~~~~~~~~~~


.. _whatsnew_0200.enhancements.dataio_dtype:

``dtype`` keyword for data IO
Expand Down Expand Up @@ -55,6 +54,19 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files.
pd.read_fwf(StringIO(data)).dtypes
pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes

.. _whatsnew_0120.enhancements.datetime_origin:

to_datetime can be used with Offset
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
``pd.to_datetime`` has gained a new parameter, ``origin``, to define an offset
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's use 'reference date' instead of 'offset' here as well (as you did in the docstring)

from where to compute the resulting ``DatetimeIndex``. (:issue:`11276`, :issue:`11745`)

.. ipython:: python

to_datetime([1,2,3], unit='D', origin=pd.Timestamp('1960-01-01'))

The above code would return days with offset from origin as defined by timestamp set by origin.

.. _whatsnew_0200.enhancements.groupby_access:

Groupby Enhancements
Expand Down
44 changes: 44 additions & 0 deletions pandas/tests/indexes/datetimes/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,3 +1515,47 @@ def test_normalize_date():

result = normalize_date(value)
assert (result == datetime(2012, 9, 7))


def test_to_datetime_origin():
units = ['D', 's', 'ms', 'us', 'ns']
# gh-11276, gh-11745
# for origin as julian

julian_dates = pd.date_range(
'2014-1-1', periods=10).to_julian_date().values
result = Series(pd.to_datetime(
julian_dates, unit='D', origin='julian'))
expected = Series(pd.to_datetime(
julian_dates - pd.Timestamp(0).to_julian_date(), unit='D'))
assert_series_equal(result, expected)

# checking for invalid combination of origin='julian' and unit != D
for unit in units:
if unit == 'D':
continue
with pytest.raises(ValueError):
pd.to_datetime(julian_dates, unit=unit, origin='julian')

# for origin as 1960-01-01
epoch_1960 = pd.Timestamp('1960-01-01')
epoch_timestamp_convertible = [epoch_1960, epoch_1960.to_datetime(),
epoch_1960.to_datetime64(),
str(epoch_1960)]
invalid_origins = ['random_string', '13-24-1990', '0001-01-01']
units_from_epoch = [0, 1, 2, 3, 4]

for unit in units:
for epoch in epoch_timestamp_convertible:
expected = Series(
[pd.Timedelta(x, unit=unit) +
epoch_1960 for x in units_from_epoch])
result = Series(pd.to_datetime(
units_from_epoch, unit=unit, origin=epoch))
assert_series_equal(result, expected)

# check for invalid origins
for origin in invalid_origins:
with pytest.raises(ValueError):
pd.to_datetime(units_from_epoch, unit=unit,
origin=origin)
76 changes: 58 additions & 18 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ def _guess_datetime_format_for_array(arr, **kwargs):


def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
utc=None, box=True, format=None, exact=True,
unit=None, infer_datetime_format=False):
utc=None, box=True, format=None, exact=True, coerce=None,
unit=None, infer_datetime_format=False, origin='epoch'):
"""
Convert argument to datetime.

Expand Down Expand Up @@ -236,6 +236,19 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
datetime strings, and if it can be inferred, switch to a faster
method of parsing them. In some cases this can increase the parsing
speed by ~5-10x.
origin : scalar convertible to Timestamp / string ('julian', 'epoch'),
default 'epoch'.
Define reference date. The numeric values would be parsed as number
of units (defined by `unit`) since this reference date.

- If 'epoch', origin is set to 1970-01-01.
- If 'julian', unit must be 'D', and origin is set to beginning of
Julian Calendar. Julian day number 0 is assigned to the day starting
at noon on January 1, 4713 BC.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not critical, but could expand to other semi-common origins @bashtage mentions here.
#11470 (comment)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, maybe, can always add if people really want this.

- If Timestamp convertible, origin is set to Timestamp identified by
origin.

.. versionadded: 0.19.0

Returns
-------
Expand Down Expand Up @@ -297,8 +310,14 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
>>> %timeit pd.to_datetime(s,infer_datetime_format=False)
1 loop, best of 3: 471 ms per loop

"""
Using non-epoch origins to parse date

>>> pd.to_datetime([1,2,3], unit='D', origin=pd.Timestamp('1960-01-01'))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blank line between the two lines above

0 1960-01-02
1 1960-01-03
2 1960-01-04

"""
from pandas.tseries.index import DatetimeIndex

tz = 'utc' if utc else None
Expand Down Expand Up @@ -409,22 +428,43 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
except (ValueError, TypeError):
raise e

if arg is None:
return arg
elif isinstance(arg, tslib.Timestamp):
return arg
elif isinstance(arg, ABCSeries):
from pandas import Series
values = _convert_listlike(arg._values, False, format)
return Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
return _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
return _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
return _convert_listlike(arg, box, format)
def intermediate_result(arg):
if origin == 'julian':
if unit != 'D':
raise ValueError("unit must be 'D' for origin='julian'")
try:
arg = arg - tslib.Timestamp(0).to_julian_date()
except:
raise ValueError("incompatible 'arg' type for given "
"'origin'='julian'")
if arg is None:
return arg
elif isinstance(arg, tslib.Timestamp):
return arg
elif isinstance(arg, ABCSeries):
from pandas import Series
values = _convert_listlike(arg._values, False, format)
return Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, (ABCDataFrame, MutableMapping)):
return _assemble_from_unit_mappings(arg, errors=errors)
elif isinstance(arg, ABCIndexClass):
return _convert_listlike(arg, box, format, name=arg.name)
elif is_list_like(arg):
return _convert_listlike(arg, box, format)
return _convert_listlike(np.array([arg]), box, format)[0]

result = intermediate_result(arg)

offset = None
if origin not in ['epoch', 'julian']:
try:
offset = tslib.Timestamp(origin) - tslib.Timestamp(0)
except ValueError:
raise ValueError("Invalid 'origin' or 'origin' Out of Bound")

return _convert_listlike(np.array([arg]), box, format)[0]
if offset is not None:
result = result + offset
return result


# mappings for assembling units
Expand Down