Skip to content
294 changes: 294 additions & 0 deletions pandas/tests/io/parser/dtypes/test_categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
"""
Tests dtype specification during parsing
for all of the parsers defined in parsers.py
"""
from io import StringIO
import os

import numpy as np
import pytest

from pandas.core.dtypes.dtypes import CategoricalDtype

import pandas as pd
from pandas import Categorical, DataFrame, Timestamp
import pandas._testing as tm


@pytest.mark.parametrize(
"dtype",
[
"category",
CategoricalDtype(),
{"a": "category", "b": "category", "c": CategoricalDtype()},
],
)
def test_categorical_dtype(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["a", "a", "b"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize("dtype", [{"b": "category"}, {1: "category"}])
def test_categorical_dtype_single(all_parsers, dtype):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,a,3.4
1,a,3.4
2,b,4.5"""
expected = DataFrame(
{"a": [1, 1, 2], "b": Categorical(["a", "a", "b"]), "c": [3.4, 3.4, 4.5]}
)
actual = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_unsorted(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,b,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", "b", "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_missing(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b,c
1,b,3.4
1,nan,3.4
2,a,4.5"""
expected = DataFrame(
{
"a": Categorical(["1", "1", "2"]),
"b": Categorical(["b", np.nan, "a"]),
"c": Categorical(["3.4", "3.4", "4.5"]),
}
)
actual = parser.read_csv(StringIO(data), dtype="category")
tm.assert_frame_equal(actual, expected)


@pytest.mark.slow
def test_categorical_dtype_high_cardinality_numeric(all_parsers):
# see gh-18186
parser = all_parsers
data = np.sort([str(i) for i in range(524289)])
expected = DataFrame({"a": Categorical(data, ordered=True)})

actual = parser.read_csv(StringIO("a\n" + "\n".join(data)), dtype="category")
actual["a"] = actual["a"].cat.reorder_categories(
np.sort(actual.a.cat.categories), ordered=True
)
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_utf16(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "utf16_ex.txt")
parser = all_parsers
encoding = "utf-16"
sep = "\t"

expected = parser.read_csv(pth, sep=sep, encoding=encoding)
expected = expected.apply(Categorical)

actual = parser.read_csv(pth, sep=sep, encoding=encoding, dtype="category")
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_chunksize_infer_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"])}),
DataFrame({"a": [1, 2], "b": Categorical(["b", "c"])}, index=[2, 3]),
]
with parser.read_csv(
StringIO(data), dtype={"b": "category"}, chunksize=2
) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_chunksize_explicit_categories(all_parsers):
# see gh-10153
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
cats = ["a", "b", "c"]
expecteds = [
DataFrame({"a": [1, 1], "b": Categorical(["a", "b"], categories=cats)}),
DataFrame(
{"a": [1, 2], "b": Categorical(["b", "c"], categories=cats)},
index=[2, 3],
),
]
dtype = CategoricalDtype(cats)
with parser.read_csv(StringIO(data), dtype={"b": dtype}, chunksize=2) as actuals:
for actual, expected in zip(actuals, expecteds):
tm.assert_frame_equal(actual, expected)


def test_categorical_dtype_latin1(all_parsers, csv_dir_path):
# see gh-10153
pth = os.path.join(csv_dir_path, "unicode_series.csv")
parser = all_parsers
encoding = "latin-1"

expected = parser.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])

actual = parser.read_csv(pth, header=None, encoding=encoding, dtype={1: "category"})
tm.assert_frame_equal(actual, expected)


@pytest.mark.parametrize("ordered", [False, True])
@pytest.mark.parametrize(
"categories",
[["a", "b", "c"], ["a", "c", "b"], ["a", "b", "c", "d"], ["c", "b", "a"]],
)
def test_categorical_category_dtype(all_parsers, categories, ordered):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(
["a", "b", "b", "c"], categories=categories, ordered=ordered
),
}
)

dtype = {"b": CategoricalDtype(categories=categories, ordered=ordered)}
result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_category_dtype_unsorted(all_parsers):
parser = all_parsers
data = """a,b
1,a
1,b
1,b
2,c"""
dtype = CategoricalDtype(["c", "b", "a"])
expected = DataFrame(
{
"a": [1, 1, 1, 2],
"b": Categorical(["a", "b", "b", "c"], categories=["c", "b", "a"]),
}
)

result = parser.read_csv(StringIO(data), dtype={"b": dtype})
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_numeric(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([1, 2, 3])}

data = "b\n1\n1\n2\n3"
expected = DataFrame({"b": Categorical([1, 1, 2, 3])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_datetime(all_parsers):
parser = all_parsers
dti = pd.DatetimeIndex(["2017-01-01", "2018-01-01", "2019-01-01"], freq=None)
dtype = {"b": CategoricalDtype(dti)}

data = "b\n2017-01-01\n2018-01-01\n2019-01-01"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_timestamp(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype([Timestamp("2014")])}

data = "b\n2014-01-01\n2014-01-01T00:00:00"
expected = DataFrame({"b": Categorical([Timestamp("2014")] * 2)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_coerces_timedelta(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(pd.to_timedelta(["1H", "2H", "3H"]))}

data = "b\n1H\n2H\n3H"
expected = DataFrame({"b": Categorical(dtype["b"].categories)})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"data",
[
"b\nTrue\nFalse\nNA\nFalse",
"b\ntrue\nfalse\nNA\nfalse",
"b\nTRUE\nFALSE\nNA\nFALSE",
"b\nTrue\nFalse\nNA\nFALSE",
],
)
def test_categorical_dtype_coerces_boolean(all_parsers, data):
# see gh-20498
parser = all_parsers
dtype = {"b": CategoricalDtype([False, True])}
expected = DataFrame({"b": Categorical([True, False, None, False])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)


def test_categorical_unexpected_categories(all_parsers):
parser = all_parsers
dtype = {"b": CategoricalDtype(["a", "b", "d", "e"])}

data = "b\nd\na\nc\nd" # Unexpected c
expected = DataFrame({"b": Categorical(list("dacd"), dtype=dtype["b"])})

result = parser.read_csv(StringIO(data), dtype=dtype)
tm.assert_frame_equal(result, expected)
Loading