Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 39 additions & 40 deletions pandas/tests/io/pytables/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,22 @@
tables = pytest.importorskip("tables")


def test_context(setup_path):
with tm.ensure_clean(setup_path) as path:
try:
with HDFStore(path) as tbl:
raise ValueError("blah")
except ValueError:
pass
with tm.ensure_clean(setup_path) as path:
def test_context(setup_path, tmp_path):
path = tmp_path / setup_path
try:
with HDFStore(path) as tbl:
tbl["a"] = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
assert len(tbl) == 1
assert type(tbl["a"]) == DataFrame
raise ValueError("blah")
except ValueError:
pass
path = tmp_path / setup_path
with HDFStore(path) as tbl:
tbl["a"] = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
)
assert len(tbl) == 1
assert type(tbl["a"]) == DataFrame


def test_no_track_times(tmp_path, setup_path):
Expand Down Expand Up @@ -971,37 +971,36 @@ def test_pickle_path_localpath():


@pytest.mark.parametrize("propindexes", [True, False])
def test_copy(propindexes):
def test_copy(propindexes, temp_file):
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

with tm.ensure_clean() as path:
with HDFStore(path) as st:
st.append("df", df, data_columns=["A"])
with tempfile.NamedTemporaryFile() as new_f:
with HDFStore(path) as store:
with contextlib.closing(
store.copy(new_f.name, keys=None, propindexes=propindexes)
) as tstore:
# check keys
keys = store.keys()
assert set(keys) == set(tstore.keys())
# check indices & nrows
for k in tstore.keys():
if tstore.get_storer(k).is_table:
new_t = tstore.get_storer(k)
orig_t = store.get_storer(k)

assert orig_t.nrows == new_t.nrows

# check propindixes
if propindexes:
for a in orig_t.axes:
if a.is_indexed:
assert new_t[a.name].is_indexed
with HDFStore(temp_file) as st:
st.append("df", df, data_columns=["A"])
with tempfile.NamedTemporaryFile() as new_f:
with HDFStore(temp_file) as store:
with contextlib.closing(
store.copy(new_f.name, keys=None, propindexes=propindexes)
) as tstore:
# check keys
keys = store.keys()
assert set(keys) == set(tstore.keys())
# check indices & nrows
for k in tstore.keys():
if tstore.get_storer(k).is_table:
new_t = tstore.get_storer(k)
orig_t = store.get_storer(k)

assert orig_t.nrows == new_t.nrows

# check propindixes
if propindexes:
for a in orig_t.axes:
if a.is_indexed:
assert new_t[a.name].is_indexed


def test_duplicate_column_name(tmp_path, setup_path):
Expand Down
190 changes: 90 additions & 100 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,11 @@ def test_stringify_path_fspath(self):
result = icom.stringify_path(p)
assert result == "foo/bar.csv"

def test_stringify_file_and_path_like(self):
def test_stringify_file_and_path_like(self,temp_file):
# GH 38125: do not stringify file objects that are also path-like
fsspec = pytest.importorskip("fsspec")
with tm.ensure_clean() as path:
with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj:
assert fsspec_obj == icom.stringify_path(fsspec_obj)
with fsspec.open(f"file://{temp_file}", mode="wb") as fsspec_obj:
assert fsspec_obj == icom.stringify_path(fsspec_obj)

@pytest.mark.parametrize("path_type", [str, CustomFSPath, Path])
def test_infer_compression_from_path(self, compression_format, path_type):
Expand Down Expand Up @@ -338,49 +337,47 @@ def test_read_fspath_all(self, reader, module, path, datapath):
("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
],
)
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
def test_write_fspath_all(self, writer_name, writer_kwargs, module, tmp_path):
if writer_name in ["to_latex"]: # uses Styler implementation
pytest.importorskip("jinja2")
p1 = tm.ensure_clean("string")
p2 = tm.ensure_clean("fspath")
string = str(tmp_path / "string")
fspath = str(tmp_path / "fspath")
df = pd.DataFrame({"A": [1, 2]})

with p1 as string, p2 as fspath:
pytest.importorskip(module)
mypath = CustomFSPath(fspath)
writer = getattr(df, writer_name)

writer(string, **writer_kwargs)
writer(mypath, **writer_kwargs)
with open(string, "rb") as f_str, open(fspath, "rb") as f_path:
if writer_name == "to_excel":
# binary representation of excel contains time creation
# data that causes flaky CI failures
result = pd.read_excel(f_str, **writer_kwargs)
expected = pd.read_excel(f_path, **writer_kwargs)
tm.assert_frame_equal(result, expected)
else:
result = f_str.read()
expected = f_path.read()
assert result == expected

def test_write_fspath_hdf5(self):
pytest.importorskip(module)
mypath = CustomFSPath(fspath)
writer = getattr(df, writer_name)

writer(string, **writer_kwargs)
writer(mypath, **writer_kwargs)
with open(string, "rb") as f_str, open(fspath, "rb") as f_path:
if writer_name == "to_excel":
# binary representation of excel contains time creation
# data that causes flaky CI failures
result = pd.read_excel(f_str, **writer_kwargs)
expected = pd.read_excel(f_path, **writer_kwargs)
tm.assert_frame_equal(result, expected)
else:
result = f_str.read()
expected = f_path.read()
assert result == expected

def test_write_fspath_hdf5(self, tmp_path):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
# have to read and compare equality
pytest.importorskip("tables")

df = pd.DataFrame({"A": [1, 2]})
p1 = tm.ensure_clean("string")
p2 = tm.ensure_clean("fspath")
string = str(tmp_path / "string")
fspath = str(tmp_path / "fspath")

with p1 as string, p2 as fspath:
mypath = CustomFSPath(fspath)
df.to_hdf(mypath, key="bar")
df.to_hdf(string, key="bar")
mypath = CustomFSPath(fspath)
df.to_hdf(mypath, key="bar")
df.to_hdf(string, key="bar")

result = pd.read_hdf(fspath, key="bar")
expected = pd.read_hdf(string, key="bar")
result = pd.read_hdf(fspath, key="bar")
expected = pd.read_hdf(string, key="bar")

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -432,35 +429,33 @@ def test_next(self, mmap_file):
with pytest.raises(StopIteration, match=r"^$"):
next(wrapper)

def test_unknown_engine(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")

def test_binary_mode(self):
def test_unknown_engine(self, temp_file):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(temp_file)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(temp_file, engine="pyt")

def test_binary_mode(self, temp_file):
"""
'encoding' shouldn't be passed to 'open' in binary mode.

GH 35058
"""
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(temp_file, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(temp_file, index_col=0))

@pytest.mark.parametrize("encoding", ["utf-16", "utf-32"])
@pytest.mark.parametrize("compression_", ["bz2", "xz"])
def test_warning_missing_utf_bom(self, encoding, compression_):
def test_warning_missing_utf_bom(self, encoding, compression_, temp_file):
"""
bz2 and xz do not write the byte order mark (BOM) for utf-16/32.

Expand All @@ -473,17 +468,16 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
df.to_csv(path, compression=compression_, encoding=encoding)
with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
df.to_csv(temp_file, compression=compression_, encoding=encoding)

# reading should fail (otherwise we wouldn't need the warning)
msg = (
r"UTF-\d+ stream does not start with BOM|"
r"'utf-\d+' codec can't decode byte"
)
with pytest.raises(UnicodeError, match=msg):
pd.read_csv(path, compression=compression_, encoding=encoding)
# reading should fail (otherwise we wouldn't need the warning)
msg = (
r"UTF-\d+ stream does not start with BOM|"
r"'utf-\d+' codec can't decode byte"
)
with pytest.raises(UnicodeError, match=msg):
pd.read_csv(temp_file, compression=compression_, encoding=encoding)


def test_is_fsspec_url():
Expand Down Expand Up @@ -514,38 +508,36 @@ def test_is_fsspec_url_chained():


@pytest.mark.parametrize("format", ["csv", "json"])
def test_codecs_encoding(format):
def test_codecs_encoding(format, temp_file):
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, mode="w", encoding="utf-8") as handle:
getattr(expected, f"to_{format}")(handle)
with open(path, encoding="utf-8") as handle:
if format == "csv":
df = pd.read_csv(handle, index_col=0)
else:
df = pd.read_json(handle)
with open(temp_file, mode="w", encoding="utf-8") as handle:
getattr(expected, f"to_{format}")(handle)
with open(temp_file, encoding="utf-8") as handle:
if format == "csv":
df = pd.read_csv(handle, index_col=0)
else:
df = pd.read_json(handle)
tm.assert_frame_equal(expected, df)


def test_codecs_get_writer_reader():
def test_codecs_get_writer_reader(temp_file):
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, "wb") as handle:
with codecs.getwriter("utf-8")(handle) as encoded:
expected.to_csv(encoded)
with open(path, "rb") as handle:
with codecs.getreader("utf-8")(handle) as encoded:
df = pd.read_csv(encoded, index_col=0)
with open(temp_file, "wb") as handle:
with codecs.getwriter("utf-8")(handle) as encoded:
expected.to_csv(encoded)
with open(temp_file, "rb") as handle:
with codecs.getreader("utf-8")(handle) as encoded:
df = pd.read_csv(encoded, index_col=0)
tm.assert_frame_equal(expected, df)


Expand All @@ -572,7 +564,7 @@ def test_explicit_encoding(io_class, mode, msg):

@pytest.mark.parametrize("encoding_errors", ["strict", "replace"])
@pytest.mark.parametrize("format", ["csv", "json"])
def test_encoding_errors(encoding_errors, format):
def test_encoding_errors(encoding_errors, format, temp_file):
# GH39450
msg = "'utf-8' codec can't decode byte"
bad_encoding = b"\xe4"
Expand All @@ -591,18 +583,17 @@ def test_encoding_errors(encoding_errors, format):
+ b'"}}'
)
reader = partial(pd.read_json, orient="index")
with tm.ensure_clean() as path:
file = Path(path)
file.write_bytes(content)
file = Path(temp_file)
file.write_bytes(content)

if encoding_errors != "replace":
with pytest.raises(UnicodeDecodeError, match=msg):
reader(path, encoding_errors=encoding_errors)
else:
df = reader(path, encoding_errors=encoding_errors)
decoded = bad_encoding.decode(errors=encoding_errors)
expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2])
tm.assert_frame_equal(df, expected)
if encoding_errors != "replace":
with pytest.raises(UnicodeDecodeError, match=msg):
reader(temp_file, encoding_errors=encoding_errors)
else:
df = reader(temp_file, encoding_errors=encoding_errors)
decoded = bad_encoding.decode(errors=encoding_errors)
expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2])
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("encoding_errors", [0, None])
Expand All @@ -616,11 +607,10 @@ def test_encoding_errors_badtype(encoding_errors):
reader(content)


def test_bad_encdoing_errors():
def test_bad_encdoing_errors(temp_file):
# GH 39777
with tm.ensure_clean() as path:
with pytest.raises(LookupError, match="unknown error handler name"):
icom.get_handle(path, "w", errors="bad")
with pytest.raises(LookupError, match="unknown error handler name"):
icom.get_handle(temp_file, "w", errors="bad")


@pytest.mark.skipif(WASM, reason="limited file system access on WASM")
Expand Down Expand Up @@ -653,7 +643,7 @@ def close(self):
@pytest.mark.parametrize("compression", [None, "infer"])
def test_read_csv_chained_url_no_error(compression):
# GH 60100
tar_file_path = "pandas/tests/io/data/tar/test-csv.tar"
tar_file_path = "data/tar/test-csv.tar"
chained_file_url = f"tar://test.csv::file://{tar_file_path}"

result = pd.read_csv(chained_file_url, compression=compression, sep=";")
Expand Down
Loading