initial commit
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
25
venv/Lib/site-packages/pandas/tests/io/pytables/conftest.py
Normal file
25
venv/Lib/site-packages/pandas/tests/io/pytables/conftest.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.io.pytables import HDFStore
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
# set these parameters so we don't have file sharing
|
||||
tables.parameters.MAX_NUMEXPR_THREADS = 1
|
||||
tables.parameters.MAX_BLOSC_THREADS = 1
|
||||
tables.parameters.MAX_THREADS = 1
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_h5_path(tmp_path):
|
||||
"""Fixture for HDF5 path"""
|
||||
file_path = tmp_path / f"{uuid.uuid4()}.h5"
|
||||
file_path.touch()
|
||||
return file_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def temp_hdfstore(temp_h5_path):
|
||||
with HDFStore(temp_h5_path, mode="a") as store:
|
||||
yield store
|
||||
989
venv/Lib/site-packages/pandas/tests/io/pytables/test_append.py
Normal file
989
venv/Lib/site-packages/pandas/tests/io/pytables/test_append.py
Normal file
@@ -0,0 +1,989 @@
|
||||
import datetime
|
||||
from datetime import timedelta
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
from pandas.compat import PY312
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Index,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::tables.NaturalNameWarning")
|
||||
def test_append(temp_hdfstore):
|
||||
# this is allowed by almost always don't want to do it
|
||||
# tables.NaturalNameWarning):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((20, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=20, freq="B"),
|
||||
)
|
||||
temp_hdfstore.append("df1", df[:10])
|
||||
temp_hdfstore.append("df1", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df1"], df)
|
||||
|
||||
temp_hdfstore.put("df2", df[:10], format="table")
|
||||
temp_hdfstore.append("df2", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], df)
|
||||
|
||||
temp_hdfstore.append("/df3", df[:10])
|
||||
temp_hdfstore.append("/df3", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df3"], df)
|
||||
|
||||
# this is allowed by almost always don't want to do it
|
||||
# tables.NaturalNameWarning
|
||||
temp_hdfstore.append("/df3 foo", df[:10])
|
||||
temp_hdfstore.append("/df3 foo", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df3 foo"], df)
|
||||
|
||||
# dtype issues - mizxed type in a single object column
|
||||
df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
|
||||
df["mixed_column"] = "testing"
|
||||
df.loc[2, "mixed_column"] = np.nan
|
||||
temp_hdfstore.append("df", df)
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df)
|
||||
|
||||
# uints - test storage of uints
|
||||
uint_data = DataFrame(
|
||||
{
|
||||
"u08": Series(
|
||||
np.random.default_rng(2).integers(0, high=255, size=5),
|
||||
dtype=np.uint8,
|
||||
),
|
||||
"u16": Series(
|
||||
np.random.default_rng(2).integers(0, high=65535, size=5),
|
||||
dtype=np.uint16,
|
||||
),
|
||||
"u32": Series(
|
||||
np.random.default_rng(2).integers(0, high=2**30, size=5),
|
||||
dtype=np.uint32,
|
||||
),
|
||||
"u64": Series(
|
||||
[2**58, 2**59, 2**60, 2**61, 2**62],
|
||||
dtype=np.uint64,
|
||||
),
|
||||
},
|
||||
index=np.arange(5),
|
||||
)
|
||||
temp_hdfstore.append("uints", uint_data)
|
||||
tm.assert_frame_equal(temp_hdfstore["uints"], uint_data, check_index_type=True)
|
||||
|
||||
# uints - test storage of uints in indexable columns
|
||||
temp_hdfstore.remove("uints")
|
||||
# 64-bit indices not yet supported
|
||||
temp_hdfstore.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
|
||||
tm.assert_frame_equal(temp_hdfstore["uints"], uint_data, check_index_type=True)
|
||||
|
||||
|
||||
def test_append_series(temp_hdfstore):
|
||||
# basic
|
||||
ss = Series(range(20), dtype=np.float64, index=[f"i_{i}" for i in range(20)])
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
ns = Series(np.arange(100))
|
||||
|
||||
temp_hdfstore.append("ss", ss)
|
||||
result = temp_hdfstore["ss"]
|
||||
tm.assert_series_equal(result, ss)
|
||||
assert result.name is None
|
||||
|
||||
temp_hdfstore.append("ts", ts)
|
||||
result = temp_hdfstore["ts"]
|
||||
tm.assert_series_equal(result, ts)
|
||||
assert result.name is None
|
||||
|
||||
ns.name = "foo"
|
||||
temp_hdfstore.append("ns", ns)
|
||||
result = temp_hdfstore["ns"]
|
||||
tm.assert_series_equal(result, ns)
|
||||
assert result.name == ns.name
|
||||
|
||||
# select on the values
|
||||
expected = ns[ns > 60]
|
||||
result = temp_hdfstore.select("ns", "foo>60")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
# select on the index and values
|
||||
expected = ns[(ns > 70) & (ns.index < 90)]
|
||||
# Reading/writing RangeIndex info is not supported yet
|
||||
expected.index = Index(expected.index._data)
|
||||
result = temp_hdfstore.select("ns", "foo>70 and index<90")
|
||||
tm.assert_series_equal(result, expected, check_index_type=True)
|
||||
|
||||
# multi-index
|
||||
mi = DataFrame(np.random.default_rng(2).standard_normal((5, 1)), columns=["A"])
|
||||
mi["B"] = np.arange(len(mi))
|
||||
mi["C"] = "foo"
|
||||
mi.loc[3:5, "C"] = "bar"
|
||||
mi.set_index(["C", "B"], inplace=True)
|
||||
s = mi.stack()
|
||||
s.index = s.index.droplevel(2)
|
||||
temp_hdfstore.append("mi", s)
|
||||
tm.assert_series_equal(temp_hdfstore["mi"], s, check_index_type=True)
|
||||
|
||||
|
||||
def test_append_some_nans(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": Series(np.random.default_rng(2).standard_normal(20)).astype("int32"),
|
||||
"A1": np.random.default_rng(2).standard_normal(20),
|
||||
"A2": np.random.default_rng(2).standard_normal(20),
|
||||
"B": "foo",
|
||||
"C": "bar",
|
||||
"D": Timestamp("2001-01-01").as_unit("ns"),
|
||||
"E": Timestamp("2001-01-02").as_unit("ns"),
|
||||
},
|
||||
index=np.arange(20),
|
||||
)
|
||||
# some nans
|
||||
df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
|
||||
temp_hdfstore.append("df1", df[:10])
|
||||
temp_hdfstore.append("df1", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df1"], df, check_index_type=True)
|
||||
|
||||
# first column
|
||||
df1 = df.copy()
|
||||
df1["A1"] = np.nan
|
||||
temp_hdfstore.remove("df1")
|
||||
temp_hdfstore.append("df1", df1[:10])
|
||||
temp_hdfstore.append("df1", df1[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df1"], df1, check_index_type=True)
|
||||
|
||||
# 2nd column
|
||||
df2 = df.copy()
|
||||
df2["A2"] = np.nan
|
||||
temp_hdfstore.append("df2", df2[:10])
|
||||
temp_hdfstore.append("df2", df2[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], df2, check_index_type=True)
|
||||
|
||||
# datetimes
|
||||
df3 = df.copy()
|
||||
df3["E"] = np.nan
|
||||
temp_hdfstore.append("df3", df3[:10])
|
||||
temp_hdfstore.append("df3", df3[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df3"], df3, check_index_type=True)
|
||||
|
||||
|
||||
def test_append_all_nans(temp_hdfstore, using_infer_string):
|
||||
df = DataFrame(
|
||||
{
|
||||
"A1": np.random.default_rng(2).standard_normal(20),
|
||||
"A2": np.random.default_rng(2).standard_normal(20),
|
||||
},
|
||||
index=np.arange(20),
|
||||
)
|
||||
df.loc[0:15, :] = np.nan
|
||||
|
||||
# nan some entire rows (dropna=True)
|
||||
temp_hdfstore.append("df", df[:10], dropna=True)
|
||||
temp_hdfstore.append("df", df[10:], dropna=True)
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df[-4:], check_index_type=True)
|
||||
|
||||
# nan some entire rows (dropna=False)
|
||||
temp_hdfstore.append("df2", df[:10], dropna=False)
|
||||
temp_hdfstore.append("df2", df[10:], dropna=False)
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True)
|
||||
|
||||
# tests the option io.hdf.dropna_table
|
||||
with pd.option_context("io.hdf.dropna_table", False):
|
||||
temp_hdfstore.append("df3", df[:10])
|
||||
temp_hdfstore.append("df3", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df3"], df)
|
||||
|
||||
with pd.option_context("io.hdf.dropna_table", True):
|
||||
temp_hdfstore.append("df4", df[:10])
|
||||
temp_hdfstore.append("df4", df[10:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df4"], df[-4:])
|
||||
|
||||
# nan some entire rows (string are still written!)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A1": np.random.default_rng(2).standard_normal(20),
|
||||
"A2": np.random.default_rng(2).standard_normal(20),
|
||||
"B": "foo",
|
||||
"C": "bar",
|
||||
},
|
||||
index=np.arange(20),
|
||||
)
|
||||
|
||||
df.loc[0:15, :] = np.nan
|
||||
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df[:10], dropna=True)
|
||||
temp_hdfstore.append("df", df[10:], dropna=True)
|
||||
result = temp_hdfstore["df"]
|
||||
expected = df
|
||||
if using_infer_string:
|
||||
# TODO: Test is incorrect when not using_infer_string.
|
||||
# Should take the last 4 rows uncondiationally.
|
||||
expected = expected[-4:]
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
temp_hdfstore.remove("df2")
|
||||
temp_hdfstore.append("df2", df[:10], dropna=False)
|
||||
temp_hdfstore.append("df2", df[10:], dropna=False)
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True)
|
||||
|
||||
# nan some entire rows (but since we have dates they are still
|
||||
# written!)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A1": np.random.default_rng(2).standard_normal(20),
|
||||
"A2": np.random.default_rng(2).standard_normal(20),
|
||||
"B": "foo",
|
||||
"C": "bar",
|
||||
"D": Timestamp("2001-01-01").as_unit("ns"),
|
||||
"E": Timestamp("2001-01-02").as_unit("ns"),
|
||||
},
|
||||
index=np.arange(20),
|
||||
)
|
||||
|
||||
df.loc[0:15, :] = np.nan
|
||||
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df[:10], dropna=True)
|
||||
temp_hdfstore.append("df", df[10:], dropna=True)
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df, check_index_type=True)
|
||||
|
||||
temp_hdfstore.remove("df2")
|
||||
temp_hdfstore.append("df2", df[:10], dropna=False)
|
||||
temp_hdfstore.append("df2", df[10:], dropna=False)
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], df, check_index_type=True)
|
||||
|
||||
|
||||
def test_append_frame_column_oriented(temp_hdfstore, request):
|
||||
# column oriented
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df.index = df.index._with_freq(None) # freq doesn't round-trip
|
||||
|
||||
temp_hdfstore.append("df1", df.iloc[:, :2], axes=["columns"])
|
||||
temp_hdfstore.append("df1", df.iloc[:, 2:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df1"], df)
|
||||
|
||||
result = temp_hdfstore.select("df1", "columns=A")
|
||||
expected = df.reindex(columns=["A"])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
# selection on the non-indexable
|
||||
request.applymarker(
|
||||
pytest.mark.xfail(
|
||||
PY312,
|
||||
reason="AST change in PY312",
|
||||
raises=ValueError,
|
||||
)
|
||||
)
|
||||
result = temp_hdfstore.select("df1", ("columns=A", "index=df.index[0:4]"))
|
||||
expected = df.reindex(columns=["A"], index=df.index[0:4])
|
||||
tm.assert_frame_equal(expected, result)
|
||||
|
||||
# this isn't supported
|
||||
msg = re.escape(
|
||||
"passing a filterable condition to a non-table indexer "
|
||||
"[Filter: Not Initialized]"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.select("df1", "columns=A and index>df.index[4]")
|
||||
|
||||
|
||||
def test_append_with_different_block_ordering(temp_hdfstore):
|
||||
# GH 4096; using same frames, but different block orderings
|
||||
for i in range(10):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
|
||||
)
|
||||
df["index"] = range(10)
|
||||
df["index"] += i * 10
|
||||
df["int64"] = Series([1] * len(df), dtype="int64")
|
||||
df["int16"] = Series([1] * len(df), dtype="int16")
|
||||
|
||||
if i % 2 == 0:
|
||||
del df["int64"]
|
||||
df["int64"] = Series([1] * len(df), dtype="int64")
|
||||
if i % 3 == 0:
|
||||
a = df.pop("A")
|
||||
df["A"] = a
|
||||
|
||||
df.set_index("index", inplace=True)
|
||||
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# test a different ordering but with more fields (like invalid
|
||||
# combinations)
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 2)),
|
||||
columns=list("AB"),
|
||||
dtype="float64",
|
||||
)
|
||||
df["int64"] = Series([1] * len(df), dtype="int64")
|
||||
df["int16"] = Series([1] * len(df), dtype="int16")
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# store additional fields in different blocks
|
||||
df["int16_2"] = Series([1] * len(df), dtype="int16")
|
||||
msg = re.escape(
|
||||
"cannot match existing table structure for [int16] on appending data"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# store multiple additional fields in different blocks
|
||||
df["float_3"] = Series([1.0] * len(df), dtype="float64")
|
||||
msg = re.escape("cannot match existing table structure for [A,B] on appending data")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
|
||||
def test_append_with_strings(temp_hdfstore):
|
||||
def check_col(key, name, size):
|
||||
assert (
|
||||
getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize
|
||||
== size
|
||||
)
|
||||
|
||||
# avoid truncation on elements
|
||||
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
|
||||
temp_hdfstore.append("df_big", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df_big"), df)
|
||||
check_col("df_big", "values_block_1", 15)
|
||||
|
||||
# appending smaller string ok
|
||||
df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
|
||||
temp_hdfstore.append("df_big", df2)
|
||||
expected = concat([df, df2])
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df_big"), expected)
|
||||
check_col("df_big", "values_block_1", 15)
|
||||
|
||||
# avoid truncation on elements
|
||||
df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
|
||||
temp_hdfstore.append("df_big2", df, min_itemsize={"values": 50})
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df_big2"), df)
|
||||
check_col("df_big2", "values_block_1", 50)
|
||||
|
||||
# bigger string on next append
|
||||
temp_hdfstore.append("df_new", df)
|
||||
df_new = DataFrame([[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]])
|
||||
msg = (
|
||||
r"Trying to store a string with len \[26\] in "
|
||||
r"\[values_block_1\] column but\n"
|
||||
r"this column has a limit of \[15\]!\n"
|
||||
"Consider using min_itemsize to preset the sizes on these "
|
||||
"columns"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df_new", df_new)
|
||||
|
||||
# min_itemsize on Series index (GH 11412)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [0.0, 1.0, 2.0, 3.0, 4.0],
|
||||
"B": [0.0, 1.0, 0.0, 1.0, 0.0],
|
||||
"C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"]),
|
||||
"D": date_range("20130101", periods=5),
|
||||
}
|
||||
).set_index("C")
|
||||
temp_hdfstore.append("ss", df["B"], min_itemsize={"index": 4})
|
||||
tm.assert_series_equal(temp_hdfstore.select("ss"), df["B"])
|
||||
|
||||
# same as above, with data_columns=True
|
||||
temp_hdfstore.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
|
||||
tm.assert_series_equal(temp_hdfstore.select("ss2"), df["B"])
|
||||
|
||||
# min_itemsize in index without appending (GH 10381)
|
||||
temp_hdfstore.put("ss3", df, format="table", min_itemsize={"index": 6})
|
||||
# just make sure there is a longer string:
|
||||
df2 = df.copy().reset_index().assign(C="longer").set_index("C")
|
||||
temp_hdfstore.append("ss3", df2)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("ss3"), concat([df, df2]))
|
||||
|
||||
# same as above, with a Series
|
||||
temp_hdfstore.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
|
||||
temp_hdfstore.append("ss4", df2["B"])
|
||||
tm.assert_series_equal(temp_hdfstore.select("ss4"), concat([df["B"], df2["B"]]))
|
||||
|
||||
# with nans
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df["string"] = "foo"
|
||||
df.loc[df.index[1:4], "string"] = np.nan
|
||||
df["string2"] = "bar"
|
||||
df.loc[df.index[4:8], "string2"] = np.nan
|
||||
df["string3"] = "bah"
|
||||
df.loc[df.index[1:], "string3"] = np.nan
|
||||
temp_hdfstore.append("df", df)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_append_with_strings2(temp_hdfstore):
|
||||
def check_col(key, name, size):
|
||||
assert (
|
||||
getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize
|
||||
== size
|
||||
)
|
||||
|
||||
df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
|
||||
|
||||
# a min_itemsize that creates a data_column
|
||||
temp_hdfstore.append("df", df, min_itemsize={"A": 200})
|
||||
check_col("df", "A", 200)
|
||||
assert temp_hdfstore.get_storer("df").data_columns == ["A"]
|
||||
|
||||
# a min_itemsize that creates a data_column2
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
|
||||
check_col("df", "A", 200)
|
||||
assert temp_hdfstore.get_storer("df").data_columns == ["B", "A"]
|
||||
|
||||
# a min_itemsize that creates a data_column2
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
|
||||
check_col("df", "B", 200)
|
||||
check_col("df", "values_block_0", 200)
|
||||
assert temp_hdfstore.get_storer("df").data_columns == ["B"]
|
||||
|
||||
# infer the .typ on subsequent appends
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df[:5], min_itemsize=200)
|
||||
temp_hdfstore.append("df", df[5:], min_itemsize=200)
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df)
|
||||
|
||||
# invalid min_itemsize keys
|
||||
df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
|
||||
temp_hdfstore.remove("df")
|
||||
msg = re.escape(
|
||||
"min_itemsize has the key [foo] which is not an axis or data_column"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
|
||||
|
||||
|
||||
def test_append_with_empty_string(temp_hdfstore):
|
||||
# with all empty strings (GH 12242)
|
||||
df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
|
||||
temp_hdfstore.append("df", df[:-1], min_itemsize={"x": 1})
|
||||
temp_hdfstore.append("df", df[-1:], min_itemsize={"x": 1})
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
|
||||
def test_append_with_data_columns(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B", unit="ns"),
|
||||
)
|
||||
df.iloc[0, df.columns.get_loc("B")] = 1.0
|
||||
temp_hdfstore.append("df", df[:2], data_columns=["B"])
|
||||
temp_hdfstore.append("df", df[2:])
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df)
|
||||
|
||||
# check that we have indices created
|
||||
assert temp_hdfstore._handle.root.df.table.cols.index.is_indexed is True
|
||||
assert temp_hdfstore._handle.root.df.table.cols.B.is_indexed is True
|
||||
|
||||
# data column searching
|
||||
result = temp_hdfstore.select("df", "B>0")
|
||||
expected = df[df.B > 0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# data column searching (with an indexable and a data_columns)
|
||||
result = temp_hdfstore.select("df", "B>0 and index>df.index[3]")
|
||||
df_new = df.reindex(index=df.index[4:])
|
||||
expected = df_new[df_new.B > 0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# data column selection with a string data_column
|
||||
df_new = df.copy()
|
||||
df_new["string"] = "foo"
|
||||
df_new.loc[df_new.index[1:4], "string"] = np.nan
|
||||
df_new.loc[df_new.index[5:6], "string"] = "bar"
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df_new, data_columns=["string"])
|
||||
result = temp_hdfstore.select("df", "string='foo'")
|
||||
expected = df_new[df_new.string == "foo"]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# using min_itemsize and a data column
|
||||
def check_col(key, name, size):
|
||||
assert (
|
||||
getattr(temp_hdfstore.get_storer(key).table.description, name).itemsize
|
||||
== size
|
||||
)
|
||||
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append(
|
||||
"df", df_new, data_columns=["string"], min_itemsize={"string": 30}
|
||||
)
|
||||
check_col("df", "string", 30)
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df_new, data_columns=["string"], min_itemsize=30)
|
||||
check_col("df", "string", 30)
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append(
|
||||
"df", df_new, data_columns=["string"], min_itemsize={"values": 30}
|
||||
)
|
||||
check_col("df", "string", 30)
|
||||
|
||||
df_new["string2"] = "foobarbah"
|
||||
df_new["string_block1"] = "foobarbah1"
|
||||
df_new["string_block2"] = "foobarbah2"
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append(
|
||||
"df",
|
||||
df_new,
|
||||
data_columns=["string", "string2"],
|
||||
min_itemsize={"string": 30, "string2": 40, "values": 50},
|
||||
)
|
||||
check_col("df", "string", 30)
|
||||
check_col("df", "string2", 40)
|
||||
check_col("df", "values_block_1", 50)
|
||||
|
||||
# multiple data columns
|
||||
df_new = df.copy()
|
||||
df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
|
||||
df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
|
||||
df_new["string"] = "foo"
|
||||
|
||||
sl = df_new.columns.get_loc("string")
|
||||
df_new.iloc[1:4, sl] = np.nan
|
||||
df_new.iloc[5:6, sl] = "bar"
|
||||
|
||||
df_new["string2"] = "foo"
|
||||
sl = df_new.columns.get_loc("string2")
|
||||
df_new.iloc[2:5, sl] = np.nan
|
||||
df_new.iloc[7:8, sl] = "bar"
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df_new, data_columns=["A", "B", "string", "string2"])
|
||||
result = temp_hdfstore.select(
|
||||
"df", "string='foo' and string2='foo' and A>0 and B<0"
|
||||
)
|
||||
expected = df_new[
|
||||
(df_new.string == "foo")
|
||||
& (df_new.string2 == "foo")
|
||||
& (df_new.A > 0)
|
||||
& (df_new.B < 0)
|
||||
]
|
||||
tm.assert_frame_equal(result, expected, check_freq=False)
|
||||
# FIXME: 2020-05-07 freq check randomly fails in the CI
|
||||
|
||||
# yield an empty frame
|
||||
result = temp_hdfstore.select("df", "string='foo' and string2='cool'")
|
||||
expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# doc example
|
||||
df_dc = df.copy()
|
||||
df_dc["string"] = "foo"
|
||||
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
|
||||
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
|
||||
df_dc["string2"] = "cool"
|
||||
df_dc["datetime"] = Timestamp("20010102").as_unit("ns")
|
||||
df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
|
||||
|
||||
temp_hdfstore.append(
|
||||
"df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
|
||||
)
|
||||
result = temp_hdfstore.select("df_dc", "B>0")
|
||||
|
||||
expected = df_dc[df_dc.B > 0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = temp_hdfstore.select("df_dc", ["B > 0", "C > 0", "string == foo"])
|
||||
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
|
||||
tm.assert_frame_equal(result, expected, check_freq=False)
|
||||
# FIXME: 2020-12-07 intermittent build failures here with freq of
|
||||
# None instead of BDay(4)
|
||||
|
||||
# doc example part 2
|
||||
|
||||
index = date_range("1/1/2000", periods=8)
|
||||
df_dc = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((8, 3)),
|
||||
index=index,
|
||||
columns=["A", "B", "C"],
|
||||
)
|
||||
df_dc["string"] = "foo"
|
||||
df_dc.loc[df_dc.index[4:6], "string"] = np.nan
|
||||
df_dc.loc[df_dc.index[7:9], "string"] = "bar"
|
||||
df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
|
||||
df_dc["string2"] = "cool"
|
||||
|
||||
# on-disk operations
|
||||
temp_hdfstore.remove("df_dc")
|
||||
temp_hdfstore.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
|
||||
|
||||
result = temp_hdfstore.select("df_dc", "B>0")
|
||||
expected = df_dc[df_dc.B > 0]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
result = temp_hdfstore.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
|
||||
expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_append_hierarchical(temp_hdfstore, multiindex_dataframe_random_data):
|
||||
df = multiindex_dataframe_random_data
|
||||
df.columns.name = None
|
||||
|
||||
temp_hdfstore.append("mi", df)
|
||||
result = temp_hdfstore.select("mi")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# GH 3748
|
||||
result = temp_hdfstore.select("mi", columns=["A", "B"])
|
||||
expected = df.reindex(columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
df.to_hdf(temp_hdfstore, key="df", format="table")
|
||||
result = read_hdf(temp_hdfstore, "df", columns=["A", "B"])
|
||||
expected = df.reindex(columns=["A", "B"])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_append_misc(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
temp_hdfstore.append("df", df, chunksize=1)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
temp_hdfstore.append("df1", df, expectedrows=10)
|
||||
result = temp_hdfstore.select("df1")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("chunksize", [10, 200, 1000])
|
||||
def test_append_misc_chunksize(temp_hdfstore, chunksize):
|
||||
# more chunksize in append tests
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
df["string"] = "foo"
|
||||
df["float322"] = 1.0
|
||||
df["float322"] = df["float322"].astype("float32")
|
||||
df["bool"] = df["float322"] > 0
|
||||
df["time1"] = Timestamp("20130101").as_unit("ns")
|
||||
df["time2"] = Timestamp("20130102").as_unit("ns")
|
||||
temp_hdfstore.append("obj", df, chunksize=chunksize)
|
||||
result = temp_hdfstore.select("obj")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_append_misc_empty_frame(temp_hdfstore):
|
||||
# empty frame, GH4273
|
||||
# 0 len
|
||||
df_empty = DataFrame(columns=list("ABC"))
|
||||
temp_hdfstore.append("df", df_empty)
|
||||
with pytest.raises(KeyError, match="'No object named df in the file'"):
|
||||
temp_hdfstore.select("df")
|
||||
|
||||
# repeated append of 0/non-zero frames
|
||||
df = DataFrame(np.random.default_rng(2).random((10, 3)), columns=list("ABC"))
|
||||
temp_hdfstore.append("df", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
temp_hdfstore.append("df", df_empty)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
# store
|
||||
df = DataFrame(columns=list("ABC"))
|
||||
temp_hdfstore.put("df2", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df2"), df)
|
||||
|
||||
|
||||
def test_append_raise(temp_hdfstore, using_infer_string):
|
||||
# test append with invalid input to get good error messages
|
||||
|
||||
# list in column
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
df["invalid"] = [["a"]] * len(df)
|
||||
assert df.dtypes["invalid"] == np.object_
|
||||
msg = re.escape(
|
||||
"""Cannot serialize the column [invalid]
|
||||
because its data contents are not [string] but [mixed] object dtype"""
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# multiple invalid columns
|
||||
df["invalid2"] = [["a"]] * len(df)
|
||||
df["invalid3"] = [["a"]] * len(df)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# datetime with embedded nans as object
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
s = Series(datetime.datetime(2001, 1, 2), index=df.index)
|
||||
s = s.astype(object)
|
||||
s[0:5] = np.nan
|
||||
df["invalid"] = s
|
||||
assert df.dtypes["invalid"] == np.object_
|
||||
msg = "too many timezones in this block, create separate data columns"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# directly ndarray
|
||||
msg = "value must be None, Series, or DataFrame"
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", np.arange(10))
|
||||
|
||||
# series directly
|
||||
msg = re.escape(
|
||||
"cannot properly create the storer for: "
|
||||
"[group->df,value-><class 'pandas.Series'>]"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", Series(np.arange(10)))
|
||||
|
||||
# appending an incompatible table
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
df["foo"] = "foo"
|
||||
msg = re.escape(
|
||||
"invalid combination of [non_index_axes] on appending data "
|
||||
"[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
|
||||
"[(1, ['A', 'B', 'C', 'D'])]"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# incompatible type (GH 41897)
|
||||
df["foo"] = Timestamp("20130101")
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df)
|
||||
df["foo"] = "bar"
|
||||
msg = re.escape(
|
||||
"Cannot serialize the column [foo] "
|
||||
"because its data contents are not [string] "
|
||||
"but [datetime64[us]] object dtype"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
|
||||
def test_append_with_timedelta(temp_hdfstore, unit):
|
||||
# GH 3577
|
||||
# append timedelta
|
||||
|
||||
ts = Timestamp("20130101").as_unit("ns")
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": ts,
|
||||
"B": [ts + timedelta(days=i, seconds=10) for i in range(10)],
|
||||
}
|
||||
)
|
||||
df["C"] = df["A"] - df["B"]
|
||||
df["C"] = df["C"].astype(f"m8[{unit}]")
|
||||
df.loc[3:5, "C"] = np.nan
|
||||
|
||||
# table
|
||||
temp_hdfstore.append("df", df, data_columns=True)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = temp_hdfstore.select("df", where="C<100000")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
result = temp_hdfstore.select("df", where="C<pd.Timedelta('-3D')")
|
||||
tm.assert_frame_equal(result, df.iloc[3:])
|
||||
|
||||
result = temp_hdfstore.select("df", "C<'-3D'")
|
||||
tm.assert_frame_equal(result, df.iloc[3:])
|
||||
|
||||
# a bit hacky here as we don't really deal with the NaT properly
|
||||
|
||||
result = temp_hdfstore.select("df", "C<'-500000s'")
|
||||
result = result.dropna(subset=["C"])
|
||||
tm.assert_frame_equal(result, df.iloc[6:])
|
||||
|
||||
result = temp_hdfstore.select("df", "C<'-3.5D'")
|
||||
result = result.iloc[1:]
|
||||
tm.assert_frame_equal(result, df.iloc[4:])
|
||||
|
||||
# fixed
|
||||
temp_hdfstore.put("df2", df)
|
||||
result = temp_hdfstore.select("df2")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_append_to_multiple(temp_hdfstore):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = df1.copy().rename(columns="{}_2".format)
|
||||
df2["foo"] = "bar"
|
||||
df = concat([df1, df2], axis=1)
|
||||
|
||||
# exceptions
|
||||
msg = "append_to_multiple requires a selector that is in passed dict"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append_to_multiple(
|
||||
{"df1": ["A", "B"], "df2": None}, df, selector="df3"
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
|
||||
|
||||
msg = (
|
||||
"append_to_multiple must have a dictionary specified as the way to "
|
||||
"split the value"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append_to_multiple("df1", df, "df1")
|
||||
|
||||
# regular operation
|
||||
temp_hdfstore.append_to_multiple(
|
||||
{"df1": ["A", "B"], "df2": None}, df, selector="df1"
|
||||
)
|
||||
result = temp_hdfstore.select_as_multiple(
|
||||
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
|
||||
)
|
||||
expected = df[(df.A > 0) & (df.B > 0)]
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_append_to_multiple_dropna(temp_hdfstore):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
).rename(columns="{}_2".format)
|
||||
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
|
||||
df = concat([df1, df2], axis=1)
|
||||
|
||||
# dropna=True should guarantee rows are synchronized
|
||||
temp_hdfstore.append_to_multiple(
|
||||
{"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
|
||||
)
|
||||
result = temp_hdfstore.select_as_multiple(["df1", "df2"])
|
||||
expected = df.dropna()
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
tm.assert_index_equal(
|
||||
temp_hdfstore.select("df1").index, temp_hdfstore.select("df2").index
|
||||
)
|
||||
|
||||
|
||||
def test_append_to_multiple_dropna_false(temp_hdfstore):
|
||||
df1 = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df2 = df1.copy().rename(columns="{}_2".format)
|
||||
df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
|
||||
df = concat([df1, df2], axis=1)
|
||||
|
||||
with pd.option_context("io.hdf.dropna_table", True):
|
||||
# dropna=False shouldn't synchronize row indexes
|
||||
temp_hdfstore.append_to_multiple(
|
||||
{"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
|
||||
)
|
||||
|
||||
msg = "all tables must have exactly the same nrows!"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.select_as_multiple(["df1a", "df2a"])
|
||||
|
||||
assert not temp_hdfstore.select("df1a").index.equals(
|
||||
temp_hdfstore.select("df2a").index
|
||||
)
|
||||
|
||||
|
||||
def test_append_to_multiple_min_itemsize(temp_hdfstore):
|
||||
# GH 11238
|
||||
df = DataFrame(
|
||||
{
|
||||
"IX": np.arange(1, 21),
|
||||
"Num": np.arange(1, 21),
|
||||
"BigNum": np.arange(1, 21) * 88,
|
||||
"Str": ["a" for _ in range(20)],
|
||||
"LongStr": ["abcde" for _ in range(20)],
|
||||
}
|
||||
)
|
||||
expected = df.iloc[[0]]
|
||||
# Reading/writing RangeIndex info is not supported yet
|
||||
expected.index = Index(list(range(len(expected.index))))
|
||||
|
||||
temp_hdfstore.append_to_multiple(
|
||||
{
|
||||
"index": ["IX"],
|
||||
"nums": ["Num", "BigNum"],
|
||||
"strs": ["Str", "LongStr"],
|
||||
},
|
||||
df.iloc[[0]],
|
||||
"index",
|
||||
min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
|
||||
)
|
||||
result = temp_hdfstore.select_as_multiple(["index", "nums", "strs"])
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
|
||||
def test_append_string_nan_rep(temp_hdfstore):
|
||||
# GH 16300
|
||||
df = DataFrame({"A": "a", "B": "foo"}, index=np.arange(10))
|
||||
df_nan = df.copy()
|
||||
df_nan.loc[0:4, :] = np.nan
|
||||
msg = "NaN representation is too large for existing column size"
|
||||
|
||||
# string column too small
|
||||
temp_hdfstore.append("sa", df["A"])
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("sa", df_nan["A"])
|
||||
|
||||
# nan_rep too big
|
||||
temp_hdfstore.append("sb", df["B"], nan_rep="bars")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("sb", df_nan["B"])
|
||||
|
||||
# smaller modified nan_rep
|
||||
temp_hdfstore.append("sc", df["A"], nan_rep="n")
|
||||
temp_hdfstore.append("sc", df_nan["A"])
|
||||
result = temp_hdfstore["sc"]
|
||||
expected = concat([df["A"], df_nan["A"]])
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,189 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
Categorical,
|
||||
DataFrame,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_categorical(temp_hdfstore):
|
||||
# Basic
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=False,
|
||||
)
|
||||
)
|
||||
temp_hdfstore.append("s", s, format="table")
|
||||
result = temp_hdfstore.select("s")
|
||||
tm.assert_series_equal(s, result)
|
||||
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=True,
|
||||
)
|
||||
)
|
||||
temp_hdfstore.append("s_ordered", s, format="table")
|
||||
result = temp_hdfstore.select("s_ordered")
|
||||
tm.assert_series_equal(s, result)
|
||||
|
||||
df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
|
||||
temp_hdfstore.append("df", df, format="table")
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# Dtypes
|
||||
s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
|
||||
temp_hdfstore.append("si", s)
|
||||
result = temp_hdfstore.select("si")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
|
||||
temp_hdfstore.append("si2", s)
|
||||
result = temp_hdfstore.select("si2")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
# Multiple
|
||||
df2 = df.copy()
|
||||
df2["s2"] = Series(list("abcdefg")).astype("category")
|
||||
temp_hdfstore.append("df2", df2)
|
||||
result = temp_hdfstore.select("df2")
|
||||
tm.assert_frame_equal(result, df2)
|
||||
|
||||
# Make sure the metadata is OK
|
||||
info = temp_hdfstore.info()
|
||||
assert "/df2 " in info
|
||||
# df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
|
||||
assert "/df2/meta/values_block_0/meta" in info
|
||||
assert "/df2/meta/values_block_2/meta" in info
|
||||
|
||||
# unordered
|
||||
s = Series(
|
||||
Categorical(
|
||||
["a", "b", "b", "a", "a", "c"],
|
||||
categories=["a", "b", "c", "d"],
|
||||
ordered=False,
|
||||
)
|
||||
)
|
||||
temp_hdfstore.append("s2", s, format="table")
|
||||
result = temp_hdfstore.select("s2")
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
# Query
|
||||
temp_hdfstore.append("df3", df, data_columns=["s"])
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = temp_hdfstore.select("df3", where=['s in ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = temp_hdfstore.select("df3", where=['s = ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["d"])]
|
||||
result = temp_hdfstore.select("df3", where=['s in ["d"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
expected = df[df.s.isin(["f"])]
|
||||
result = temp_hdfstore.select("df3", where=['s in ["f"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Appending with same categories is ok
|
||||
temp_hdfstore.append("df3", df)
|
||||
|
||||
df = concat([df, df])
|
||||
expected = df[df.s.isin(["b", "c"])]
|
||||
result = temp_hdfstore.select("df3", where=['s in ["b","c"]'])
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Appending must have the same categories
|
||||
df3 = df.copy()
|
||||
df3["s"] = df3["s"].cat.remove_unused_categories()
|
||||
|
||||
msg = "cannot append a categorical with different categories to the existing"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df3", df3)
|
||||
|
||||
# Remove, and make sure meta data is removed (its a recursive
|
||||
# removal so should be).
|
||||
result = temp_hdfstore.select("df3/meta/s/meta")
|
||||
assert result is not None
|
||||
temp_hdfstore.remove("df3")
|
||||
|
||||
with pytest.raises(KeyError, match="'No object named df3/meta/s/meta in the file'"):
|
||||
temp_hdfstore.select("df3/meta/s/meta")
|
||||
|
||||
|
||||
def test_categorical_conversion(temp_h5_path):
|
||||
# GH13322
|
||||
# Check that read_hdf with categorical columns doesn't return rows if
|
||||
# where criteria isn't met.
|
||||
obsids = ["ESP_012345_6789", "ESP_987654_3210"]
|
||||
imgids = ["APF00006np", "APF0001imm"]
|
||||
data = [4.3, 9.8]
|
||||
|
||||
# Test without categories
|
||||
df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
|
||||
|
||||
# We are expecting an empty DataFrame matching types of df
|
||||
expected = df.iloc[[], :]
|
||||
df.to_hdf(temp_h5_path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(temp_h5_path, "df", where="obsids=B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
# Test with categories
|
||||
df.obsids = df.obsids.astype("category")
|
||||
df.imgids = df.imgids.astype("category")
|
||||
|
||||
# We are expecting an empty DataFrame matching types of df
|
||||
expected = df.iloc[[], :]
|
||||
df.to_hdf(temp_h5_path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(temp_h5_path, "df", where="obsids=B")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_categorical_nan_only_columns(temp_h5_path):
|
||||
# GH18413
|
||||
# Check that read_hdf with categorical columns with NaN-only values can
|
||||
# be read back.
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": ["a", "b", "c", np.nan],
|
||||
"b": [np.nan, np.nan, np.nan, np.nan],
|
||||
"c": [1, 2, 3, 4],
|
||||
"d": Series([None] * 4, dtype=object),
|
||||
}
|
||||
)
|
||||
df["a"] = df.a.astype("category")
|
||||
df["b"] = df.b.astype("category")
|
||||
df["d"] = df.b.astype("category")
|
||||
expected = df
|
||||
df.to_hdf(temp_h5_path, key="df", format="table", data_columns=True)
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("where, expected", [["q", []], ["a", ["a"]]])
|
||||
def test_convert_value(temp_h5_path, where: str, expected):
|
||||
# GH39420
|
||||
# Check that read_hdf with categorical columns can filter by where condition.
|
||||
df = DataFrame({"col": ["a", "b", "s"]})
|
||||
df.col = df.col.astype("category")
|
||||
max_widths = {"col": 1}
|
||||
categorical_values = sorted(df.col.unique())
|
||||
expected = DataFrame({"col": expected})
|
||||
expected.col = expected.col.astype("category")
|
||||
expected.col = expected.col.cat.set_categories(categorical_values)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", format="table", min_itemsize=max_widths)
|
||||
result = read_hdf(temp_h5_path, where=f'col=="{where}"')
|
||||
tm.assert_frame_equal(result, expected)
|
||||
156
venv/Lib/site-packages/pandas/tests/io/pytables/test_compat.py
Normal file
156
venv/Lib/site-packages/pandas/tests/io/pytables/test_compat.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from pandas.compat.numpy import np_version_gt2
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
from pandas.tests.io.generate_legacy_storage_files import create_dataframe_all_types
|
||||
from pandas.util.version import Version
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def pytables_hdf5_file(temp_h5_path):
|
||||
"""
|
||||
Use PyTables to create a simple HDF5 file.
|
||||
"""
|
||||
table_schema = {
|
||||
"c0": tables.Time64Col(pos=0),
|
||||
"c1": tables.StringCol(5, pos=1),
|
||||
"c2": tables.Int64Col(pos=2),
|
||||
}
|
||||
|
||||
t0 = 1_561_105_000.0
|
||||
|
||||
testsamples = [
|
||||
{"c0": t0, "c1": "aaaaa", "c2": 1},
|
||||
{"c0": t0 + 1, "c1": "bbbbb", "c2": 2},
|
||||
{"c0": t0 + 2, "c1": "ccccc", "c2": 10**5},
|
||||
{"c0": t0 + 3, "c1": "ddddd", "c2": 4_294_967_295},
|
||||
]
|
||||
|
||||
objname = "pandas_test_timeseries"
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="w") as f:
|
||||
t = f.create_table("/", name=objname, description=table_schema)
|
||||
for sample in testsamples:
|
||||
for key, value in sample.items():
|
||||
t.row[key] = value
|
||||
t.row.append()
|
||||
|
||||
return temp_h5_path, objname, pd.DataFrame(testsamples)
|
||||
|
||||
|
||||
class TestReadPyTablesHDF5:
|
||||
"""
|
||||
A group of tests which covers reading HDF5 files written by plain PyTables
|
||||
(not written by pandas).
|
||||
|
||||
Was introduced for regression-testing issue 11188.
|
||||
"""
|
||||
|
||||
def test_read_complete(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
result = pd.read_hdf(path, key=objname)
|
||||
expected = df
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_start(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, start=1)
|
||||
expected = df[1:].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_stop(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, stop=1)
|
||||
expected = df[:1].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
def test_read_with_startstop(self, pytables_hdf5_file):
|
||||
path, objname, df = pytables_hdf5_file
|
||||
# This is a regression test for pandas-dev/pandas/issues/11188
|
||||
result = pd.read_hdf(path, key=objname, start=1, stop=2)
|
||||
expected = df[1:2].reset_index(drop=True)
|
||||
tm.assert_frame_equal(result, expected, check_index_type=True)
|
||||
|
||||
|
||||
_legacy_files = list(Path(__file__).parent.parent.glob("data/legacy_hdf/*/*.h5"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("legacy_file", _legacy_files, ids=lambda x: x.name)
|
||||
def test_legacy_files(datapath, legacy_file, using_infer_string, request):
|
||||
legacy_version = Version(legacy_file.parent.name)
|
||||
legacy_file = datapath(legacy_file)
|
||||
|
||||
if not np_version_gt2 and legacy_file.endswith("fixed.h5"):
|
||||
# Files created for versions 2.0-3.0 used a numpy version >= 2.0, and
|
||||
# unpickling the object dtype column fails with older numpy
|
||||
pytest.skip("Fixed format pickle objects don't deserialize with numpy < 2.0")
|
||||
|
||||
result = pd.read_hdf(legacy_file)
|
||||
|
||||
expected = create_dataframe_all_types()
|
||||
|
||||
# the fixed format doesn't include categorical columns (not supported)
|
||||
if legacy_file.endswith("fixed.h5"):
|
||||
expected = expected.drop(
|
||||
# columns=["categorical", "categorical_object", "categorical_int"]
|
||||
columns=["categorical_int"]
|
||||
)
|
||||
|
||||
# # object dtype columns with strings get read as `str`
|
||||
# if using_infer_string:
|
||||
# expected["object"] = expected["object"].astype("str")
|
||||
# expected["object_nan"] = expected["object_nan"].astype("str")
|
||||
# if legacy_file.endswith("table.h5"):
|
||||
# expected["categorical_object"] = expected["categorical_object"].astype(
|
||||
# pd.CategoricalDtype(
|
||||
# expected["categorical_object"].cat.categories.astype("str")
|
||||
# )
|
||||
# )
|
||||
# else:
|
||||
# expected["string"] = expected["string"].astype("object")
|
||||
# if legacy_file.endswith("table.h5"):
|
||||
# expected["object"] = expected["object"].fillna(np.nan)
|
||||
# expected["categorical"] = expected["categorical"].astype(
|
||||
# pd.CategoricalDtype(
|
||||
# expected["categorical"].cat.categories.astype(object)
|
||||
# )
|
||||
# )
|
||||
# else:
|
||||
# expected["string"] = expected["string"].fillna("nan")
|
||||
|
||||
if legacy_version < Version("2.2.0") or (
|
||||
legacy_version < Version("3.0.0") and legacy_file.endswith("fixed.h5")
|
||||
):
|
||||
# timedelta columns gets read as nanoseconds, resulting in buggy values
|
||||
# (this also happened for direct roundtrips with those versions)
|
||||
assert not result["timedelta_us"].equals(expected["timedelta_us"])
|
||||
assert not result["timedelta_ms"].equals(expected["timedelta_ms"])
|
||||
assert not result["timedelta_s"].equals(expected["timedelta_s"])
|
||||
result = result.drop(columns=["timedelta_us", "timedelta_ms", "timedelta_s"])
|
||||
expected = expected.drop(
|
||||
columns=["timedelta_us", "timedelta_ms", "timedelta_s"]
|
||||
)
|
||||
|
||||
if legacy_version < Version("2.2.0"):
|
||||
# datetime columns gets read as nanoseconds, resulting in buggy values
|
||||
# (this also happened for direct roundtrips with those versions)
|
||||
assert not result["datetime_us"].equals(expected["datetime_us"])
|
||||
assert not result["datetime_ms"].equals(expected["datetime_ms"])
|
||||
assert not result["datetime_s"].equals(expected["datetime_s"])
|
||||
assert not result["datetimetz_us"].equals(expected["datetimetz_us"])
|
||||
result = result.drop(
|
||||
columns=["datetime_us", "datetime_ms", "datetime_s", "datetimetz_us"]
|
||||
)
|
||||
expected = expected.drop(
|
||||
columns=["datetime_us", "datetime_ms", "datetime_s", "datetimetz_us"]
|
||||
)
|
||||
|
||||
tm.assert_frame_equal(result, expected)
|
||||
206
venv/Lib/site-packages/pandas/tests/io/pytables/test_complex.py
Normal file
206
venv/Lib/site-packages/pandas/tests/io/pytables/test_complex.py
Normal file
@@ -0,0 +1,206 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.pytables import read_hdf
|
||||
|
||||
|
||||
def test_complex_fixed(temp_h5_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
reread = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
reread = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_table(temp_h5_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex64),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", format="table")
|
||||
reread = read_hdf(temp_h5_path, key="df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)).astype(np.complex128),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", format="table", mode="w")
|
||||
reread = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_mixed_fixed(temp_h5_path):
|
||||
complex64 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
|
||||
)
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": complex64,
|
||||
"D": complex128,
|
||||
"E": [1.0, 2.0, 3.0, 4.0],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
reread = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning"
|
||||
)
|
||||
def test_complex_mixed_table_store_select(temp_hdfstore):
|
||||
complex64 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
|
||||
)
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": complex64,
|
||||
"D": complex128,
|
||||
"E": [1.0, 2.0, 3.0, 4.0],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
|
||||
temp_hdfstore.append("df", df, data_columns=["A", "B"])
|
||||
result = temp_hdfstore.select("df", where="A>2")
|
||||
tm.assert_frame_equal(df.loc[df.A > 2], result)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning"
|
||||
)
|
||||
def test_complex_mixed_table_store_to_path(temp_h5_path):
|
||||
complex64 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64
|
||||
)
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3, 4],
|
||||
"B": ["a", "b", "c", "d"],
|
||||
"C": complex64,
|
||||
"D": complex128,
|
||||
"E": [1.0, 2.0, 3.0, 4.0],
|
||||
},
|
||||
index=list("abcd"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", format="table")
|
||||
reread = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_across_dimensions_fixed(temp_h5_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
objs = [s, df]
|
||||
comps = [tm.assert_series_equal, tm.assert_frame_equal]
|
||||
for obj, comp in zip(objs, comps):
|
||||
obj.to_hdf(temp_h5_path, key="obj", format="fixed")
|
||||
reread = read_hdf(temp_h5_path, "obj")
|
||||
comp(obj, reread)
|
||||
|
||||
|
||||
def test_complex_across_dimensions(temp_h5_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
df.to_hdf(temp_h5_path, key="obj", format="table")
|
||||
reread = read_hdf(temp_h5_path, "obj")
|
||||
tm.assert_frame_equal(df, reread)
|
||||
|
||||
|
||||
def test_complex_indexing_error(temp_hdfstore):
|
||||
complex128 = np.array(
|
||||
[1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128
|
||||
)
|
||||
df = DataFrame(
|
||||
{"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128},
|
||||
index=list("abcd"),
|
||||
)
|
||||
|
||||
msg = (
|
||||
"Columns containing complex values can be stored "
|
||||
"but cannot be indexed when using table format. "
|
||||
"Either use fixed format, set index=False, "
|
||||
"or do not include the columns containing complex "
|
||||
"values to data_columns when initializing the table."
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df", df, data_columns=["C"])
|
||||
|
||||
|
||||
def test_complex_series_error(temp_h5_path):
|
||||
complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j])
|
||||
s = Series(complex128, index=list("abcd"))
|
||||
|
||||
msg = (
|
||||
"Columns containing complex values can be stored "
|
||||
"but cannot be indexed when using table format. "
|
||||
"Either use fixed format, set index=False, "
|
||||
"or do not include the columns containing complex "
|
||||
"values to data_columns when initializing the table."
|
||||
)
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
s.to_hdf(temp_h5_path, key="obj", format="t")
|
||||
|
||||
s.to_hdf(temp_h5_path, key="obj", format="t", index=False)
|
||||
reread = read_hdf(temp_h5_path, "obj")
|
||||
tm.assert_series_equal(s, reread)
|
||||
|
||||
|
||||
def test_complex_append(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
{
|
||||
"a": np.random.default_rng(2).standard_normal(100).astype(np.complex128),
|
||||
"b": np.random.default_rng(2).standard_normal(100),
|
||||
}
|
||||
)
|
||||
|
||||
temp_hdfstore.append("df", df, data_columns=["b"])
|
||||
temp_hdfstore.append("df", df)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(pd.concat([df, df], axis=0), result)
|
||||
254
venv/Lib/site-packages/pandas/tests/io/pytables/test_errors.py
Normal file
254
venv/Lib/site-packages/pandas/tests/io/pytables/test_errors.py
Normal file
@@ -0,0 +1,254 @@
|
||||
import datetime
|
||||
from io import BytesIO
|
||||
import re
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
CategoricalIndex,
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
MultiIndex,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
from pandas.io.pytables import (
|
||||
Term,
|
||||
_maybe_adjust_name,
|
||||
)
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_pass_spec_to_storer(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
temp_hdfstore.put("df", df)
|
||||
msg = (
|
||||
"cannot pass a column specification when reading a Fixed format "
|
||||
"store. this store must be selected in its entirety"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.select("df", columns=["A"])
|
||||
msg = (
|
||||
"cannot pass a where specification when reading from a Fixed "
|
||||
"format store. this store must be selected in its entirety"
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.select("df", where=["columns=A"])
|
||||
|
||||
|
||||
def test_table_index_incompatible_dtypes(temp_hdfstore):
|
||||
df1 = DataFrame({"a": [1, 2, 3]})
|
||||
df2 = DataFrame(
|
||||
{"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3, unit="ns")
|
||||
)
|
||||
|
||||
temp_hdfstore.put("frame", df1, format="table")
|
||||
msg = re.escape("incompatible kind in col [integer - datetime64[ns]]")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.put("frame", df2, format="table", append=True)
|
||||
|
||||
|
||||
def test_unimplemented_dtypes_table_columns(temp_hdfstore):
|
||||
dtypes = [("date", datetime.date(2001, 1, 2))]
|
||||
|
||||
# currently not supported dtypes ####
|
||||
for n, f in dtypes:
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df[n] = f
|
||||
msg = re.escape(f"[{n}] is not implemented as a table column")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append(f"df1_{n}", df)
|
||||
|
||||
|
||||
def test_unimplemented_dtypes_table_columns2(temp_hdfstore):
|
||||
# frame
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df["obj1"] = "foo"
|
||||
df["obj2"] = "bar"
|
||||
df["datetime1"] = datetime.date(2001, 1, 2)
|
||||
df = df._consolidate()
|
||||
|
||||
# this fails because we have a date in the object block......
|
||||
msg = "|".join(
|
||||
[
|
||||
re.escape(
|
||||
"Cannot serialize the column [datetime1]\nbecause its data "
|
||||
"contents are not [string] but [date] object dtype"
|
||||
),
|
||||
re.escape("[date] is not implemented as a table column"),
|
||||
]
|
||||
)
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.append("df_unimplemented", df)
|
||||
|
||||
|
||||
def test_invalid_terms(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B", unit="ns"),
|
||||
)
|
||||
df["string"] = "foo"
|
||||
df.loc[df.index[0:4], "string"] = "bar"
|
||||
|
||||
temp_hdfstore.put("df", df, format="table")
|
||||
|
||||
# some invalid terms
|
||||
msg = re.escape("__init__() missing 1 required positional argument: 'where'")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
Term()
|
||||
|
||||
# more invalid
|
||||
msg = re.escape(
|
||||
"cannot process expression [df.index[3]], "
|
||||
"[2000-01-06 00:00:00] is not a valid condition"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.select("df", "df.index[3]")
|
||||
|
||||
msg = "invalid syntax"
|
||||
with pytest.raises(SyntaxError, match=msg):
|
||||
temp_hdfstore.select("df", "index>")
|
||||
|
||||
|
||||
def test_invalid_terms_from_docs(temp_h5_path):
|
||||
# from the docs
|
||||
dfq = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=list("ABCD"),
|
||||
index=date_range("20130101", periods=10, unit="ns"),
|
||||
)
|
||||
dfq.to_hdf(temp_h5_path, key="dfq", format="table", data_columns=True)
|
||||
|
||||
# check ok
|
||||
read_hdf(
|
||||
temp_h5_path, "dfq", where="index>Timestamp('20130104') & columns=['A', 'B']"
|
||||
)
|
||||
read_hdf(temp_h5_path, "dfq", where="A>0 or C>0")
|
||||
|
||||
|
||||
def test_invalid_terms_reference(temp_h5_path):
|
||||
# catch the invalid reference
|
||||
dfq = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=list("ABCD"),
|
||||
index=date_range("20130101", periods=10, unit="ns"),
|
||||
)
|
||||
dfq.to_hdf(temp_h5_path, key="dfq", format="table")
|
||||
|
||||
msg = (
|
||||
r"The passed where expression: A>0 or C>0\n\s*"
|
||||
r"contains an invalid variable reference\n\s*"
|
||||
r"all of the variable references must be a reference to\n\s*"
|
||||
r"an axis \(e.g. 'index' or 'columns'\), or a data_column\n\s*"
|
||||
r"The currently defined references are: index,columns\n"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(temp_h5_path, "dfq", where="A>0 or C>0")
|
||||
|
||||
|
||||
def test_append_with_diff_col_name_types_raises_value_error(temp_hdfstore):
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)))
|
||||
df2 = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
|
||||
df3 = DataFrame({(1, 2): np.random.default_rng(2).standard_normal(10)})
|
||||
df4 = DataFrame({("1", 2): np.random.default_rng(2).standard_normal(10)})
|
||||
df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)})
|
||||
|
||||
name = "df_diff_valerror"
|
||||
temp_hdfstore.append(name, df)
|
||||
|
||||
for d in (df2, df3, df4, df5):
|
||||
msg = re.escape(
|
||||
"cannot match existing table structure for [0] on appending data"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append(name, d)
|
||||
|
||||
|
||||
def test_invalid_complib(temp_h5_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
msg = r"complib only supports \[.*\] compression."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_hdf(temp_h5_path, key="df", complib="foolib")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"idx",
|
||||
[
|
||||
date_range("2019", freq="D", periods=3, tz="UTC", unit="ns"),
|
||||
CategoricalIndex(list("abc")),
|
||||
],
|
||||
)
|
||||
def test_to_hdf_multiindex_extension_dtype(idx, temp_h5_path):
|
||||
# GH 7775
|
||||
mi = MultiIndex.from_arrays([idx, idx])
|
||||
df = DataFrame(0, index=mi, columns=["a"])
|
||||
with pytest.raises(NotImplementedError, match="Saving a MultiIndex"):
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
|
||||
|
||||
def test_unsuppored_hdf_file_error(datapath):
|
||||
# GH 9539
|
||||
data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5")
|
||||
message = (
|
||||
r"Dataset\(s\) incompatible with Pandas data types, "
|
||||
"not table, or no datasets found in HDF5 file."
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=message):
|
||||
read_hdf(data_path)
|
||||
|
||||
|
||||
def test_read_hdf_errors(temp_h5_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
msg = r"File [\S]* does not exist"
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_hdf(f"{uuid.uuid4()}.h5", "key")
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
store = HDFStore(temp_h5_path, mode="r")
|
||||
store.close()
|
||||
|
||||
msg = "The HDFStore must be open for reading."
|
||||
with pytest.raises(OSError, match=msg):
|
||||
read_hdf(store, "df")
|
||||
|
||||
|
||||
def test_read_hdf_generic_buffer_errors():
|
||||
msg = "Support for generic buffers has not been implemented."
|
||||
with pytest.raises(NotImplementedError, match=msg):
|
||||
read_hdf(BytesIO(b""), "df")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_version", [(1, 2), (1,), [], "12", "123"])
|
||||
def test_maybe_adjust_name_bad_version_raises(bad_version):
|
||||
msg = "Version is incorrect, expected sequence of 3 integers"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
_maybe_adjust_name("values_block_0", version=bad_version)
|
||||
@@ -0,0 +1,489 @@
|
||||
import os
|
||||
import uuid
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import (
|
||||
is_platform_linux,
|
||||
is_platform_little_endian,
|
||||
is_platform_mac,
|
||||
)
|
||||
from pandas.errors import (
|
||||
ClosedFileError,
|
||||
PossibleDataLossError,
|
||||
)
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
Series,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
from pandas.io import pytables
|
||||
from pandas.io.pytables import Term
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
|
||||
def test_mode(temp_h5_path, mode, using_infer_string):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
msg = r"[\S]* does not exist"
|
||||
doesnt_exist = f"{uuid.uuid4()}.h5"
|
||||
|
||||
# constructor
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
HDFStore(doesnt_exist, mode=mode)
|
||||
|
||||
else:
|
||||
with HDFStore(temp_h5_path, mode=mode) as store:
|
||||
assert store._handle.mode == mode
|
||||
|
||||
# context
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
with HDFStore(doesnt_exist, mode=mode) as store:
|
||||
pass
|
||||
else:
|
||||
with HDFStore(temp_h5_path, mode=mode) as store:
|
||||
assert store._handle.mode == mode
|
||||
|
||||
# conv write
|
||||
if mode in ["r", "r+"]:
|
||||
with pytest.raises(OSError, match=msg):
|
||||
df.to_hdf(doesnt_exist, key="df", mode=mode)
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w")
|
||||
else:
|
||||
df.to_hdf(temp_h5_path, key="df", mode=mode)
|
||||
|
||||
# conv read
|
||||
if mode in ["w"]:
|
||||
msg = (
|
||||
"mode w is not allowed while performing a read. "
|
||||
r"Allowed modes are r, r\+ and a."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(temp_h5_path, "df", mode=mode)
|
||||
else:
|
||||
result = read_hdf(temp_h5_path, "df", mode=mode)
|
||||
if using_infer_string:
|
||||
df.columns = df.columns.astype("str")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_default_mode(temp_h5_path, using_infer_string):
|
||||
# read_hdf uses default mode
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w")
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_reopen_handle(temp_h5_path):
|
||||
store = HDFStore(temp_h5_path, mode="a")
|
||||
store["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
msg = (
|
||||
r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the "
|
||||
"current file!"
|
||||
)
|
||||
# invalid mode change
|
||||
with pytest.raises(PossibleDataLossError, match=msg):
|
||||
store.open("w")
|
||||
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# truncation ok here
|
||||
store.open("w")
|
||||
assert store.is_open
|
||||
assert len(store) == 0
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
store = HDFStore(temp_h5_path, mode="a")
|
||||
store["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
|
||||
# reopen as read
|
||||
store.open("r")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "r"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# reopen as append
|
||||
store.open("a")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "a"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
# reopen as append (again)
|
||||
store.open("a")
|
||||
assert store.is_open
|
||||
assert len(store) == 1
|
||||
assert store._mode == "a"
|
||||
store.close()
|
||||
assert not store.is_open
|
||||
|
||||
|
||||
def test_open_args(using_infer_string):
|
||||
not_written = f"{uuid.uuid4()}.h5"
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
# create an in memory store
|
||||
store = HDFStore(
|
||||
not_written, mode="a", driver="H5FD_CORE", driver_core_backing_store=0
|
||||
)
|
||||
store["df"] = df
|
||||
store.append("df2", df)
|
||||
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
|
||||
tm.assert_frame_equal(store["df"], expected)
|
||||
tm.assert_frame_equal(store["df2"], expected)
|
||||
|
||||
store.close()
|
||||
|
||||
# the file should not have actually been written
|
||||
assert not os.path.exists(not_written)
|
||||
|
||||
|
||||
def test_flush(temp_h5_path):
|
||||
with HDFStore(temp_h5_path, mode="w") as store:
|
||||
store["a"] = Series(range(5))
|
||||
store.flush()
|
||||
store.flush(fsync=True)
|
||||
|
||||
|
||||
def test_complibs_default_settings(temp_h5_path, using_infer_string):
|
||||
# GH15943
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
# Set complevel and check if complib is automatically set to
|
||||
# default value
|
||||
df.to_hdf(temp_h5_path, key="df", complevel=9)
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 9
|
||||
assert node.filters.complib == "zlib"
|
||||
|
||||
# Set complib and check to see if compression is disabled
|
||||
df.to_hdf(temp_h5_path, key="df", complib="zlib")
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
|
||||
# Check if not setting complib or complevel results in no compression
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
expected = df.copy()
|
||||
if using_infer_string:
|
||||
expected.index = expected.index.astype("str")
|
||||
expected.columns = expected.columns.astype("str")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
|
||||
|
||||
def test_complibs_default_settings_override(temp_h5_path):
|
||||
# Check if file-defaults can be overridden on a per table basis
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
store = HDFStore(temp_h5_path)
|
||||
store.append("dfc", df, complevel=9, complib="blosc")
|
||||
store.append("df", df)
|
||||
store.close()
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="r") as h5file:
|
||||
for node in h5file.walk_nodes(where="/df", classname="Leaf"):
|
||||
assert node.filters.complevel == 0
|
||||
assert node.filters.complib is None
|
||||
for node in h5file.walk_nodes(where="/dfc", classname="Leaf"):
|
||||
assert node.filters.complevel == 9
|
||||
assert node.filters.complib == "blosc"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("lvl", range(10))
|
||||
@pytest.mark.parametrize("lib", tables.filters.all_complibs)
|
||||
@pytest.mark.filterwarnings("ignore:object name is not a valid")
|
||||
def test_complibs(tmp_path, lvl, lib, request):
|
||||
# GH14478
|
||||
if is_platform_linux() and lib == "blosc2" and lvl != 0:
|
||||
request.applymarker(pytest.mark.xfail(reason=f"Fails for {lib} on Linux"))
|
||||
df = DataFrame(
|
||||
np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_)
|
||||
)
|
||||
|
||||
# Remove lzo if its not available on this platform
|
||||
if not tables.which_lib_version("lzo"):
|
||||
pytest.skip("lzo not available")
|
||||
# Remove bzip2 if its not available on this platform
|
||||
if not tables.which_lib_version("bzip2"):
|
||||
pytest.skip("bzip2 not available")
|
||||
|
||||
tmpfile = tmp_path / f"{lvl}_{lib}.h5"
|
||||
gname = f"{lvl}_{lib}"
|
||||
|
||||
# Write and read file to see if data is consistent
|
||||
df.to_hdf(tmpfile, key=gname, complib=lib, complevel=lvl)
|
||||
result = read_hdf(tmpfile, gname)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
is_mac = is_platform_mac()
|
||||
|
||||
# Open file and check metadata for correct amount of compression
|
||||
with tables.open_file(tmpfile, mode="r") as h5table:
|
||||
for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"):
|
||||
assert node.filters.complevel == lvl
|
||||
if lvl == 0:
|
||||
assert node.filters.complib is None
|
||||
elif is_mac and lib == "blosc2":
|
||||
res = node.filters.complib
|
||||
assert res in [lib, "blosc2:blosclz"], res
|
||||
else:
|
||||
assert node.filters.complib == lib
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not is_platform_little_endian(), reason="reason platform is not little endian"
|
||||
)
|
||||
def test_encoding(temp_hdfstore):
|
||||
df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
|
||||
df.loc[2, "A"] = np.nan
|
||||
df.loc[3, "B"] = np.nan
|
||||
temp_hdfstore.append("df", df, encoding="ascii")
|
||||
tm.assert_frame_equal(temp_hdfstore["df"], df)
|
||||
|
||||
expected = df.reindex(columns=["A"])
|
||||
result = temp_hdfstore.select("df", Term("columns=A", encoding="ascii"))
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"val",
|
||||
[
|
||||
[b"E\xc9, 17", b"", b"a", b"b", b"c"],
|
||||
[b"E\xc9, 17", b"a", b"b", b"c"],
|
||||
[b"EE, 17", b"", b"a", b"b", b"c"],
|
||||
[b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
|
||||
[b"", b"a", b"b", b"c"],
|
||||
[b"\xf8\xfc", b"a", b"b", b"c"],
|
||||
[b"A\xf8\xfc", b"", b"a", b"b", b"c"],
|
||||
[np.nan, b"", b"b", b"c"],
|
||||
[b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["category", None])
|
||||
def test_latin_encoding(temp_h5_path, dtype, val):
|
||||
enc = "latin-1"
|
||||
nan_rep = ""
|
||||
key = "data"
|
||||
|
||||
val = [x.decode(enc) if isinstance(x, bytes) else x for x in val]
|
||||
ser = Series(val, dtype=dtype)
|
||||
|
||||
ser.to_hdf(temp_h5_path, key=key, format="table", encoding=enc, nan_rep=nan_rep)
|
||||
retr = read_hdf(temp_h5_path, key)
|
||||
|
||||
# TODO:(3.0): once Categorical replace deprecation is enforced,
|
||||
# we may be able to re-simplify the construction of s_nan
|
||||
if dtype == "category":
|
||||
if nan_rep in ser.cat.categories:
|
||||
s_nan = ser.cat.remove_categories([nan_rep])
|
||||
else:
|
||||
s_nan = ser
|
||||
else:
|
||||
s_nan = ser.replace(nan_rep, np.nan)
|
||||
|
||||
tm.assert_series_equal(s_nan, retr)
|
||||
|
||||
|
||||
def test_multiple_open_close(temp_h5_path):
|
||||
# gh-4409: open & close multiple times
|
||||
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w", format="table")
|
||||
|
||||
# single
|
||||
store = HDFStore(temp_h5_path)
|
||||
assert "CLOSED" not in store.info()
|
||||
assert store.is_open
|
||||
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
if pytables._table_file_open_policy_is_strict:
|
||||
# multiples
|
||||
store1 = HDFStore(temp_h5_path)
|
||||
msg = (
|
||||
r"The file [\S]* is already opened\. Please close it before "
|
||||
r"reopening in write mode\."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDFStore(temp_h5_path)
|
||||
|
||||
store1.close()
|
||||
else:
|
||||
# multiples
|
||||
store1 = HDFStore(temp_h5_path)
|
||||
store2 = HDFStore(temp_h5_path)
|
||||
|
||||
assert "CLOSED" not in store1.info()
|
||||
assert "CLOSED" not in store2.info()
|
||||
assert store1.is_open
|
||||
assert store2.is_open
|
||||
|
||||
store1.close()
|
||||
assert "CLOSED" in store1.info()
|
||||
assert not store1.is_open
|
||||
assert "CLOSED" not in store2.info()
|
||||
assert store2.is_open
|
||||
|
||||
store2.close()
|
||||
assert "CLOSED" in store1.info()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store1.is_open
|
||||
assert not store2.is_open
|
||||
|
||||
# nested close
|
||||
store = HDFStore(temp_h5_path, mode="w")
|
||||
store.append("df", df)
|
||||
|
||||
store2 = HDFStore(temp_h5_path)
|
||||
store2.append("df2", df)
|
||||
store2.close()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store2.is_open
|
||||
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
# double closing
|
||||
store = HDFStore(temp_h5_path, mode="w")
|
||||
store.append("df", df)
|
||||
|
||||
store2 = HDFStore(temp_h5_path)
|
||||
store.close()
|
||||
assert "CLOSED" in store.info()
|
||||
assert not store.is_open
|
||||
|
||||
store2.close()
|
||||
assert "CLOSED" in store2.info()
|
||||
assert not store2.is_open
|
||||
|
||||
# ops on a closed store
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w", format="table")
|
||||
|
||||
store = HDFStore(temp_h5_path)
|
||||
store.close()
|
||||
|
||||
msg = r"[\S]* file is not open!"
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.keys()
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
"df" in store
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
len(store)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store["df"]
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.select("df")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.get("df")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.append("df2", df)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.put("df3", df)
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.get_storer("df2")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.remove("df2")
|
||||
|
||||
with pytest.raises(ClosedFileError, match=msg):
|
||||
store.select("df")
|
||||
|
||||
msg = "'HDFStore' object has no attribute 'df'"
|
||||
with pytest.raises(AttributeError, match=msg):
|
||||
store.df
|
||||
|
||||
|
||||
def test_fspath(temp_h5_path):
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
assert os.fspath(store) == str(temp_h5_path)
|
||||
80
venv/Lib/site-packages/pandas/tests/io/pytables/test_keys.py
Normal file
80
venv/Lib/site-packages/pandas/tests/io/pytables/test_keys.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
Series,
|
||||
date_range,
|
||||
)
|
||||
|
||||
tables = pytest.importorskip("tables")
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_keys(temp_hdfstore):
|
||||
temp_hdfstore["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
temp_hdfstore["b"] = Series(
|
||||
range(10), dtype="float64", index=[f"i_{i}" for i in range(10)]
|
||||
)
|
||||
temp_hdfstore["c"] = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD"), dtype=object),
|
||||
index=Index([f"i-{i}" for i in range(30)], dtype=object),
|
||||
)
|
||||
|
||||
assert len(temp_hdfstore) == 3
|
||||
expected = {"/a", "/b", "/c"}
|
||||
assert set(temp_hdfstore.keys()) == expected
|
||||
assert set(temp_hdfstore) == expected
|
||||
|
||||
|
||||
def test_non_pandas_keys(temp_h5_path):
|
||||
class Table1(tables.IsDescription):
|
||||
value1 = tables.Float32Col()
|
||||
|
||||
class Table2(tables.IsDescription):
|
||||
value2 = tables.Float32Col()
|
||||
|
||||
class Table3(tables.IsDescription):
|
||||
value3 = tables.Float32Col()
|
||||
|
||||
with tables.open_file(temp_h5_path, mode="w") as h5file:
|
||||
group = h5file.create_group("/", "group")
|
||||
h5file.create_table(group, "table1", Table1, "Table 1")
|
||||
h5file.create_table(group, "table2", Table2, "Table 2")
|
||||
h5file.create_table(group, "table3", Table3, "Table 3")
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
assert len(store.keys(include="native")) == 3
|
||||
expected = {"/group/table1", "/group/table2", "/group/table3"}
|
||||
assert set(store.keys(include="native")) == expected
|
||||
assert set(store.keys(include="pandas")) == set()
|
||||
for name in expected:
|
||||
df = store.get(name)
|
||||
assert len(df.columns) == 1
|
||||
|
||||
|
||||
def test_keys_illegal_include_keyword_value(temp_hdfstore):
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="`include` should be either 'pandas' or 'native' but is 'illegal'",
|
||||
):
|
||||
temp_hdfstore.keys(include="illegal")
|
||||
|
||||
|
||||
def test_keys_ignore_hdf_softlink(temp_hdfstore):
|
||||
# GH 20523
|
||||
# Puts a softlink into HDF file and rereads
|
||||
df = DataFrame({"A": range(5), "B": range(5)})
|
||||
temp_hdfstore.put("df", df)
|
||||
|
||||
assert temp_hdfstore.keys() == ["/df"]
|
||||
|
||||
temp_hdfstore._handle.create_soft_link(temp_hdfstore._handle.root, "symlink", "df")
|
||||
|
||||
# Should ignore the softlink
|
||||
assert temp_hdfstore.keys() == ["/df"]
|
||||
393
venv/Lib/site-packages/pandas/tests/io/pytables/test_put.py
Normal file
393
venv/Lib/site-packages/pandas/tests/io/pytables/test_put.py
Normal file
@@ -0,0 +1,393 @@
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
MultiIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
concat,
|
||||
date_range,
|
||||
)
|
||||
from pandas.util import _test_decorators as td
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_format_type(temp_hdfstore):
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
temp_hdfstore.put("a", df, format="fixed")
|
||||
temp_hdfstore.put("b", df, format="table")
|
||||
|
||||
assert temp_hdfstore.get_storer("a").format_type == "fixed"
|
||||
assert temp_hdfstore.get_storer("b").format_type == "table"
|
||||
|
||||
|
||||
def test_format_kwarg_in_constructor(temp_h5_path):
|
||||
# GH 13291
|
||||
|
||||
msg = "format is not a defined argument for HDFStore"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
HDFStore(temp_h5_path, format="table")
|
||||
|
||||
|
||||
def test_api_default_format(temp_hdfstore):
|
||||
# default_format option
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
|
||||
with pd.option_context("io.hdf.default_format", "fixed"):
|
||||
temp_hdfstore.put("df", df)
|
||||
assert not temp_hdfstore.get_storer("df").is_table
|
||||
|
||||
msg = "Can only append to Tables"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df2", df)
|
||||
|
||||
with pd.option_context("io.hdf.default_format", "table"):
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.put("df", df)
|
||||
assert temp_hdfstore.get_storer("df").is_table
|
||||
|
||||
temp_hdfstore.append("df2", df)
|
||||
assert temp_hdfstore.get_storer("df").is_table
|
||||
|
||||
|
||||
def test_api_default_format_path(temp_h5_path):
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
|
||||
with pd.option_context("io.hdf.default_format", "fixed"):
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
assert not store.get_storer("df").is_table
|
||||
msg = "Can only append to Tables"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_hdf(temp_h5_path, key="df2", append=True)
|
||||
|
||||
with pd.option_context("io.hdf.default_format", "table"):
|
||||
df.to_hdf(temp_h5_path, key="df3")
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
assert store.get_storer("df3").is_table
|
||||
df.to_hdf(temp_h5_path, key="df4", append=True)
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
assert store.get_storer("df4").is_table
|
||||
|
||||
|
||||
def test_put(temp_hdfstore):
|
||||
store = temp_hdfstore
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((20, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=20, freq="B"),
|
||||
)
|
||||
store["a"] = ts
|
||||
store["b"] = df[:10]
|
||||
store["foo/bar/bah"] = df[:10]
|
||||
store["foo"] = df[:10]
|
||||
store["/foo"] = df[:10]
|
||||
store.put("c", df[:10], format="table")
|
||||
|
||||
# not OK, not a table
|
||||
msg = "Can only append to Tables"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.put("b", df[10:], append=True)
|
||||
|
||||
# node does not currently exist, test _is_table_type returns False
|
||||
# in this case
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.put("f", df[10:], append=True)
|
||||
|
||||
# can't put to a table (use append instead)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
store.put("c", df[10:], append=True)
|
||||
|
||||
# overwrite table
|
||||
store.put("c", df[:10], format="table", append=False)
|
||||
tm.assert_frame_equal(df[:10], store["c"])
|
||||
|
||||
|
||||
def test_put_string_index(temp_hdfstore):
|
||||
store = temp_hdfstore
|
||||
index = Index([f"I am a very long string index: {i}" for i in range(20)])
|
||||
s = Series(np.arange(20), index=index)
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
|
||||
store["a"] = s
|
||||
tm.assert_series_equal(store["a"], s)
|
||||
|
||||
store["b"] = df
|
||||
tm.assert_frame_equal(store["b"], df)
|
||||
|
||||
# mixed length
|
||||
index = Index(
|
||||
["abcdefghijklmnopqrstuvwxyz1234567890"]
|
||||
+ [f"I am a very long string index: {i}" for i in range(20)]
|
||||
)
|
||||
s = Series(np.arange(21), index=index)
|
||||
df = DataFrame({"A": s, "B": s})
|
||||
store["a"] = s
|
||||
tm.assert_series_equal(store["a"], s)
|
||||
|
||||
store["b"] = df
|
||||
tm.assert_frame_equal(store["b"], df)
|
||||
|
||||
|
||||
def test_put_compression(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
|
||||
temp_hdfstore.put("c", df, format="table", complib="zlib")
|
||||
tm.assert_frame_equal(temp_hdfstore["c"], df)
|
||||
|
||||
# can't compress if format='fixed'
|
||||
msg = "Compression not supported on Fixed format stores"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.put("b", df, format="fixed", complib="zlib")
|
||||
|
||||
|
||||
@td.skip_if_windows
|
||||
def test_put_compression_blosc(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
# can't compress if format='fixed'
|
||||
msg = "Compression not supported on Fixed format stores"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.put("b", df, format="fixed", complib="blosc")
|
||||
|
||||
temp_hdfstore.put("c", df, format="table", complib="blosc")
|
||||
tm.assert_frame_equal(temp_hdfstore["c"], df)
|
||||
|
||||
|
||||
def test_put_datetime_ser(temp_hdfstore, performance_warning, using_infer_string):
|
||||
# https://github.com/pandas-dev/pandas/pull/60663
|
||||
ser = Series(3 * [Timestamp("20010102").as_unit("ns")])
|
||||
temp_hdfstore.put("ser", ser)
|
||||
expected = ser.copy()
|
||||
result = temp_hdfstore.get("ser")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_put_mixed_type(temp_hdfstore, performance_warning, using_infer_string):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
df["obj1"] = "foo"
|
||||
df["obj2"] = "bar"
|
||||
df["bool1"] = df["A"] > 0
|
||||
df["bool2"] = df["B"] > 0
|
||||
df["bool3"] = True
|
||||
df["int1"] = 1
|
||||
df["int2"] = 2
|
||||
df["timestamp1"] = Timestamp("20010102").as_unit("ns")
|
||||
df["timestamp2"] = Timestamp("20010103").as_unit("ns")
|
||||
df["datetime1"] = Timestamp("20010102").as_unit("ns")
|
||||
df["datetime2"] = Timestamp("20010103").as_unit("ns")
|
||||
df.loc[df.index[3:6], ["obj1"]] = np.nan
|
||||
df = df._consolidate()
|
||||
|
||||
warning = None if using_infer_string else performance_warning
|
||||
with tm.assert_produces_warning(warning):
|
||||
temp_hdfstore.put("df", df)
|
||||
|
||||
expected = temp_hdfstore.get("df")
|
||||
tm.assert_frame_equal(expected, df)
|
||||
|
||||
|
||||
def test_put_str_frame(temp_hdfstore, performance_warning, string_dtype_arguments):
|
||||
# https://github.com/pandas-dev/pandas/pull/60663
|
||||
dtype = pd.StringDtype(*string_dtype_arguments)
|
||||
df = DataFrame({"a": pd.array(["x", pd.NA, "y"], dtype=dtype)})
|
||||
|
||||
temp_hdfstore.put("df", df)
|
||||
expected_dtype = "str" if dtype.na_value is np.nan else "string"
|
||||
expected = df.astype(expected_dtype)
|
||||
result = temp_hdfstore.get("df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_put_str_series(temp_hdfstore, performance_warning, string_dtype_arguments):
|
||||
# https://github.com/pandas-dev/pandas/pull/60663
|
||||
dtype = pd.StringDtype(*string_dtype_arguments)
|
||||
ser = Series(["x", pd.NA, "y"], dtype=dtype)
|
||||
|
||||
temp_hdfstore.put("ser", ser)
|
||||
expected_dtype = "str" if dtype.na_value is np.nan else "string"
|
||||
expected = ser.astype(expected_dtype)
|
||||
result = temp_hdfstore.get("ser")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format", ["table", "fixed"])
|
||||
@pytest.mark.parametrize(
|
||||
"index",
|
||||
[
|
||||
Index([str(i) for i in range(10)]),
|
||||
Index(np.arange(10, dtype=float)),
|
||||
Index(np.arange(10)),
|
||||
date_range("2020-01-01", periods=10),
|
||||
pd.period_range("2020-01-01", periods=10),
|
||||
],
|
||||
)
|
||||
def test_store_index_types(temp_hdfstore, format, index):
|
||||
# GH5386
|
||||
# test storing various index types
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 2)),
|
||||
columns=list("AB"),
|
||||
index=index,
|
||||
)
|
||||
temp_hdfstore.put("df", df, format=format)
|
||||
tm.assert_frame_equal(df, temp_hdfstore["df"])
|
||||
|
||||
|
||||
def test_column_multiindex(temp_hdfstore, using_infer_string):
|
||||
# GH 4710
|
||||
# recreate multi-indexes properly
|
||||
|
||||
index = MultiIndex.from_tuples(
|
||||
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
|
||||
)
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
|
||||
expected = df.set_axis(df.index.to_numpy())
|
||||
|
||||
temp_hdfstore.put("df", df)
|
||||
tm.assert_frame_equal(
|
||||
temp_hdfstore["df"], expected, check_index_type=True, check_column_type=True
|
||||
)
|
||||
|
||||
temp_hdfstore.put("df1", df, format="table")
|
||||
tm.assert_frame_equal(
|
||||
temp_hdfstore["df1"], expected, check_index_type=True, check_column_type=True
|
||||
)
|
||||
|
||||
msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.put("df2", df, format="table", data_columns=["A"])
|
||||
msg = re.escape("cannot use a multi-index on axis [1] with data_columns True")
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.put("df3", df, format="table", data_columns=True)
|
||||
|
||||
|
||||
def test_column_multiindex_existing(temp_hdfstore, using_infer_string):
|
||||
# appending multi-column on existing table (see GH 6167)
|
||||
|
||||
index = MultiIndex.from_tuples(
|
||||
[("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
|
||||
)
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
|
||||
temp_hdfstore.append("df2", df)
|
||||
temp_hdfstore.append("df2", df)
|
||||
|
||||
tm.assert_frame_equal(temp_hdfstore["df2"], concat((df, df)))
|
||||
|
||||
|
||||
def test_column_multiindex_non_index_axes(temp_hdfstore, using_infer_string):
|
||||
df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo"))
|
||||
expected = df.set_axis(df.index.to_numpy())
|
||||
|
||||
temp_hdfstore.put("df1", df, format="table")
|
||||
tm.assert_frame_equal(
|
||||
temp_hdfstore["df1"], expected, check_index_type=True, check_column_type=True
|
||||
)
|
||||
|
||||
|
||||
def test_store_multiindex(temp_hdfstore):
|
||||
# validate multi-index names
|
||||
# GH 5527
|
||||
|
||||
def make_index(names=None):
|
||||
dti = date_range("2013-12-01", "2013-12-02")
|
||||
mi = MultiIndex.from_product([dti, range(2), range(3)], names=names)
|
||||
return mi
|
||||
|
||||
# no names
|
||||
df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index())
|
||||
temp_hdfstore.append("df", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
# partial names
|
||||
temp_hdfstore.remove("df")
|
||||
df = DataFrame(
|
||||
np.zeros((12, 2)),
|
||||
columns=["a", "b"],
|
||||
index=make_index(["date", None, None]),
|
||||
)
|
||||
temp_hdfstore.append("df", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
# series
|
||||
ser = Series(np.zeros(12), index=make_index(["date", None, None]))
|
||||
temp_hdfstore.append("ser", ser)
|
||||
xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"]))
|
||||
tm.assert_series_equal(temp_hdfstore.select("ser"), xp)
|
||||
|
||||
# dup with column
|
||||
temp_hdfstore.remove("df")
|
||||
df = DataFrame(
|
||||
np.zeros((12, 2)),
|
||||
columns=["a", "b"],
|
||||
index=make_index(["date", "a", "t"]),
|
||||
)
|
||||
msg = "duplicate names/columns in the multi-index when storing as a table"
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# dup within level
|
||||
temp_hdfstore.remove("df")
|
||||
df = DataFrame(
|
||||
np.zeros((12, 2)),
|
||||
columns=["a", "b"],
|
||||
index=make_index(["date", "date", "date"]),
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df", df)
|
||||
|
||||
# fully names
|
||||
temp_hdfstore.remove("df")
|
||||
df = DataFrame(
|
||||
np.zeros((12, 2)),
|
||||
columns=["a", "b"],
|
||||
index=make_index(["date", "s", "t"]),
|
||||
)
|
||||
temp_hdfstore.append("df", df)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format", ["fixed", "table"])
|
||||
def test_store_periodindex(temp_h5_path, format):
|
||||
# GH 7796
|
||||
# test of PeriodIndex in HDFStore
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((5, 1)),
|
||||
index=pd.period_range("20220101", freq="M", periods=5),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w", format=format)
|
||||
expected = pd.read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, expected)
|
||||
@@ -0,0 +1,12 @@
|
||||
import pytest
|
||||
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
@td.skip_if_installed("tables")
|
||||
def test_pytables_raises(temp_h5_path):
|
||||
df = pd.DataFrame({"A": [1, 2]})
|
||||
with pytest.raises(ImportError, match="tables"):
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
305
venv/Lib/site-packages/pandas/tests/io/pytables/test_read.py
Normal file
305
venv/Lib/site-packages/pandas/tests/io/pytables/test_read.py
Normal file
@@ -0,0 +1,305 @@
|
||||
from contextlib import closing
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
HDFStore,
|
||||
Index,
|
||||
Series,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
from pandas.io.pytables import TableIterator
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_read_missing_key_close_store(temp_h5_path):
|
||||
# GH 25766
|
||||
df = DataFrame({"a": range(2), "b": range(2)})
|
||||
df.to_hdf(temp_h5_path, key="k1")
|
||||
|
||||
with pytest.raises(KeyError, match="'No object named k2 in the file'"):
|
||||
read_hdf(temp_h5_path, "k2")
|
||||
|
||||
# smoke test to test that file is properly closed after
|
||||
# read with KeyError before another write
|
||||
df.to_hdf(temp_h5_path, key="k2")
|
||||
|
||||
|
||||
def test_read_index_error_close_store(temp_h5_path):
|
||||
# GH 25766
|
||||
df = DataFrame({"A": [], "B": []}, index=[])
|
||||
df.to_hdf(temp_h5_path, key="k1")
|
||||
|
||||
with pytest.raises(IndexError, match=r"list index out of range"):
|
||||
read_hdf(temp_h5_path, "k1", stop=0)
|
||||
|
||||
# smoke test to test that file is properly closed after
|
||||
# read with IndexError before another write
|
||||
df.to_hdf(temp_h5_path, key="k1")
|
||||
|
||||
|
||||
def test_read_missing_key_opened_store(temp_h5_path):
|
||||
# GH 28699
|
||||
df = DataFrame({"a": range(2), "b": range(2)})
|
||||
df.to_hdf(temp_h5_path, key="k1")
|
||||
|
||||
with HDFStore(temp_h5_path, "r") as store:
|
||||
with pytest.raises(KeyError, match="'No object named k2 in the file'"):
|
||||
read_hdf(store, "k2")
|
||||
|
||||
# Test that the file is still open after a KeyError and that we can
|
||||
# still read from it.
|
||||
read_hdf(store, "k1")
|
||||
|
||||
|
||||
def test_read_column(temp_hdfstore):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
|
||||
# GH 17912
|
||||
# HDFStore.select_column should raise a KeyError
|
||||
# exception if the key is not a valid store
|
||||
with pytest.raises(KeyError, match="No object named df in the file"):
|
||||
temp_hdfstore.select_column("df", "index")
|
||||
|
||||
temp_hdfstore.append("df", df)
|
||||
# error
|
||||
with pytest.raises(
|
||||
KeyError, match=re.escape("'column [foo] not found in the table'")
|
||||
):
|
||||
temp_hdfstore.select_column("df", "foo")
|
||||
|
||||
msg = re.escape("select_column() got an unexpected keyword argument 'where'")
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
temp_hdfstore.select_column("df", "index", where=["index>5"])
|
||||
|
||||
# valid
|
||||
result = temp_hdfstore.select_column("df", "index")
|
||||
tm.assert_almost_equal(result.values, Series(df.index).values)
|
||||
assert isinstance(result, Series)
|
||||
|
||||
# not a data indexable column
|
||||
msg = re.escape(
|
||||
"column [values_block_0] can not be extracted individually; "
|
||||
"it is not data indexable"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.select_column("df", "values_block_0")
|
||||
|
||||
# a data column
|
||||
df2 = df.copy()
|
||||
df2["string"] = "foo"
|
||||
temp_hdfstore.append("df2", df2, data_columns=["string"])
|
||||
result = temp_hdfstore.select_column("df2", "string")
|
||||
tm.assert_almost_equal(result.values, df2["string"].values)
|
||||
|
||||
# a data column with NaNs, result excludes the NaNs
|
||||
df3 = df.copy()
|
||||
df3["string"] = "foo"
|
||||
df3.loc[df3.index[4:6], "string"] = np.nan
|
||||
temp_hdfstore.append("df3", df3, data_columns=["string"])
|
||||
result = temp_hdfstore.select_column("df3", "string")
|
||||
tm.assert_almost_equal(result.values, df3["string"].values)
|
||||
|
||||
# start/stop
|
||||
result = temp_hdfstore.select_column("df3", "string", start=2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[2:])
|
||||
|
||||
result = temp_hdfstore.select_column("df3", "string", start=-2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[-2:])
|
||||
|
||||
result = temp_hdfstore.select_column("df3", "string", stop=2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[:2])
|
||||
|
||||
result = temp_hdfstore.select_column("df3", "string", stop=-2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[:-2])
|
||||
|
||||
result = temp_hdfstore.select_column("df3", "string", start=2, stop=-2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[2:-2])
|
||||
|
||||
result = temp_hdfstore.select_column("df3", "string", start=-2, stop=2)
|
||||
tm.assert_almost_equal(result.values, df3["string"].values[-2:2])
|
||||
|
||||
# GH 10392 - make sure column name is preserved
|
||||
df4 = DataFrame({"A": np.random.default_rng(2).standard_normal(10), "B": "foo"})
|
||||
temp_hdfstore.append("df4", df4, data_columns=True)
|
||||
expected = df4["B"]
|
||||
result = temp_hdfstore.select_column("df4", "B")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
def test_pytables_native_read(datapath):
|
||||
with HDFStore(
|
||||
datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r"
|
||||
) as store:
|
||||
d2 = store["detector/readout"]
|
||||
assert isinstance(d2, DataFrame)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows")
|
||||
def test_pytables_native2_read(datapath):
|
||||
with HDFStore(
|
||||
datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r"
|
||||
) as store:
|
||||
str(store)
|
||||
d1 = store["detector"]
|
||||
assert isinstance(d1, DataFrame)
|
||||
|
||||
|
||||
def test_read_hdf_open_store(temp_h5_path, using_infer_string):
|
||||
# GH10330
|
||||
# No check for non-string path_or-buf, and no test of open store
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
df.index.name = "letters"
|
||||
df = df.set_index(keys="E", append=True)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w")
|
||||
direct = read_hdf(temp_h5_path, "df")
|
||||
with HDFStore(temp_h5_path, mode="r") as store:
|
||||
indirect = read_hdf(store, "df")
|
||||
tm.assert_frame_equal(direct, indirect)
|
||||
assert store.is_open
|
||||
|
||||
|
||||
def test_read_hdf_index_not_view(temp_h5_path):
|
||||
# GH 37441
|
||||
# Ensure that the index of the DataFrame is not a view
|
||||
# into the original recarray that pytables reads in
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=[0, 1, 2, 3],
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w", format="table")
|
||||
|
||||
df2 = read_hdf(temp_h5_path, "df")
|
||||
assert df2.index._data.base is None
|
||||
tm.assert_frame_equal(df, df2)
|
||||
|
||||
|
||||
def test_read_hdf_iterator(temp_h5_path):
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
df.index.name = "letters"
|
||||
df = df.set_index(keys="E", append=True)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", mode="w", format="t")
|
||||
direct = read_hdf(temp_h5_path, "df")
|
||||
iterator = read_hdf(temp_h5_path, "df", iterator=True)
|
||||
with closing(iterator.store):
|
||||
assert isinstance(iterator, TableIterator)
|
||||
indirect = next(iterator.__iter__())
|
||||
tm.assert_frame_equal(direct, indirect)
|
||||
|
||||
|
||||
def test_read_nokey(temp_h5_path):
|
||||
# GH10443
|
||||
df = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
# Categorical dtype not supported for "fixed" format. So no need
|
||||
# to test with that dtype in the dataframe here.
|
||||
df.to_hdf(temp_h5_path, key="df", mode="a")
|
||||
reread = read_hdf(temp_h5_path)
|
||||
tm.assert_frame_equal(df, reread)
|
||||
df.to_hdf(temp_h5_path, key="df2", mode="a")
|
||||
|
||||
msg = "key must be provided when HDF5 file contains multiple datasets."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(temp_h5_path)
|
||||
|
||||
|
||||
def test_read_nokey_table(temp_h5_path):
|
||||
# GH13231
|
||||
df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")})
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", mode="a", format="table")
|
||||
reread = read_hdf(temp_h5_path)
|
||||
tm.assert_frame_equal(df, reread)
|
||||
df.to_hdf(temp_h5_path, key="df2", mode="a", format="table")
|
||||
|
||||
msg = "key must be provided when HDF5 file contains multiple datasets."
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(temp_h5_path)
|
||||
|
||||
|
||||
def test_read_nokey_empty(temp_h5_path):
|
||||
store = HDFStore(temp_h5_path)
|
||||
store.close()
|
||||
msg = re.escape(
|
||||
"Dataset(s) incompatible with Pandas data types, not table, or no "
|
||||
"datasets found in HDF5 file."
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
read_hdf(temp_h5_path)
|
||||
|
||||
|
||||
def test_read_from_pathlib_path(temp_h5_path):
|
||||
# GH11773
|
||||
expected = DataFrame(
|
||||
np.random.default_rng(2).random((4, 5)),
|
||||
index=list("abcd"),
|
||||
columns=list("ABCDE"),
|
||||
)
|
||||
|
||||
expected.to_hdf(temp_h5_path, key="df", mode="a")
|
||||
actual = read_hdf(temp_h5_path, key="df")
|
||||
|
||||
tm.assert_frame_equal(expected, actual)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("format", ["fixed", "table"])
|
||||
def test_read_hdf_series_mode_r(temp_h5_path, format):
|
||||
# GH 16583
|
||||
# Tests that reading a Series saved to an HDF file
|
||||
# still works if a mode='r' argument is supplied
|
||||
series = Series(range(10), dtype=np.float64)
|
||||
series.to_hdf(temp_h5_path, key="data", format=format)
|
||||
result = read_hdf(temp_h5_path, key="data", mode="r")
|
||||
tm.assert_series_equal(result, series)
|
||||
|
||||
|
||||
def test_read_infer_string(temp_h5_path):
|
||||
# GH#54431
|
||||
df = DataFrame({"a": ["a", "b", None]})
|
||||
df.to_hdf(temp_h5_path, key="data", format="table")
|
||||
with pd.option_context("future.infer_string", True):
|
||||
result = read_hdf(temp_h5_path, key="data", mode="r")
|
||||
expected = DataFrame(
|
||||
{"a": ["a", "b", None]},
|
||||
dtype=pd.StringDtype(na_value=np.nan),
|
||||
columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
|
||||
def test_hdfstore_read_datetime64_unit_s(temp_hdfstore):
|
||||
# GH 59004
|
||||
df_s = DataFrame(["2001-01-01", "2002-02-02"], dtype="datetime64[s]")
|
||||
temp_hdfstore.put("df_s", df_s)
|
||||
df_fromstore = temp_hdfstore.get("df_s")
|
||||
tm.assert_frame_equal(df_s, df_fromstore)
|
||||
@@ -0,0 +1,95 @@
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
errors,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
def test_retain_index_attributes(temp_hdfstore, unit):
|
||||
# GH 3499, losing frequency info on index recreation
|
||||
dti = date_range("2000-1-1", periods=3, freq="h", unit=unit)
|
||||
df = DataFrame({"A": Series(range(3), index=dti)})
|
||||
|
||||
temp_hdfstore.put("data", df, format="table")
|
||||
|
||||
result = temp_hdfstore.get("data")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
for attr in ["freq", "tz", "name"]:
|
||||
for idx in ["index", "columns"]:
|
||||
assert getattr(getattr(df, idx), attr, None) == getattr(
|
||||
getattr(result, idx), attr, None
|
||||
)
|
||||
|
||||
dti2 = date_range("2002-1-1", periods=3, freq="D", unit=unit)
|
||||
# try to append a table with a different frequency
|
||||
with tm.assert_produces_warning(errors.AttributeConflictWarning):
|
||||
df2 = DataFrame({"A": Series(range(3), index=dti2)})
|
||||
temp_hdfstore.append("data", df2)
|
||||
|
||||
assert temp_hdfstore.get_storer("data").info["index"]["freq"] is None
|
||||
|
||||
# this is ok
|
||||
dti3 = DatetimeIndex(
|
||||
["2001-01-01", "2001-01-02", "2002-01-01"], dtype=f"M8[{unit}]"
|
||||
)
|
||||
df2 = DataFrame(
|
||||
{
|
||||
"A": Series(
|
||||
range(3),
|
||||
index=dti3,
|
||||
)
|
||||
}
|
||||
)
|
||||
temp_hdfstore.append("df2", df2)
|
||||
dti4 = date_range("2002-1-1", periods=3, freq="D", unit=unit)
|
||||
df3 = DataFrame({"A": Series(range(3), index=dti4)})
|
||||
temp_hdfstore.append("df2", df3)
|
||||
|
||||
|
||||
def test_retain_index_attributes2(temp_h5_path):
|
||||
with tm.assert_produces_warning(errors.AttributeConflictWarning):
|
||||
df = DataFrame(
|
||||
{"A": Series(range(3), index=date_range("2000-1-1", periods=3, freq="h"))}
|
||||
)
|
||||
df.to_hdf(temp_h5_path, key="data", mode="w", append=True)
|
||||
df2 = DataFrame(
|
||||
{"A": Series(range(3), index=date_range("2002-1-1", periods=3, freq="D"))}
|
||||
)
|
||||
|
||||
df2.to_hdf(temp_h5_path, key="data", append=True)
|
||||
|
||||
idx = date_range("2000-1-1", periods=3, freq="h")
|
||||
idx.name = "foo"
|
||||
df = DataFrame({"A": Series(range(3), index=idx)})
|
||||
df.to_hdf(temp_h5_path, key="data", mode="w", append=True)
|
||||
|
||||
assert read_hdf(temp_h5_path, key="data").index.name == "foo"
|
||||
|
||||
with tm.assert_produces_warning(errors.AttributeConflictWarning):
|
||||
idx2 = date_range("2001-1-1", periods=3, freq="h")
|
||||
idx2.name = "bar"
|
||||
df2 = DataFrame({"A": Series(range(3), index=idx2)})
|
||||
df2.to_hdf(temp_h5_path, key="data", append=True)
|
||||
|
||||
assert read_hdf(temp_h5_path, "data").index.name is None
|
||||
|
||||
|
||||
def test_retain_datetime_attribute(temp_h5_path):
|
||||
ser = Series(
|
||||
["2024-08-26 15:13:14", "2024-08-26 15:14:14"],
|
||||
dtype="datetime64[us, UTC]",
|
||||
)
|
||||
dataframe = DataFrame(ser)
|
||||
dataframe.to_hdf(temp_h5_path, key="Annotations", mode="w")
|
||||
|
||||
recovered_dataframe = read_hdf(temp_h5_path, key="Annotations")
|
||||
tm.assert_frame_equal(dataframe, recovered_dataframe)
|
||||
@@ -0,0 +1,572 @@
|
||||
import datetime
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs import Timestamp
|
||||
from pandas.compat import is_platform_windows
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
HDFStore,
|
||||
Index,
|
||||
Series,
|
||||
_testing as tm,
|
||||
bdate_range,
|
||||
date_range,
|
||||
read_hdf,
|
||||
)
|
||||
from pandas.util import _test_decorators as td
|
||||
|
||||
pytestmark = [pytest.mark.single_cpu]
|
||||
|
||||
|
||||
def test_conv_read_write(temp_h5_path):
|
||||
def roundtrip(key, obj, **kwargs):
|
||||
obj.to_hdf(temp_h5_path, key=key, **kwargs)
|
||||
return read_hdf(temp_h5_path, key)
|
||||
|
||||
o = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
tm.assert_series_equal(o, roundtrip("series", o))
|
||||
|
||||
o = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)])
|
||||
tm.assert_series_equal(o, roundtrip("string_series", o))
|
||||
|
||||
o = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
tm.assert_frame_equal(o, roundtrip("frame", o))
|
||||
|
||||
# table
|
||||
df = DataFrame({"A": range(5), "B": range(5)})
|
||||
df.to_hdf(temp_h5_path, key="table", append=True)
|
||||
result = read_hdf(temp_h5_path, "table", where=["index>2"])
|
||||
tm.assert_frame_equal(df[df.index > 2], result)
|
||||
|
||||
|
||||
def test_long_strings(temp_hdfstore):
|
||||
# GH6166
|
||||
data = ["a" * 50] * 10
|
||||
df = DataFrame({"a": data}, index=data)
|
||||
|
||||
temp_hdfstore.append("df", df, data_columns=["a"])
|
||||
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(df, result)
|
||||
|
||||
|
||||
def test_api(temp_h5_path):
|
||||
# GH4584
|
||||
# API issue when to_hdf doesn't accept append AND format args
|
||||
path = temp_h5_path
|
||||
|
||||
df = DataFrame(range(20))
|
||||
df.iloc[:10].to_hdf(path, key="df", append=True, format="table")
|
||||
df.iloc[10:].to_hdf(path, key="df", append=True, format="table")
|
||||
tm.assert_frame_equal(read_hdf(path, "df"), df)
|
||||
|
||||
# append to False
|
||||
df.iloc[:10].to_hdf(path, key="df", append=False, format="table")
|
||||
df.iloc[10:].to_hdf(path, key="df", append=True, format="table")
|
||||
tm.assert_frame_equal(read_hdf(path, "df"), df)
|
||||
|
||||
|
||||
def test_api_append(temp_h5_path):
|
||||
path = temp_h5_path
|
||||
|
||||
df = DataFrame(range(20))
|
||||
df.iloc[:10].to_hdf(path, key="df", append=True)
|
||||
df.iloc[10:].to_hdf(path, key="df", append=True, format="table")
|
||||
tm.assert_frame_equal(read_hdf(path, "df"), df)
|
||||
|
||||
# append to False
|
||||
df.iloc[:10].to_hdf(path, key="df", append=False, format="table")
|
||||
df.iloc[10:].to_hdf(path, key="df", append=True)
|
||||
tm.assert_frame_equal(read_hdf(path, "df"), df)
|
||||
|
||||
|
||||
def test_api_2(temp_h5_path):
|
||||
df = DataFrame(range(20))
|
||||
df.to_hdf(temp_h5_path, key="df", append=False, format="fixed")
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "df"), df)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", append=False, format="f")
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "df"), df)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", append=False)
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "df"), df)
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df")
|
||||
tm.assert_frame_equal(read_hdf(temp_h5_path, "df"), df)
|
||||
|
||||
|
||||
def test_api_3(temp_hdfstore):
|
||||
df = DataFrame(range(20))
|
||||
|
||||
temp_hdfstore.append("df", df.iloc[:10], append=True, format="table")
|
||||
temp_hdfstore.append("df", df.iloc[10:], append=True, format="table")
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
# append to False
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df.iloc[:10], append=False, format="table")
|
||||
temp_hdfstore.append("df", df.iloc[10:], append=True, format="table")
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
# formats
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df.iloc[:10], append=False, format="table")
|
||||
temp_hdfstore.append("df", df.iloc[10:], append=True, format="table")
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df.iloc[:10], append=False, format="table")
|
||||
temp_hdfstore.append("df", df.iloc[10:], append=True, format=None)
|
||||
tm.assert_frame_equal(temp_hdfstore.select("df"), df)
|
||||
|
||||
|
||||
def test_api_invalid(temp_h5_path):
|
||||
path = temp_h5_path
|
||||
# Invalid.
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
|
||||
msg = "Can only append to Tables"
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_hdf(path, key="df", append=True, format="f")
|
||||
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
df.to_hdf(path, key="df", append=True, format="fixed")
|
||||
|
||||
msg = r"invalid HDFStore format specified \[foo\]"
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.to_hdf(path, key="df", append=True, format="foo")
|
||||
|
||||
with pytest.raises(TypeError, match=msg):
|
||||
df.to_hdf(path, key="df", append=False, format="foo")
|
||||
|
||||
# File path doesn't exist
|
||||
path = ""
|
||||
msg = f"File {path} does not exist"
|
||||
|
||||
with pytest.raises(FileNotFoundError, match=msg):
|
||||
read_hdf(path, "df")
|
||||
|
||||
|
||||
def test_get(temp_hdfstore):
|
||||
temp_hdfstore["a"] = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
left = temp_hdfstore.get("a")
|
||||
right = temp_hdfstore["a"]
|
||||
tm.assert_series_equal(left, right)
|
||||
|
||||
left = temp_hdfstore.get("/a")
|
||||
right = temp_hdfstore["/a"]
|
||||
tm.assert_series_equal(left, right)
|
||||
|
||||
with pytest.raises(KeyError, match="'No object named b in the file'"):
|
||||
temp_hdfstore.get("b")
|
||||
|
||||
|
||||
def test_put_integer(temp_h5_path):
|
||||
# non-date, non-string index
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((50, 100)))
|
||||
_check_roundtrip(df, tm.assert_frame_equal, temp_h5_path)
|
||||
|
||||
|
||||
def test_table_values_dtypes_roundtrip(temp_hdfstore, using_infer_string):
|
||||
df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
|
||||
temp_hdfstore.append("df_f8", df1)
|
||||
tm.assert_series_equal(df1.dtypes, temp_hdfstore["df_f8"].dtypes)
|
||||
|
||||
df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8")
|
||||
temp_hdfstore.append("df_i8", df2)
|
||||
tm.assert_series_equal(df2.dtypes, temp_hdfstore["df_i8"].dtypes)
|
||||
|
||||
# incompatible dtype
|
||||
msg = re.escape(
|
||||
"Cannot serialize the column [a] "
|
||||
"because its data contents are not [float] "
|
||||
"but [integer] object dtype"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df_i8", df1)
|
||||
|
||||
# check creation/storage/retrieval of float32 (a bit hacky to
|
||||
# actually create them thought)
|
||||
df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"])
|
||||
temp_hdfstore.append("df_f4", df1)
|
||||
tm.assert_series_equal(df1.dtypes, temp_hdfstore["df_f4"].dtypes)
|
||||
assert df1.dtypes.iloc[0] == "float32"
|
||||
|
||||
# check with mixed dtypes
|
||||
df1 = DataFrame(
|
||||
{
|
||||
c: Series(np.random.default_rng(2).integers(5), dtype=c)
|
||||
for c in ["float32", "float64", "int32", "int64", "int16", "int8"]
|
||||
}
|
||||
)
|
||||
df1["string"] = "foo"
|
||||
df1["float322"] = 1.0
|
||||
df1["float322"] = df1["float322"].astype("float32")
|
||||
df1["bool"] = df1["float32"] > 0
|
||||
df1["time_s_1"] = Timestamp("20130101").as_unit("s")
|
||||
df1["time_s_2"] = Timestamp("20130101 00:00:00").as_unit("s")
|
||||
df1["time_ms"] = Timestamp("20130101 00:00:00.000").as_unit("ms")
|
||||
df1["time_ns"] = Timestamp("20130102 00:00:00.000000000")
|
||||
|
||||
temp_hdfstore.append("df_mixed_dtypes1", df1)
|
||||
result = temp_hdfstore.select("df_mixed_dtypes1").dtypes.value_counts()
|
||||
result.index = [str(i) for i in result.index]
|
||||
str_dtype = "str" if using_infer_string else "object"
|
||||
expected = Series(
|
||||
{
|
||||
"float32": 2,
|
||||
"float64": 1,
|
||||
"int32": 1,
|
||||
"bool": 1,
|
||||
"int16": 1,
|
||||
"int8": 1,
|
||||
"int64": 1,
|
||||
str_dtype: 1,
|
||||
"datetime64[s]": 2,
|
||||
"datetime64[ms]": 1,
|
||||
"datetime64[ns]": 1,
|
||||
},
|
||||
name="count",
|
||||
)
|
||||
result = result.sort_index()
|
||||
expected = expected.sort_index()
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
|
||||
def test_series(temp_h5_path):
|
||||
s = Series(range(10), dtype="float64", index=[f"i_{i}" for i in range(10)])
|
||||
_check_roundtrip(s, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
_check_roundtrip(ts, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
ts2 = Series(ts.index, Index(ts.index))
|
||||
_check_roundtrip(ts2, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
ts3 = Series(ts.values, Index(np.asarray(ts.index)))
|
||||
_check_roundtrip(
|
||||
ts3, tm.assert_series_equal, path=temp_h5_path, check_index_type=False
|
||||
)
|
||||
|
||||
|
||||
def test_float_index(temp_h5_path):
|
||||
# GH #454
|
||||
index = np.random.default_rng(2).standard_normal(10)
|
||||
s = Series(np.random.default_rng(2).standard_normal(10), index=index)
|
||||
_check_roundtrip(s, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_tuple_index(temp_h5_path, performance_warning):
|
||||
# GH #492
|
||||
col = np.arange(10)
|
||||
idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)]
|
||||
data = np.random.default_rng(2).standard_normal(30).reshape((3, 10))
|
||||
DF = DataFrame(data, index=idx, columns=col)
|
||||
|
||||
with tm.assert_produces_warning(performance_warning):
|
||||
_check_roundtrip(DF, tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
|
||||
def test_index_types(temp_h5_path):
|
||||
values = np.random.default_rng(2).standard_normal(2)
|
||||
|
||||
func = lambda lhs, rhs: tm.assert_series_equal(lhs, rhs, check_index_type=True)
|
||||
|
||||
ser = Series(values, [0, "y"])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [datetime.datetime.today(), 0])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, ["y", 0])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [datetime.date.today(), "a"])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [0, "y"])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [datetime.datetime.today(), 0])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, ["y", 0])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [datetime.date.today(), "a"])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [1.23, "b"])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [1, 1.53])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser = Series(values, [1, 5])
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
dti = DatetimeIndex(["2012-01-01", "2012-01-02"], dtype="M8[ns]")
|
||||
ser = Series(values, index=dti)
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
ser.index = ser.index.as_unit("s")
|
||||
_check_roundtrip(ser, func, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_timeseries_preepoch(temp_h5_path, request):
|
||||
dr = bdate_range("1/1/1940", "1/1/1960")
|
||||
ts = Series(np.random.default_rng(2).standard_normal(len(dr)), index=dr)
|
||||
try:
|
||||
_check_roundtrip(ts, tm.assert_series_equal, path=temp_h5_path)
|
||||
except OverflowError:
|
||||
if is_platform_windows():
|
||||
request.applymarker(
|
||||
pytest.mark.xfail("known failure on some windows platforms")
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression", [False, pytest.param(True, marks=td.skip_if_windows)]
|
||||
)
|
||||
def test_frame(compression, temp_h5_path):
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
|
||||
# put in some random NAs
|
||||
df.iloc[0, 0] = np.nan
|
||||
df.iloc[5, 3] = np.nan
|
||||
|
||||
_check_roundtrip_table(
|
||||
df, tm.assert_frame_equal, path=temp_h5_path, compression=compression
|
||||
)
|
||||
_check_roundtrip(
|
||||
df, tm.assert_frame_equal, path=temp_h5_path, compression=compression
|
||||
)
|
||||
|
||||
tdf = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((10, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=date_range("2000-01-01", periods=10, freq="B"),
|
||||
)
|
||||
_check_roundtrip(
|
||||
tdf, tm.assert_frame_equal, path=temp_h5_path, compression=compression
|
||||
)
|
||||
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
# not consolidated
|
||||
df["foo"] = np.random.default_rng(2).standard_normal(len(df))
|
||||
store["df"] = df
|
||||
recons = store["df"]
|
||||
assert recons._mgr.is_consolidated()
|
||||
|
||||
# empty
|
||||
df2 = df[:0]
|
||||
# Prevent df2 from having index with inferred_type as string
|
||||
df2.index = Index([])
|
||||
_check_roundtrip(df2[:0], tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_empty_series_frame(temp_h5_path):
|
||||
s0 = Series(dtype=object)
|
||||
s1 = Series(name="myseries", dtype=object)
|
||||
df0 = DataFrame()
|
||||
df1 = DataFrame(index=["a", "b", "c"])
|
||||
df2 = DataFrame(columns=["d", "e", "f"])
|
||||
|
||||
_check_roundtrip(s0, tm.assert_series_equal, path=temp_h5_path)
|
||||
_check_roundtrip(s1, tm.assert_series_equal, path=temp_h5_path)
|
||||
_check_roundtrip(df0, tm.assert_frame_equal, path=temp_h5_path)
|
||||
_check_roundtrip(df1, tm.assert_frame_equal, path=temp_h5_path)
|
||||
_check_roundtrip(df2, tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"])
|
||||
def test_empty_series(dtype, temp_h5_path):
|
||||
s = Series(dtype=dtype)
|
||||
_check_roundtrip(s, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_can_serialize_dates(temp_h5_path):
|
||||
rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")]
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng
|
||||
)
|
||||
|
||||
_check_roundtrip(frame, tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_store_hierarchical(
|
||||
temp_h5_path, using_infer_string, multiindex_dataframe_random_data
|
||||
):
|
||||
frame = multiindex_dataframe_random_data
|
||||
|
||||
_check_roundtrip(frame, tm.assert_frame_equal, path=temp_h5_path)
|
||||
_check_roundtrip(frame.T, tm.assert_frame_equal, path=temp_h5_path)
|
||||
_check_roundtrip(frame["A"], tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
# check that the names are stored
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
store["frame"] = frame
|
||||
recons = store["frame"]
|
||||
tm.assert_frame_equal(recons, frame)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"compression", [False, pytest.param(True, marks=td.skip_if_windows)]
|
||||
)
|
||||
def test_store_mixed(compression, temp_h5_path):
|
||||
def _make_one():
|
||||
df = DataFrame(
|
||||
1.1 * np.arange(120).reshape((30, 4)),
|
||||
columns=Index(list("ABCD")),
|
||||
index=Index([f"i-{i}" for i in range(30)]),
|
||||
)
|
||||
df["obj1"] = "foo"
|
||||
df["obj2"] = "bar"
|
||||
df["bool1"] = df["A"] > 0
|
||||
df["bool2"] = df["B"] > 0
|
||||
df["int1"] = 1
|
||||
df["int2"] = 2
|
||||
return df._consolidate()
|
||||
|
||||
df1 = _make_one()
|
||||
df2 = _make_one()
|
||||
|
||||
_check_roundtrip(df1, tm.assert_frame_equal, path=temp_h5_path)
|
||||
_check_roundtrip(df2, tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
store["obj"] = df1
|
||||
tm.assert_frame_equal(store["obj"], df1)
|
||||
store["obj"] = df2
|
||||
tm.assert_frame_equal(store["obj"], df2)
|
||||
|
||||
# check that can store Series of all of these types
|
||||
_check_roundtrip(
|
||||
df1["obj1"],
|
||||
tm.assert_series_equal,
|
||||
path=temp_h5_path,
|
||||
compression=compression,
|
||||
)
|
||||
_check_roundtrip(
|
||||
df1["bool1"],
|
||||
tm.assert_series_equal,
|
||||
path=temp_h5_path,
|
||||
compression=compression,
|
||||
)
|
||||
_check_roundtrip(
|
||||
df1["int1"],
|
||||
tm.assert_series_equal,
|
||||
path=temp_h5_path,
|
||||
compression=compression,
|
||||
)
|
||||
|
||||
|
||||
def _check_roundtrip(obj, comparator, path, compression=False, **kwargs):
|
||||
options = {}
|
||||
if compression:
|
||||
options["complib"] = "blosc"
|
||||
|
||||
with HDFStore(path, "w", **options) as store:
|
||||
store["obj"] = obj
|
||||
retrieved = store["obj"]
|
||||
comparator(retrieved, obj, **kwargs)
|
||||
|
||||
|
||||
def _check_roundtrip_table(obj, comparator, path, compression=False):
|
||||
options = {}
|
||||
if compression:
|
||||
options["complib"] = "blosc"
|
||||
|
||||
with HDFStore(path, "w", **options) as store:
|
||||
store.put("obj", obj, format="table")
|
||||
retrieved = store["obj"]
|
||||
|
||||
comparator(retrieved, obj)
|
||||
|
||||
|
||||
def test_unicode_index(temp_h5_path):
|
||||
unicode_values = ["\u03c3", "\u03c3\u03c3"]
|
||||
|
||||
s = Series(
|
||||
np.random.default_rng(2).standard_normal(len(unicode_values)),
|
||||
unicode_values,
|
||||
)
|
||||
_check_roundtrip(s, tm.assert_series_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_unicode_longer_encoded(temp_hdfstore):
|
||||
# GH 11234
|
||||
char = "\u0394"
|
||||
df = DataFrame({"A": [char]})
|
||||
temp_hdfstore.put("df", df, format="table", encoding="utf-8")
|
||||
result = temp_hdfstore.get("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
df = DataFrame({"A": ["a", char], "B": ["b", "b"]})
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.put("df", df, format="table", encoding="utf-8")
|
||||
result = temp_hdfstore.get("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_store_datetime_mixed(temp_h5_path):
|
||||
df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
|
||||
ts = Series(
|
||||
np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
|
||||
)
|
||||
df["d"] = ts.index[:3]
|
||||
_check_roundtrip(df, tm.assert_frame_equal, path=temp_h5_path)
|
||||
|
||||
|
||||
def test_round_trip_equals(temp_h5_path):
|
||||
# GH 9330
|
||||
df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
|
||||
|
||||
df.to_hdf(temp_h5_path, key="df", format="table")
|
||||
other = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(df, other)
|
||||
assert df.equals(other)
|
||||
assert other.equals(df)
|
||||
|
||||
|
||||
def test_infer_string_columns(temp_h5_path):
|
||||
# GH#
|
||||
pytest.importorskip("pyarrow")
|
||||
with pd.option_context("future.infer_string", True):
|
||||
df = DataFrame(1, columns=list("ABCD"), index=list(range(10))).set_index(
|
||||
["A", "B"]
|
||||
)
|
||||
expected = df.copy()
|
||||
df.to_hdf(temp_h5_path, key="df", format="table")
|
||||
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
1035
venv/Lib/site-packages/pandas/tests/io/pytables/test_select.py
Normal file
1035
venv/Lib/site-packages/pandas/tests/io/pytables/test_select.py
Normal file
File diff suppressed because it is too large
Load Diff
1097
venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py
Normal file
1097
venv/Lib/site-packages/pandas/tests/io/pytables/test_store.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,48 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
Series,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.pytables import (
|
||||
HDFStore,
|
||||
read_hdf,
|
||||
)
|
||||
|
||||
pytest.importorskip("tables")
|
||||
|
||||
|
||||
class TestHDFStoreSubclass:
|
||||
# GH 33748
|
||||
def test_supported_for_subclass_dataframe(self, temp_h5_path):
|
||||
data = {"a": [1, 2], "b": [3, 4]}
|
||||
sdf = tm.SubclassedDataFrame(data, dtype=np.intp)
|
||||
|
||||
expected = DataFrame(data, dtype=np.intp)
|
||||
|
||||
sdf.to_hdf(temp_h5_path, key="df")
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
store.put("df", sdf)
|
||||
result = read_hdf(temp_h5_path, "df")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_supported_for_subclass_series(self, temp_h5_path):
|
||||
data = [1, 2, 3]
|
||||
sser = tm.SubclassedSeries(data, dtype=np.intp)
|
||||
|
||||
expected = Series(data, dtype=np.intp)
|
||||
|
||||
sser.to_hdf(temp_h5_path, key="ser")
|
||||
result = read_hdf(temp_h5_path, "ser")
|
||||
tm.assert_series_equal(result, expected)
|
||||
|
||||
with HDFStore(temp_h5_path) as store:
|
||||
store.put("ser", sser)
|
||||
result = read_hdf(temp_h5_path, "ser")
|
||||
tm.assert_series_equal(result, expected)
|
||||
@@ -0,0 +1,66 @@
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Series,
|
||||
_testing as tm,
|
||||
date_range,
|
||||
period_range,
|
||||
)
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
|
||||
@pytest.mark.parametrize("unit", ["us", "ns"])
|
||||
def test_store_datetime_fractional_secs(temp_hdfstore, unit):
|
||||
dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456)
|
||||
dti = DatetimeIndex([dt], dtype=f"M8[{unit}]")
|
||||
series = Series([0], index=dti)
|
||||
temp_hdfstore["a"] = series
|
||||
assert temp_hdfstore["a"].index[0] == dt
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_tseries_indices_series(temp_hdfstore):
|
||||
idx = date_range("2020-01-01", periods=10)
|
||||
ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
|
||||
temp_hdfstore["a"] = ser
|
||||
result = temp_hdfstore["a"]
|
||||
|
||||
tm.assert_series_equal(result, ser)
|
||||
assert result.index.freq == ser.index.freq
|
||||
tm.assert_class_equal(result.index, ser.index, obj="series index")
|
||||
|
||||
idx = period_range("2020-01-01", periods=10, freq="D")
|
||||
ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
|
||||
temp_hdfstore["a"] = ser
|
||||
result = temp_hdfstore["a"]
|
||||
|
||||
tm.assert_series_equal(result, ser)
|
||||
assert result.index.freq == ser.index.freq
|
||||
tm.assert_class_equal(result.index, ser.index, obj="series index")
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
|
||||
def test_tseries_indices_frame(temp_hdfstore):
|
||||
idx = date_range("2020-01-01", periods=10)
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx)
|
||||
temp_hdfstore["a"] = df
|
||||
result = temp_hdfstore["a"]
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
assert result.index.freq == df.index.freq
|
||||
tm.assert_class_equal(result.index, df.index, obj="dataframe index")
|
||||
|
||||
idx = period_range("2020-01-01", periods=10, freq="D")
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 3)), idx)
|
||||
temp_hdfstore["a"] = df
|
||||
result = temp_hdfstore["a"]
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
assert result.index.freq == df.index.freq
|
||||
tm.assert_class_equal(result.index, df.index, obj="dataframe index")
|
||||
@@ -0,0 +1,334 @@
|
||||
from datetime import (
|
||||
date,
|
||||
timedelta,
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from pandas._libs.tslibs.timezones import maybe_get_tz
|
||||
import pandas.util._test_decorators as td
|
||||
|
||||
import pandas as pd
|
||||
from pandas import (
|
||||
DataFrame,
|
||||
DatetimeIndex,
|
||||
Series,
|
||||
Timestamp,
|
||||
date_range,
|
||||
)
|
||||
import pandas._testing as tm
|
||||
|
||||
|
||||
def _compare_with_tz(a, b):
|
||||
tm.assert_frame_equal(a, b)
|
||||
|
||||
# compare the zones on each element
|
||||
for c in a.columns:
|
||||
for i in a.index:
|
||||
a_e = a.loc[i, c]
|
||||
b_e = b.loc[i, c]
|
||||
if not (a_e == b_e and a_e.tz == b_e.tz):
|
||||
raise AssertionError(f"invalid tz comparison [{a_e}] [{b_e}]")
|
||||
|
||||
|
||||
# use maybe_get_tz instead of dateutil.tz.gettz to handle the windows
|
||||
# filename issues.
|
||||
gettz_dateutil = lambda x: maybe_get_tz("dateutil/" + x)
|
||||
gettz_pytz = lambda x: x
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning"
|
||||
)
|
||||
@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz])
|
||||
def test_append_with_timezones(temp_hdfstore, gettz):
|
||||
# as columns
|
||||
|
||||
# Single-tzinfo, no DST transition
|
||||
df_est = DataFrame(
|
||||
{
|
||||
"A": [
|
||||
Timestamp("20130102 2:00:00", tz=gettz("US/Eastern")).as_unit("ns")
|
||||
+ timedelta(hours=1) * i
|
||||
for i in range(5)
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
# frame with all columns having same tzinfo, but different sides
|
||||
# of DST transition
|
||||
df_crosses_dst = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz=gettz("US/Eastern")).as_unit("ns"),
|
||||
"B": Timestamp("20130603", tz=gettz("US/Eastern")).as_unit("ns"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
|
||||
df_mixed_tz = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz=gettz("US/Eastern")).as_unit("ns"),
|
||||
"B": Timestamp("20130102", tz=gettz("EET")).as_unit("ns"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
|
||||
df_different_tz = DataFrame(
|
||||
{
|
||||
"A": Timestamp("20130102", tz=gettz("US/Eastern")).as_unit("ns"),
|
||||
"B": Timestamp("20130102", tz=gettz("CET")).as_unit("ns"),
|
||||
},
|
||||
index=range(5),
|
||||
)
|
||||
|
||||
temp_hdfstore.append("df_tz", df_est, data_columns=["A"])
|
||||
result = temp_hdfstore["df_tz"]
|
||||
_compare_with_tz(result, df_est)
|
||||
tm.assert_frame_equal(result, df_est)
|
||||
|
||||
# select with tz aware
|
||||
expected = df_est[df_est.A >= df_est.A[3]]
|
||||
result = temp_hdfstore.select("df_tz", where="A>=df_est.A[3]")
|
||||
_compare_with_tz(result, expected)
|
||||
|
||||
# ensure we include dates in DST and STD time here.
|
||||
temp_hdfstore.remove("df_tz")
|
||||
temp_hdfstore.append("df_tz", df_crosses_dst)
|
||||
result = temp_hdfstore["df_tz"]
|
||||
_compare_with_tz(result, df_crosses_dst)
|
||||
tm.assert_frame_equal(result, df_crosses_dst)
|
||||
|
||||
msg = (
|
||||
r"invalid info for \[values_block_1\] for \[tz\], "
|
||||
r"existing_value \[(dateutil/.*)?(US/Eastern|America/New_York)\] "
|
||||
r"conflicts with new value \[(dateutil/.*)?EET\]"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df_tz", df_mixed_tz)
|
||||
|
||||
# this is ok
|
||||
temp_hdfstore.remove("df_tz")
|
||||
temp_hdfstore.append("df_tz", df_mixed_tz, data_columns=["A", "B"])
|
||||
result = temp_hdfstore["df_tz"]
|
||||
_compare_with_tz(result, df_mixed_tz)
|
||||
tm.assert_frame_equal(result, df_mixed_tz)
|
||||
|
||||
# can't append with diff timezone
|
||||
msg = (
|
||||
r"invalid info for \[B\] for \[tz\], "
|
||||
r"existing_value \[(dateutil/.*)?EET\] "
|
||||
r"conflicts with new value \[(dateutil/.*)?CET\]"
|
||||
)
|
||||
with pytest.raises(ValueError, match=msg):
|
||||
temp_hdfstore.append("df_tz", df_different_tz)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("gettz", [gettz_dateutil, gettz_pytz])
|
||||
def test_append_with_timezones_as_index(temp_hdfstore, gettz):
|
||||
# GH#4098 example
|
||||
|
||||
dti = date_range("2000-1-1", periods=3, freq="h", tz=gettz("US/Eastern"))
|
||||
dti = dti._with_freq(None) # freq doesn't round-trip
|
||||
|
||||
df = DataFrame({"A": Series(range(3), index=dti)})
|
||||
|
||||
temp_hdfstore.put("df", df)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
temp_hdfstore.remove("df")
|
||||
temp_hdfstore.append("df", df)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_roundtrip_tz_aware_index(temp_hdfstore, unit):
|
||||
# GH 17618
|
||||
ts = Timestamp("2000-01-01 01:00:00", tz="US/Eastern")
|
||||
dti = DatetimeIndex([ts]).as_unit(unit)
|
||||
df = DataFrame(data=[0], index=dti)
|
||||
|
||||
temp_hdfstore.put("frame", df, format="fixed")
|
||||
recons = temp_hdfstore["frame"]
|
||||
tm.assert_frame_equal(recons, df)
|
||||
|
||||
value = recons.index[0]._value
|
||||
denom = {"ns": 1, "us": 1000, "ms": 10**6, "s": 10**9}[unit]
|
||||
assert value == 946706400000000000 // denom
|
||||
|
||||
|
||||
def test_store_index_name_with_tz(temp_hdfstore):
|
||||
# GH 13884
|
||||
df = DataFrame({"A": [1, 2]})
|
||||
df.index = DatetimeIndex([1234567890123456787, 1234567890123456788])
|
||||
df.index = df.index.tz_localize("UTC")
|
||||
df.index.name = "foo"
|
||||
|
||||
temp_hdfstore.put("frame", df, format="table")
|
||||
recons = temp_hdfstore["frame"]
|
||||
tm.assert_frame_equal(recons, df)
|
||||
|
||||
|
||||
def test_tseries_select_index_column(temp_hdfstore):
|
||||
# GH7777
|
||||
# selecting a UTC datetimeindex column did
|
||||
# not preserve UTC tzinfo set before storing
|
||||
|
||||
# check that no tz still works
|
||||
rng = date_range("1/1/2000", "1/30/2000")
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng
|
||||
)
|
||||
|
||||
temp_hdfstore.append("frame", frame)
|
||||
result = temp_hdfstore.select_column("frame", "index")
|
||||
assert rng.tz == DatetimeIndex(result.values).tz
|
||||
|
||||
# check utc
|
||||
rng = date_range("1/1/2000", "1/30/2000", tz="UTC")
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng
|
||||
)
|
||||
|
||||
temp_hdfstore.remove("frame")
|
||||
temp_hdfstore.append("frame", frame)
|
||||
result = temp_hdfstore.select_column("frame", "index")
|
||||
assert rng.tz == result.dt.tz
|
||||
|
||||
# double check non-utc
|
||||
rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern")
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng
|
||||
)
|
||||
|
||||
temp_hdfstore.remove("frame")
|
||||
temp_hdfstore.append("frame", frame)
|
||||
result = temp_hdfstore.select_column("frame", "index")
|
||||
assert rng.tz == result.dt.tz
|
||||
|
||||
|
||||
def test_timezones_fixed_format_frame_non_empty(temp_hdfstore):
|
||||
# index
|
||||
rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern")
|
||||
rng = rng._with_freq(None) # freq doesn't round-trip
|
||||
df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng)
|
||||
temp_hdfstore["df"] = df
|
||||
result = temp_hdfstore["df"]
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_timezones_fixed_format_frame_non_empty_as_data(temp_hdfstore):
|
||||
# GH11411
|
||||
rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern")
|
||||
rng = rng._with_freq(None) # freq doesn't round-trip
|
||||
df = DataFrame(
|
||||
{
|
||||
"A": rng,
|
||||
"B": rng.tz_convert("UTC").tz_localize(None),
|
||||
"C": rng.tz_convert("CET"),
|
||||
"D": range(len(rng)),
|
||||
},
|
||||
index=rng,
|
||||
)
|
||||
temp_hdfstore["df"] = df
|
||||
result = temp_hdfstore["df"]
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_timezones_fixed_format_empty(temp_hdfstore, tz_aware_fixture, frame_or_series):
|
||||
# GH 20594
|
||||
|
||||
dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture)
|
||||
|
||||
obj = Series(dtype=dtype, name="A")
|
||||
if frame_or_series is DataFrame:
|
||||
obj = obj.to_frame()
|
||||
|
||||
temp_hdfstore["obj"] = obj
|
||||
result = temp_hdfstore["obj"]
|
||||
tm.assert_equal(result, obj)
|
||||
|
||||
|
||||
def test_timezones_fixed_format_series_nonempty(temp_hdfstore, tz_aware_fixture):
|
||||
# GH 20594
|
||||
|
||||
dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture)
|
||||
|
||||
s = Series([0], dtype=dtype)
|
||||
temp_hdfstore["s"] = s
|
||||
result = temp_hdfstore["s"]
|
||||
tm.assert_series_equal(result, s)
|
||||
|
||||
|
||||
def test_fixed_offset_tz(temp_hdfstore):
|
||||
rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00")
|
||||
frame = DataFrame(
|
||||
np.random.default_rng(2).standard_normal((len(rng), 4)), index=rng
|
||||
)
|
||||
|
||||
temp_hdfstore["frame"] = frame
|
||||
recons = temp_hdfstore["frame"]
|
||||
tm.assert_index_equal(recons.index, rng)
|
||||
assert rng.tz == recons.index.tz
|
||||
|
||||
|
||||
@td.skip_if_windows
|
||||
def test_store_timezone(temp_hdfstore):
|
||||
# GH2852
|
||||
# issue storing datetime.date with a timezone as it resets when read
|
||||
# back in a new timezone
|
||||
|
||||
# original method
|
||||
today = date(2013, 9, 10)
|
||||
df = DataFrame([1, 2, 3], index=[today, today, today])
|
||||
temp_hdfstore["obj1"] = df
|
||||
result = temp_hdfstore["obj1"]
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
# with tz setting
|
||||
with tm.set_timezone("EST5EDT"):
|
||||
today = date(2013, 9, 10)
|
||||
df = DataFrame([1, 2, 3], index=[today, today, today])
|
||||
temp_hdfstore["obj2"] = df
|
||||
|
||||
with tm.set_timezone("CST6CDT"):
|
||||
result = temp_hdfstore["obj2"]
|
||||
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
|
||||
def test_dst_transitions(temp_hdfstore):
|
||||
# make sure we are not failing on transitions
|
||||
times = date_range(
|
||||
"2013-10-26 23:00",
|
||||
"2013-10-27 01:00",
|
||||
tz="Europe/London",
|
||||
freq="h",
|
||||
ambiguous="infer",
|
||||
)
|
||||
times = times._with_freq(None) # freq doesn't round-trip
|
||||
|
||||
for i in [times, times + pd.Timedelta("10min")]:
|
||||
df = DataFrame({"A": range(len(i)), "B": i}, index=i)
|
||||
temp_hdfstore.append("df", df)
|
||||
result = temp_hdfstore.select("df")
|
||||
tm.assert_frame_equal(result, df)
|
||||
temp_hdfstore.remove("df")
|
||||
|
||||
|
||||
@pytest.mark.filterwarnings(
|
||||
"ignore:`alltrue` is deprecated as of NumPy 1.25.0:DeprecationWarning"
|
||||
)
|
||||
def test_read_with_where_tz_aware_index(temp_hdfstore):
|
||||
# GH 11926
|
||||
periods = 10
|
||||
dts = date_range("20151201", periods=periods, freq="D", tz="UTC", unit="ns")
|
||||
mi = pd.MultiIndex.from_arrays([dts, range(periods)], names=["DATE", "NO"])
|
||||
expected = DataFrame({"MYCOL": 0}, index=mi)
|
||||
|
||||
key = "mykey"
|
||||
with pd.HDFStore(temp_hdfstore) as store:
|
||||
store.append(key, expected, format="table", append=True)
|
||||
result = pd.read_hdf(temp_hdfstore, key, where="DATE > 20151130")
|
||||
tm.assert_frame_equal(result, expected)
|
||||
Reference in New Issue
Block a user