2021-01-09 15:20:56 +01:00
|
|
|
""" parquet compat """
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
from typing import Any, AnyStr, Dict, List, Optional
|
2021-01-09 15:20:56 +01:00
|
|
|
from warnings import catch_warnings
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
from pandas._typing import FilePathOrBuffer
|
2021-01-09 15:20:56 +01:00
|
|
|
from pandas.compat._optional import import_optional_dependency
|
|
|
|
from pandas.errors import AbstractMethodError
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
from pandas import DataFrame, get_option
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
def get_engine(engine: str) -> "BaseImpl":
|
|
|
|
""" return our implementation """
|
|
|
|
if engine == "auto":
|
|
|
|
engine = get_option("io.parquet.engine")
|
|
|
|
|
|
|
|
if engine == "auto":
|
|
|
|
# try engines in this order
|
|
|
|
engine_classes = [PyArrowImpl, FastParquetImpl]
|
|
|
|
|
|
|
|
error_msgs = ""
|
|
|
|
for engine_class in engine_classes:
|
|
|
|
try:
|
|
|
|
return engine_class()
|
|
|
|
except ImportError as err:
|
|
|
|
error_msgs += "\n - " + str(err)
|
|
|
|
|
|
|
|
raise ImportError(
|
|
|
|
"Unable to find a usable engine; "
|
|
|
|
"tried using: 'pyarrow', 'fastparquet'.\n"
|
|
|
|
"A suitable version of "
|
|
|
|
"pyarrow or fastparquet is required for parquet "
|
|
|
|
"support.\n"
|
|
|
|
"Trying to import the above resulted in these errors:"
|
|
|
|
f"{error_msgs}"
|
|
|
|
)
|
|
|
|
|
|
|
|
if engine == "pyarrow":
|
|
|
|
return PyArrowImpl()
|
|
|
|
elif engine == "fastparquet":
|
|
|
|
return FastParquetImpl()
|
|
|
|
|
|
|
|
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
|
|
|
|
|
|
|
|
|
|
|
|
class BaseImpl:
|
|
|
|
@staticmethod
|
|
|
|
def validate_dataframe(df: DataFrame):
|
|
|
|
|
|
|
|
if not isinstance(df, DataFrame):
|
|
|
|
raise ValueError("to_parquet only supports IO with DataFrames")
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
# must have value column names (strings only)
|
|
|
|
if df.columns.inferred_type not in {"string", "empty"}:
|
|
|
|
raise ValueError("parquet must have string column names")
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
# index level names must be strings
|
|
|
|
valid_names = all(
|
|
|
|
isinstance(name, str) for name in df.index.names if name is not None
|
|
|
|
)
|
|
|
|
if not valid_names:
|
|
|
|
raise ValueError("Index level names must be strings")
|
|
|
|
|
|
|
|
def write(self, df: DataFrame, path, compression, **kwargs):
|
|
|
|
raise AbstractMethodError(self)
|
|
|
|
|
|
|
|
def read(self, path, columns=None, **kwargs):
|
|
|
|
raise AbstractMethodError(self)
|
|
|
|
|
|
|
|
|
|
|
|
class PyArrowImpl(BaseImpl):
|
|
|
|
def __init__(self):
|
|
|
|
import_optional_dependency(
|
|
|
|
"pyarrow", extra="pyarrow is required for parquet support."
|
|
|
|
)
|
|
|
|
import pyarrow.parquet
|
|
|
|
|
|
|
|
# import utils to register the pyarrow extension types
|
|
|
|
import pandas.core.arrays._arrow_utils # noqa
|
|
|
|
|
|
|
|
self.api = pyarrow
|
|
|
|
|
|
|
|
def write(
|
|
|
|
self,
|
|
|
|
df: DataFrame,
|
|
|
|
path: FilePathOrBuffer[AnyStr],
|
|
|
|
compression: Optional[str] = "snappy",
|
|
|
|
index: Optional[bool] = None,
|
|
|
|
partition_cols: Optional[List[str]] = None,
|
|
|
|
**kwargs,
|
|
|
|
):
|
|
|
|
self.validate_dataframe(df)
|
|
|
|
|
|
|
|
from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)}
|
|
|
|
if index is not None:
|
|
|
|
from_pandas_kwargs["preserve_index"] = index
|
|
|
|
|
|
|
|
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
if is_fsspec_url(path) and "filesystem" not in kwargs:
|
|
|
|
# make fsspec instance, which pyarrow will use to open paths
|
|
|
|
import_optional_dependency("fsspec")
|
|
|
|
import fsspec.core
|
|
|
|
|
|
|
|
fs, path = fsspec.core.url_to_fs(path)
|
|
|
|
kwargs["filesystem"] = fs
|
|
|
|
else:
|
|
|
|
path = _expand_user(path)
|
|
|
|
if partition_cols is not None:
|
|
|
|
# writes to multiple files under the given path
|
|
|
|
self.api.parquet.write_to_dataset(
|
|
|
|
table,
|
|
|
|
path,
|
|
|
|
compression=compression,
|
|
|
|
partition_cols=partition_cols,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
# write to single output file
|
|
|
|
self.api.parquet.write_table(table, path, compression=compression, **kwargs)
|
|
|
|
|
|
|
|
def read(self, path, columns=None, **kwargs):
|
|
|
|
if is_fsspec_url(path) and "filesystem" not in kwargs:
|
|
|
|
import_optional_dependency("fsspec")
|
|
|
|
import fsspec.core
|
|
|
|
|
|
|
|
fs, path = fsspec.core.url_to_fs(path)
|
|
|
|
should_close = False
|
|
|
|
else:
|
|
|
|
fs = kwargs.pop("filesystem", None)
|
|
|
|
should_close = False
|
|
|
|
path = _expand_user(path)
|
|
|
|
|
|
|
|
if not fs:
|
|
|
|
path, _, _, should_close = get_filepath_or_buffer(path)
|
|
|
|
|
2021-01-09 15:20:56 +01:00
|
|
|
kwargs["use_pandas_metadata"] = True
|
2021-01-30 22:29:33 +01:00
|
|
|
result = self.api.parquet.read_table(
|
|
|
|
path, columns=columns, filesystem=fs, **kwargs
|
|
|
|
).to_pandas()
|
|
|
|
if should_close:
|
|
|
|
path.close()
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
return result
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
class FastParquetImpl(BaseImpl):
|
|
|
|
def __init__(self):
|
|
|
|
# since pandas is a dependency of fastparquet
|
|
|
|
# we need to import on first use
|
|
|
|
fastparquet = import_optional_dependency(
|
|
|
|
"fastparquet", extra="fastparquet is required for parquet support."
|
|
|
|
)
|
|
|
|
self.api = fastparquet
|
|
|
|
|
|
|
|
def write(
|
|
|
|
self,
|
|
|
|
df: DataFrame,
|
|
|
|
path,
|
|
|
|
compression="snappy",
|
|
|
|
index=None,
|
|
|
|
partition_cols=None,
|
|
|
|
**kwargs,
|
|
|
|
):
|
|
|
|
self.validate_dataframe(df)
|
|
|
|
# thriftpy/protocol/compact.py:339:
|
|
|
|
# DeprecationWarning: tostring() is deprecated.
|
|
|
|
# Use tobytes() instead.
|
|
|
|
|
|
|
|
if "partition_on" in kwargs and partition_cols is not None:
|
|
|
|
raise ValueError(
|
|
|
|
"Cannot use both partition_on and "
|
|
|
|
"partition_cols. Use partition_cols for partitioning data"
|
|
|
|
)
|
|
|
|
elif "partition_on" in kwargs:
|
|
|
|
partition_cols = kwargs.pop("partition_on")
|
|
|
|
|
|
|
|
if partition_cols is not None:
|
|
|
|
kwargs["file_scheme"] = "hive"
|
|
|
|
|
|
|
|
if is_fsspec_url(path):
|
|
|
|
fsspec = import_optional_dependency("fsspec")
|
|
|
|
|
|
|
|
# if filesystem is provided by fsspec, file must be opened in 'wb' mode.
|
2021-01-30 22:29:33 +01:00
|
|
|
kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open()
|
|
|
|
else:
|
|
|
|
path, _, _, _ = get_filepath_or_buffer(path)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
with catch_warnings(record=True):
|
|
|
|
self.api.write(
|
|
|
|
path,
|
|
|
|
df,
|
|
|
|
compression=compression,
|
|
|
|
write_index=index,
|
|
|
|
partition_on=partition_cols,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def read(self, path, columns=None, **kwargs):
|
2021-01-09 15:20:56 +01:00
|
|
|
if is_fsspec_url(path):
|
|
|
|
fsspec = import_optional_dependency("fsspec")
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
open_with = lambda path, _: fsspec.open(path, "rb").open()
|
|
|
|
parquet_file = self.api.ParquetFile(path, open_with=open_with)
|
|
|
|
else:
|
|
|
|
path, _, _, _ = get_filepath_or_buffer(path)
|
|
|
|
parquet_file = self.api.ParquetFile(path)
|
|
|
|
|
|
|
|
return parquet_file.to_pandas(columns=columns, **kwargs)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
def to_parquet(
|
|
|
|
df: DataFrame,
|
2021-01-30 22:29:33 +01:00
|
|
|
path: FilePathOrBuffer[AnyStr],
|
2021-01-09 15:20:56 +01:00
|
|
|
engine: str = "auto",
|
|
|
|
compression: Optional[str] = "snappy",
|
|
|
|
index: Optional[bool] = None,
|
|
|
|
partition_cols: Optional[List[str]] = None,
|
|
|
|
**kwargs,
|
2021-01-30 22:29:33 +01:00
|
|
|
):
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
|
|
|
Write a DataFrame to the parquet format.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
df : DataFrame
|
2021-01-30 22:29:33 +01:00
|
|
|
path : str or file-like object
|
2021-01-09 15:20:56 +01:00
|
|
|
If a string, it will be used as Root Directory path
|
|
|
|
when writing a partitioned dataset. By file-like object,
|
2021-01-30 22:29:33 +01:00
|
|
|
we refer to objects with a write() method, such as a file handler
|
2021-01-09 15:20:56 +01:00
|
|
|
(e.g. via builtin open function) or io.BytesIO. The engine
|
2021-01-30 22:29:33 +01:00
|
|
|
fastparquet does not accept file-like objects.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
.. versionchanged:: 0.24.0
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
2021-01-09 15:20:56 +01:00
|
|
|
Parquet library to use. If 'auto', then the option
|
|
|
|
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
|
|
|
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
|
|
|
'pyarrow' is unavailable.
|
2021-01-30 22:29:33 +01:00
|
|
|
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
|
2021-01-09 15:20:56 +01:00
|
|
|
Name of the compression to use. Use ``None`` for no compression.
|
|
|
|
index : bool, default None
|
|
|
|
If ``True``, include the dataframe's index(es) in the file output. If
|
|
|
|
``False``, they will not be written to the file.
|
|
|
|
If ``None``, similar to ``True`` the dataframe's index(es)
|
|
|
|
will be saved. However, instead of being saved as values,
|
|
|
|
the RangeIndex will be stored as a range in the metadata so it
|
|
|
|
doesn't require much space and is faster. Other indexes will
|
|
|
|
be included as columns in the file output.
|
|
|
|
|
|
|
|
.. versionadded:: 0.24.0
|
|
|
|
|
|
|
|
partition_cols : str or list, optional, default None
|
|
|
|
Column names by which to partition the dataset.
|
|
|
|
Columns are partitioned in the order they are given.
|
|
|
|
Must be None if path is not a string.
|
|
|
|
|
|
|
|
.. versionadded:: 0.24.0
|
|
|
|
|
|
|
|
kwargs
|
|
|
|
Additional keyword arguments passed to the engine
|
|
|
|
"""
|
|
|
|
if isinstance(partition_cols, str):
|
|
|
|
partition_cols = [partition_cols]
|
|
|
|
impl = get_engine(engine)
|
2021-01-30 22:29:33 +01:00
|
|
|
return impl.write(
|
2021-01-09 15:20:56 +01:00
|
|
|
df,
|
2021-01-30 22:29:33 +01:00
|
|
|
path,
|
2021-01-09 15:20:56 +01:00
|
|
|
compression=compression,
|
|
|
|
index=index,
|
|
|
|
partition_cols=partition_cols,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
|
|
|
Load a parquet object from the file path, returning a DataFrame.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
path : str, path object or file-like object
|
|
|
|
Any valid string path is acceptable. The string could be a URL. Valid
|
2021-01-30 22:29:33 +01:00
|
|
|
URL schemes include http, ftp, s3, and file. For file URLs, a host is
|
2021-01-09 15:20:56 +01:00
|
|
|
expected. A local file could be:
|
|
|
|
``file://localhost/path/to/table.parquet``.
|
|
|
|
A file URL can also be a path to a directory that contains multiple
|
|
|
|
partitioned parquet files. Both pyarrow and fastparquet support
|
|
|
|
paths to directories as well as file URLs. A directory path could be:
|
|
|
|
``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``
|
|
|
|
|
|
|
|
If you want to pass in a path object, pandas accepts any
|
|
|
|
``os.PathLike``.
|
|
|
|
|
|
|
|
By file-like object, we refer to objects with a ``read()`` method,
|
2021-01-30 22:29:33 +01:00
|
|
|
such as a file handler (e.g. via builtin ``open`` function)
|
2021-01-09 15:20:56 +01:00
|
|
|
or ``StringIO``.
|
|
|
|
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
|
|
|
|
Parquet library to use. If 'auto', then the option
|
|
|
|
``io.parquet.engine`` is used. The default ``io.parquet.engine``
|
|
|
|
behavior is to try 'pyarrow', falling back to 'fastparquet' if
|
|
|
|
'pyarrow' is unavailable.
|
|
|
|
columns : list, default=None
|
|
|
|
If not None, only these columns will be read from the file.
|
|
|
|
**kwargs
|
|
|
|
Any additional kwargs are passed to the engine.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
DataFrame
|
|
|
|
"""
|
|
|
|
impl = get_engine(engine)
|
2021-01-30 22:29:33 +01:00
|
|
|
return impl.read(path, columns=columns, **kwargs)
|