2021-01-30 22:29:33 +01:00
|
|
|
from abc import ABCMeta, abstractmethod
|
2021-01-09 15:20:56 +01:00
|
|
|
import sys
|
2021-01-30 22:29:33 +01:00
|
|
|
from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
from pandas._config import get_option
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
from pandas._typing import Dtype, FrameOrSeries
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
from pandas.core.indexes.api import Index
|
|
|
|
|
|
|
|
from pandas.io.formats import format as fmt
|
|
|
|
from pandas.io.formats.printing import pprint_thing
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
2021-01-30 22:29:33 +01:00
|
|
|
from pandas.core.series import Series # noqa: F401
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
def _put_str(s: Union[str, Dtype], space: int) -> str:
|
|
|
|
"""
|
|
|
|
Make string of specified length, padding to the right if necessary.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
s : Union[str, Dtype]
|
|
|
|
String to be formatted.
|
|
|
|
space : int
|
|
|
|
Length to force string to be of.
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
str
|
|
|
|
String coerced to given length.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
>>> pd.io.formats.info._put_str("panda", 6)
|
|
|
|
'panda '
|
|
|
|
>>> pd.io.formats.info._put_str("panda", 4)
|
|
|
|
'pand'
|
|
|
|
"""
|
|
|
|
return str(s)[:space].ljust(space)
|
|
|
|
|
|
|
|
|
|
|
|
def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str:
|
|
|
|
"""
|
|
|
|
Return size in human readable format.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
num : int
|
|
|
|
Size in bytes.
|
|
|
|
size_qualifier : str
|
|
|
|
Either empty, or '+' (if lower bound).
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
str
|
|
|
|
Size in human readable format.
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
>>> _sizeof_fmt(23028, '')
|
|
|
|
'22.5 KB'
|
|
|
|
|
|
|
|
>>> _sizeof_fmt(23028, '+')
|
|
|
|
'22.5+ KB'
|
|
|
|
"""
|
|
|
|
for x in ["bytes", "KB", "MB", "GB", "TB"]:
|
|
|
|
if num < 1024.0:
|
|
|
|
return f"{num:3.1f}{size_qualifier} {x}"
|
|
|
|
num /= 1024.0
|
|
|
|
return f"{num:3.1f}{size_qualifier} PB"
|
|
|
|
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
class BaseInfo(metaclass=ABCMeta):
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
data: FrameOrSeries,
|
|
|
|
verbose: Optional[bool] = None,
|
|
|
|
buf: Optional[IO[str]] = None,
|
|
|
|
max_cols: Optional[int] = None,
|
|
|
|
memory_usage: Optional[Union[bool, str]] = None,
|
|
|
|
null_counts: Optional[bool] = None,
|
|
|
|
):
|
|
|
|
if buf is None: # pragma: no cover
|
|
|
|
buf = sys.stdout
|
|
|
|
if memory_usage is None:
|
|
|
|
memory_usage = get_option("display.memory_usage")
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
self.data = data
|
|
|
|
self.verbose = verbose
|
|
|
|
self.buf = buf
|
|
|
|
self.max_cols = max_cols
|
|
|
|
self.memory_usage = memory_usage
|
|
|
|
self.null_counts = null_counts
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
@abstractmethod
|
|
|
|
def _get_mem_usage(self, deep: bool) -> int:
|
|
|
|
"""
|
|
|
|
Get memory usage in bytes.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
deep : bool
|
|
|
|
If True, introspect the data deeply by interrogating object dtypes
|
|
|
|
for system-level memory consumption, and include it in the returned
|
|
|
|
values.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
mem_usage : int
|
|
|
|
Object's total memory usage in bytes.
|
|
|
|
"""
|
|
|
|
pass
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
@abstractmethod
|
2021-01-30 22:29:33 +01:00
|
|
|
def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]:
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
2021-01-30 22:29:33 +01:00
|
|
|
Get column names and dtypes.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
2021-01-30 22:29:33 +01:00
|
|
|
ids : Index
|
|
|
|
DataFrame's column names.
|
|
|
|
dtypes : Series
|
|
|
|
Dtype of each of the DataFrame's columns.
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
2021-01-30 22:29:33 +01:00
|
|
|
pass
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
@abstractmethod
|
2021-01-30 22:29:33 +01:00
|
|
|
def _verbose_repr(
|
|
|
|
self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool
|
|
|
|
) -> None:
|
|
|
|
"""
|
|
|
|
Append name, non-null count (optional), and dtype for each column to `lines`.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
lines : List[str]
|
|
|
|
Lines that will contain `info` representation.
|
|
|
|
ids : Index
|
|
|
|
The DataFrame's column names.
|
|
|
|
dtypes : Series
|
|
|
|
The DataFrame's columns' dtypes.
|
|
|
|
show_counts : bool
|
|
|
|
If True, count of non-NA cells for each column will be appended to `lines`.
|
|
|
|
"""
|
|
|
|
pass
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
@abstractmethod
|
2021-01-30 22:29:33 +01:00
|
|
|
def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None:
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
2021-01-30 22:29:33 +01:00
|
|
|
Append short summary of columns' names to `lines`.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
lines : List[str]
|
|
|
|
Lines that will contain `info` representation.
|
|
|
|
ids : Index
|
|
|
|
The DataFrame's column names.
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
2021-01-30 22:29:33 +01:00
|
|
|
pass
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def info(self) -> None:
|
2021-01-09 15:20:56 +01:00
|
|
|
"""
|
|
|
|
Print a concise summary of a %(klass)s.
|
|
|
|
|
|
|
|
This method prints information about a %(klass)s including
|
|
|
|
the index dtype%(type_sub)s, non-null values and memory usage.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
data : %(klass)s
|
|
|
|
%(klass)s to print information about.
|
|
|
|
verbose : bool, optional
|
|
|
|
Whether to print the full summary. By default, the setting in
|
|
|
|
``pandas.options.display.max_info_columns`` is followed.
|
|
|
|
buf : writable buffer, defaults to sys.stdout
|
|
|
|
Where to send the output. By default, the output is printed to
|
|
|
|
sys.stdout. Pass a writable buffer if you need to further process
|
|
|
|
the output.
|
|
|
|
%(max_cols_sub)s
|
|
|
|
memory_usage : bool, str, optional
|
|
|
|
Specifies whether total memory usage of the %(klass)s
|
|
|
|
elements (including the index) should be displayed. By default,
|
|
|
|
this follows the ``pandas.options.display.memory_usage`` setting.
|
|
|
|
|
|
|
|
True always show memory usage. False never shows memory usage.
|
|
|
|
A value of 'deep' is equivalent to "True with deep introspection".
|
|
|
|
Memory usage is shown in human-readable units (base-2
|
|
|
|
representation). Without deep introspection a memory estimation is
|
|
|
|
made based in column dtype and number of rows assuming values
|
|
|
|
consume the same memory amount for corresponding dtypes. With deep
|
|
|
|
memory introspection, a real memory usage calculation is performed
|
|
|
|
at the cost of computational resources.
|
2021-01-30 22:29:33 +01:00
|
|
|
null_counts : bool, optional
|
|
|
|
Whether to show the non-null counts. By default, this is shown
|
|
|
|
only if the %(klass)s is smaller than
|
|
|
|
``pandas.options.display.max_info_rows`` and
|
|
|
|
``pandas.options.display.max_info_columns``. A value of True always
|
|
|
|
shows the counts, and False never shows the counts.
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
None
|
|
|
|
This method prints a summary of a %(klass)s and returns None.
|
|
|
|
|
|
|
|
See Also
|
|
|
|
--------
|
|
|
|
%(see_also_sub)s
|
|
|
|
|
|
|
|
Examples
|
|
|
|
--------
|
|
|
|
%(examples_sub)s
|
|
|
|
"""
|
2021-01-30 22:29:33 +01:00
|
|
|
lines = []
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
lines.append(str(type(self.data)))
|
|
|
|
lines.append(self.data.index._summary())
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
ids, dtypes = self._get_ids_and_dtypes()
|
|
|
|
col_count = len(ids)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
if col_count == 0:
|
|
|
|
lines.append(f"Empty {type(self.data).__name__}")
|
|
|
|
fmt.buffer_put_lines(self.buf, lines)
|
|
|
|
return
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
# hack
|
|
|
|
max_cols = self.max_cols
|
2021-01-09 15:20:56 +01:00
|
|
|
if max_cols is None:
|
2021-01-30 22:29:33 +01:00
|
|
|
max_cols = get_option("display.max_info_columns", col_count + 1)
|
|
|
|
|
|
|
|
max_rows = get_option("display.max_info_rows", len(self.data) + 1)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
if self.null_counts is None:
|
|
|
|
show_counts = (col_count <= max_cols) and (len(self.data) < max_rows)
|
2021-01-09 15:20:56 +01:00
|
|
|
else:
|
2021-01-30 22:29:33 +01:00
|
|
|
show_counts = self.null_counts
|
|
|
|
exceeds_info_cols = col_count > max_cols
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
if self.verbose:
|
2021-01-30 22:29:33 +01:00
|
|
|
self._verbose_repr(lines, ids, dtypes, show_counts)
|
2021-01-09 15:20:56 +01:00
|
|
|
elif self.verbose is False: # specifically set to False, not necessarily None
|
2021-01-30 22:29:33 +01:00
|
|
|
self._non_verbose_repr(lines, ids)
|
2021-01-09 15:20:56 +01:00
|
|
|
else:
|
2021-01-30 22:29:33 +01:00
|
|
|
if exceeds_info_cols:
|
|
|
|
self._non_verbose_repr(lines, ids)
|
2021-01-09 15:20:56 +01:00
|
|
|
else:
|
2021-01-30 22:29:33 +01:00
|
|
|
self._verbose_repr(lines, ids, dtypes, show_counts)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
# groupby dtype.name to collect e.g. Categorical columns
|
|
|
|
counts = dtypes.value_counts().groupby(lambda x: x.name).sum()
|
|
|
|
collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
|
|
|
|
lines.append(f"dtypes: {', '.join(collected_dtypes)}")
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
if self.memory_usage:
|
|
|
|
# append memory usage of df to display
|
|
|
|
size_qualifier = ""
|
|
|
|
if self.memory_usage == "deep":
|
|
|
|
deep = True
|
|
|
|
else:
|
|
|
|
# size_qualifier is just a best effort; not guaranteed to catch
|
|
|
|
# all cases (e.g., it misses categorical data even with object
|
|
|
|
# categories)
|
|
|
|
deep = False
|
|
|
|
if "object" in counts or self.data.index._is_memory_usage_qualified():
|
|
|
|
size_qualifier = "+"
|
|
|
|
mem_usage = self._get_mem_usage(deep=deep)
|
|
|
|
lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
|
|
|
|
fmt.buffer_put_lines(self.buf, lines)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
class DataFrameInfo(BaseInfo):
|
|
|
|
def _get_mem_usage(self, deep: bool) -> int:
|
|
|
|
return self.data.memory_usage(index=True, deep=deep).sum()
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]:
|
|
|
|
return self.data.columns, self.data.dtypes
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def _verbose_repr(
|
|
|
|
self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool
|
|
|
|
) -> None:
|
|
|
|
col_count = len(ids)
|
|
|
|
lines.append(f"Data columns (total {col_count} columns):")
|
|
|
|
|
|
|
|
id_head = " # "
|
|
|
|
column_head = "Column"
|
|
|
|
col_space = 2
|
|
|
|
|
|
|
|
max_col = max(len(pprint_thing(k)) for k in ids)
|
|
|
|
len_column = len(pprint_thing(column_head))
|
|
|
|
space = max(max_col, len_column) + col_space
|
|
|
|
|
|
|
|
max_id = len(pprint_thing(col_count))
|
|
|
|
len_id = len(pprint_thing(id_head))
|
|
|
|
space_num = max(max_id, len_id) + col_space
|
|
|
|
|
|
|
|
header = _put_str(id_head, space_num) + _put_str(column_head, space)
|
|
|
|
if show_counts:
|
|
|
|
counts = self.data.count()
|
|
|
|
if col_count != len(counts): # pragma: no cover
|
|
|
|
raise AssertionError(
|
|
|
|
f"Columns must equal counts ({col_count} != {len(counts)})"
|
2021-01-09 15:20:56 +01:00
|
|
|
)
|
2021-01-30 22:29:33 +01:00
|
|
|
count_header = "Non-Null Count"
|
|
|
|
len_count = len(count_header)
|
|
|
|
non_null = " non-null"
|
|
|
|
max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
|
|
|
|
space_count = max(len_count, max_count) + col_space
|
|
|
|
count_temp = "{count}" + non_null
|
|
|
|
else:
|
|
|
|
count_header = ""
|
|
|
|
space_count = len(count_header)
|
|
|
|
len_count = space_count
|
|
|
|
count_temp = "{count}"
|
|
|
|
|
|
|
|
dtype_header = "Dtype"
|
|
|
|
len_dtype = len(dtype_header)
|
|
|
|
max_dtypes = max(len(pprint_thing(k)) for k in dtypes)
|
|
|
|
space_dtype = max(len_dtype, max_dtypes)
|
|
|
|
header += _put_str(count_header, space_count) + _put_str(
|
|
|
|
dtype_header, space_dtype
|
2021-01-09 15:20:56 +01:00
|
|
|
)
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
lines.append(header)
|
|
|
|
lines.append(
|
|
|
|
_put_str("-" * len_id, space_num)
|
|
|
|
+ _put_str("-" * len_column, space)
|
|
|
|
+ _put_str("-" * len_count, space_count)
|
|
|
|
+ _put_str("-" * len_dtype, space_dtype)
|
2021-01-09 15:20:56 +01:00
|
|
|
)
|
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
for i, col in enumerate(ids):
|
|
|
|
dtype = dtypes.iloc[i]
|
|
|
|
col = pprint_thing(col)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
line_no = _put_str(f" {i}", space_num)
|
|
|
|
count = ""
|
|
|
|
if show_counts:
|
|
|
|
count = counts.iloc[i]
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
lines.append(
|
|
|
|
line_no
|
|
|
|
+ _put_str(col, space)
|
|
|
|
+ _put_str(count_temp.format(count=count), space_count)
|
|
|
|
+ _put_str(dtype, space_dtype)
|
|
|
|
)
|
2021-01-09 15:20:56 +01:00
|
|
|
|
2021-01-30 22:29:33 +01:00
|
|
|
def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None:
|
|
|
|
lines.append(ids._summary(name="Columns"))
|