diff --git a/.DS_Store b/.DS_Store index f9ce9b4..ed0c6c6 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/cbpi/__init__.py b/cbpi/__init__.py index f5d10d4..aba157c 100644 --- a/cbpi/__init__.py +++ b/cbpi/__init__.py @@ -1 +1 @@ -__version__ = "4.0.0.12" \ No newline at end of file +__version__ = "4.0.0.13" \ No newline at end of file diff --git a/cbpi/api/kettle_logic.py b/cbpi/api/kettle_logic.py index ef52353..580d4a3 100644 --- a/cbpi/api/kettle_logic.py +++ b/cbpi/api/kettle_logic.py @@ -23,16 +23,16 @@ class CBPiKettleLogic(metaclass=ABCMeta): self.cbpi.log.log_data(self.id, value) async def run(self): + self.state = True while self.running: print("RUNNING KETTLE") await asyncio.sleep(1) + self.state = False def get_state(self): - - return dict(state=self.state) + return dict(running=self.running) async def start(self): - self.running = True async def stop(self): diff --git a/cbpi/cli.py b/cbpi/cli.py index a73f5e1..d11425e 100644 --- a/cbpi/cli.py +++ b/cbpi/cli.py @@ -7,19 +7,15 @@ import re import requests import yaml from cbpi.utils.utils import load_config - +from zipfile import ZipFile from cbpi.craftbeerpi import CraftBeerPi import os import pathlib import shutil +import yaml +import click -def create_plugin_file(): - import os.path - if os.path.exists(os.path.join(".", 'config', "plugin_list.txt")) is False: - srcfile = os.path.join(os.path.dirname(__file__), "config", "plugin_list.txt") - destfile = os.path.join(".", 'config') - shutil.copy(srcfile, destfile) - print("Plugin Folder created") +from jinja2 import Template def create_config_file(): import os.path @@ -74,9 +70,7 @@ def clear_db(): os.remove(os.path.join(".", "craftbeerpi.db")) print("database Cleared") - def check_for_setup(): - if os.path.exists(os.path.join(".", "config", "config.yaml")) is False: print("***************************************************") print("CraftBeerPi Config File not found: %s" % os.path.join(".", "config", "config.yaml")) @@ -87,137 +81,164 @@ def check_for_setup(): return True -def list_plugins(): - print("***************************************************") - print("CraftBeerPi 4.x Plugin List") - print("***************************************************") - print("") - plugins_yaml = "https://raw.githubusercontent.com/Manuel83/craftbeerpi-plugins/master/plugins_v4.yaml" - r = requests.get(plugins_yaml) - data = yaml.load(r.content, Loader=yaml.FullLoader) - for name, value in data.items(): - print(name) - print("") - print("***************************************************") - -def add(package_name): - +def plugins_add(package_name): if package_name is None: - print("Missing Plugin Name: cbpi add --name=") + print("Pleaes provide a plugin Name") + return + try: + with open(os.path.join(".", 'config', "config.yaml"), 'rt') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + if package_name in data["plugins"]: + print("") + print("Plugin {} already active".format(package_name)) + print("") + return + data["plugins"].append(package_name) + with open(os.path.join(".", 'config', "config.yaml"), 'w') as outfile: + yaml.dump(data, outfile, default_flow_style=False) + print("") + print("Plugin {} activated".format(package_name)) + print("") + except Exception as e: + print(e) + pass + + + +def plugin_remove(package_name): + if package_name is None: + print("Pleaes provide a plugin Name") + return + try: + with open(os.path.join(".", 'config', "config.yaml"), 'rt') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + + data["plugins"] = list(filter(lambda k: package_name not in k, data["plugins"])) + with open(os.path.join(".", 'config', "config.yaml"), 'w') as outfile: + yaml.dump(data, outfile, default_flow_style=False) + print("") + print("Plugin {} deactivated".format(package_name)) + print("") + except Exception as e: + print(e) + pass + +def plugins_list(): + + print("--------------------------------------") + print("List of active pluigins") + try: + with open(os.path.join(".", 'config', "config.yaml"), 'rt') as f: + data = yaml.load(f, Loader=yaml.FullLoader) + + for p in data["plugins"]: + print("- {}".format(p)) + except Exception as e: + print(e) + pass + print("--------------------------------------") + +def plugin_create(name): + + if os.path.exists(os.path.join(".", name)) is True: + print("Cant create Plugin. Folder {} already exists ".format(name)) return - data = subprocess.check_output([sys.executable, "-m", "pip", "install", package_name]) - data = data.decode('UTF-8') + url = 'https://github.com/Manuel83/craftbeerpi4-plugin-template/archive/main.zip' + r = requests.get(url) + with open('temp.zip', 'wb') as f: + f.write(r.content) - patter_already_installed = "Requirement already satisfied: %s" % package_name - pattern = "Successfully installed %s-([-0-9a-zA-Z._]*)" % package_name - - match_already_installed = re.search(patter_already_installed, data) - match_installed = re.search(pattern, data) - - if match_already_installed is not None: - print("Plugin already installed") - return False - - if match_installed is None: - print(data) - print("Faild to install plugin") - return False - - version = match_installed.groups()[0] - plugins = load_config("./config/plugin_list.txt") - if plugins is None: - plugins = {} - now = datetime.datetime.now() - plugins[package_name] = dict(version=version, installation_date=now.strftime("%Y-%m-%d %H:%M:%S")) - - with open('./config/plugin_list.txt', 'w') as outfile: - yaml.dump(plugins, outfile, default_flow_style=False) - - print("Plugin %s added" % package_name) - return True + with ZipFile('temp.zip', 'r') as repo_zip: + repo_zip.extractall() -def remove(package_name): - if package_name is None: - print("Missing Plugin Name: cbpi add --name=") - return - data = subprocess.check_output([sys.executable, "-m", "pip", "uninstall", "-y", package_name]) - data = data.decode('UTF-8') + os.rename("./craftbeerpi4-plugin-template-main", os.path.join(".", name)) + os.rename(os.path.join(".", name, "src"), os.path.join(".", name, name)) - pattern = "Successfully uninstalled %s-([-0-9a-zA-Z._]*)" % package_name - match_uninstalled = re.search(pattern, data) + import jinja2 - if match_uninstalled is None: - - print("Faild to uninstall plugin") - return False - - plugins = load_config("./config/plugin_list.txt") - if plugins is None: - plugins = {} - - if package_name not in plugins: - return False - - del plugins[package_name] - with open('./config/plugin_list.txt', 'w') as outfile: - yaml.dump(plugins, outfile, default_flow_style=False) - - print("Plugin %s removed" % package_name) - return True - -def main(): + templateLoader = jinja2.FileSystemLoader(searchpath=os.path.join(".", name)) + templateEnv = jinja2.Environment(loader=templateLoader) + TEMPLATE_FILE = "setup.py" + template = templateEnv.get_template(TEMPLATE_FILE) + outputText = template.render(name=name) - parser = argparse.ArgumentParser(description='Welcome to CraftBeerPi 4') - parser.add_argument("action", type=str, help="start,stop,restart,setup,plugins") - parser.add_argument('--debug', dest='debug', action='store_true') - parser.add_argument("--name", type=str, help="Plugin name") - args = parser.parse_args() + with open(os.path.join(".", name, "setup.py"), "w") as fh: + fh.write(outputText) - if args.debug is True: - level =logging.DEBUG - else: - level =logging.INFO - #logging.basicConfig(level=logging.INFO, filename='./logs/app.log', filemode='a', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s') - logging.basicConfig(level=level, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s') + TEMPLATE_FILE = "MANIFEST.in" + template = templateEnv.get_template(TEMPLATE_FILE) + outputText = template.render(name=name) + with open(os.path.join(".", name, "MANIFEST.in"), "w") as fh: + fh.write(outputText) + + TEMPLATE_FILE = os.path.join("/", name , "config.yaml") + template = templateEnv.get_template(TEMPLATE_FILE) + outputText = template.render(name=name) + + with open(os.path.join(".", name, name, "config.yaml"), "w") as fh: + fh.write(outputText) + print("") + print("") + print("Plugin {} created! See https://craftbeerpi.gitbook.io/craftbeerpi4/development how to run your plugin ".format(name)) + print("") + print("Happy Development! Cheers") + print("") + print("") - if args.action == "setup": - print("Setting up CBPi") - create_home_folder_structure() - create_plugin_file() - create_config_file() - copy_splash() + + +@click.group() +def main(): + level =logging.INFO + logging.basicConfig(level=level, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s') + pass + + +@click.command() +def setup(): + '''Create Config folder''' + print("Setting up CraftBeerPi") + create_home_folder_structure() + create_config_file() + +@click.command() +def start(): + if check_for_setup() is False: return + print("START") + cbpi = CraftBeerPi() + cbpi.start() - if args.action == "cleardb": - clear_db() - return - - if args.action == "plugins": - list_plugins() - return - - - if args.action == "add": - - add(args.name) - return - - if args.action == "remove": - remove(args.name) - return - - if args.action == "start": - if check_for_setup() is False: - return - - cbpi = CraftBeerPi() - cbpi.start() - return - - parser.print_help() +@click.command() +def plugins(): + '''List active plugins''' + plugins_list() + return +@click.command() +@click.argument('name') +def add(name): + '''Activate Plugin''' + plugins_add(name) +@click.command() +@click.argument('name') +def remove(name): + '''Deactivate Plugin''' + plugin_remove(name) + +@click.command() +@click.argument('name') +def create(name): + '''Deactivate Plugin''' + plugin_create(name) +main.add_command(setup) +main.add_command(start) +main.add_command(plugins) +main.add_command(add) +main.add_command(remove) +main.add_command(create) diff --git a/cbpi/config/config.yaml b/cbpi/config/config.yaml index cc1c19d..e4407dc 100644 --- a/cbpi/config/config.yaml +++ b/cbpi/config/config.yaml @@ -9,3 +9,6 @@ port: 8000 username: cbpi password: 123 +plugins: +- cbpi4-ui + diff --git a/cbpi/controller/actor_controller.py b/cbpi/controller/actor_controller.py index e2ab188..1f2b3eb 100644 --- a/cbpi/controller/actor_controller.py +++ b/cbpi/controller/actor_controller.py @@ -37,6 +37,6 @@ class ActorController(BasicController): instance = data.get("instance") state = state=instance.get_state() except Exception as e: - logging.error("Faild to crate actor dict {} ".format(e)) + logging.error("Faild to create actor dict {} ".format(e)) state = dict() return dict(name=data.get("name"), id=data.get("id"), type=data.get("type"), state=state,props=data.get("props", [])) \ No newline at end of file diff --git a/cbpi/controller/basic_controller.py b/cbpi/controller/basic_controller.py index 2f14006..aedd77e 100644 --- a/cbpi/controller/basic_controller.py +++ b/cbpi/controller/basic_controller.py @@ -31,13 +31,12 @@ class BasicController: logging.info("{} Load ".format(self.name)) with open(self.path) as json_file: data = json.load(json_file) - self.data = data["data"] - if self.autostart is True: for d in self.data: logging.info("{} Starting ".format(self.name)) await self.start(d.get("id")) + await self.push_udpate() async def save(self): logging.info("{} Save ".format(self.name)) @@ -76,6 +75,7 @@ class BasicController: instance = item.get("instance") await instance.stop() await instance.task + await self.push_udpate() except Exception as e: logging.error("{} Cant stop {} - {}".format(self.name, id, e)) @@ -84,20 +84,17 @@ class BasicController: try: item = self.find_by_id(id) instance = item.get("instance") - if instance is not None and instance.running is True: logging.warning("{} already running {}".format(self.name, id)) return type = item["type"] - - clazz = self.types[type]["class"] item["instance"] = clazz(self.cbpi, item["id"], {}) - await item["instance"].start() item["instance"].task = self._loop.create_task(item["instance"].run()) - logging.info("Sensor started {}".format(id)) + logging.info("{} started {}".format(self.name, id)) + except Exception as e: logging.error("{} Cant start {} - {}".format(self.name, id, e)) diff --git a/cbpi/controller/kettle_controller.py b/cbpi/controller/kettle_controller.py index 5ddaf27..c30f0f1 100644 --- a/cbpi/controller/kettle_controller.py +++ b/cbpi/controller/kettle_controller.py @@ -13,6 +13,7 @@ class KettleController(BasicController): item = self.find_by_id(id) instance = item.get("instance") await instance.start() + await self.push_udpate() except Exception as e: logging.error("Faild to switch on KettleLogic {} {}".format(id, e)) @@ -21,6 +22,19 @@ class KettleController(BasicController): item = self.find_by_id(id) instance = item.get("instance") await instance.stop() + await self.push_udpate() + except Exception as e: + logging.error("Faild to switch on KettleLogic {} {}".format(id, e)) + + async def toggle(self, id): + try: + item = self.find_by_id(id) + instance = item.get("instance") + if instance is None or instance.running == False: + await self.start(id) + else: + await instance.stop() + await self.push_udpate() except Exception as e: logging.error("Faild to switch on KettleLogic {} {}".format(id, e)) @@ -35,7 +49,7 @@ class KettleController(BasicController): def create_dict(self, data): try: instance = data.get("instance") - state = dict(state=instance.get_state()) + state = instance.get_state() except Exception as e: logging.error("Faild to create KettleLogic dict {} ".format(e)) state = dict() diff --git a/cbpi/controller/plugin_controller.py b/cbpi/controller/plugin_controller.py index 619a995..ce58b3d 100644 --- a/cbpi/controller/plugin_controller.py +++ b/cbpi/controller/plugin_controller.py @@ -18,72 +18,8 @@ class PluginController(): def __init__(self, cbpi): self.cbpi = cbpi - self.plugins = load_config("./config/plugin_list.txt") - if self.plugins is None: - self.plugins = {} - async def load_plugin_list(self): - async with aiohttp.ClientSession() as session: - async with session.get('http://localhost:2202/list') as resp: - if (resp.status == 200): - data = yaml.load(await resp.text()) - self.plugins = data - return data - - def installed_plugins(self): - return self.plugins - - async def install(self, package_name): - async def install(cbpi, plugins, package_name): - data = subprocess.check_output( - [sys.executable, "-m", "pip", "install", package_name]) - data = data.decode('UTF-8') - if package_name not in self.plugins: - now = datetime.datetime.now() - self.plugins[package_name] = dict( - version="1.0", installation_date=now.strftime("%Y-%m-%d %H:%M:%S")) - with open('./config/plugin_list.txt', 'w') as outfile: - yaml.dump(self.plugins, outfile, default_flow_style=False) - if data.startswith('Requirement already satisfied'): - self.cbpi.notify( - key="p", message="Plugin already installed ", type="warning") - else: - - self.cbpi.notify( - key="p", message="Plugin installed ", type="success") - - async with aiohttp.ClientSession() as session: - async with session.get('http://localhost:2202/get/%s' % package_name) as resp: - - if (resp.status == 200): - data = await resp.json() - await self.cbpi.job.start_job(install(self.cbpi, self.plugins, data["package_name"]), data["package_name"], "plugins_install") - return True - else: - self.cbpi.notify( - key="p", message="Failed to install Plugin %s " % package_name, type="danger") - return False - - async def uninstall(self, package_name): - async def uninstall(cbpi, plugins, package_name): - print("try to uninstall", package_name) - try: - data = subprocess.check_output( - [sys.executable, "-m", "pip", "uninstall", "-y", package_name]) - data = data.decode('UTF-8') - if data.startswith("Successfully uninstalled"): - cbpi.notify(key="p", message="Plugin %s Uninstalled" % - package_name, type="success") - else: - cbpi.notify(key="p", message=data, type="success") - except Exception as e: - print(e) - - if package_name in self.plugins: - print("Uninstall", self.plugins[package_name]) - await self.cbpi.job.start_job(uninstall(self.cbpi, self.plugins, package_name), package_name, "plugins_uninstall") - def load_plugins(self): this_directory = os.path.dirname(__file__) @@ -109,19 +45,22 @@ class PluginController(): logger.error(e) def load_plugins_from_evn(self): - - for p in self.plugins: - logger.debug("Load Plugin %s" % p) + + + for p in self.cbpi.static_config.get("plugins",[]): + try: logger.info("Try to load plugin: %s " % p) self.modules[p] = import_module(p) self.modules[p].setup(self.cbpi) - #logger.info("Plugin %s loaded successfully" % p) + logger.info("Plugin %s loaded successfully" % p) except Exception as e: logger.error("FAILED to load plugin %s " % p) logger.error(e) + + def register(self, name, clazz) -> None: ''' Register a new actor type @@ -171,9 +110,7 @@ class PluginController(): parameters.append(self._parse_property_object(p)) result["properties"] = parameters for method_name, method in cls.__dict__.items(): - if hasattr(method, "action"): - key = method.__getattribute__("key") parameters = [] for p in method.__getattribute__("parameters"): diff --git a/cbpi/controller/sensor_controller.py b/cbpi/controller/sensor_controller.py index a988976..4afb4a9 100644 --- a/cbpi/controller/sensor_controller.py +++ b/cbpi/controller/sensor_controller.py @@ -11,7 +11,7 @@ class SensorController(BasicController): instance = data.get("instance") state = state=instance.get_state() except Exception as e: - logging.error("Faild to crate actor dict {} ".format(e)) + logging.error("Faild to create sensor dict {} ".format(e)) state = dict() return dict(name=data.get("name"), id=data.get("id"), type=data.get("type"), state=state,props=data.get("props", [])) diff --git a/cbpi/controller/step_controller.py b/cbpi/controller/step_controller.py index d10395a..58029a5 100644 --- a/cbpi/controller/step_controller.py +++ b/cbpi/controller/step_controller.py @@ -222,8 +222,8 @@ class StepController: return next((i for i, item in enumerate(self.profile) if item["id"] == id), None) async def push_udpate(self): - await self.cbpi.bus.fire("step/update", data=list(map(lambda x: self.create_dict(x), self.profile))) - + self.cbpi.ws.send(dict(topic="step_update", data=list(map(lambda x: self.create_dict(x), self.profile)))) + async def start_step(self,step): logging.info("Start Step") step.get("instance").start() diff --git a/cbpi/extension/dummyactor/__init__.py b/cbpi/extension/dummyactor/__init__.py index e4557e7..a999f04 100644 --- a/cbpi/extension/dummyactor/__init__.py +++ b/cbpi/extension/dummyactor/__init__.py @@ -20,11 +20,7 @@ except Exception: import RPi.GPIO as GPIO -@parameters([Property.Number(label="Param1", configurable=True), - Property.Text(label="Param2", configurable=True, default_value="HALLO"), - Property.Select(label="Param3", options=[1,2,4]), - Property.Sensor(label="Param4"), - Property.Actor(label="Param5")]) +@parameters([]) class CustomActor(CBPiActor): my_name = "" @@ -37,7 +33,6 @@ class CustomActor(CBPiActor): def init(self): print("INIT") - self.state = False pass diff --git a/cbpi/extension/dummylogic/__init__.py b/cbpi/extension/dummylogic/__init__.py index 5615277..8c092a5 100644 --- a/cbpi/extension/dummylogic/__init__.py +++ b/cbpi/extension/dummylogic/__init__.py @@ -2,11 +2,7 @@ import asyncio from cbpi.api import * -@parameters([Property.Number(label="Param1", configurable=True), - Property.Text(label="Param2", configurable=True, default_value="HALLO"), - Property.Select(label="Param3", options=[1,2,4]), - Property.Sensor(label="Param4"), - Property.Actor(label="Param5")]) +@parameters([]) class CustomLogic(CBPiKettleLogic): pass diff --git a/cbpi/extension/dummysensor/__init__.py b/cbpi/extension/dummysensor/__init__.py index 32df456..9d2b410 100644 --- a/cbpi/extension/dummysensor/__init__.py +++ b/cbpi/extension/dummysensor/__init__.py @@ -7,11 +7,7 @@ from aiohttp import web from cbpi.api import * -@parameters([Property.Number(label="Param1", configurable=True), - Property.Text(label="Param2", configurable=True, default_value="HALLO"), - Property.Select(label="Param3", options=[1,2,4]), - Property.Sensor(label="Param4"), - Property.Actor(label="Param5")]) +@parameters([]) class CustomSensor(CBPiSensor): def __init__(self, cbpi, id, props): @@ -36,7 +32,7 @@ class CustomSensor(CBPiSensor): while self.running is True: self.value = random.randint(0,50) self.push_update(self.value) - await asyncio.sleep(1) + await asyncio.sleep(10) def get_state(self): return dict(value=self.value) diff --git a/cbpi/http_endpoints/http_kettle.py b/cbpi/http_endpoints/http_kettle.py index a31d414..6b4cead 100644 --- a/cbpi/http_endpoints/http_kettle.py +++ b/cbpi/http_endpoints/http_kettle.py @@ -182,6 +182,31 @@ class KettleHttpEndpoints(): await self.controller.off(id) return web.Response(status=204) + @request_mapping(path="/{id}/toggle", method="POST", auth_required=False) + async def http_toggle(self, request) -> web.Response: + """ + + --- + description: Switch actor on + tags: + - Kettle + + parameters: + - name: "id" + in: "path" + description: "Kettle ID" + required: true + type: "string" + + responses: + "204": + description: successful operation + "405": + description: invalid HTTP Met + """ + id = request.match_info['id'] + await self.controller.toggle(id) + return web.Response(status=204) @request_mapping(path="/{id}/action", method="POST", auth_required=auth) async def http_action(self, request) -> web.Response: @@ -233,11 +258,20 @@ class KettleHttpEndpoints(): required: true type: "integer" format: "int64" + - in: body + name: body + description: Update Temp + required: true + schema: + type: object + properties: + temp: + type: integer responses: "204": description: successful operation """ id = request.match_info['id'] - #data = await request.json() - await self.controller.set_target_temp(id,999) + data = await request.json() + await self.controller.set_target_temp(id,data.get("temp")) return web.Response(status=204) \ No newline at end of file diff --git a/config/actor.json b/config/actor.json index af1f306..c2aad86 100644 --- a/config/actor.json +++ b/config/actor.json @@ -15,7 +15,7 @@ "id": "Aifjxmw4QdPfU3XbR6iyis", "name": "Pump1", "props": {}, - "state": false, + "state": true, "type": "CustomActor" }, { @@ -24,6 +24,34 @@ "props": {}, "state": false, "type": "CustomActor" + }, + { + "id": "NjammuygecdvMpoGYc3rXt", + "name": "Heater Boil", + "props": {}, + "state": false, + "type": "CustomActor" + }, + { + "id": "j4PnSfuWRhgZDgrQScLN7e", + "name": "Vent1", + "props": {}, + "state": true, + "type": "CustomActor" + }, + { + "id": "ZGJqoybWv3eWrEeGJLopFs", + "name": "Water In", + "props": {}, + "state": false, + "type": "CustomActor" + }, + { + "id": "NfYJEWbTXPUSUQzS83dfAn", + "name": "Vent Out", + "props": {}, + "state": false, + "type": "CustomActor" } ] } \ No newline at end of file diff --git a/config/cbpi_dashboard_1.json b/config/cbpi_dashboard_1.json index 5f3b3f8..39356e9 100644 --- a/config/cbpi_dashboard_1.json +++ b/config/cbpi_dashboard_1.json @@ -1,64 +1,425 @@ { "elements": [ { - "id": "6c670263-7b19-426c-8769-19aac8ebb381", - "name": "CustomSVG", + "id": "1ad5cec3-0f10-4910-b5ba-b4a96207d0ca", + "name": "Kettle", "props": { - "name": "tank", - "width": "200" + "heigth": "150", + "width": "100" }, - "type": "CustomSVG", - "x": 295, - "y": 45 + "type": "Kettle", + "x": 225, + "y": 160 }, { - "id": "cbe859ca-b8e8-433f-952c-938a2f8a309b", + "id": "ba621aee-a733-4238-b892-0f39100a5d21", + "name": "Kettle", + "props": { + "heigth": "150", + "width": "100" + }, + "type": "Kettle", + "x": 530, + "y": 160 + }, + { + "id": "b61f57d9-e9ce-42b5-97df-3b2d7deaf18c", + "name": "Kettle", + "props": { + "heigth": "150", + "width": "100" + }, + "type": "Kettle", + "x": 780, + "y": 160 + }, + { + "id": "f2facefa-5808-4f63-93e7-fd8c3343aa2f", + "name": "Pump1", + "props": { + "actor": "Aifjxmw4QdPfU3XbR6iyis" + }, + "type": "ActorButton", + "x": 410, + "y": 380 + }, + { + "id": "6996220e-b314-4c23-82c5-2d0873bcd1bc", + "name": "KettleControl", + "props": { + "kettle": "oHxKz3z5RjbsxfSz6KUgov", + "orientation": "vertical" + }, + "type": "KettleControl", + "x": 165, + "y": 205 + }, + { + "id": "91547101-86e5-405c-84e4-295d3565adfb", + "name": "Vent", + "props": { + "actor": "j4PnSfuWRhgZDgrQScLN7e" + }, + "type": "ActorButton", + "x": 550, + "y": 380 + }, + { + "id": "a7ec6424-0df5-489e-85a6-5b36d039079b", + "name": "Pump2", + "props": { + "actor": "HX2bKdobuANehPggYcynnj" + }, + "type": "ActorButton", + "x": 680, + "y": 380 + }, + { + "id": "39bb1a5b-294e-47e6-b472-699ef05aa780", + "name": "KettleControl", + "props": { + "kettle": "a7bWex85Z9Td4atwgazpXW", + "orientation": "vertical" + }, + "type": "KettleControl", + "x": 720, + "y": 205 + }, + { + "id": "310054aa-729b-45b2-a3a3-2c73196a2444", + "name": "HLT", + "props": { + "color": "#fff", + "size": "15" + }, + "type": "Text", + "x": 235, + "y": 165 + }, + { + "id": "72a66e4f-f7ce-4ac2-9956-c581590bfb3d", + "name": "MashTun", + "props": { + "color": "#fff", + "size": "15" + }, + "type": "Text", + "x": 540, + "y": 165 + }, + { + "id": "62f58450-5ce6-45bf-b178-0dde9225ab52", + "name": "Boil", + "props": { + "color": "#fff", + "size": "15" + }, + "type": "Text", + "x": 820, + "y": 165 + }, + { + "id": "e2b351fa-b66e-416a-a6d6-887ee41b3d7e", + "name": "Water", + "props": { + "actor": "ZGJqoybWv3eWrEeGJLopFs" + }, + "type": "ActorButton", + "x": 45, + "y": 160 + }, + { + "id": "9f3f87d4-3c2a-4dcc-9740-8f7efcc553bf", + "name": "Sensor Data", + "props": { + "color": "#fff", + "sensor": "8ohkXvFA9UrkHLsxQL38wu", + "size": "30", + "unit": "\u00b0" + }, + "type": "Sensor", + "x": 255, + "y": 185 + }, + { + "id": "8df86373-7ed9-4d49-9d29-3b80e67989ab", + "name": "Sensor Data", + "props": { + "color": "#fff", + "sensor": "8ohkXvFA9UrkHLsxQL38wu", + "size": "30", + "unit": "\u00b0" + }, + "type": "Sensor", + "x": 810, + "y": 185 + }, + { + "id": "16a0e88b-09fb-4f32-9d9a-b82d02c48190", + "name": "TargetTemp", + "props": { + "color": "#fff", + "kettle": "oHxKz3z5RjbsxfSz6KUgov", + "size": "12", + "unit": "\u00b0" + }, + "type": "TargetTemp", + "x": 260, + "y": 225 + }, + { + "id": "2204b231-ca45-4773-a110-0e4b19dfab89", + "name": "TargetTemp", + "props": { + "color": "#fff", + "kettle": "a7bWex85Z9Td4atwgazpXW", + "size": "12", + "unit": "\u00b0" + }, + "type": "TargetTemp", + "x": 820, + "y": 225 + }, + { + "id": "8f3c656c-16b7-4f81-9d6d-8219e90e87d0", "name": "CustomSVG", "props": { - "name": "tank", - "width": "100" + "name": "cbpi_svg", + "width": "50" }, "type": "CustomSVG", "x": 555, - "y": 55 + "y": 240 }, { - "id": "1f1d5ee6-1ccc-409b-a240-c81d50b71627", + "id": "2a8b37f8-c0af-4592-9771-2e6500ef4299", "name": "CustomSVG", "props": { - "name": "kettle", - "width": "100" + "name": "cbpi_svg", + "width": "50" }, "type": "CustomSVG", - "x": 795, - "y": 90 + "x": 245, + "y": 240 + }, + { + "id": "16ec8526-7f2c-4973-bf97-4ab3363e6ca1", + "name": "CustomSVG", + "props": { + "name": "cbpi_svg", + "width": "50" + }, + "type": "CustomSVG", + "x": 805, + "y": 240 + }, + { + "id": "4fecbb43-53be-4d4a-b24d-2d980777afbe", + "name": "CraftBeerPi Brewery", + "props": { + "color": "#fff", + "size": "40" + }, + "type": "Text", + "x": 45, + "y": 65 + }, + { + "id": "4996dd17-b047-4d27-8598-0563dfd444ab", + "name": "Steps", + "props": { + "width": "200" + }, + "type": "Steps", + "x": 35, + "y": 315 + }, + { + "id": "44014b52-4bf0-4136-88a7-3cb9f1882962", + "name": "Out", + "props": { + "actor": "NfYJEWbTXPUSUQzS83dfAn" + }, + "type": "ActorButton", + "x": 985, + "y": 265 + }, + { + "id": "d4a56a0e-f410-47c1-879a-ff41c6422a6e", + "name": "Sensor Data", + "props": { + "color": "red", + "sensor": "8ohkXvFA9UrkHLsxQL38wu", + "size": "40", + "unit": "\u00b0" + }, + "type": "Sensor", + "x": 555, + "y": 180 } ], "pathes": [ { + "condition": [ + "ZGJqoybWv3eWrEeGJLopFs" + ], "coordinates": [ [ - 305, - 75 + 225, + 180 ], [ - 160, - 190 - ], - [ - 245, - 460 - ], - [ - 525, - 395 - ], - [ - 560, - 75 + 115, + 180 ] ], - "id": "d22d65d2-c4db-4553-856a-e9239a79e136" + "id": "731806be-b2cb-4706-8dd1-00bfc7daa818" + }, + { + "condition": [ + "Aifjxmw4QdPfU3XbR6iyis", + "j4PnSfuWRhgZDgrQScLN7e" + ], + "coordinates": [ + [ + 480, + 400 + ], + [ + 550, + 400 + ] + ], + "id": "39c646bc-3655-433d-a989-aa25a4a1d3ab" + }, + { + "condition": [ + "Aifjxmw4QdPfU3XbR6iyis", + "j4PnSfuWRhgZDgrQScLN7e" + ], + "coordinates": [ + [ + 320, + 285 + ], + [ + 360, + 285 + ], + [ + 360, + 400 + ], + [ + 410, + 400 + ] + ], + "id": "3fd4d742-a9b4-4d6f-ab75-9fcfed4f5104" + }, + { + "condition": [ + "Aifjxmw4QdPfU3XbR6iyis", + "j4PnSfuWRhgZDgrQScLN7e" + ], + "coordinates": [ + [ + 535, + 175 + ], + [ + 390, + 175 + ], + [ + 390, + 215 + ], + [ + 325, + 215 + ] + ], + "id": "91f38257-788c-4255-99cf-f454c69a7d93" + }, + { + "condition": [ + "Aifjxmw4QdPfU3XbR6iyis", + "j4PnSfuWRhgZDgrQScLN7e" + ], + "coordinates": [ + [ + 580, + 380 + ], + [ + 580, + 305 + ] + ], + "id": "0f9ffe1d-0b0c-4a0e-9dbf-3931ded3d050" + }, + { + "coordinates": [ + [ + 615, + 400 + ], + [ + 680, + 400 + ] + ], + "id": "fbbd511d-b51c-43a3-95e7-1608f21fdb33" + }, + { + "coordinates": [ + [ + 780, + 180 + ], + [ + 710, + 180 + ], + [ + 710, + 380 + ] + ], + "id": "e4f7b27e-a0db-48e8-82e2-7a07f1a61dc5" + }, + { + "condition": [ + "NfYJEWbTXPUSUQzS83dfAn" + ], + "coordinates": [ + [ + 985, + 285 + ], + [ + 880, + 285 + ] + ], + "id": "0dc28018-7282-4a43-98e6-c1dd198c93d5" + }, + { + "condition": [ + "NfYJEWbTXPUSUQzS83dfAn" + ], + "coordinates": [ + [ + 1015, + 375 + ], + [ + 1015, + 300 + ] + ], + "id": "6ca9c0f9-d4a6-45cf-bfdd-b7f6740c4bc1" } ] } \ No newline at end of file diff --git a/config/config.yaml b/config/config.yaml index 893c975..b5114a8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,14 +1,13 @@ - name: CraftBeerPi version: 4.0 index_url: /cbpi_ui/static/index.html +plugins: +- cbpi4-ui port: 8080 - # login data username: cbpi password: 123 - ws_push_all: true diff --git a/config/dashboard/widgets/brewery.svg b/config/dashboard/widgets/brewery.svg new file mode 100644 index 0000000..c874e00 --- /dev/null +++ b/config/dashboard/widgets/brewery.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/config/dashboard/widgets/cbpi_svg.svg b/config/dashboard/widgets/cbpi_svg.svg new file mode 100644 index 0000000..6375a00 --- /dev/null +++ b/config/dashboard/widgets/cbpi_svg.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/config/dashboard/widgets/kettle.svg b/config/dashboard/widgets/kettle.svg deleted file mode 100644 index a2000ef..0000000 --- a/config/dashboard/widgets/kettle.svg +++ /dev/null @@ -1,81 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/config/dashboard/widgets/kettle2.svg b/config/dashboard/widgets/kettle2.svg new file mode 100644 index 0000000..8590454 --- /dev/null +++ b/config/dashboard/widgets/kettle2.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/config/kettle.json b/config/kettle.json index dcbb3ff..ff907ca 100644 --- a/config/kettle.json +++ b/config/kettle.json @@ -4,18 +4,20 @@ "agitator": "", "heater": "8BLRqagLicCdEBDdc77Sgr", "id": "oHxKz3z5RjbsxfSz6KUgov", - "name": "Test1111111", + "name": "MashTun", "props": {}, - "sensor": "", - "state": {}, - "target_temp": null, + "sensor": "8ohkXvFA9UrkHLsxQL38wu", + "state": { + "running": false + }, + "target_temp": 52, "type": "CustomKettleLogic" }, { "agitator": "", "heater": "", "id": "WxAkesrkqiHH3Gywc4fMci", - "name": "Test", + "name": "HLT", "props": { "Param2": "13", "Param3": 1, @@ -25,43 +27,20 @@ "sensor": "", "state": {}, "target_temp": null, - "type": "CustomKettleLogic" - }, - { - "agitator": "", - "heater": "8BLRqagLicCdEBDdc77Sgr", - "id": "gc9Bwp38jtyxkVWH5oYRNZ", - "name": "Test", - "props": { - "Param3": 1, - "Param5": "8BLRqagLicCdEBDdc77Sgr" - }, - "sensor": "", - "state": {}, - "target_temp": null, - "type": "CustomKettleLogic" - }, - { - "agitator": "", - "heater": "", - "id": "ZfF2N2UnEHtgExNgZJyF5i", - "name": "Test", - "props": {}, - "sensor": "", - "state": {}, - "target_temp": null, - "type": "CustomKettleLogic" - }, - { - "agitator": "", - "heater": "8BLRqagLicCdEBDdc77Sgr", - "id": "oTivUB7LueLeUWoZAnLhwp", - "name": "", - "props": {}, - "sensor": "", - "state": {}, - "target_temp": null, "type": "" + }, + { + "agitator": "", + "heater": "NjammuygecdvMpoGYc3rXt", + "id": "a7bWex85Z9Td4atwgazpXW", + "name": "Boil", + "props": {}, + "sensor": "", + "state": { + "running": false + }, + "target_temp": 55, + "type": "CustomKettleLogic" } ] } \ No newline at end of file diff --git a/config/sensor.json b/config/sensor.json index f19b37c..ffd1236 100644 --- a/config/sensor.json +++ b/config/sensor.json @@ -2,10 +2,10 @@ "data": [ { "id": "8ohkXvFA9UrkHLsxQL38wu", - "name": "Test1112222", + "name": "Sensor1", "props": {}, "state": { - "value": 49 + "value": 0 }, "type": "CustomSensor" } diff --git a/config/step_data.json b/config/step_data.json index 8bf517b..4fdb233 100644 --- a/config/step_data.json +++ b/config/step_data.json @@ -1,15 +1,43 @@ { "basic": { - "name": "" + "name": "PALE ALE" }, "profile": [ { - "id": "6mdUtsrBaWeDvKgUXJiLqu", - "name": "Test", + "id": "T2y34Mbex9KjNWXhzfCRby", + "name": "MashIn", "props": { "Param1": 123, "Param2": "HALLO", - "Param3": 1 + "Param3": 1, + "count": 1, + "wohoo": 0 + }, + "status": "P", + "type": "CustomStep2" + }, + { + "id": "RjS8Zb2GGpUtNsqHsES3yF", + "name": "Step2", + "props": { + "Param1": 123, + "Param2": "HALLO", + "Param3": 1, + "count": 0, + "wohoo": 0 + }, + "status": "I", + "type": "CustomStep2" + }, + { + "id": "WkZG4fDNxZdtZ7uoTsSHhR", + "name": "Mash Step 1", + "props": { + "Param1": 123, + "Param2": "HALLO", + "Param3": 1, + "count": 0, + "wohoo": 0 }, "status": "I", "type": "CustomStep2" diff --git a/setup.py b/setup.py index b2261fa..40968aa 100644 --- a/setup.py +++ b/setup.py @@ -28,9 +28,10 @@ setup(name='cbpi', "voluptuous==0.12.1", "pyfiglet==0.8.post1", 'pandas==1.1.5', + 'click==7.1.2', 'shortuuid==1.0.1', 'tabulate==0.8.7', - 'cbpi4-ui==0.0.2', + 'cbpi4-ui==0.0.3', ], dependency_links=[ 'https://testpypi.python.org/pypi' diff --git a/temp.zip b/temp.zip new file mode 100644 index 0000000..5ff852f Binary files /dev/null and b/temp.zip differ diff --git a/venv/lib/python3.8/site-packages/easy-install.pth b/venv/lib/python3.8/site-packages/easy-install.pth index 389022c..0cdb068 100644 --- a/venv/lib/python3.8/site-packages/easy-install.pth +++ b/venv/lib/python3.8/site-packages/easy-install.pth @@ -1,2 +1,3 @@ /Users/manuelfritsch/Documents/git/cbpi4-ui-plugin /Users/manuelfritsch/Documents/git/cbpi4-ui +/Users/manuelfritsch/Documents/git/myplugin/plugin1 diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/AUTHORS.md b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/AUTHORS.md deleted file mode 100644 index 84fcfe0..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/AUTHORS.md +++ /dev/null @@ -1,56 +0,0 @@ -About the Copyright Holders -=========================== - -* Copyright (c) 2008-2011 AQR Capital Management, LLC - - AQR Capital Management began pandas development in 2008. Development was - led by Wes McKinney. AQR released the source under this license in 2009. -* Copyright (c) 2011-2012, Lambda Foundry, Inc. - - Wes is now an employee of Lambda Foundry, and remains the pandas project - lead. -* Copyright (c) 2011-2012, PyData Development Team - - The PyData Development Team is the collection of developers of the PyData - project. This includes all of the PyData sub-projects, including pandas. The - core team that coordinates development on GitHub can be found here: - https://github.com/pydata. - -Full credits for pandas contributors can be found in the documentation. - -Our Copyright Policy -==================== - -PyData uses a shared copyright model. Each contributor maintains copyright -over their contributions to PyData. However, it is important to note that -these contributions are typically only changes to the repositories. Thus, -the PyData source code, in its entirety, is not the copyright of any single -person or institution. Instead, it is the collective copyright of the -entire PyData Development Team. If individual contributors want to maintain -a record of what changes/contributions they have specific copyright on, -they should indicate their copyright in the commit message of the change -when they commit the change to one of the PyData repositories. - -With this in mind, the following banner should be used in any source code -file to indicate the copyright and license terms: - -``` -#----------------------------------------------------------------------------- -# Copyright (c) 2012, PyData Development Team -# All rights reserved. -# -# Distributed under the terms of the BSD Simplified License. -# -# The full license is in the LICENSE file, distributed with this software. -#----------------------------------------------------------------------------- -``` - -Other licenses can be found in the LICENSES directory. - -License -======= - -pandas is distributed under a 3-clause ("Simplified" or "New") BSD -license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have -BSD-compatible licenses, are included. Their licenses follow the pandas -license. diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/INSTALLER b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/LICENSE b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/LICENSE deleted file mode 100644 index 76954a5..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/LICENSE +++ /dev/null @@ -1,31 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team -All rights reserved. - -Copyright (c) 2011-2020, Open source contributors. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/METADATA b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/METADATA deleted file mode 100644 index 7c4a102..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/METADATA +++ /dev/null @@ -1,95 +0,0 @@ -Metadata-Version: 2.1 -Name: pandas -Version: 1.2.0 -Summary: Powerful data structures for data analysis, time series, and statistics -Home-page: https://pandas.pydata.org -Maintainer: The PyData Development Team -Maintainer-email: pydata@googlegroups.com -License: BSD -Project-URL: Bug Tracker, https://github.com/pandas-dev/pandas/issues -Project-URL: Documentation, https://pandas.pydata.org/pandas-docs/stable/ -Project-URL: Source Code, https://github.com/pandas-dev/pandas -Platform: any -Classifier: Development Status :: 5 - Production/Stable -Classifier: Environment :: Console -Classifier: Operating System :: OS Independent -Classifier: Intended Audience :: Science/Research -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.7 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Cython -Classifier: Topic :: Scientific/Engineering -Requires-Python: >=3.7.1 -Requires-Dist: python-dateutil (>=2.7.3) -Requires-Dist: pytz (>=2017.3) -Requires-Dist: numpy (>=1.16.5) -Provides-Extra: test -Requires-Dist: pytest (>=5.0.1) ; extra == 'test' -Requires-Dist: pytest-xdist ; extra == 'test' -Requires-Dist: hypothesis (>=3.58) ; extra == 'test' - - -**pandas** is a Python package that provides fast, flexible, and expressive data -structures designed to make working with structured (tabular, multidimensional, -potentially heterogeneous) and time series data both easy and intuitive. It -aims to be the fundamental high-level building block for doing practical, -**real world** data analysis in Python. Additionally, it has the broader goal -of becoming **the most powerful and flexible open source data analysis / -manipulation tool available in any language**. It is already well on its way -toward this goal. - -pandas is well suited for many different kinds of data: - - - Tabular data with heterogeneously-typed columns, as in an SQL table or - Excel spreadsheet - - Ordered and unordered (not necessarily fixed-frequency) time series data. - - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and - column labels - - Any other form of observational / statistical data sets. The data actually - need not be labeled at all to be placed into a pandas data structure - -The two primary data structures of pandas, Series (1-dimensional) and DataFrame -(2-dimensional), handle the vast majority of typical use cases in finance, -statistics, social science, and many areas of engineering. For R users, -DataFrame provides everything that R's ``data.frame`` provides and much -more. pandas is built on top of `NumPy `__ and is -intended to integrate well within a scientific computing environment with many -other 3rd party libraries. - -Here are just a few of the things that pandas does well: - - - Easy handling of **missing data** (represented as NaN) in floating point as - well as non-floating point data - - Size mutability: columns can be **inserted and deleted** from DataFrame and - higher dimensional objects - - Automatic and explicit **data alignment**: objects can be explicitly - aligned to a set of labels, or the user can simply ignore the labels and - let `Series`, `DataFrame`, etc. automatically align the data for you in - computations - - Powerful, flexible **group by** functionality to perform - split-apply-combine operations on data sets, for both aggregating and - transforming data - - Make it **easy to convert** ragged, differently-indexed data in other - Python and NumPy data structures into DataFrame objects - - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** - of large data sets - - Intuitive **merging** and **joining** data sets - - Flexible **reshaping** and pivoting of data sets - - **Hierarchical** labeling of axes (possible to have multiple labels per - tick) - - Robust IO tools for loading data from **flat files** (CSV and delimited), - Excel files, databases, and saving / loading data from the ultrafast **HDF5 - format** - - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, date shifting and lagging. - -Many of these principles are here to address the shortcomings frequently -experienced using other languages / scientific research environments. For data -scientists, working with data is typically divided into multiple stages: -munging and cleaning data, analyzing / modeling it, then organizing the results -of the analysis into a form suitable for plotting or tabular display. pandas is -the ideal tool for all of these tasks. - - diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/RECORD b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/RECORD deleted file mode 100644 index 5fac42c..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/RECORD +++ /dev/null @@ -1,2307 +0,0 @@ -pandas-1.2.0.dist-info/AUTHORS.md,sha256=6szZqPe-0AWpJveYBjdiqrY0UY60xEKegopyvoZi7TU,2284 -pandas-1.2.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -pandas-1.2.0.dist-info/LICENSE,sha256=B0kLFpnKpsig34ukMJXDe4alFNPeO17lMHHeUD3LEH4,1634 -pandas-1.2.0.dist-info/METADATA,sha256=JOv9dFtY6NIJ7qRS8VLXtEpE9Z_Xun7anrJU0ferkeA,4679 -pandas-1.2.0.dist-info/RECORD,, -pandas-1.2.0.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas-1.2.0.dist-info/WHEEL,sha256=RA4ju32EWHpO-G1BhxTQ3AwXQWjnnrzYGkM9skoN7_I,109 -pandas-1.2.0.dist-info/entry_points.txt,sha256=OVLKNEPs-Q7IWypWBL6fxv56_zt4sRnEI7zawo6y_0w,69 -pandas-1.2.0.dist-info/top_level.txt,sha256=_W-EYOwsRjyO7fqakAIX0J3vvvCqzSWZ8z5RtnXISDw,7 -pandas/__init__.py,sha256=KSqlqiPOMghLGG4CpBoxpAyzr6_8X3tRVPxV5VcRJFQ,7362 -pandas/__pycache__/__init__.cpython-38.pyc,, -pandas/__pycache__/_testing.cpython-38.pyc,, -pandas/__pycache__/_typing.cpython-38.pyc,, -pandas/__pycache__/_version.cpython-38.pyc,, -pandas/__pycache__/conftest.cpython-38.pyc,, -pandas/__pycache__/testing.cpython-38.pyc,, -pandas/_config/__init__.py,sha256=jzEp5Rpr9WGcaLUuXUKeHQcmWNh3OGBmRTFKTGVMq4s,669 -pandas/_config/__pycache__/__init__.cpython-38.pyc,, -pandas/_config/__pycache__/config.cpython-38.pyc,, -pandas/_config/__pycache__/dates.cpython-38.pyc,, -pandas/_config/__pycache__/display.cpython-38.pyc,, -pandas/_config/__pycache__/localization.cpython-38.pyc,, -pandas/_config/config.py,sha256=0aKM1Xl6Ks9KVlBvhIoOn8uQqfX68KSPR8ZifOG3qek,23688 -pandas/_config/dates.py,sha256=u2rl3gkqVBRFadm4GcgijNpZjtxFm88EDH3sopO-3fs,632 -pandas/_config/display.py,sha256=fTuUIGSPuAIXGjRxhzsZpHd9Ase-SlMnYyRCUl8ZhFg,1756 -pandas/_config/localization.py,sha256=bi5DEB1edvb_kdBxTTEwmmJrpP9mZn9_gOupBWT2HBQ,4967 -pandas/_libs/__init__.py,sha256=EadclD1M9bLo-Ae6EHyb_k80U8meOTYbT7sdJv4tFjc,323 -pandas/_libs/__pycache__/__init__.cpython-38.pyc,, -pandas/_libs/algos.cpython-38-darwin.so,sha256=RdD0eD-zFS8KmviVhshPeHWYR5EdZKxxVu0Ag9Pqb_s,1883208 -pandas/_libs/groupby.cpython-38-darwin.so,sha256=cp5Diy9NYY538_ARbHUkcquC4QXHCupA0XSwlppiaHQ,1392016 -pandas/_libs/hashing.cpython-38-darwin.so,sha256=hhYLzQ4dRGOiI5KrMvd1T64JXZq8RJbQ98y_Ctw26j4,180576 -pandas/_libs/hashtable.cpython-38-darwin.so,sha256=Olh_yT_avTtKVCP6LqGCD2-kzKSkk7vuJT1zlRTmqkU,1416164 -pandas/_libs/index.cpython-38-darwin.so,sha256=9Zr99RRblmO_0vJ0VzYYcvfXRcpweRMNEObtNVEVDcY,655248 -pandas/_libs/indexing.cpython-38-darwin.so,sha256=9hJDNyL1BxnXWDI7TKOxtA15Bzu9yzGIUZi8c3xODCA,48792 -pandas/_libs/internals.cpython-38-darwin.so,sha256=n4Y4LxfDLCHu7AG2sKK9RqZzeQl0nk5x-2xS_iDxwdI,273208 -pandas/_libs/interval.cpython-38-darwin.so,sha256=EU9V6dHiC6fkpmsuDjI5BESyvvzoHGhVi87w_8OmEGU,1378832 -pandas/_libs/join.cpython-38-darwin.so,sha256=kQJ8RNZyc6pO7c1TbYwHeimRXpfVI52MacMNLxD-x9g,3116768 -pandas/_libs/json.cpython-38-darwin.so,sha256=SEIlBoIl-Xsh_NOQevupka0cywxeLCuHkKQweqmG6cA,96728 -pandas/_libs/lib.cpython-38-darwin.so,sha256=2lFmyL9GbyCloVzaYZLCr-uXLM6iHp9wdLAXjiYNTZs,681796 -pandas/_libs/missing.cpython-38-darwin.so,sha256=dX1zYt9RMBBxgdYQ6eCmIRO-_UKzeQ6AHv7VL0qNGCM,260428 -pandas/_libs/ops.cpython-38-darwin.so,sha256=ZkHyc7VJBPM5bd1DRo50QQr6ykWtE1bFYH7HCPITrHs,240524 -pandas/_libs/ops_dispatch.cpython-38-darwin.so,sha256=WUexeLGhDwtQRpptYygePPWLwaOn9AxNJoYpuEPkyTk,63044 -pandas/_libs/parsers.cpython-38-darwin.so,sha256=atWvW1SK-z4r0kxNTCNxKJ_f4l-KC9DdJqDT4pJLMYs,596668 -pandas/_libs/properties.cpython-38-darwin.so,sha256=_2EeY-Dl0a1GixfX6vBRixWKXnD_GBHg8tebR7dTA24,71988 -pandas/_libs/reduction.cpython-38-darwin.so,sha256=H3HHxAO6GG1yV3Z-O7QLsUEZzG5qUJhhzTXlHEutdFQ,339444 -pandas/_libs/reshape.cpython-38-darwin.so,sha256=sG5SFG3mPciO5QJGJzUpjl0vUfmpgCzBlw_9o325-UU,275088 -pandas/_libs/sparse.cpython-38-darwin.so,sha256=BM8zMTicxlcXc-tW89apwuuowuM08CMNAcGOmguj7dk,1035408 -pandas/_libs/testing.cpython-38-darwin.so,sha256=4P8ViuUP0ftVmhORU7pZTOPiotPx7NQzwM-7RCnlco8,105332 -pandas/_libs/tslib.cpython-38-darwin.so,sha256=GShWZrCu9Msaif-zo9Dn-NUg8zDxkf1PUd7B8xJBrP0,199520 -pandas/_libs/tslibs/__init__.py,sha256=zwVZI8VNvlpkOV7m4rENCuGMn7KOKVP963MTGUCPILg,1224 -pandas/_libs/tslibs/__pycache__/__init__.cpython-38.pyc,, -pandas/_libs/tslibs/base.cpython-38-darwin.so,sha256=SKLcKIjumNhB8AUdARjkhapUIKIuP_Fd0svPC1DddME,41892 -pandas/_libs/tslibs/ccalendar.cpython-38-darwin.so,sha256=ZPFeRORLUPdFtoOczkZkz7SVoJi0edLQrK2_OcU4RAw,76944 -pandas/_libs/tslibs/conversion.cpython-38-darwin.so,sha256=V-E7a8VkvlqLU7BcZm576Z4BRQGrzuQaTYAi4sG4vl0,317708 -pandas/_libs/tslibs/dtypes.cpython-38-darwin.so,sha256=k97diKpWCfIOQnTDu6PIF1m-_OS-mv6cHtDEyyGgwQc,156516 -pandas/_libs/tslibs/fields.cpython-38-darwin.so,sha256=XbRr60HN6e0CAY4_Ndztd_UTf_s8B9nFSxtS02MVrnw,288944 -pandas/_libs/tslibs/nattype.cpython-38-darwin.so,sha256=DI3p2kDHRF2BHRzFrO5bsQqbA-9CLso_soXvBcHJLOg,278564 -pandas/_libs/tslibs/np_datetime.cpython-38-darwin.so,sha256=dLaJMEXmOsOLJuz9wh-3CWrJY0y4FLbUQ1zZogKCWrU,63804 -pandas/_libs/tslibs/offsets.cpython-38-darwin.so,sha256=DEIwCsNlSxA45ike4VpIimDpoDEwImkJeeHIIdWOCGE,1188112 -pandas/_libs/tslibs/parsing.cpython-38-darwin.so,sha256=WnZZ50Ru9GtxLOZDcN3Fh0ForKB-PX_SmezTTemEwYg,481236 -pandas/_libs/tslibs/period.cpython-38-darwin.so,sha256=dujTMW9QnakokbBETsT2-HV4WUsd46uO5xXZiPlWKwA,480260 -pandas/_libs/tslibs/strptime.cpython-38-darwin.so,sha256=ekOWqmqrco7n1hqXwn_v6B8_2hpjouyaXPoD-5S-QgI,445696 -pandas/_libs/tslibs/timedeltas.cpython-38-darwin.so,sha256=rfiWlRkFWUNR0n8TCJTTJixUdwLmn0opKLpUS95Q0Ts,519056 -pandas/_libs/tslibs/timestamps.cpython-38-darwin.so,sha256=HGmwc5qWLbH3dX5iDIBEDC8vQ_KdecKRejqg2qfFHfU,544480 -pandas/_libs/tslibs/timezones.cpython-38-darwin.so,sha256=LJirVH5qv7-QTKBFcg5hneSjnpBQXMExCkVJXPSEOIA,265000 -pandas/_libs/tslibs/tzconversion.cpython-38-darwin.so,sha256=qzJQAY8UJFiUnAawmHkmP8L_4v10SETrBjCmikx993o,363096 -pandas/_libs/tslibs/vectorized.cpython-38-darwin.so,sha256=DH2uFCK9F7PTHtTXPtoXEzWEhry2cCkmYq_wXsiNJJg,243252 -pandas/_libs/window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/_libs/window/__pycache__/__init__.cpython-38.pyc,, -pandas/_libs/window/aggregations.cpython-38-darwin.so,sha256=TvybAZ9q1vBOZnKzHd8kLoDtZJnUzY7nPvtIVtNE_ps,396140 -pandas/_libs/window/indexers.cpython-38-darwin.so,sha256=9LricQgkq68nCrRNsh_yUMOhqmiyrm-B_ovIXdpvJto,175156 -pandas/_libs/writers.cpython-38-darwin.so,sha256=yEKvVQc8BNw4GtZREi9167YQN9hUlJZBTzwGNEs4BOc,218416 -pandas/_testing.py,sha256=aI-HZlIO8xbGnYMt6SLBWAJ8FtOlMJYiyR_nd2n52qY,96704 -pandas/_typing.py,sha256=IIi_9OF47qxsKBQMNMy9zbglrxsh_Ox-V7N_C2r95T0,4865 -pandas/_version.py,sha256=1HczP9Jlp1KqJUbnza6F1U6xS5E9nDtxbrYQgM1c0gI,497 -pandas/api/__init__.py,sha256=o_GBSB3OnMEmoldMWjubsJotWfgTl0v5gD-X3CXG_LY,86 -pandas/api/__pycache__/__init__.cpython-38.pyc,, -pandas/api/extensions/__init__.py,sha256=ygXHDnNAL6S4RcAWoWYHCaZ9oawHcsQcLkESHzX7UCg,659 -pandas/api/extensions/__pycache__/__init__.cpython-38.pyc,, -pandas/api/indexers/__init__.py,sha256=pEEBwFqQLMscMAzgOSEhGZ70fLyd6-b6OGRx5oEKIG4,356 -pandas/api/indexers/__pycache__/__init__.cpython-38.pyc,, -pandas/api/types/__init__.py,sha256=d6jVFKCNtSuNLsI2vR-INIeutY4jUskjPD80WK2DVh4,453 -pandas/api/types/__pycache__/__init__.cpython-38.pyc,, -pandas/arrays/__init__.py,sha256=cMIl0o9YQ6jUOdbz0_lfdI_37w8z86XXr9L7Kx8eVus,556 -pandas/arrays/__pycache__/__init__.cpython-38.pyc,, -pandas/compat/__init__.py,sha256=nwuZpm80c7GliMKbNn-MW1yYwsGRA28f1ufDJK3yC4Q,2448 -pandas/compat/__pycache__/__init__.cpython-38.pyc,, -pandas/compat/__pycache__/_optional.cpython-38.pyc,, -pandas/compat/__pycache__/chainmap.cpython-38.pyc,, -pandas/compat/__pycache__/pickle_compat.cpython-38.pyc,, -pandas/compat/_optional.py,sha256=-12BYKBQNvZStLPPwWhuTUi0DFhY7aUIvST7lC0y6d4,3909 -pandas/compat/chainmap.py,sha256=o9PLcUwnHzJ77ieXmI6sf-_PaqE3V3NvkF1VdcsGMXg,996 -pandas/compat/numpy/__init__.py,sha256=YJC-reS1t8e4bajWO1GIM8CrxPnNBiZ38FlJ5GVRPM0,1906 -pandas/compat/numpy/__pycache__/__init__.cpython-38.pyc,, -pandas/compat/numpy/__pycache__/function.cpython-38.pyc,, -pandas/compat/numpy/function.py,sha256=BzsDCGdfn5P2Re99HLuSn-NRydIDYMqPoSeEjln27BA,13173 -pandas/compat/pickle_compat.py,sha256=qImQhEzx6LCvpUKOAAhp5nXti_6IuthM5zbmWw7YLdw,7903 -pandas/conftest.py,sha256=Kpmxe2IKHS8JeRogDW8FFPJBXfPFg-L-yHuJ2431sPc,36679 -pandas/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/__pycache__/__init__.cpython-38.pyc,, -pandas/core/__pycache__/accessor.cpython-38.pyc,, -pandas/core/__pycache__/aggregation.cpython-38.pyc,, -pandas/core/__pycache__/algorithms.cpython-38.pyc,, -pandas/core/__pycache__/api.cpython-38.pyc,, -pandas/core/__pycache__/apply.cpython-38.pyc,, -pandas/core/__pycache__/arraylike.cpython-38.pyc,, -pandas/core/__pycache__/base.cpython-38.pyc,, -pandas/core/__pycache__/common.cpython-38.pyc,, -pandas/core/__pycache__/config_init.cpython-38.pyc,, -pandas/core/__pycache__/construction.cpython-38.pyc,, -pandas/core/__pycache__/flags.cpython-38.pyc,, -pandas/core/__pycache__/frame.cpython-38.pyc,, -pandas/core/__pycache__/generic.cpython-38.pyc,, -pandas/core/__pycache__/index.cpython-38.pyc,, -pandas/core/__pycache__/indexers.cpython-38.pyc,, -pandas/core/__pycache__/indexing.cpython-38.pyc,, -pandas/core/__pycache__/missing.cpython-38.pyc,, -pandas/core/__pycache__/nanops.cpython-38.pyc,, -pandas/core/__pycache__/resample.cpython-38.pyc,, -pandas/core/__pycache__/series.cpython-38.pyc,, -pandas/core/__pycache__/shared_docs.cpython-38.pyc,, -pandas/core/__pycache__/sorting.cpython-38.pyc,, -pandas/core/accessor.py,sha256=v26naW0AGTEgh3VDB1IXRJWdNwe02DisYMx8a4vFvjw,8626 -pandas/core/aggregation.py,sha256=z3GsdyWhiJxi0RNuealbxrdTczpjFejrdI9fhnQ8gIo,24361 -pandas/core/algorithms.py,sha256=DCt2gap1zJYEDi8asGqRSeARCZVYmVN71WXO1N5wq1E,71260 -pandas/core/api.py,sha256=p3owLWRNuMidkqQtvY_o_GQoHRflKREi_xvY19xJYXE,1830 -pandas/core/apply.py,sha256=_VXVIAaBNBDeoDtIS60cmO9X1HQZreZUMk-4FAcHJBI,13469 -pandas/core/array_algos/__init__.py,sha256=8YLlO6TysEPxltfbNKdG9MlVXeDLfTIGNo2nUR-Zwl0,408 -pandas/core/array_algos/__pycache__/__init__.cpython-38.pyc,, -pandas/core/array_algos/__pycache__/masked_reductions.cpython-38.pyc,, -pandas/core/array_algos/__pycache__/replace.cpython-38.pyc,, -pandas/core/array_algos/__pycache__/transforms.cpython-38.pyc,, -pandas/core/array_algos/masked_reductions.py,sha256=xHp7mmyErFYH_v9JFiak87_TUvQTqW9-RABBAsgqZI4,3032 -pandas/core/array_algos/replace.py,sha256=Ps890DAffefdpfW33V76xDQgQ0KmjK0IzFLr8ltHE2w,3686 -pandas/core/array_algos/transforms.py,sha256=CySKHJbIBraqWiD-Mzs36rzDXumwxn0rllLErR0O2UU,959 -pandas/core/arraylike.py,sha256=2mATnuTFD4XYrT_qbNhtSZL_wTuUlD5X1kOChlQ1ANw,9860 -pandas/core/arrays/__init__.py,sha256=r8-e9RpHHe-J7Lv7j1-AEVlnPfQkSFGANd3X17C6xsg,1184 -pandas/core/arrays/__pycache__/__init__.cpython-38.pyc,, -pandas/core/arrays/__pycache__/_arrow_utils.cpython-38.pyc,, -pandas/core/arrays/__pycache__/_mixins.cpython-38.pyc,, -pandas/core/arrays/__pycache__/_ranges.cpython-38.pyc,, -pandas/core/arrays/__pycache__/base.cpython-38.pyc,, -pandas/core/arrays/__pycache__/boolean.cpython-38.pyc,, -pandas/core/arrays/__pycache__/categorical.cpython-38.pyc,, -pandas/core/arrays/__pycache__/datetimelike.cpython-38.pyc,, -pandas/core/arrays/__pycache__/datetimes.cpython-38.pyc,, -pandas/core/arrays/__pycache__/floating.cpython-38.pyc,, -pandas/core/arrays/__pycache__/integer.cpython-38.pyc,, -pandas/core/arrays/__pycache__/interval.cpython-38.pyc,, -pandas/core/arrays/__pycache__/masked.cpython-38.pyc,, -pandas/core/arrays/__pycache__/numeric.cpython-38.pyc,, -pandas/core/arrays/__pycache__/numpy_.cpython-38.pyc,, -pandas/core/arrays/__pycache__/period.cpython-38.pyc,, -pandas/core/arrays/__pycache__/string_.cpython-38.pyc,, -pandas/core/arrays/__pycache__/string_arrow.cpython-38.pyc,, -pandas/core/arrays/__pycache__/timedeltas.cpython-38.pyc,, -pandas/core/arrays/_arrow_utils.py,sha256=Z0FmVhX2re9zEB7Il-lL-gH2KRB8SGvkZ4yTKAEe-yU,4447 -pandas/core/arrays/_mixins.py,sha256=_0C7zEQNkkyukY4TeHIuFBtpMV6nJa_GLe2F4YbA00E,11032 -pandas/core/arrays/_ranges.py,sha256=psF60KSj4_qQ4sFNL0H8F2b62YBO4uKostTUDzIstjE,6027 -pandas/core/arrays/base.py,sha256=Pc1KkzIKdXTfc4R7O4kPhwNNy-wzF8q21tyxpKrSXFQ,47674 -pandas/core/arrays/boolean.py,sha256=_c47IRfs5mpJ6pVegOfPe0dAn4t62arpy_TBgN9tbJk,23082 -pandas/core/arrays/categorical.py,sha256=VitZ3Fr5fg7T1EymUoD7F1RAXdgpPrpPtZLYuiX-CfQ,88290 -pandas/core/arrays/datetimelike.py,sha256=UqzucMMoeWInSmYvFNbGyKCbUERzoc45dXYM2WGKEiM,60855 -pandas/core/arrays/datetimes.py,sha256=3Tmyf3yX1ofNjSNwc21H-QIjRKKRPyUwO-r5iwkv6Ao,79761 -pandas/core/arrays/floating.py,sha256=euWOwAWOK02l4yWQftvK86dBJ0S7yLN8y0yn-8rcHYY,16228 -pandas/core/arrays/integer.py,sha256=yYeV5Jcm7Q3OZIXr-iEW4px0SVq1WgaHVs5UZYuyhLs,19924 -pandas/core/arrays/interval.py,sha256=hpQzeN0niTlF0-l3QLzwve6NX0nH2hltZ_4iwfig_-M,48625 -pandas/core/arrays/masked.py,sha256=jhDRMVwSxl3O0XoCXYkMxWHImGGew8xOBi15WOhw3oc,12084 -pandas/core/arrays/numeric.py,sha256=AF5W2_0dRx4McRdI_izi21OJEz9bspu7SqODT0wX7JM,2993 -pandas/core/arrays/numpy_.py,sha256=LqGiADdOtgCvlRTmqb_jtLqOhQl6VYlaImd913m3GtQ,14388 -pandas/core/arrays/period.py,sha256=YenjtmuV68Z2JsoyNLPkFRmKVHOA0rGY9BRcCDBorj0,33248 -pandas/core/arrays/sparse/__init__.py,sha256=x9InYBC-nZbghUIJg4s2m0ty1cyum3EtQpzRE0pE2Kw,279 -pandas/core/arrays/sparse/__pycache__/__init__.cpython-38.pyc,, -pandas/core/arrays/sparse/__pycache__/accessor.cpython-38.pyc,, -pandas/core/arrays/sparse/__pycache__/array.cpython-38.pyc,, -pandas/core/arrays/sparse/__pycache__/dtype.cpython-38.pyc,, -pandas/core/arrays/sparse/__pycache__/scipy_sparse.cpython-38.pyc,, -pandas/core/arrays/sparse/accessor.py,sha256=oA0DD8SdVczqxoPjRkqEFK7egba-NB361UyfugxWPFY,11443 -pandas/core/arrays/sparse/array.py,sha256=nHyXvtzxGP40g-F7FKb4J981mapLx7rzb7K8r24xp94,49795 -pandas/core/arrays/sparse/dtype.py,sha256=MrNC5XDBEhiVhMsFfoj_2IzRWkl7VrdcGDwlAVpbs1c,12150 -pandas/core/arrays/sparse/scipy_sparse.py,sha256=lCUeF6cEJVEzc3yndw0XKTbjctUJtBCGj__0j94s6Qk,5379 -pandas/core/arrays/string_.py,sha256=d6-m1H6afk7Iefla9BVZb51xzKgyQNZrALL5JWle53k,13343 -pandas/core/arrays/string_arrow.py,sha256=1So2lAXgF_PujqftNEtDB-wQDGL4lrxDrjiYaQm--Y0,20002 -pandas/core/arrays/timedeltas.py,sha256=C3SXFT6NvmH3rpyQlMo7ZIGUphZZTWtUJM5K-KfqlbM,36176 -pandas/core/base.py,sha256=jidjpg44_eGt-BWLyRbBQ3S5nwy5JkcKPmN08mu7jTA,41475 -pandas/core/common.py,sha256=TaSAWCaDrTA2J_m3QOeIgKJEkjDsD5Rug4IO6kqh3xk,13614 -pandas/core/computation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/computation/__pycache__/__init__.cpython-38.pyc,, -pandas/core/computation/__pycache__/align.cpython-38.pyc,, -pandas/core/computation/__pycache__/api.cpython-38.pyc,, -pandas/core/computation/__pycache__/check.cpython-38.pyc,, -pandas/core/computation/__pycache__/common.cpython-38.pyc,, -pandas/core/computation/__pycache__/engines.cpython-38.pyc,, -pandas/core/computation/__pycache__/eval.cpython-38.pyc,, -pandas/core/computation/__pycache__/expr.cpython-38.pyc,, -pandas/core/computation/__pycache__/expressions.cpython-38.pyc,, -pandas/core/computation/__pycache__/ops.cpython-38.pyc,, -pandas/core/computation/__pycache__/parsing.cpython-38.pyc,, -pandas/core/computation/__pycache__/pytables.cpython-38.pyc,, -pandas/core/computation/__pycache__/scope.cpython-38.pyc,, -pandas/core/computation/align.py,sha256=yMeg_bTBLHQ_m6TJH7RDpnF97zWylrwlq5qgkAKSeuY,5999 -pandas/core/computation/api.py,sha256=Q_hRn-f6r45ph3AJqKmXlodzOufxNc9masH1q-DbSjE,62 -pandas/core/computation/check.py,sha256=jCEHgYQ8V2bZijbUeaKzzlyZILSeLgK9IbfMbVdjv9k,329 -pandas/core/computation/common.py,sha256=GNcuMW5apCe9ZJARp8NmALg-2p6ZX7CovyuZt5sHMTo,632 -pandas/core/computation/engines.py,sha256=icasg95xoxq0MR236hnGhkfj0B_NpEoPOF-0iiyhG5I,3180 -pandas/core/computation/eval.py,sha256=lNGyW57VmCk0sLsCIhRkikCjE15W6qJDY51zKyplfJs,13172 -pandas/core/computation/expr.py,sha256=SaKkmL4rUuEARq-YJFNlsPiVjyoFEEr7mpc6EBCu7Vg,24575 -pandas/core/computation/expressions.py,sha256=M6skMkH2jgj8gks22Wel2Ufbn0VNf-4Ohbkjf9F4IAI,7337 -pandas/core/computation/ops.py,sha256=hoiUd-S2tu41pJb8D7hMzhGdHnwznaQtsL56U5YuB1M,16454 -pandas/core/computation/parsing.py,sha256=entRfUox4YTdBiRayB-ZhYJflIENtf7jhUyebiYjt40,6398 -pandas/core/computation/pytables.py,sha256=XOkoBnIKwF9UV5ncu241955o0reDLjQ2R5kq2uB_n0M,19569 -pandas/core/computation/scope.py,sha256=O74rxMMlr2Ltesly3ZWHMqBPyYzgwbr3m99GltVlOis,10722 -pandas/core/config_init.py,sha256=lNqrwYPQOnfTC-tPRgO_BRiPtUbt-POQVzsfRcM-FLo,20106 -pandas/core/construction.py,sha256=681Kmash2TKqtTXiXTUlkrngLuuYHMaV5_gAghr7dZE,22216 -pandas/core/dtypes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/dtypes/__pycache__/__init__.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/api.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/base.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/cast.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/common.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/concat.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/dtypes.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/generic.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/inference.cpython-38.pyc,, -pandas/core/dtypes/__pycache__/missing.cpython-38.pyc,, -pandas/core/dtypes/api.py,sha256=n2yY_gwcSayI-sfINVeGw2y3eJu3Rk2BDzs6Q_0YtAo,911 -pandas/core/dtypes/base.py,sha256=ZcJOFwPUz4yAYriHiJR0ozQQ7Yv3NgLwSNEYFrSHt2A,13190 -pandas/core/dtypes/cast.py,sha256=3QkeCpULmWMeUx6Drs1KuZa2oR_gDOB8R5vvTfFyCAA,56000 -pandas/core/dtypes/common.py,sha256=rUlj0Xm3slK7WHSmgJJO-NSFRHlrSoQVHlESovgvi9U,47603 -pandas/core/dtypes/concat.py,sha256=Lw1td6K7nVE736GU7fLXzXPxO0keey7_Pr_DV4wT1pA,13211 -pandas/core/dtypes/dtypes.py,sha256=Q-SNRVya8Lx5IWqQja29SmBg_8XMvGhQ6lEZRc3Nc40,38003 -pandas/core/dtypes/generic.py,sha256=g_42hrcln6l00nAuPwgi2fXV8KUO-0GcY_8T-4INFdE,3612 -pandas/core/dtypes/inference.py,sha256=n2L3B1WCl0wN1XxZO4dXXQF_AjizFqy1wqerPHWP1JQ,8609 -pandas/core/dtypes/missing.py,sha256=Zea9eybKhdXhl8CKu7aSxnlwNX13GZ2htaNSIhzPBg4,18366 -pandas/core/flags.py,sha256=sQKSGjw1SIkuwO5ptiQzT0aavRJmZv1PaVw8MPrZ-NA,3567 -pandas/core/frame.py,sha256=_EPgYduSqrv5qhXU0jwdJ73QmGljkEPj0yi-kplzQGE,325741 -pandas/core/generic.py,sha256=hffTphh5haYPdyb23URIXxrOA_T-AIaRhYtC3NOM8rM,403275 -pandas/core/groupby/__init__.py,sha256=xZmPcq3GTA59FrudjUQ2e03mvF4jdow1KqIJQ57dXOQ,284 -pandas/core/groupby/__pycache__/__init__.cpython-38.pyc,, -pandas/core/groupby/__pycache__/base.cpython-38.pyc,, -pandas/core/groupby/__pycache__/categorical.cpython-38.pyc,, -pandas/core/groupby/__pycache__/generic.cpython-38.pyc,, -pandas/core/groupby/__pycache__/groupby.cpython-38.pyc,, -pandas/core/groupby/__pycache__/grouper.cpython-38.pyc,, -pandas/core/groupby/__pycache__/numba_.cpython-38.pyc,, -pandas/core/groupby/__pycache__/ops.cpython-38.pyc,, -pandas/core/groupby/base.py,sha256=5pKIOj5XzQxx1-bOKW-prSOsKGVJ5Ff0QpHmfqOeRB4,6085 -pandas/core/groupby/categorical.py,sha256=i9NI7NbCEjQT75ocRFQT_B4sqxSrkj6GpKjfHtjGPz0,3587 -pandas/core/groupby/generic.py,sha256=1g1WTAAC3jW3lc67Mtb_XEf9OvoLd721TGSMvXcLXNE,64470 -pandas/core/groupby/groupby.py,sha256=91v0crfIlT808fLQg8E8kicokhTMuXgrKZ5aLoTpS54,98654 -pandas/core/groupby/grouper.py,sha256=yX1BKo14X5UX1yUpNzEqSPm-hMCllV42C7XjoJ70n2Q,30689 -pandas/core/groupby/numba_.py,sha256=fErZ5oK82bAObHK5nLwSC1ZpEBPVr0IW2E6Eu6iOCvQ,5213 -pandas/core/groupby/ops.py,sha256=ZgY2lEaqNv0HLQSl6ugYkNXHlcGS4nC-dyP8PzRpXq0,33650 -pandas/core/index.py,sha256=mufKydxdzpq0T-9ZqZpviigz6_Nc_2us08ul0U5wEWY,665 -pandas/core/indexers.py,sha256=Vf8NbuT3yyi8TiQI7kYQRncFOvDt0uNhbhUpsQHMakk,14281 -pandas/core/indexes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/indexes/__pycache__/__init__.cpython-38.pyc,, -pandas/core/indexes/__pycache__/accessors.cpython-38.pyc,, -pandas/core/indexes/__pycache__/api.cpython-38.pyc,, -pandas/core/indexes/__pycache__/base.cpython-38.pyc,, -pandas/core/indexes/__pycache__/category.cpython-38.pyc,, -pandas/core/indexes/__pycache__/datetimelike.cpython-38.pyc,, -pandas/core/indexes/__pycache__/datetimes.cpython-38.pyc,, -pandas/core/indexes/__pycache__/extension.cpython-38.pyc,, -pandas/core/indexes/__pycache__/frozen.cpython-38.pyc,, -pandas/core/indexes/__pycache__/interval.cpython-38.pyc,, -pandas/core/indexes/__pycache__/multi.cpython-38.pyc,, -pandas/core/indexes/__pycache__/numeric.cpython-38.pyc,, -pandas/core/indexes/__pycache__/period.cpython-38.pyc,, -pandas/core/indexes/__pycache__/range.cpython-38.pyc,, -pandas/core/indexes/__pycache__/timedeltas.cpython-38.pyc,, -pandas/core/indexes/accessors.py,sha256=FSrzVddVydP0fH1uAtBU0ycoTkxgv_kRui8vRG9I3L8,14519 -pandas/core/indexes/api.py,sha256=Fit9pYYZs2NJD2RP9I98IqFfR_1KJpxlZvZyDS9WucY,7469 -pandas/core/indexes/base.py,sha256=98FWYMh3vVlIzYKWcOHMtp3mkNux2vFiqQcSRMEYgeU,199767 -pandas/core/indexes/category.py,sha256=vQ67dRQYB0f2yVodAjASWZCamgbGQw6KU5KlJ_vlgAs,22857 -pandas/core/indexes/datetimelike.py,sha256=rA2qdppzJuIjTAF8rXhMXOssK4dc27v82nMUeRNl_Es,30263 -pandas/core/indexes/datetimes.py,sha256=zHouJbbQ2t32G4rdpf60JAf9Qzw5jF_OodpgLzNfbnA,40191 -pandas/core/indexes/extension.py,sha256=CC-3yfwCRqLwQ2_e94SXk6e_3opO4XTArlZko-bb4Uc,12483 -pandas/core/indexes/frozen.py,sha256=fDq5HfmcUAK_OPmXlRqx_rb_MF0HQ2PgSNX6RPSe22w,3078 -pandas/core/indexes/interval.py,sha256=mK3ehLqoGbP_7mSsqIn6dFUVMHhOgzZcZHDTUCD30P0,43674 -pandas/core/indexes/multi.py,sha256=HxDsdT5xT5e9aD3cq6p2cMdsqfCR3WWQrNOsevGQWqc,133109 -pandas/core/indexes/numeric.py,sha256=y-QidjHnZtfUi6gF88fX0o2lQNVrbCbmjV9sXrRKqaQ,13271 -pandas/core/indexes/period.py,sha256=LJeeHbiG5DIaKv0Cgu8kQqQw_Z0F7nxU3IKgZZOokqY,25613 -pandas/core/indexes/range.py,sha256=aUXSKLSe_WE9z8WcQVNNmMwNAQR6KpWPDBfy74L2HOA,29648 -pandas/core/indexes/timedeltas.py,sha256=t1AUVoXbhwZktbcOXjNSVOqb0AIx3R23VkB5T27i2-g,9840 -pandas/core/indexing.py,sha256=Skzg1g2cswgIAOmtkx0Km8R9TVqjmMwoHp2egOfjVXU,80647 -pandas/core/internals/__init__.py,sha256=MhXTDdNOFR4FJTc3u-BYKl_9yeE8bqc7Sf5qNBKjIsk,1035 -pandas/core/internals/__pycache__/__init__.cpython-38.pyc,, -pandas/core/internals/__pycache__/blocks.cpython-38.pyc,, -pandas/core/internals/__pycache__/concat.cpython-38.pyc,, -pandas/core/internals/__pycache__/construction.cpython-38.pyc,, -pandas/core/internals/__pycache__/managers.cpython-38.pyc,, -pandas/core/internals/__pycache__/ops.cpython-38.pyc,, -pandas/core/internals/blocks.py,sha256=GlvDLjQh6LYuCLs-VRN3EvZ3NXRlFdpW5EU_CTpjuoI,93770 -pandas/core/internals/concat.py,sha256=UWw4hYmEdsAQwd38GDGjRF1rMMOhtWOddaSO2oYtq1k,20709 -pandas/core/internals/construction.py,sha256=5yvnj_Wc_m6eoKI-OX-AqfBlfTT2AIe6pEWSVz-nqMk,24036 -pandas/core/internals/managers.py,sha256=KsD6JqF3hFqkihaJbX_L6rr4Pw8S_mjAjeSKelj_qJ8,62958 -pandas/core/internals/ops.py,sha256=0D2sHmCcih802J_YfjIvbFSM1h0DOyMDAYKu53dsvxo,4427 -pandas/core/missing.py,sha256=tHld1cVHSLoCZ-ohNF8O4EbaOO8UU85Ipg9pZ2el9gc,24510 -pandas/core/nanops.py,sha256=VKEEpqTClovHr6LoOTxKmuWCVMyBhvKjZHZZJ8aU2jg,49256 -pandas/core/ops/__init__.py,sha256=VguRwY31VDapp9QIw5cASZhT8ufJO0DAUXPOh7y2i6s,13971 -pandas/core/ops/__pycache__/__init__.cpython-38.pyc,, -pandas/core/ops/__pycache__/array_ops.cpython-38.pyc,, -pandas/core/ops/__pycache__/common.cpython-38.pyc,, -pandas/core/ops/__pycache__/dispatch.cpython-38.pyc,, -pandas/core/ops/__pycache__/docstrings.cpython-38.pyc,, -pandas/core/ops/__pycache__/invalid.cpython-38.pyc,, -pandas/core/ops/__pycache__/mask_ops.cpython-38.pyc,, -pandas/core/ops/__pycache__/methods.cpython-38.pyc,, -pandas/core/ops/__pycache__/missing.cpython-38.pyc,, -pandas/core/ops/__pycache__/roperator.cpython-38.pyc,, -pandas/core/ops/array_ops.py,sha256=xDHMGMuHll47KrJG89cXcR4on12jHHq1a2-U7VNWh7Q,14795 -pandas/core/ops/common.py,sha256=AXrcNVh_ZpkZRzt4oHc9rZndE1H4ikKsloZkgHICjWc,2935 -pandas/core/ops/dispatch.py,sha256=G2SEvrbKI5iIg0sPBCOxk4s3c-kVBZ7jDlAyL-jHEFI,549 -pandas/core/ops/docstrings.py,sha256=sWRq-KuecXx7Jaewr7lXJCWb7aYo711Du40DjwWL_eU,17706 -pandas/core/ops/invalid.py,sha256=sUm_2E_QJhW9kiDWlvcDQjM2U8OdUgPEj2Dv2iQE8aM,1285 -pandas/core/ops/mask_ops.py,sha256=WqezxdvCmKEXHIWihblKYsKyGWjqx1YcCb9vkPTNlWM,4935 -pandas/core/ops/methods.py,sha256=4MweMQ5wsPUcxFqK5fi1WbvITmLmk3ZD2XNEnMDzD5M,3690 -pandas/core/ops/missing.py,sha256=Me34ivzzDtFS96yd1paLXl6AQ1c_1iOfiU5mpHtcsxI,5164 -pandas/core/ops/roperator.py,sha256=F8ULcax62DJD-0JEax2vBsIGf5vBLvuKhDjQR244AUc,1080 -pandas/core/resample.py,sha256=rToXML26Zbj8GxRw2LZ01hu9owymKxClN6jKTCRt7as,63164 -pandas/core/reshape/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/reshape/__pycache__/__init__.cpython-38.pyc,, -pandas/core/reshape/__pycache__/api.cpython-38.pyc,, -pandas/core/reshape/__pycache__/concat.cpython-38.pyc,, -pandas/core/reshape/__pycache__/melt.cpython-38.pyc,, -pandas/core/reshape/__pycache__/merge.cpython-38.pyc,, -pandas/core/reshape/__pycache__/pivot.cpython-38.pyc,, -pandas/core/reshape/__pycache__/reshape.cpython-38.pyc,, -pandas/core/reshape/__pycache__/tile.cpython-38.pyc,, -pandas/core/reshape/__pycache__/util.cpython-38.pyc,, -pandas/core/reshape/api.py,sha256=pRwwOuS6LNFLQOEFVrVKgvJk2ZrjIPf14ZHmnrN_7vA,365 -pandas/core/reshape/concat.py,sha256=Cw_jYQKzcZd_Xbe0vz18QxaNyZV9myx7zrLQH3-KKcs,22724 -pandas/core/reshape/melt.py,sha256=fjqzzZ1pnnyD6Gh9Q_A1huXTn5xOubQ5DWTJhNc9FnQ,17922 -pandas/core/reshape/merge.py,sha256=UliPUObg2_iGtr6wqBwY5dnLEZiPcZzaDiJiZaOUqGo,76890 -pandas/core/reshape/pivot.py,sha256=XyH3MEV_W7r05jJfEymsHf4zh5hyrd6RoObKx2Y8iNo,25283 -pandas/core/reshape/reshape.py,sha256=BVzbn3w-d2MuWzcGXpkzmzpInTAxAORUduo3OIRYA0s,34913 -pandas/core/reshape/tile.py,sha256=4kAV2L4lbmVagbgT5cQfLaAv3M7Y4fV1u1TSYwswt14,20930 -pandas/core/reshape/util.py,sha256=Gik84iHEE2Gz7G0hWcbBFCCu466NpFMO-Brcsx6Ilyk,1662 -pandas/core/series.py,sha256=OS3BxDNJhfSAA7bi56SMiP9v4qLcVJxJChEejCbffP4,152436 -pandas/core/shared_docs.py,sha256=ImsQ9coOmAOT3m0_mk8osRtWR9vMoguBAdKpRD3MaUc,10730 -pandas/core/sorting.py,sha256=-93QANFjQJLp1SI28WHdgVniGoBy7-UJtickXVZAhFk,19545 -pandas/core/sparse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/sparse/__pycache__/__init__.cpython-38.pyc,, -pandas/core/sparse/__pycache__/api.cpython-38.pyc,, -pandas/core/sparse/api.py,sha256=g4O6jA1PpcOWOFCAeR8SELO2CGaZVKjIAN6sqvA-KUw,105 -pandas/core/strings/__init__.py,sha256=Z4NJaPl5H21aPinpUUy6oo7MynXg9Yq7QeR5Hdia360,1187 -pandas/core/strings/__pycache__/__init__.cpython-38.pyc,, -pandas/core/strings/__pycache__/accessor.cpython-38.pyc,, -pandas/core/strings/__pycache__/base.cpython-38.pyc,, -pandas/core/strings/__pycache__/object_array.cpython-38.pyc,, -pandas/core/strings/accessor.py,sha256=II8fM6i25kvwaER3UPmmptJauCM3qopuTKSOnF_shDg,100131 -pandas/core/strings/base.py,sha256=PTCix5jU4cFvLDniS49ToGyRc0JuCNG-9LJLKBmIPHg,4702 -pandas/core/strings/object_array.py,sha256=IKYBP_JDCYP8yqomfRXgrsR-GKMmWNAwh9fr2E86fiw,13921 -pandas/core/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/tools/__pycache__/__init__.cpython-38.pyc,, -pandas/core/tools/__pycache__/datetimes.cpython-38.pyc,, -pandas/core/tools/__pycache__/numeric.cpython-38.pyc,, -pandas/core/tools/__pycache__/timedeltas.cpython-38.pyc,, -pandas/core/tools/__pycache__/times.cpython-38.pyc,, -pandas/core/tools/datetimes.py,sha256=12Of5Y2QkTwzXw7Dn1ENr1giIIfd1iMvAoMX2GWJKF0,32551 -pandas/core/tools/numeric.py,sha256=9gQTAO02rLYuLKpW3w72H3V_XCfnXlUakLwm4MVi4pU,6786 -pandas/core/tools/timedeltas.py,sha256=yTCGx6BNQyHYPJNnDf2k5uQfMQfXAj1ClinlYxgu7O0,6348 -pandas/core/tools/times.py,sha256=ZVXatwM60JiqYi-0YIykdgGZAY8VczJ543Kj7qvA8mI,4611 -pandas/core/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/core/util/__pycache__/__init__.cpython-38.pyc,, -pandas/core/util/__pycache__/hashing.cpython-38.pyc,, -pandas/core/util/__pycache__/numba_.cpython-38.pyc,, -pandas/core/util/hashing.py,sha256=vCijE6n-8-5UWrON5XR_rPAuowgEGmz5iPnxI5pZiRs,8983 -pandas/core/util/numba_.py,sha256=pGhSypExTFJfTOuIvYBD0IW3gPeH2QexGupKYJX2WpQ,3088 -pandas/core/window/__init__.py,sha256=wmIH3975rmB-XleS0JqJHR5dTchnk0hnIh1luO_CkjM,283 -pandas/core/window/__pycache__/__init__.cpython-38.pyc,, -pandas/core/window/__pycache__/common.cpython-38.pyc,, -pandas/core/window/__pycache__/ewm.cpython-38.pyc,, -pandas/core/window/__pycache__/expanding.cpython-38.pyc,, -pandas/core/window/__pycache__/indexers.cpython-38.pyc,, -pandas/core/window/__pycache__/numba_.cpython-38.pyc,, -pandas/core/window/__pycache__/rolling.cpython-38.pyc,, -pandas/core/window/common.py,sha256=tyiGmBg0N0W3fKTMp122mFlur8O6MGz6QDEl7IpNIVg,7052 -pandas/core/window/ewm.py,sha256=J3UxZUufIhbxMHaJANoaW3XR_bXSHkdH89wMQfzqbUI,19354 -pandas/core/window/expanding.py,sha256=HHELlTi_yaYH1LzUgmout1GvQ7pcZw7TaiRtXKoHt0Q,8393 -pandas/core/window/indexers.py,sha256=_dGXpApvoUzJRWYQ0KmcDysXOAN_Zc2Pma6447aMMv8,11553 -pandas/core/window/numba_.py,sha256=_HtCdt_iuEBDgQ3K8Jaab79veSmOlRWYbuodwrZ0nnQ,4809 -pandas/core/window/rolling.py,sha256=af_0rNMIdf7olFHHsNNUSpWShYPQqpZtAinixHXYNtY,67366 -pandas/errors/__init__.py,sha256=fAxSPV3tbD-Pf02kf_xO63IY8zKdcB4V8TsmAlOosDo,6567 -pandas/errors/__pycache__/__init__.cpython-38.pyc,, -pandas/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/io/__pycache__/__init__.cpython-38.pyc,, -pandas/io/__pycache__/api.cpython-38.pyc,, -pandas/io/__pycache__/clipboards.cpython-38.pyc,, -pandas/io/__pycache__/common.cpython-38.pyc,, -pandas/io/__pycache__/date_converters.cpython-38.pyc,, -pandas/io/__pycache__/feather_format.cpython-38.pyc,, -pandas/io/__pycache__/gbq.cpython-38.pyc,, -pandas/io/__pycache__/html.cpython-38.pyc,, -pandas/io/__pycache__/orc.cpython-38.pyc,, -pandas/io/__pycache__/parquet.cpython-38.pyc,, -pandas/io/__pycache__/parsers.cpython-38.pyc,, -pandas/io/__pycache__/pickle.cpython-38.pyc,, -pandas/io/__pycache__/pytables.cpython-38.pyc,, -pandas/io/__pycache__/spss.cpython-38.pyc,, -pandas/io/__pycache__/sql.cpython-38.pyc,, -pandas/io/__pycache__/stata.cpython-38.pyc,, -pandas/io/api.py,sha256=pvcFoUIpJ3vb-eyIKxX6YyfxyZGrFT4Jrkyj7TSneDk,726 -pandas/io/clipboard/__init__.py,sha256=9sc7tRNL0LHlnUYYGj8QNxyRMkPXZMjhkR1J2yJn5kQ,21543 -pandas/io/clipboard/__pycache__/__init__.cpython-38.pyc,, -pandas/io/clipboards.py,sha256=Q6QJ-FOiPEks2e7gI6ydMUI0VZsGsW-NPpUrztF0aZo,4337 -pandas/io/common.py,sha256=e3aN4mFPUDnRlJdhiALIwc30AIk08f8I6Q4M6GyBhoI,27076 -pandas/io/date_converters.py,sha256=Qa_A15x0Wb5ENtq6Ylg68-k0oOa3Gqs56DJTN-AS0pY,3560 -pandas/io/excel/__init__.py,sha256=PjVe9ZS4TIIwUK3ZI1vS2x64s2IgrPzkW8OnrMV4foY,561 -pandas/io/excel/__pycache__/__init__.cpython-38.pyc,, -pandas/io/excel/__pycache__/_base.cpython-38.pyc,, -pandas/io/excel/__pycache__/_odfreader.cpython-38.pyc,, -pandas/io/excel/__pycache__/_odswriter.cpython-38.pyc,, -pandas/io/excel/__pycache__/_openpyxl.cpython-38.pyc,, -pandas/io/excel/__pycache__/_pyxlsb.cpython-38.pyc,, -pandas/io/excel/__pycache__/_util.cpython-38.pyc,, -pandas/io/excel/__pycache__/_xlrd.cpython-38.pyc,, -pandas/io/excel/__pycache__/_xlsxwriter.cpython-38.pyc,, -pandas/io/excel/__pycache__/_xlwt.cpython-38.pyc,, -pandas/io/excel/_base.py,sha256=DfP847G88APvbYez10HpeGLHqD-bFwAjvhDGJY2T6yw,42037 -pandas/io/excel/_odfreader.py,sha256=IU_wWQPCMJx1hfuTWoq_G7YUQ3Mj1U0Rf4Ww8Fk3fOU,7377 -pandas/io/excel/_odswriter.py,sha256=x3GqN--SK3ZbOg8v6zWHJmdaKOswGuiTb5LIChR0p2E,9394 -pandas/io/excel/_openpyxl.py,sha256=lDVl-mp0_ZPMmVpa1UOPfQOl8VJVuy0UZpCsgTKO9XI,16191 -pandas/io/excel/_pyxlsb.py,sha256=2wUyH71booh0pSJemRfNqXlKv6L5AKKKW7RofFkGGZQ,2573 -pandas/io/excel/_util.py,sha256=TF1N10Baw01Xrx0v54FnfDPvzee7C10po9Wum43o23A,5415 -pandas/io/excel/_xlrd.py,sha256=B_dGU9EiACtriDYZzzIinOhj91Nq5yGGkROGNl7ZTBc,3624 -pandas/io/excel/_xlsxwriter.py,sha256=FFpBe-qY0dccE8Tfq8qHUSF01ssp4jaQs2clZ7cu6AU,8066 -pandas/io/excel/_xlwt.py,sha256=YFthUF8PYUFnv-uZEH3UgWkq4tWaSrUP0QystnFfRdk,4791 -pandas/io/feather_format.py,sha256=3V7GoD-crbAJ21b9NCjwCBEC5NYT-R25KeSalPKXKJ4,3807 -pandas/io/formats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/io/formats/__pycache__/__init__.cpython-38.pyc,, -pandas/io/formats/__pycache__/_color_data.cpython-38.pyc,, -pandas/io/formats/__pycache__/console.cpython-38.pyc,, -pandas/io/formats/__pycache__/css.cpython-38.pyc,, -pandas/io/formats/__pycache__/csvs.cpython-38.pyc,, -pandas/io/formats/__pycache__/excel.cpython-38.pyc,, -pandas/io/formats/__pycache__/format.cpython-38.pyc,, -pandas/io/formats/__pycache__/html.cpython-38.pyc,, -pandas/io/formats/__pycache__/info.cpython-38.pyc,, -pandas/io/formats/__pycache__/latex.cpython-38.pyc,, -pandas/io/formats/__pycache__/printing.cpython-38.pyc,, -pandas/io/formats/__pycache__/string.cpython-38.pyc,, -pandas/io/formats/__pycache__/style.cpython-38.pyc,, -pandas/io/formats/_color_data.py,sha256=nZOC4hv88N33FbfZBUfd_-Chd59Fxxw7xkPujdZFOu0,4296 -pandas/io/formats/console.py,sha256=sZCCnhg1XXCrdesweQN5yvhlk9zkqnsJndDuhZ_w2ps,2784 -pandas/io/formats/css.py,sha256=yTbRGb3vLCK7YfQbnQy275J0Kr_iy8WYCEiVv-9PCbs,8810 -pandas/io/formats/csvs.py,sha256=pjuRqnKLTBKTaS7FjDkmNFi-fjlMBH2NE1pv7pCDhLo,9942 -pandas/io/formats/excel.py,sha256=EMUXC509nTMsdP9ghwvuLMmTb2FqeGbFggIB1AyS18s,29098 -pandas/io/formats/format.py,sha256=lxvWAY0GGZj-nMkZn-Z7TUrSzZKio2PAvCWc58lxsb0,65608 -pandas/io/formats/html.py,sha256=xNEpOP99TdE3Qrjj8vTfXxykUgV7DAbcBUIrFxDeA7s,23192 -pandas/io/formats/info.py,sha256=-lG9osuvJy7A9eNHhAoOQExlsHzbinpYDVX8SVCc2bo,20630 -pandas/io/formats/latex.py,sha256=bQ2es3h_zjMNnTRiud2KYuLtFWGo0IxZdtC70uvfBWw,25201 -pandas/io/formats/printing.py,sha256=pGf33TJmsbSZegctXxuYIGZhKP9WPDtPLx7YnDy6Ty0,17127 -pandas/io/formats/string.py,sha256=nr9To1-7ajRFdd5mDNbJgFV3Gt7plGyTJu6pOJJ7UOg,6627 -pandas/io/formats/style.py,sha256=UgEhekhQ5GIhxK6yQorXBFIo5WkKUtfaqASB9f3bWJQ,57222 -pandas/io/formats/templates/html.tpl,sha256=3RJIj2ZMLa_v6dGDMHWPp_q2SYJhu0JuxFEB66JQ23Q,2126 -pandas/io/gbq.py,sha256=rI3xatdaIuLv9g9ET4wLIgjJA8WqAhE9aT0EE9fX46g,8183 -pandas/io/html.py,sha256=JFrTp2Dvhu-Deyox73KzH5Y2J3klAvWyVPGeWo9Xp7c,34621 -pandas/io/json/__init__.py,sha256=D1g5ROWGdB9OhvKZi6-mYt2QIllqc7Te_xXbesD-2Oc,340 -pandas/io/json/__pycache__/__init__.cpython-38.pyc,, -pandas/io/json/__pycache__/_json.cpython-38.pyc,, -pandas/io/json/__pycache__/_normalize.cpython-38.pyc,, -pandas/io/json/__pycache__/_table_schema.cpython-38.pyc,, -pandas/io/json/_json.py,sha256=jF2w4H5LazaWFPCuZWHRpZys0AJN2nrWphyaiZd7ukk,37314 -pandas/io/json/_normalize.py,sha256=dXiKItfjlWM5uHTaZScJMX8Va7sdiI6Xli2VGCaTxZc,12555 -pandas/io/json/_table_schema.py,sha256=hwg9e5lnGf0SfE-0Oir2ijU4YSl-TpDDaBwVMzfQFyI,10308 -pandas/io/orc.py,sha256=46AD1FGmKwHigbzrLC1wmAh66VVxJ0xsj7xEC6PvMEY,1646 -pandas/io/parquet.py,sha256=sGsVEdDgMCnS5Sw8dG5F90YaqNeiS6IJDe1KMX1rwMg,16101 -pandas/io/parsers.py,sha256=UzTTCjJf_wgDaBusNGtLG2kPDvbHMbyDs-eYNQ6LKps,139466 -pandas/io/pickle.py,sha256=JGWEXym_4lU5268xYRfCN2Ou2xfpIONpJipPSn2lLLs,6467 -pandas/io/pytables.py,sha256=lkgZM82eJKSwUa5ApZ-B8FpNUVkHyeXQ2xXnrhpCJnE,167155 -pandas/io/sas/__init__.py,sha256=l74xCbedVGG8EzuApVbpGlttu5KFgxVGQFdvevtAkQw,53 -pandas/io/sas/__pycache__/__init__.cpython-38.pyc,, -pandas/io/sas/__pycache__/sas7bdat.cpython-38.pyc,, -pandas/io/sas/__pycache__/sas_constants.cpython-38.pyc,, -pandas/io/sas/__pycache__/sas_xport.cpython-38.pyc,, -pandas/io/sas/__pycache__/sasreader.cpython-38.pyc,, -pandas/io/sas/_sas.cpython-38-darwin.so,sha256=i4J8iNQAhhGGCus1QaW2iuNjXPmAN4ArJrUxmWlWxdI,221588 -pandas/io/sas/sas7bdat.py,sha256=uFgejxv2i_DUxeCTEI9NjlhaRU5o0YhRcsT9cQ13JQU,29205 -pandas/io/sas/sas_constants.py,sha256=1osy4oIK4siNYqILPpHOmPqrDFhpZL8c06ywvGFEtmk,6731 -pandas/io/sas/sas_xport.py,sha256=CQY3ALKfCqoaB0smjWmn4kIDqBB2CcZV5xThK_IqhzQ,14035 -pandas/io/sas/sasreader.py,sha256=pGQJZ7fjtgPwNxiDQTwDbi_tVCRLBGVI-U40IBJ4iKs,4364 -pandas/io/spss.py,sha256=nhjPjrd4ta8v_tcBmUrCaPEacyRIdoGVVD5uQhk8CiA,1255 -pandas/io/sql.py,sha256=eNSKM7q6GpE4sKEaD09S8jLrXFDoSo0CfDF9TuWXk5A,63263 -pandas/io/stata.py,sha256=VYGvUccmIzncG5GVA43JYQ2Wom03tm1iCsNXW137euo,125569 -pandas/plotting/__init__.py,sha256=W_2wP9v02mNCK4lV5ekG1iJHYSF8dD1NbByJiNq3g8I,2826 -pandas/plotting/__pycache__/__init__.cpython-38.pyc,, -pandas/plotting/__pycache__/_core.cpython-38.pyc,, -pandas/plotting/__pycache__/_misc.cpython-38.pyc,, -pandas/plotting/_core.py,sha256=CdH3pDPelCEs-xCu4sV7J5-_W9BA9zsJlyMBwPNBY98,61416 -pandas/plotting/_matplotlib/__init__.py,sha256=zF6Tb0T2Wo4ZwKxDSMXkYhrXG5dipe3F4lRNKx1EYL0,1988 -pandas/plotting/_matplotlib/__pycache__/__init__.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/boxplot.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/compat.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/converter.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/core.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/hist.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/misc.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/style.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/timeseries.cpython-38.pyc,, -pandas/plotting/_matplotlib/__pycache__/tools.cpython-38.pyc,, -pandas/plotting/_matplotlib/boxplot.py,sha256=et0xJn0BZ-oCi23KbKH30TjY8y6ArYdeytsH6-wxpTQ,14600 -pandas/plotting/_matplotlib/compat.py,sha256=YAASClCfooQ1a0mVafkW8XyHMRoJe10t5khUL_Mdi9E,651 -pandas/plotting/_matplotlib/converter.py,sha256=eBf3WE4RsA0ARHyDv7aaqMjy5R8ceajQrBZa8dS2DSA,35530 -pandas/plotting/_matplotlib/core.py,sha256=y3w1ytu4Hlam0O80IrXUfYIZ3Mem4hvX9v62sAs6zD8,54900 -pandas/plotting/_matplotlib/hist.py,sha256=b0QqlInp6ArI8GL2XJRmyd8qqxppopeb2zYA1GYQKfY,11993 -pandas/plotting/_matplotlib/misc.py,sha256=j98gNSKQmT3Qt1wZmlxV1dI8KtkgWKP6Lv6d7EkkaRU,13023 -pandas/plotting/_matplotlib/style.py,sha256=EUTFfgi-WDgVgaKksH7YWOG0iN47nPglyAJVkjD5-4c,8132 -pandas/plotting/_matplotlib/timeseries.py,sha256=HE6QWwpQa4jnqSYyi1OKQ7PNQj_wULbd4WRq9dxbGsc,10037 -pandas/plotting/_matplotlib/tools.py,sha256=luuf7US8ca8IRmb8GG8kat1gSl2-uCzPBoL4QFjFArs,14395 -pandas/plotting/_misc.py,sha256=aXB7QDekPR3Jrt0_TsI-3Nnf6x3jUqu6hjzx9PtLAHw,15959 -pandas/testing.py,sha256=R-qMoJX9R-8-XhM57EirjGYyY3hRnbqHlR8PgcP6Zk0,312 -pandas/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/__pycache__/test_aggregation.cpython-38.pyc,, -pandas/tests/__pycache__/test_algos.cpython-38.pyc,, -pandas/tests/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/__pycache__/test_downstream.cpython-38.pyc,, -pandas/tests/__pycache__/test_errors.cpython-38.pyc,, -pandas/tests/__pycache__/test_expressions.cpython-38.pyc,, -pandas/tests/__pycache__/test_flags.cpython-38.pyc,, -pandas/tests/__pycache__/test_multilevel.cpython-38.pyc,, -pandas/tests/__pycache__/test_nanops.cpython-38.pyc,, -pandas/tests/__pycache__/test_optional_dependency.cpython-38.pyc,, -pandas/tests/__pycache__/test_register_accessor.cpython-38.pyc,, -pandas/tests/__pycache__/test_sorting.cpython-38.pyc,, -pandas/tests/__pycache__/test_strings.cpython-38.pyc,, -pandas/tests/__pycache__/test_take.cpython-38.pyc,, -pandas/tests/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/api/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/api/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/api/__pycache__/test_types.cpython-38.pyc,, -pandas/tests/api/test_api.py,sha256=-4IrZ44eVqIBZYW_Kc89k-CJFyZhW5qib8ikEAqB454,7706 -pandas/tests/api/test_types.py,sha256=hCK0T11SGVVGkWcB09oI8a88whWTBjeWmwuMSUJn_IA,1684 -pandas/tests/arithmetic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arithmetic/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/common.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_array_ops.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_datetime64.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_numeric.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_object.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_period.cpython-38.pyc,, -pandas/tests/arithmetic/__pycache__/test_timedelta64.cpython-38.pyc,, -pandas/tests/arithmetic/common.py,sha256=rmRbb7-V5Ia8uLrz1wTj1SkGFWtDM1EbdvjHdknLxRI,3243 -pandas/tests/arithmetic/conftest.py,sha256=yPccaciTyxywgDPKzF4RiO5SBzee9ZCuBcuL_3ad19Q,5751 -pandas/tests/arithmetic/test_array_ops.py,sha256=Jph25VTHpSkEitqhI357NOgP1DadLK1SWgLY1VdZbdg,1051 -pandas/tests/arithmetic/test_categorical.py,sha256=Lh01H0x566CsVTWwAwxgH7A4XZ97PLNLuB2BntMANx0,358 -pandas/tests/arithmetic/test_datetime64.py,sha256=BmtjAYsG6omk9gOSZjH8ePpxWe7LjFwebcQgjMu1iZc,91671 -pandas/tests/arithmetic/test_interval.py,sha256=WyCk1YjOpQ8tStm7xODq6J4J61cDsOEcB1LnTmiqA4w,10355 -pandas/tests/arithmetic/test_numeric.py,sha256=YK-NmbTjQLOP-Hub0vB7sSg2DSib4Pi51kUBkK79jgk,48540 -pandas/tests/arithmetic/test_object.py,sha256=1eeKJm6v7Fbf0QJWUkBJAi4UwIpVCYmSLMGypMHI9-U,12144 -pandas/tests/arithmetic/test_period.py,sha256=8oV-6JY5Jk_6IMHSK6404dl1CuVCoLP03amWtU9Llcc,56214 -pandas/tests/arithmetic/test_timedelta64.py,sha256=J_A1hMjM_oyTyVXlOlyOrwxfoWPPtALmMjeTBnptIls,79354 -pandas/tests/arrays/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_array.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_datetimelike.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_datetimes.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_numpy.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_period.cpython-38.pyc,, -pandas/tests/arrays/__pycache__/test_timedeltas.cpython-38.pyc,, -pandas/tests/arrays/boolean/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/boolean/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_comparison.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_construction.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_function.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_logical.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_reduction.cpython-38.pyc,, -pandas/tests/arrays/boolean/__pycache__/test_repr.cpython-38.pyc,, -pandas/tests/arrays/boolean/test_arithmetic.py,sha256=86LO8wQINI69JfjYCPULax3wmnMW-SANckufxw6Ww7s,2819 -pandas/tests/arrays/boolean/test_astype.py,sha256=jeAO8vaP3wl-ztAhmctM90RIfXuP_XJc76dF7Su6aNQ,1603 -pandas/tests/arrays/boolean/test_comparison.py,sha256=rJl3ZfyRMUUHNWW0BmZB3Konjr-TY0Ea9lYa3Uh1zWY,3103 -pandas/tests/arrays/boolean/test_construction.py,sha256=QEB5jW6tyfEFqUfr_sblnzEaQ4VKwYoyuXrwW3-P0VU,12857 -pandas/tests/arrays/boolean/test_function.py,sha256=v1tKuvQSXqmSBz2UUVh3oQN9jcVy9jH7mq4jkBN1IaA,3536 -pandas/tests/arrays/boolean/test_indexing.py,sha256=BorrK8_ZJbN5HWcIX9fCP-BbTCaJsgAGUiza5IwhYr4,361 -pandas/tests/arrays/boolean/test_logical.py,sha256=zG2i5fgP7IQOtD3z_GtV4y802MQ1ppyswbmo0SnLKZI,8486 -pandas/tests/arrays/boolean/test_ops.py,sha256=4bwSXbXRldT2OvuWCVGgnJLXoe2V99-s0_vJIQQ68KQ,745 -pandas/tests/arrays/boolean/test_reduction.py,sha256=KJx4QD5sy94U0TrGuugFzBnp4XtjFahHsSQZwMY4KUo,2017 -pandas/tests/arrays/boolean/test_repr.py,sha256=RRljPIDi6jDNhUdbjKMc75Mst-wm92l-H6b5Y-lCCJA,437 -pandas/tests/arrays/categorical/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/categorical/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/common.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_algos.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_analytics.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_missing.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_operators.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_replace.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_repr.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_sorting.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_subclass.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_take.cpython-38.pyc,, -pandas/tests/arrays/categorical/__pycache__/test_warnings.cpython-38.pyc,, -pandas/tests/arrays/categorical/common.py,sha256=lQ8b1Pb6EWPTcKE-PQ9cymIcuDWYzH1JDyxmU6PaOi4,204 -pandas/tests/arrays/categorical/conftest.py,sha256=AmIOXNVnHCRF2-kO0cLEU6YDahNTFvUrHAbYSv3HxrA,166 -pandas/tests/arrays/categorical/test_algos.py,sha256=0bNc6nJ4oaD-RJAa6D4RX4TJpHjd9-PZ9LF8DIK8pG4,2589 -pandas/tests/arrays/categorical/test_analytics.py,sha256=nKpfeR5YvlXosmjsQPZcuQoAMmezdTt2TkMdrCssD5k,14429 -pandas/tests/arrays/categorical/test_api.py,sha256=gvX7lOZDFoAUCtyilUCgjphWqpPFA66lckGp2mXVjRE,20671 -pandas/tests/arrays/categorical/test_constructors.py,sha256=daG-fXldaCVFHqNcmyYeLFAbyHgeW9iM4xdESvGT1Tc,27512 -pandas/tests/arrays/categorical/test_dtypes.py,sha256=Zp_ssz35J1c6qc7bGMsyySGkhdb0RYbnEy-YdVBI6f4,7343 -pandas/tests/arrays/categorical/test_indexing.py,sha256=vohK1X-6qms4RxKK50I4QKkdHONYzBkZGcvM9fcGA4A,12452 -pandas/tests/arrays/categorical/test_missing.py,sha256=5Knlv05eI4VYZ6-AZhlb4Ut1kUU6fBVakRPEY8-sU7U,6550 -pandas/tests/arrays/categorical/test_operators.py,sha256=WOWYH_ZwlAt6yooJLu1Lg4yaz_80TPP_eeTv20J5A5Q,15558 -pandas/tests/arrays/categorical/test_replace.py,sha256=u8Thfxjvsqb7I2VnnhCmWQp-nJzzF6LP4JcNfvu1O5w,2708 -pandas/tests/arrays/categorical/test_repr.py,sha256=qPV4O57AHAN8Oi0nZEJWDp2kbnWfNNoTHP_ZPYbJ2OA,26265 -pandas/tests/arrays/categorical/test_sorting.py,sha256=EQFXTYRygVTFGxaQviN6oh0N1WyndkhwbtYveGVdGhY,5040 -pandas/tests/arrays/categorical/test_subclass.py,sha256=v-VtvFFLSUt9zXoLOf7YLbda4q0NQVqRWck9x2qtXys,852 -pandas/tests/arrays/categorical/test_take.py,sha256=9By-IRuQqd-2ws9wBx68DaGJKQ2EXSCUpKQB7tg4dy0,3644 -pandas/tests/arrays/categorical/test_warnings.py,sha256=xm4bh8aymFC3fQYNU0pQxgCkDpTUXyBsyJbtcZNhWJw,919 -pandas/tests/arrays/floating/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/floating/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_comparison.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_concat.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_construction.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_function.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_repr.cpython-38.pyc,, -pandas/tests/arrays/floating/__pycache__/test_to_numpy.cpython-38.pyc,, -pandas/tests/arrays/floating/conftest.py,sha256=vEouLU4_uVyMOJKzKvSLoO_lh3aeyO6rC1jz4xzRYfo,814 -pandas/tests/arrays/floating/test_arithmetic.py,sha256=-V5INL1phLnC5rsTHbf6UgISEwbcvaKW7YcBafornH8,5867 -pandas/tests/arrays/floating/test_astype.py,sha256=DHc8AjOR51kLkmD4SGtmcQMQB0dxg6Ukrzqting_i8A,3917 -pandas/tests/arrays/floating/test_comparison.py,sha256=NKRxEmYHAI1NrtQV0By5uJ_D9ZLhPFtmgtHkt7Zu0lg,4146 -pandas/tests/arrays/floating/test_concat.py,sha256=jTRivQyq4rJaWPioRDt-UL136Wap5rdpNP73vwY6HQQ,574 -pandas/tests/arrays/floating/test_construction.py,sha256=qAKBp4DXSfKct6SFxZzQN58BpzzvvvZxMc-k5qJQPxQ,5176 -pandas/tests/arrays/floating/test_function.py,sha256=dxWQDeVir13FR1pBbwzVEZtInhjA7I_vhDAfZ_1jdUU,6130 -pandas/tests/arrays/floating/test_repr.py,sha256=9jfXlRFKDCsPL1ZTkTlDlQHTKs9y69RIfYgR3c7APRs,1145 -pandas/tests/arrays/floating/test_to_numpy.py,sha256=lPUL0Z47k3F0HVi58JIaEOOL7SMybuss0KOofD2E1hQ,4976 -pandas/tests/arrays/integer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/integer/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_comparison.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_concat.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_construction.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_function.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/arrays/integer/__pycache__/test_repr.cpython-38.pyc,, -pandas/tests/arrays/integer/conftest.py,sha256=BxlWeJ_dw76yIiZw9HY32XrLZ3_sOAWHzXPCRpXW5Iw,1043 -pandas/tests/arrays/integer/test_arithmetic.py,sha256=A5EG1TOv8f4Jeh90fxf8hXjNCeBouJ8Nbt169nuN3iE,10068 -pandas/tests/arrays/integer/test_comparison.py,sha256=D6fpm4saQeocjgKcmpKLE4l3wG74b9CTXn-XLuucLec,4005 -pandas/tests/arrays/integer/test_concat.py,sha256=LUlEoVb7pCmZq8weLGrkEq7ez_BvChHQx6zQFc0GpBk,2131 -pandas/tests/arrays/integer/test_construction.py,sha256=PBfvTIOpbiaahYTi4KRkiaZ0efCeaqWMXeGUMjLAS04,6296 -pandas/tests/arrays/integer/test_dtypes.py,sha256=qno5AbrlLZ7rI10rLz9CPam8v8wCgJ4UcXMOfx2_5NQ,8995 -pandas/tests/arrays/integer/test_function.py,sha256=0IbLhLW9WZjYB3OpdvAxbIFxoOIFkfn8Lk0Py192Q1A,6360 -pandas/tests/arrays/integer/test_indexing.py,sha256=rgwcafGbwJztl_N4CalvAnW6FKfKVNzJcE-RjcXMpR8,498 -pandas/tests/arrays/integer/test_repr.py,sha256=mYOiocG6DgylWZ6pRiYDdy5hL4oPqVT2UEc_r8ZzNUY,1708 -pandas/tests/arrays/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/interval/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/interval/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/arrays/interval/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/arrays/interval/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/arrays/interval/test_astype.py,sha256=WE18BCsDF2tG3UIoRf_ch-2YbsIqkd-iUXJ_fKCuZRs,755 -pandas/tests/arrays/interval/test_interval.py,sha256=gYWlmEipnjtPLmb7RMXpj_GWtEHEvw0lMCaNBQa-RzU,8595 -pandas/tests/arrays/interval/test_ops.py,sha256=LuD-eLEeEKUtm2agpa30Y9G_5pLRnz4WpfjO6sSFipQ,3258 -pandas/tests/arrays/masked/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/masked/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/masked/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/arrays/masked/__pycache__/test_arrow_compat.cpython-38.pyc,, -pandas/tests/arrays/masked/test_arithmetic.py,sha256=hHkdOsfIHPgywCed70gtVl3QyC6opR4m_COUNYEYaP8,5147 -pandas/tests/arrays/masked/test_arrow_compat.py,sha256=HAxEp_H1b0giJegj4EyQLSvc6pNPPT981i5JizUbf1E,1993 -pandas/tests/arrays/sparse/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/sparse/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_accessor.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_arithmetics.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_array.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_combine_concat.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_dtype.cpython-38.pyc,, -pandas/tests/arrays/sparse/__pycache__/test_libsparse.cpython-38.pyc,, -pandas/tests/arrays/sparse/test_accessor.py,sha256=6t1MRpsUwORIhbRMM3JyxH25CRGlF-QVQo4BA7YrIAo,4881 -pandas/tests/arrays/sparse/test_arithmetics.py,sha256=_JrcF9UKyQOsb6BHNb2D4aCez7xf6eYT_XBvROngknI,20217 -pandas/tests/arrays/sparse/test_array.py,sha256=XOTQWnh1MVdU-FzD2yDcfuq3nAApDKOUPJz81kWlAN8,45953 -pandas/tests/arrays/sparse/test_combine_concat.py,sha256=3NMQXaRQc7Bxn5HhSHffcUE24GZi_VYflnFLnixOgbs,2651 -pandas/tests/arrays/sparse/test_dtype.py,sha256=bIUVJWLfT00Np9TJCsoLYdngmThmk0E0g8K3EROD-Z8,5658 -pandas/tests/arrays/sparse/test_libsparse.py,sha256=LTHVEXdN3P0cVKs6FzM843_V0ofG_2nz2HIhPxMTolI,21090 -pandas/tests/arrays/string_/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/arrays/string_/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/arrays/string_/__pycache__/test_string.cpython-38.pyc,, -pandas/tests/arrays/string_/__pycache__/test_string_arrow.cpython-38.pyc,, -pandas/tests/arrays/string_/test_string.py,sha256=cOWLJwZna0SPUmIoAQ7Mq8WNTwMitDeDHsTT1_dEQl0,18289 -pandas/tests/arrays/string_/test_string_arrow.py,sha256=BUd3BSV3N4b_QoHnu2taBJTDCTncmzc9DsWH4dSgh6o,798 -pandas/tests/arrays/test_array.py,sha256=Bq_qBjh5BjZDUC7FMr21DJsvIlfPDZbK2IoQJO1WPn8,13565 -pandas/tests/arrays/test_datetimelike.py,sha256=kLSVkk4FxbUakQgJsD3c6vioaz9wA1aJApdVqLpgCVo,42238 -pandas/tests/arrays/test_datetimes.py,sha256=fQC-nSwHK8jyQgQ_vC0DO62gwgA1_hDE3WoUQnM3ERE,21745 -pandas/tests/arrays/test_numpy.py,sha256=QdV0-H62PnKs8Wvmu7axcdJDtB_IjzNqXZjMddcdu2Q,6532 -pandas/tests/arrays/test_period.py,sha256=Bpu_VVOBHFEEAQ8Q91OcgD49ErqMmzQ_ntozfq5GmRg,13182 -pandas/tests/arrays/test_timedeltas.py,sha256=657qMA6Z4ogMlu3xWY4fqGY8jk03ZmOU0CFXEX_6Xn4,12181 -pandas/tests/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/base/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/base/__pycache__/common.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_conversion.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_misc.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_transpose.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_unique.cpython-38.pyc,, -pandas/tests/base/__pycache__/test_value_counts.cpython-38.pyc,, -pandas/tests/base/common.py,sha256=khOj_7jBGUvPKXyNDfTpkfJ56w38fTgtB0Sk9ueJcJw,252 -pandas/tests/base/test_constructors.py,sha256=ANvtyUOGnhJUNoNqyAKoq229DJtBKOEiwzxlOsv_dF8,5119 -pandas/tests/base/test_conversion.py,sha256=uZugBTlUDwEb2ANhEMqgfi0d8b1I98ZqMraxzso9ods,16371 -pandas/tests/base/test_fillna.py,sha256=NZD61ziAvp5gvNcWrASDgRo2A2b5DzmeriTBz6-nBcs,1880 -pandas/tests/base/test_misc.py,sha256=hytiw7M-59UEXHHAO-_NElaEo6IgSpws19rrO9LKXwU,4426 -pandas/tests/base/test_transpose.py,sha256=O3pOdyUY_GietbcHUjyMLut91AfiSkhXj0GqzRbGyeA,722 -pandas/tests/base/test_unique.py,sha256=pdIGYDCOphJj_AZMcCwI6lJ9JT4woceQChlHejDBjRw,4464 -pandas/tests/base/test_value_counts.py,sha256=Xio8MEQEtEzCOI31iPO9oQ5Bf1NPm_1ZnRsLPu_QKY0,9533 -pandas/tests/computation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/computation/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/computation/__pycache__/test_compat.cpython-38.pyc,, -pandas/tests/computation/__pycache__/test_eval.cpython-38.pyc,, -pandas/tests/computation/test_compat.py,sha256=e0I8FZmh8i_FQF0YKo4pYjldEY7TjlFfEaBMHWA226k,1325 -pandas/tests/computation/test_eval.py,sha256=UWsGWb7moIMdgO61m2jKaNiH6LsJzqvi8XDKrFMoLgg,72051 -pandas/tests/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/config/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/config/__pycache__/test_config.cpython-38.pyc,, -pandas/tests/config/__pycache__/test_localization.cpython-38.pyc,, -pandas/tests/config/test_config.py,sha256=5yU3Uvo1mUTiTLCoiw6BnfAq9n3slNaJbgGaDaedlpg,18257 -pandas/tests/config/test_localization.py,sha256=PrSp-149Qdk6Jb1iaDYgLqIPfG5dRhwG6YRJO0Go1cA,2861 -pandas/tests/dtypes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/dtypes/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_concat.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_generic.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_inference.cpython-38.pyc,, -pandas/tests/dtypes/__pycache__/test_missing.cpython-38.pyc,, -pandas/tests/dtypes/cast/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/dtypes/cast/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_construct_from_scalar.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_construct_ndarray.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_construct_object_arr.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_dict_compat.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_downcast.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_find_common_type.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_infer_datetimelike.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_infer_dtype.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_promote.cpython-38.pyc,, -pandas/tests/dtypes/cast/__pycache__/test_upcast.cpython-38.pyc,, -pandas/tests/dtypes/cast/test_construct_from_scalar.py,sha256=RnOgxrVmm5QadcOF91P0EzKVYZmdmu6AysomrbkSEq0,650 -pandas/tests/dtypes/cast/test_construct_ndarray.py,sha256=FqWQthn7ViFSpVbLwcMSACr7CKTd0_roxkQ4Ybjcs8Y,708 -pandas/tests/dtypes/cast/test_construct_object_arr.py,sha256=eOmUu4q0ihGTbYpCleoCnYtvwh1TBCEZQQjLeJaUMNA,717 -pandas/tests/dtypes/cast/test_dict_compat.py,sha256=qyn7kP5b14MywtqOUL5C-NOvjf2qK4PsXGpCvqmo-4E,476 -pandas/tests/dtypes/cast/test_downcast.py,sha256=ow1CtCx2RWAv3MPS8r7A3KEsi7FdJ-1AmXyIZ5Y20lk,2790 -pandas/tests/dtypes/cast/test_find_common_type.py,sha256=f5KVZJAdss03GYJ58X1xPn-CeTL-o67aq1lKq-hFCAg,3878 -pandas/tests/dtypes/cast/test_infer_datetimelike.py,sha256=aA7RtfP1NktXICyIIr-M34ukfnhOOtW2vmNJCl-0gCI,582 -pandas/tests/dtypes/cast/test_infer_dtype.py,sha256=N3JzJ3u7gTuDx7rwci5zlS76-YBMUXSspz0LVLhAW9o,5210 -pandas/tests/dtypes/cast/test_promote.py,sha256=BXJTbIv9JdrEpR8HjjkdEe-GCeEzQdKDumVoYcdfWD0,23391 -pandas/tests/dtypes/cast/test_upcast.py,sha256=HjrSC2FjI7TtLheLRMpe_abiHYTLCby-Qi7VJNGOB6c,2261 -pandas/tests/dtypes/test_common.py,sha256=kyIWRrjFhx6PjcmOVINGVMAx84w8qM8kVwTTol8Hcw8,24776 -pandas/tests/dtypes/test_concat.py,sha256=jRYyPXx3HyINqdJrGTaHB48wp17dNSiqRC1S3VTWTD8,3254 -pandas/tests/dtypes/test_dtypes.py,sha256=DRhNxwcmivCUCuwtOn4jtRanzhfvMoaignsmlr3Jp-M,35327 -pandas/tests/dtypes/test_generic.py,sha256=JpI0NbAFExwXu7heJIy8llRgYWbv4ZO6762SclU_7vs,3279 -pandas/tests/dtypes/test_inference.py,sha256=kjSmq7oMsE5CED_PZ2D4UAywAXGPyc1tVVPTUqy5yHs,52074 -pandas/tests/dtypes/test_missing.py,sha256=iRLcWWymJCkaYWKlm3giiiAOI5AT2vOhUhQ9VQVW_Yk,20677 -pandas/tests/extension/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/extension/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_boolean.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_datetime.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_external_block.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_floating.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_integer.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_numpy.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_period.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_sparse.cpython-38.pyc,, -pandas/tests/extension/__pycache__/test_string.cpython-38.pyc,, -pandas/tests/extension/arrow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/extension/arrow/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/arrow/__pycache__/arrays.cpython-38.pyc,, -pandas/tests/extension/arrow/__pycache__/test_bool.cpython-38.pyc,, -pandas/tests/extension/arrow/__pycache__/test_string.cpython-38.pyc,, -pandas/tests/extension/arrow/arrays.py,sha256=cjdOOdU1ZLudeq6sUtt1Z_sizMt_b0y2BWOkUGgKrjc,5291 -pandas/tests/extension/arrow/test_bool.py,sha256=PzPJNi6wptKg73ONAMoWIw274ED2gVArtWuVviuBn8A,2902 -pandas/tests/extension/arrow/test_string.py,sha256=0pt4mJ8VyFmkBtZMe35R_bt18EtgY__CNU2dLLfZBxM,301 -pandas/tests/extension/base/__init__.py,sha256=v7CARJg13tbAIZgy1y65YvehR4XKLTAyzG9ZPg_GWWk,2162 -pandas/tests/extension/base/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/base.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/casting.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/constructors.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/dtype.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/getitem.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/groupby.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/interface.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/io.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/methods.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/missing.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/ops.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/printing.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/reduce.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/reshaping.cpython-38.pyc,, -pandas/tests/extension/base/__pycache__/setitem.cpython-38.pyc,, -pandas/tests/extension/base/base.py,sha256=5SPn-G89qste22G5I6n0FARZNpiwJbMLkFbdfxEEFlc,742 -pandas/tests/extension/base/casting.py,sha256=U8xjCVUrXLS9MfgTGcYZcCSFL6OUVBtjScE62ipSVbU,2280 -pandas/tests/extension/base/constructors.py,sha256=FeCdrNsMJ89pI9WvVqLs5v_iFz1K1Wz7ULIr8A9I9TQ,4686 -pandas/tests/extension/base/dtype.py,sha256=xLmLgM3-jJap2Sv20bKQbEjFxrrqe7YHgVwkOyY3pP8,4310 -pandas/tests/extension/base/getitem.py,sha256=N7YO6zVHK8tdcl6ILN7I37N6pPNB3TUD7tOEEU0QPGM,13100 -pandas/tests/extension/base/groupby.py,sha256=OwXGR-HFShZMSSChhclH2TysjYo6iJDsDp3hkwYtJ_g,3433 -pandas/tests/extension/base/interface.py,sha256=ziX66EOmXhqFcZFTGEEEF2OUFj_8KYlLEgfkyPwAkYY,3932 -pandas/tests/extension/base/io.py,sha256=O_d56AfwAE0YEkPNMTXNXdelYej2flUdjCWLSexX5Aw,602 -pandas/tests/extension/base/methods.py,sha256=NHQmzbG9n_qXqBLn6fmWI7JvbHLTig_SFol4xvPDUbs,18343 -pandas/tests/extension/base/missing.py,sha256=3tlEaKfuNYMMw9wF9-p_SqDwIHK9GfQ3uE6lPXBHhWI,4515 -pandas/tests/extension/base/ops.py,sha256=mwoAsUY3eXaGo9Qhi3iAsXrm-qnjjAga-Iw0XcQOqGE,6719 -pandas/tests/extension/base/printing.py,sha256=yaso4LDPXYq1FWTgmnMEd0PJx0JZ9tZ0wvQZvREWZG4,1167 -pandas/tests/extension/base/reduce.py,sha256=MzovNiXU0l2t87_wtd9vfVFDf1T-ls-ns4WLnMTN0Q8,1906 -pandas/tests/extension/base/reshaping.py,sha256=40r2Tk0Pz_FFCvCt3-tK28OPJ-5BwqO14u1j85kRdK0,13249 -pandas/tests/extension/base/setitem.py,sha256=x39MDLNztg_dLoqcYXZewnmFMKPgg69vZQ-xsXzXPlc,11422 -pandas/tests/extension/conftest.py,sha256=TwP_uCKyUYqe673hefDUxRyDDti5Kh6iXOS51nTErJw,3785 -pandas/tests/extension/decimal/__init__.py,sha256=0FmFcmPOLYaQuKo2y872UA-3tYhtSJ1XQvdu0GJJQ0M,140 -pandas/tests/extension/decimal/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/decimal/__pycache__/array.cpython-38.pyc,, -pandas/tests/extension/decimal/__pycache__/test_decimal.cpython-38.pyc,, -pandas/tests/extension/decimal/array.py,sha256=-khYHQonmNoDMGHeYMFgMYOMLwMYbaXMuAYOnh2ZOTs,7276 -pandas/tests/extension/decimal/test_decimal.py,sha256=pD-GwbC6k3FajZqsxoFQW0Rp8uswXz2W7k_KhEVrXZY,16395 -pandas/tests/extension/json/__init__.py,sha256=xlEw4fnCd-FMBc-rq1Uan8yAHcAGeBz1rAYst06_kQY,102 -pandas/tests/extension/json/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/json/__pycache__/array.cpython-38.pyc,, -pandas/tests/extension/json/__pycache__/test_json.cpython-38.pyc,, -pandas/tests/extension/json/array.py,sha256=4S2Nxmw8R77GlyCd_mhbRKeG-F6uNh21RpQ5nVe4Zyw,7051 -pandas/tests/extension/json/test_json.py,sha256=IzvEze_RyhZqFs_XaBtd7Iou0aE6JASW6j_YbNil0Ms,10657 -pandas/tests/extension/list/__init__.py,sha256=JCuru1OdLD07EwMYEzdgf_VZnhy8TRpbr92WKi3Ll2Y,102 -pandas/tests/extension/list/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/extension/list/__pycache__/array.cpython-38.pyc,, -pandas/tests/extension/list/__pycache__/test_list.cpython-38.pyc,, -pandas/tests/extension/list/array.py,sha256=BsJbC2W28A0xplagTtTCKNey9iMHAoFyWRFZWybdLEE,3744 -pandas/tests/extension/list/test_list.py,sha256=WVIHHFZzOMxmR-9-SHD4VhWtnd1K_A2daplQBEKJorY,625 -pandas/tests/extension/test_boolean.py,sha256=dSTNnw1WSgi2LBUTLnG3-sGUliVZ2F-UgF0SFqfYO_k,13009 -pandas/tests/extension/test_categorical.py,sha256=zH5PEy11txEtgaz5IxOuakYgA6JGNQNvJxxl2eEpg9E,9337 -pandas/tests/extension/test_common.py,sha256=X2dv8Vbo1vmkh8uKpKyAOaLMDZqTOgFLtz1N3EQttZk,2091 -pandas/tests/extension/test_datetime.py,sha256=gHIzI1qp7Q820K4rvIa9p3RRouZIaProgSZmGN3pqwY,6668 -pandas/tests/extension/test_external_block.py,sha256=C_YOzEGMS4Z96-vzQE342Yv3xRQHXaJeFz4JwC6az88,791 -pandas/tests/extension/test_floating.py,sha256=8med523C04sDYHUzDN6sYmDihc8TJnkTzQh_wohPEOw,5996 -pandas/tests/extension/test_integer.py,sha256=GhWD6-cLIsT3Icumzrq4E490ajBFWHr9FfGSbnKzFnI,7289 -pandas/tests/extension/test_interval.py,sha256=R5hv2UapaT8wubiVro3UlWt9cgIn-XmjhnpuGhDZ-N4,4126 -pandas/tests/extension/test_numpy.py,sha256=j88YUpa84sNm6Nc9gpqn-9V_W9e-N7xEmRfd7JQQQ24,16177 -pandas/tests/extension/test_period.py,sha256=8IlBcXXvTKsNfT2ZQUyrRmL0wAv-S65Ushpnb0DJE0A,4619 -pandas/tests/extension/test_sparse.py,sha256=sGI7k5RcIrv6OBiyEKtVl8DnwKHPuPpNYKSqgqLNBBs,14529 -pandas/tests/extension/test_string.py,sha256=VL_4E9qPbk03Lb1EYl0vfPi4iz1Z4YRHXqzCkLNrPIo,3885 -pandas/tests/frame/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/frame/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/frame/__pycache__/common.cpython-38.pyc,, -pandas/tests/frame/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_alter_axes.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_block_internals.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_cumulative.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_iteration.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_logical_ops.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_nonunique_indexes.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_npfuncs.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_query_eval.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_reductions.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_repr_info.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_stack_unstack.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_subclass.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_ufunc.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_unary.cpython-38.pyc,, -pandas/tests/frame/__pycache__/test_validate.cpython-38.pyc,, -pandas/tests/frame/apply/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/frame/apply/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/frame/apply/__pycache__/test_apply_relabeling.cpython-38.pyc,, -pandas/tests/frame/apply/__pycache__/test_frame_apply.cpython-38.pyc,, -pandas/tests/frame/apply/__pycache__/test_frame_transform.cpython-38.pyc,, -pandas/tests/frame/apply/test_apply_relabeling.py,sha256=4e0IX8gkBA-y4polLRHmYrbZeM_F5IrnU7e3XnCa9Sw,3679 -pandas/tests/frame/apply/test_frame_apply.py,sha256=HS1-7dAINMIpFk-uDeVA_lAnsz_g5aP3QjVnWUfZ7Hw,54977 -pandas/tests/frame/apply/test_frame_transform.py,sha256=P32YXA6fmFM_JuaP4_5r9w4EsoiGIEdWflDBYwuZBZw,8313 -pandas/tests/frame/common.py,sha256=znTpNsC-wXMPlWyWTpTarofKMfrh3uIuVTOcAaDzUvk,1753 -pandas/tests/frame/conftest.py,sha256=XR6wOZ5YqzjoQIwJgAIYnFevWzBNVW6Ez9uwQQoB9h4,9007 -pandas/tests/frame/indexing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/frame/indexing/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_delitem.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_get.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_get_value.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_getitem.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_insert.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_lookup.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_mask.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_set_value.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_setitem.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_take.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_where.cpython-38.pyc,, -pandas/tests/frame/indexing/__pycache__/test_xs.cpython-38.pyc,, -pandas/tests/frame/indexing/test_categorical.py,sha256=JnHKgqbvZmnKMBM7F0m-MJjZAw3zqGuD9Jw9_vF2fQ4,14023 -pandas/tests/frame/indexing/test_delitem.py,sha256=ZwRWiENv9zid2AT8LJht6nLZJOJCz_8P1cdRyH1LuuI,1765 -pandas/tests/frame/indexing/test_get.py,sha256=N00_igU25_HjYuvAqDQKqBpqbz6HjB97o9Exvbo9BzM,662 -pandas/tests/frame/indexing/test_get_value.py,sha256=kd0UskkqUlwI-2G_Km_qjRo9-Uxxol379hFaVx_aZzE,666 -pandas/tests/frame/indexing/test_getitem.py,sha256=C8OuwTgBZ2XOJNAXin2PPpH90yZSPWK5_rgRC1ytxrU,5364 -pandas/tests/frame/indexing/test_indexing.py,sha256=KeKPGdDB4IjSIm4Qxk14zD9aALOsuHT4GXqZsAfnjfM,57759 -pandas/tests/frame/indexing/test_insert.py,sha256=3ifE-e02wGAFaEhQjnYMatLoOPrDWGqfvjNlqg3U_t0,2291 -pandas/tests/frame/indexing/test_lookup.py,sha256=Pd6owPVn6Q2wedI9gMxPLZgt_9luBqsbR2TVshQEELU,3372 -pandas/tests/frame/indexing/test_mask.py,sha256=HJ3e1_jP8tzPt0XOZjn3_7idkNNfV4DLS7z1mgJcjCI,2814 -pandas/tests/frame/indexing/test_set_value.py,sha256=C7qSCupAwtP5Hnvem7jtv6EeEJdLIg0KAA3GUu5m_Kg,2338 -pandas/tests/frame/indexing/test_setitem.py,sha256=sFW8VvupObQUlPGno7FNqkrfXrL1y4U5TPQUwRR1Eqw,13480 -pandas/tests/frame/indexing/test_take.py,sha256=DjYzYfwSpl7Mn6NcLSHHFoADjMpS8jZCKIPY7YhxQ6I,2927 -pandas/tests/frame/indexing/test_where.py,sha256=6aV8WTWQb_Ia2U-6sqU_3rbLr2yyW2xi2m_H6fwRFxs,22861 -pandas/tests/frame/indexing/test_xs.py,sha256=47Mfkr3kcw5WHX7D83BC3Y9OE18nL7U39RNyZKRX1AQ,11772 -pandas/tests/frame/methods/__init__.py,sha256=M6dCS5d750Fzf9GX7xyNka-SZ2wJFCL66y5j-moHhwo,229 -pandas/tests/frame/methods/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_add_prefix_suffix.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_align.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_append.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_asfreq.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_asof.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_assign.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_at_time.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_between_time.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_clip.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_combine.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_combine_first.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_compare.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_convert.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_convert_dtypes.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_copy.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_count.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_cov_corr.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_describe.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_diff.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_drop.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_drop_duplicates.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_droplevel.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_dropna.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_duplicated.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_equals.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_explode.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_filter.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_first_and_last.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_get_numeric_data.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_head_tail.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_infer_objects.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_interpolate.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_is_homogeneous_dtype.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_isin.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_matmul.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_nlargest.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_pct_change.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_pop.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_quantile.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_rank.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_reindex.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_reindex_like.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_rename.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_rename_axis.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_replace.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_reset_index.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_round.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_select_dtypes.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_set_index.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_shift.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_sort_index.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_sort_values.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_swapaxes.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_swaplevel.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_csv.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_dict.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_dict_of_blocks.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_numpy.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_period.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_records.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_to_timestamp.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_transpose.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_truncate.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_tz_convert.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_tz_localize.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_update.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_value_counts.cpython-38.pyc,, -pandas/tests/frame/methods/__pycache__/test_values.cpython-38.pyc,, -pandas/tests/frame/methods/test_add_prefix_suffix.py,sha256=jX_f1l0z7txhO14k8_UaHJYG7WIY3FXRlDxeOVv7OPU,784 -pandas/tests/frame/methods/test_align.py,sha256=Y4M-kymndfa25fGaEQyHlTJPNDUBnP4CC4tDEy8OXIE,11301 -pandas/tests/frame/methods/test_append.py,sha256=F9NLTSTpjau3Pg11OslUMvwGoc9ypvbF2dos5tW_NJU,7730 -pandas/tests/frame/methods/test_asfreq.py,sha256=RR1tbWbOW4W1nL5O48OxXzHrWUGcZySNk-GqXV9f2sM,3054 -pandas/tests/frame/methods/test_asof.py,sha256=ofv0SVedBu2npPOul6fZerv8AaaAcSX5Hhh7D6aU-ik,5883 -pandas/tests/frame/methods/test_assign.py,sha256=xFGREzLhP1wj3MowBimeYbMWBNiII0280DiOXI6WDB0,2982 -pandas/tests/frame/methods/test_astype.py,sha256=_MSVjPtHwNPqYhJDvHDaspjcBgy5pOesHlyPQw3gZ2M,22074 -pandas/tests/frame/methods/test_at_time.py,sha256=283Sb4t3AmrM6UldMBe_1AdlUooh3ft9jbmIBcmFdS4,4065 -pandas/tests/frame/methods/test_between_time.py,sha256=unSN5ZGKW2h6jsIQX9T6_4eLfKnvVd7OkzyrdnH7lro,6577 -pandas/tests/frame/methods/test_clip.py,sha256=Cb-B0LvAZNDJl4uABXvOCdYc_mP18eGAHc62q0e6NCU,6005 -pandas/tests/frame/methods/test_combine.py,sha256=wNaQqokqHsJmrZ9NQIao58ZT0hSkkTH14I7_Oq8tADs,1359 -pandas/tests/frame/methods/test_combine_first.py,sha256=bJxcejyzX83dzdfs3IR48-UE_laU5RUmlDgMNxfDQq4,15954 -pandas/tests/frame/methods/test_compare.py,sha256=8MkvFaOb8Dve1M2BMRMyO7e43xd5IBFChSkpcQi0Nf0,6158 -pandas/tests/frame/methods/test_convert.py,sha256=O4unr-2eXHt_Ct9GgxuFALAmQehMs-PW7_YpiwTxj4I,2082 -pandas/tests/frame/methods/test_convert_dtypes.py,sha256=TDpMwXsyb8NdUlOVm8NZMa3CCm_rDkRnzOUjwlVfH5Q,929 -pandas/tests/frame/methods/test_copy.py,sha256=fc4bDLuCxyz5linfAdq-sqNxw-SdUHOVO9VT8RKla2E,1238 -pandas/tests/frame/methods/test_count.py,sha256=fSC84-g7oI03uP-Xi2LoXhb0Zu7KpQ1Y0huwiqdwWug,4562 -pandas/tests/frame/methods/test_cov_corr.py,sha256=MDDRn8GvBlN_gOf902wVSJJG4utuB9i6rj4JFyBgJyk,12564 -pandas/tests/frame/methods/test_describe.py,sha256=t7MUmGW6DadARd2jSv1scUJDiBXXhHW5lYojq3dua-o,12434 -pandas/tests/frame/methods/test_diff.py,sha256=h20r5Cc442yHGEWf-FlI17lneWmM-jas3cirNLcVHVs,9164 -pandas/tests/frame/methods/test_drop.py,sha256=iwHQrrePIL7QDVRzLzHgCcQgJ-K7WRmGr_hN3fDUn9I,16331 -pandas/tests/frame/methods/test_drop_duplicates.py,sha256=UdI9CJ-n-m8jxLq3MQxcZeCJN-ywik3Qx6OElofWeic,14532 -pandas/tests/frame/methods/test_droplevel.py,sha256=-4NJUOc5KDZjmXPm8Ovo9WJaqksAEeRV7_Junxh-9jY,1236 -pandas/tests/frame/methods/test_dropna.py,sha256=EzLqgfR2IecAyoO_0XKsx5jvSn-Xf59R8Hq9v31R59I,7549 -pandas/tests/frame/methods/test_dtypes.py,sha256=edGACBFtXqTPnOLMHb4RzJMVn0AInXJcMTvf0ZM6wVU,4229 -pandas/tests/frame/methods/test_duplicated.py,sha256=uZny1IMRwnhywj0T5nA-Rp4LK6-w_dcs5B3Z22Uhz68,3191 -pandas/tests/frame/methods/test_equals.py,sha256=XC8ypiNaICbbZTpRAiNQtqEilmUjC4apOS3JWeRxGyY,2714 -pandas/tests/frame/methods/test_explode.py,sha256=1Z7P2aXM2FEC13lqgRJnLYAoZSJTHnNe0Ht8sLepgi4,5615 -pandas/tests/frame/methods/test_fillna.py,sha256=vyxsXwmI1xMruZfr428c1YwpV1xvXLpN1Xkyehy-rSs,18194 -pandas/tests/frame/methods/test_filter.py,sha256=qpHVkTo7lpmvAPOprKg3pKJRK5c6wD2Pz8o5QJwYWfY,4930 -pandas/tests/frame/methods/test_first_and_last.py,sha256=Ffp5_4jdrfmW2WuYSGPdqDPjgOgjadlgIZMmeZYj46Y,2080 -pandas/tests/frame/methods/test_get_numeric_data.py,sha256=XWB4K1ycXR3zpX30ZqXN4UjkskoESjylxq4m9EzW3I8,3178 -pandas/tests/frame/methods/test_head_tail.py,sha256=nZIygQxyyPQK2G4hgw6zkeF6H_KWoTOZ6rp_zZNQMxY,1911 -pandas/tests/frame/methods/test_infer_objects.py,sha256=LNOf2VJsV17FDT9ogEDba6la414yUmm5z_7B97nLN24,1241 -pandas/tests/frame/methods/test_interpolate.py,sha256=44B6g3O62_PNK8uEbeuLWWr_HVbR5ltz0pzAAdqseI0,11816 -pandas/tests/frame/methods/test_is_homogeneous_dtype.py,sha256=JfPxBfVcB8bMlATJi26qsKCk4CKHnO4rqQRlA-g-SqI,1257 -pandas/tests/frame/methods/test_isin.py,sha256=_XJ6gxuGxaJSgtArvW6MqBdjyXfsc4eYYHfVWUWLp1s,7306 -pandas/tests/frame/methods/test_join.py,sha256=bSYSTAllbO0eMjq5MqVWEcpLrwS_HERF2MPVLJSR7vU,11281 -pandas/tests/frame/methods/test_matmul.py,sha256=LGYcDFx6ALhLYpkasUM1Kw-y9KltAsoDZ1h4TjU53kY,2830 -pandas/tests/frame/methods/test_nlargest.py,sha256=Kv2XkfvUUrEbb0IftKA8MiEQpTVX61bfRiNwO5qM6xo,6731 -pandas/tests/frame/methods/test_pct_change.py,sha256=sjVvcG7BH5x66IVDkMO6x9RLEBzHkb6B6iOSVU-67A0,4528 -pandas/tests/frame/methods/test_pop.py,sha256=iI2IsccJHfUum2vU3e4Zy215QitP1oSJunZwz1Xd1JA,2099 -pandas/tests/frame/methods/test_quantile.py,sha256=utWkzVvxcuTatKzmzrUINPAmaHqkd83Hdu1PdrwtuDw,17872 -pandas/tests/frame/methods/test_rank.py,sha256=wv6RVPaHILdrVKPHlhFHhYJDAxOSyzX7vzSg3zhJlSg,11377 -pandas/tests/frame/methods/test_reindex.py,sha256=xpA0YO3oRCBfybDG-QemsTICj6TmZgOPnMwr8Ian3zs,32934 -pandas/tests/frame/methods/test_reindex_like.py,sha256=2qgqaHDSEKYO1hwE9MaPTFJhl4m7rejHyuOcrmvqaBg,1187 -pandas/tests/frame/methods/test_rename.py,sha256=V2v3Z4MV0medIV-sTwEuL0WCqEo0x_lxRkrxxVF0ieE,13189 -pandas/tests/frame/methods/test_rename_axis.py,sha256=rEkjVNzAcRdr3i1F-D9n0yFtoN1oBUPVmRH0tpB6950,4074 -pandas/tests/frame/methods/test_replace.py,sha256=Z73IVJSdfaWuvAChGa2nLuOjlchjDcAkCxFcRdbSUu4,60148 -pandas/tests/frame/methods/test_reset_index.py,sha256=qb1EMXUxdcKG9cqZEZ6utdHBzy_q3J49JUPZ5hnN3t8,22730 -pandas/tests/frame/methods/test_round.py,sha256=tI4V7iHr9xvUz59KaVW5V_A8IxgKdaQwzOVnQWW0Hok,7907 -pandas/tests/frame/methods/test_select_dtypes.py,sha256=ORfZboUyHi-BKPkyZee6kEv0TD0Phixm7-Nj9EJTU9A,11960 -pandas/tests/frame/methods/test_set_index.py,sha256=KAI4JVbqO7n0XpyhldDAxVm8vbcyY82UUw2-Ss4z_JE,24556 -pandas/tests/frame/methods/test_shift.py,sha256=KLKiRgKMIVKcLbhJV8Quvp8hAbi3wmn5n64IpBmHkRM,11010 -pandas/tests/frame/methods/test_sort_index.py,sha256=bdYpcMiB2C5OEu1_HblalZnZInvGjBlkDX-ycGM3ZOw,29215 -pandas/tests/frame/methods/test_sort_values.py,sha256=vcDo_yweQsIvzGG0fsrVzJ0PxOLF0XA-tSPz-TAWDko,27973 -pandas/tests/frame/methods/test_swapaxes.py,sha256=2UK5z7SGHTMFWqIGmwZeCwu1xD_3EV7bb8y36yN8lAA,664 -pandas/tests/frame/methods/test_swaplevel.py,sha256=Y8npUpIQM0lSdIwY7auGcLJaF21JOb-KlVU3cvSLsOg,1277 -pandas/tests/frame/methods/test_to_csv.py,sha256=JGAkMAYOlglo2BoJSP80wLbmgIZrHE05B5SDtE-McqE,47293 -pandas/tests/frame/methods/test_to_dict.py,sha256=uZh_YvfqcKUSrti2Kz97qEcQyIQ_pfQyMeao8yYAe2o,10227 -pandas/tests/frame/methods/test_to_dict_of_blocks.py,sha256=U7auS-HKD6iBtYDYQtUTJc2ggpOK1j2HJIh6lLO-U3I,2187 -pandas/tests/frame/methods/test_to_numpy.py,sha256=iTKyM8wZV74T8jBCAWfbO9RrlRJSwPeByL-nSfKj_34,1160 -pandas/tests/frame/methods/test_to_period.py,sha256=ZCtkqzBGlVtiNTIFqmnL0wjhq7wIaXmYdRp_4yS3dGo,2747 -pandas/tests/frame/methods/test_to_records.py,sha256=XmZt08MkRF_36w_MmCNBh_YdXeQyZOXg5SgZGtj6tMU,14042 -pandas/tests/frame/methods/test_to_timestamp.py,sha256=Yaolvdm9zblJybpXlNjIg7HfPhz5E-sA8QWWKveOuU4,5768 -pandas/tests/frame/methods/test_transpose.py,sha256=OJu14OvQqwRl8u7Ig9C6rM4R5tkCS-FUos91BmSZsLU,2875 -pandas/tests/frame/methods/test_truncate.py,sha256=QOBjCNpu-p8HcWDYiwU_NZrWRNEa5hbreYCT9p_Ekzk,4935 -pandas/tests/frame/methods/test_tz_convert.py,sha256=pRiM9beGU94xGYaN9ZB5rKTeOGKJJh_5vFiHP_ch-Ms,4032 -pandas/tests/frame/methods/test_tz_localize.py,sha256=AkfQNS5pYbvg2DnENZ2ir7t4vxzLsLkO7VifR4GCxOs,1357 -pandas/tests/frame/methods/test_update.py,sha256=7ah7ylV_kMq-NtaI-FYgfThPe-6JO_wOLiVfLe6f11M,4259 -pandas/tests/frame/methods/test_value_counts.py,sha256=R37l0DvKDumMx-_VJBMF3tO2XQLmGIr-rXJ2A-TiNzU,2631 -pandas/tests/frame/methods/test_values.py,sha256=jC9EpQRSN54OuP3fcG6D5ZfHOT5A3dQeBJaFhAf-MMo,7156 -pandas/tests/frame/test_alter_axes.py,sha256=feJ3Z2QYnDatFIFmjUxIFeFWmPHdl5fPqpA-92M4wZo,6592 -pandas/tests/frame/test_api.py,sha256=TlikV1v4MstdupB50vcz3-r9tdLILPwMt507L-uOH64,10868 -pandas/tests/frame/test_arithmetic.py,sha256=CHy-JamCH43lw7tLpho9k10wafgu4enLzyfVaDRZpGU,59472 -pandas/tests/frame/test_block_internals.py,sha256=jvaLkAxmOTZoXGwrVNfZcgnqx8uHEDAsgMRI9mQyCA8,14058 -pandas/tests/frame/test_constructors.py,sha256=OUcvCRUH10MjipK61Gah945bdYF0NCP71ldd9TcBLFk,106210 -pandas/tests/frame/test_cumulative.py,sha256=YJQK-X6aL6A8GXpxreExpiosIHwEJzhyihZmt3Wvi1U,4228 -pandas/tests/frame/test_iteration.py,sha256=0j1hocA0RoMa0QGqpRYHmwIyj4n2FHqRzyWDqpUw-do,5111 -pandas/tests/frame/test_logical_ops.py,sha256=Oo9zqViY6KpzIhESZPytA8XHfJ8ugRi4tN2BdXkX5W0,6147 -pandas/tests/frame/test_nonunique_indexes.py,sha256=vxoydpu2s_ibt3A0MANkifu-y0cCj8xFr1JUXWvB7x0,16606 -pandas/tests/frame/test_npfuncs.py,sha256=-NSheoQzQ7ZwNTeWvlmAuPtuGrA__Ax-lFWVs3HqhW4,840 -pandas/tests/frame/test_query_eval.py,sha256=CZEjM8kFJlztCQNNLO8b0q2fMFMSvJkdxfchq2IZza0,47508 -pandas/tests/frame/test_reductions.py,sha256=ayBg-TPXcQL4b7Gung6FRvXNoyyKmtoYVozrPKcTor8,52336 -pandas/tests/frame/test_repr_info.py,sha256=kdb9dnXTGgiJQjuxOdFTNpeGqUa6vv8rCKO5PfCZ3-k,9383 -pandas/tests/frame/test_stack_unstack.py,sha256=H0_cQ2-a-OvyxdEE6S07pea81sLdkRwkTBM8pssEFy8,66897 -pandas/tests/frame/test_subclass.py,sha256=eu0VxXNyqmHZCWkhaKPJNXr-P5OvrUmRaM-ZJza_C_w,23573 -pandas/tests/frame/test_ufunc.py,sha256=RgHhhbGMIPi0y-aw6iQdd_r5_3jIPgRRFSydrTa5t2o,4037 -pandas/tests/frame/test_unary.py,sha256=eSnR7ND06uyq4DyydjRciEv09ve_KIbIIxIwwLP3zpM,3775 -pandas/tests/frame/test_validate.py,sha256=hSQAfdZOKBe2MnbTBgWULmtA459zctixj7Qjy6bRg20,1094 -pandas/tests/generic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/generic/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_duplicate_labels.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_finalize.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_frame.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_generic.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_label_or_level_utils.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_series.cpython-38.pyc,, -pandas/tests/generic/__pycache__/test_to_xarray.cpython-38.pyc,, -pandas/tests/generic/methods/__init__.py,sha256=M3eHcEuMAl9kPVH4O2bHuAgb7Phew-2Ur7vSyeLW0vA,58 -pandas/tests/generic/methods/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_dot.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_first_valid_index.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_pipe.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_reorder_levels.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_sample.cpython-38.pyc,, -pandas/tests/generic/methods/__pycache__/test_set_axis.cpython-38.pyc,, -pandas/tests/generic/methods/test_dot.py,sha256=-CZonxYOdqmxA0Hj5RGFnE7SlLjqvQkULhmWvpmuM7s,3886 -pandas/tests/generic/methods/test_first_valid_index.py,sha256=9bVulRtVZhYPAm84pEZTdYWheeP-kdF04jRnB9FqiR8,3302 -pandas/tests/generic/methods/test_pipe.py,sha256=TVupuW7m6U1PojgIbwGP-DnrnJcy7_HlJLDnW8XPYOE,970 -pandas/tests/generic/methods/test_reorder_levels.py,sha256=2auxCleSGqEtLhXpanVW_EASWeW1SXESK29Np93uocI,2777 -pandas/tests/generic/methods/test_sample.py,sha256=pH2xDf4LsqVu9AZfISO_aDLihJ9Jglrbefi6Efp8tTY,10453 -pandas/tests/generic/methods/test_set_axis.py,sha256=DtJDaYWgQcM33N_4gulAxpzGIGuDw1lwgQxHBd3_-eo,3005 -pandas/tests/generic/test_duplicate_labels.py,sha256=ONy1GYM51TdBY4Ugguixx6dgYkDPwDgutGLRCDJ588g,16165 -pandas/tests/generic/test_finalize.py,sha256=SWD13MSHuzFQF7rcM3Z8CYUal5uqeQPWA3NcSrFaYp4,26845 -pandas/tests/generic/test_frame.py,sha256=W4BmdixD0K7-ge6RXOXkzoMd1h4bad_B2ZPsNaSGliw,7230 -pandas/tests/generic/test_generic.py,sha256=gerakICtLAn9QUV6qWmf7wzwfPKJcXi3iwZzjwSe0PM,16483 -pandas/tests/generic/test_label_or_level_utils.py,sha256=OGS9hYHw-vg_IH8Q0_U0g7rPALi0UZNKZW_PHLv9tQ0,9968 -pandas/tests/generic/test_series.py,sha256=7yKKoCYbJ3rEQkaBLZVbiCQXnGrP183vGnN0pVLvhEE,4367 -pandas/tests/generic/test_to_xarray.py,sha256=h1cVbP6Q9VXTrR4lJFM7XgWP0_W_F2C3B3mzNmguvlE,4360 -pandas/tests/groupby/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/groupby/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_allowlist.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_apply.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_apply_mutate.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_bin_groupby.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_counting.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_filters.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_function.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_groupby.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_groupby_dropna.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_groupby_subclass.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_grouping.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_index_as_string.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_libgroupby.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_missing.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_nth.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_nunique.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_pipe.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_quantile.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_rank.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_sample.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_size.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_timegrouper.cpython-38.pyc,, -pandas/tests/groupby/__pycache__/test_value_counts.cpython-38.pyc,, -pandas/tests/groupby/aggregate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/groupby/aggregate/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/groupby/aggregate/__pycache__/test_aggregate.cpython-38.pyc,, -pandas/tests/groupby/aggregate/__pycache__/test_cython.cpython-38.pyc,, -pandas/tests/groupby/aggregate/__pycache__/test_numba.cpython-38.pyc,, -pandas/tests/groupby/aggregate/__pycache__/test_other.cpython-38.pyc,, -pandas/tests/groupby/aggregate/test_aggregate.py,sha256=_BvMMUh7qzyeX3q1N4pNI-1kFdOM0az1D4GY_QRxqck,38925 -pandas/tests/groupby/aggregate/test_cython.py,sha256=dF-f1lWrYmHfejwxxolGoudOxbcU_OKVrO0RoMj_i6Y,10775 -pandas/tests/groupby/aggregate/test_numba.py,sha256=mQU3CNcYgYsYLInQbYx4jVI_eoeDYYvXOGK5bNTusBs,5280 -pandas/tests/groupby/aggregate/test_other.py,sha256=z2S-_B48TacTg6UvCQLTxy-uVC2Z85S8B1reSwWYyJA,19113 -pandas/tests/groupby/conftest.py,sha256=lNYEAK85v2neJ8PFyszKLXbs-I-3auSow6BU-VPwN74,3425 -pandas/tests/groupby/test_allowlist.py,sha256=RPa62RuA0Frh9nctnUadNucomYApBPINaoCr5yoTN3A,10565 -pandas/tests/groupby/test_apply.py,sha256=13CDZdOX22n-YebUG-ow1-gqbxtHBUbAyoodv8L3HHU,34565 -pandas/tests/groupby/test_apply_mutate.py,sha256=N8v0qkMCNwjLz5cbc3Y3bHvtfUDx4cV7vj3YfME9i-I,1849 -pandas/tests/groupby/test_bin_groupby.py,sha256=v6d2w6r_2I8Wpmj1ZiG_clFxDgBZpy1fWPfCzE1g7eY,3023 -pandas/tests/groupby/test_categorical.py,sha256=FQv-nvooL2dcwwqJVwwN2VtHT726ATaXTbdolH2XK0Q,55229 -pandas/tests/groupby/test_counting.py,sha256=CJfZROptzAa04EtfWMJAqUXrSChy03t_DBEbYRJfpgQ,12748 -pandas/tests/groupby/test_filters.py,sha256=cbqCWd36zBQ9BZO_-fYcz-fVllDCHUSbAPPMGViQjfk,20327 -pandas/tests/groupby/test_function.py,sha256=D3pdw0x9RFyyK3vKGDXLDXdyeK4Ou2gTTpQYjbR5mqA,35775 -pandas/tests/groupby/test_groupby.py,sha256=uJeT9jQ9GaVBj92NgwOjiaV0TbXOwWJHGC-PZlzhUDg,65594 -pandas/tests/groupby/test_groupby_dropna.py,sha256=PIv3pVxTubhfnSIvmUWaabvpQX3ZwtWNlSS6Tf8S53M,10836 -pandas/tests/groupby/test_groupby_subclass.py,sha256=bfvEC9xKVJZ6hVVbwxKj2U88ZCr_KlQ5TuZsnHoFYzc,2669 -pandas/tests/groupby/test_grouping.py,sha256=7O7pVw_B6KaCwo8OaYmlw38f9USlgZS9Dau-UETEjyw,35842 -pandas/tests/groupby/test_index_as_string.py,sha256=KQOXdEoOc0Y1ebWV79oZYzUH8soAb0SaISo2qBCiPNk,2069 -pandas/tests/groupby/test_libgroupby.py,sha256=Qvotf3KmKzE-UpVxkoVS-RhjKjAN49uqiac8nqs1SjI,7553 -pandas/tests/groupby/test_missing.py,sha256=NB1H4PZEJjR3dSZR001ULuxFjMmiO7K2LRcLZbP7Nv0,4009 -pandas/tests/groupby/test_nth.py,sha256=W7kKU9gNms_uJO2H2E_4a0yg1tmfpPYKubrZ2QWqmyE,20252 -pandas/tests/groupby/test_nunique.py,sha256=S_Aph7KfC0MK9NruzTe53oB3GHU0GOt0TzDR6nnfy28,5770 -pandas/tests/groupby/test_pipe.py,sha256=Q6_lZJ1UjBHOPiH744m9rTP5xULjwnV8pGvXLNZESRw,2069 -pandas/tests/groupby/test_quantile.py,sha256=q61MqRj_rIoa247qxKRQ5fG6JvglTMRcnlmHeFKBxV0,9252 -pandas/tests/groupby/test_rank.py,sha256=j5H-VbgPpFInE7FKWjDDv24n6hfz5W5G--pM-QpglRc,17961 -pandas/tests/groupby/test_sample.py,sha256=TwtzvT_R-O2CDPq2MTWKGgVavY8CI2rjnJlzlB17PH8,4358 -pandas/tests/groupby/test_size.py,sha256=qUgMcbHUN01IJfoo5eGKioG8PE_yAN5YwVJsuuz4cSU,2148 -pandas/tests/groupby/test_timegrouper.py,sha256=5emrK8j-dcyVa1u4AqigvNTgJDSOl9hmkl9_CS0D_tU,28269 -pandas/tests/groupby/test_value_counts.py,sha256=FIw18B6uVK2BMpIGmD1h_KAzKNoWNZHdvezUIs7Hy-A,3514 -pandas/tests/groupby/transform/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/groupby/transform/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/groupby/transform/__pycache__/test_numba.cpython-38.pyc,, -pandas/tests/groupby/transform/__pycache__/test_transform.cpython-38.pyc,, -pandas/tests/groupby/transform/test_numba.py,sha256=N41kCqMTTAu8HCTA9odVlOQ2e41UnqTAFqXIHCFE-pY,5152 -pandas/tests/groupby/transform/test_transform.py,sha256=7oWSsOKE9Hpn2fg6pI1jfZO9pE9kx0AlRaa76L99tjI,37892 -pandas/tests/indexes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/common.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/datetimelike.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_any_index.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_base.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_datetimelike.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_engines.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_frozen.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_index_new.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_numeric.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_numpy_compat.cpython-38.pyc,, -pandas/tests/indexes/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/base_class/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/base_class/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_reshape.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/base_class/__pycache__/test_where.cpython-38.pyc,, -pandas/tests/indexes/base_class/test_constructors.py,sha256=xljOFhiirYC7PGxWP6Swgyio0PPRNBARmbAhUNObUBA,1402 -pandas/tests/indexes/base_class/test_formats.py,sha256=sXXAaXS2b1BFM4OY51IRlfV93zKK12EtAuBVtjn6E58,5155 -pandas/tests/indexes/base_class/test_indexing.py,sha256=ujWqjVwLpOdDlVgGHFflBRTTCXZjo47qlQPXbcwup18,1450 -pandas/tests/indexes/base_class/test_reshape.py,sha256=b3xsHbNrFQUqASzZBdgkpAnwiN1M8HO4RXpYCNLvbPo,1724 -pandas/tests/indexes/base_class/test_setops.py,sha256=T2hHvIwVY2hvfhVtuxAkhmDevdDtdtf_n8wo52A_lzg,8588 -pandas/tests/indexes/base_class/test_where.py,sha256=uq7oB-lk7rsgYQer8qeUsqD5aSECtRPSEUfKzn91BiE,341 -pandas/tests/indexes/categorical/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/categorical/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_category.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_equals.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_map.cpython-38.pyc,, -pandas/tests/indexes/categorical/__pycache__/test_reindex.cpython-38.pyc,, -pandas/tests/indexes/categorical/test_astype.py,sha256=FGqRjowzfRHlQyWFQaacSsgAZtRgfPf7bCSLXjDrinw,2338 -pandas/tests/indexes/categorical/test_category.py,sha256=C0H660FFgsYKJnmOz1TwYIqtTMe2QOZqjzLJky_HBns,15645 -pandas/tests/indexes/categorical/test_constructors.py,sha256=SnW8XAA316B6qH2Vp3YpQ0qj88ztmqv1ZlMesxujNmQ,5372 -pandas/tests/indexes/categorical/test_equals.py,sha256=MvXr1I-g7r81qhfDVKD8KbyJU6K7DviKwn49ILKTjz4,3033 -pandas/tests/indexes/categorical/test_fillna.py,sha256=XMcytTV-oFQEDRflF4XXe97S8eNkAh9u8eoZjg77itQ,1547 -pandas/tests/indexes/categorical/test_formats.py,sha256=-yHeiF6GeTCK8zbMrysXJhgLO2NfEX1b6Zg7kVv-v2E,5698 -pandas/tests/indexes/categorical/test_indexing.py,sha256=f2M9aePkaCkQ1Sqp0e8HHYj0G_1HlF0gaPLgw2A3SsQ,13908 -pandas/tests/indexes/categorical/test_map.py,sha256=9mt17KKdLE9MeEQJTWe5qpooYEO6FxvAE3wVPDD5GJw,4076 -pandas/tests/indexes/categorical/test_reindex.py,sha256=nQCNlHNspyDmIvXRU-ZilnDPV7loDqM9ZWyzsXapDxI,2727 -pandas/tests/indexes/common.py,sha256=t4-PvFm3obPYWXRk-4y3YOiimC-HIYPToMkHu9i-Mic,28212 -pandas/tests/indexes/conftest.py,sha256=uV_Pe0czLWK6rSwO9Ogu9HYc_pZlJEoiTywXJsxPWhA,723 -pandas/tests/indexes/datetimelike.py,sha256=QytDPBezDgBhXOkupGgADhvnC0hjJcPLZ_tkgvaURdc,3542 -pandas/tests/indexes/datetimes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/datetimes/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_date_range.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_datetime.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_datetimelike.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_delete.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_insert.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_map.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_misc.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_partial_slicing.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_pickle.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_scalar_compat.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_shift.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_snap.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_timezones.cpython-38.pyc,, -pandas/tests/indexes/datetimes/__pycache__/test_to_period.cpython-38.pyc,, -pandas/tests/indexes/datetimes/test_astype.py,sha256=u68y_ik-xpjspG9HZrzLz5MxM6-D8X4ECfPX7-OR55Y,11237 -pandas/tests/indexes/datetimes/test_constructors.py,sha256=a5PMubGAIVH-jH8YIsg1HUZ2ymc0cGLfaPpzUGbO4Is,41224 -pandas/tests/indexes/datetimes/test_date_range.py,sha256=I931Nw91MWVDemzxWiKGYQKwO6FNxgZag8nJ-5Qhx7k,36687 -pandas/tests/indexes/datetimes/test_datetime.py,sha256=BsuFmevYoyei-dlGnpbw062bznKtj5YedsA1T1-C740,15096 -pandas/tests/indexes/datetimes/test_datetimelike.py,sha256=hak9DTm7j_K1QIfb1RyrD6_ek9vclkTkZ969_1iHwTc,991 -pandas/tests/indexes/datetimes/test_delete.py,sha256=Fs8qW4o5kuzTFhfbaDBJb4PgZNAnOTUMHwz21c6PaOA,4577 -pandas/tests/indexes/datetimes/test_fillna.py,sha256=eESnVTQ8J3iBL24bWKt7TmHxC5FJiLZMpKjw1V376qY,2004 -pandas/tests/indexes/datetimes/test_formats.py,sha256=38OLZpQiVpdaicRXQS3xDQOIslWC7Ugl7Jol3nF6r5g,8836 -pandas/tests/indexes/datetimes/test_indexing.py,sha256=VVENJ1f-JhoXx6zObbHf9Uu62OPDMXXFUwWUQAJpcgY,27115 -pandas/tests/indexes/datetimes/test_insert.py,sha256=DA15TFdQEsNZpa0Gii2aLrGOEen_1Ya23s1YFtHU6Wg,7184 -pandas/tests/indexes/datetimes/test_join.py,sha256=nEcCtKskiYkSDgrJsQsh2ziDrvyxowh0Y3RA_XfHb0I,4768 -pandas/tests/indexes/datetimes/test_map.py,sha256=nmKJgezj4TfRt8nz_q_D9avrdsNY5BBzVkqDwsmu22M,1345 -pandas/tests/indexes/datetimes/test_misc.py,sha256=aS6aRREpFZuBktZ1ednWfLjFxDshuhdhbv6-FISd3ps,15440 -pandas/tests/indexes/datetimes/test_ops.py,sha256=Yhkl5FETjZlnL4DktHWhDJBGiZ9Ek2j43SgKaOu5DUw,14167 -pandas/tests/indexes/datetimes/test_partial_slicing.py,sha256=2iko5quJG56cLaWQ7gUiv7ifvZd-_CgwlkFU2hKEBS8,15023 -pandas/tests/indexes/datetimes/test_pickle.py,sha256=LE3XZUEiA6Y5dlJSECpoBpSlo3K2FE690CA8UfTX0s4,1342 -pandas/tests/indexes/datetimes/test_scalar_compat.py,sha256=7qmQpW3WRXj_w0S4NUF85XaPCV2A9VET1Xm7OL6u-18,12213 -pandas/tests/indexes/datetimes/test_setops.py,sha256=iBZZH2ZM1K7laIILf0uT0EHJ8YlP9nk-E9NvnfJJrTc,19893 -pandas/tests/indexes/datetimes/test_shift.py,sha256=tB8BCvo4A7RiCQ5mQ_SUEWHwnmUcHpNwySmwiwDCQbw,5459 -pandas/tests/indexes/datetimes/test_snap.py,sha256=W29CgCAkZfDAfDoj-FwZuX8c4eim8ZpWlaI_HZoAOl4,1182 -pandas/tests/indexes/datetimes/test_timezones.py,sha256=TeDu-2wZzTAfQUwaueD3lnoDuUr70yq_i70nUeWWCXc,44979 -pandas/tests/indexes/datetimes/test_to_period.py,sha256=6AIWAkhRfc1Y48V8M4U8U-YhGtkszqV8bE6uhc1OwyM,6557 -pandas/tests/indexes/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/interval/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_base.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_equals.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_interval_range.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_interval_tree.cpython-38.pyc,, -pandas/tests/indexes/interval/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/interval/test_astype.py,sha256=iqiG9NN4XD1Aywm5DfnC7kMWiYMFxS_1ahE9-8lc-nA,8104 -pandas/tests/indexes/interval/test_base.py,sha256=qHrS5-aNEo6LfpOSGF5jKyQjNW_1ojId225b9TLyblQ,2917 -pandas/tests/indexes/interval/test_constructors.py,sha256=5m5D5wdUxklQV734cPXgmyQScW9Lsf51ArH4WGmap-U,15539 -pandas/tests/indexes/interval/test_equals.py,sha256=2SQ6fHKBqTSbnZHIm0om7Haq9cfKkP_CsDSUqNDfSVs,1213 -pandas/tests/indexes/interval/test_formats.py,sha256=O4vcCef5GV_33jbAdxm_ftICkqi6RxN6Dbyv2Cq_yi4,3265 -pandas/tests/indexes/interval/test_indexing.py,sha256=Wqzl7VB17R9Cwfi4k-K8IUShJQ5Usq3g5dsOTTaoa1E,17113 -pandas/tests/indexes/interval/test_interval.py,sha256=4OYZywM465Uw9PlPB5XzUbHR4Jw_hRSfaLhiy97Tqw8,34761 -pandas/tests/indexes/interval/test_interval_range.py,sha256=xUVurv9UbC_boXzljUVkmjNdxVCeRYS33z_JyEhDB74,13249 -pandas/tests/indexes/interval/test_interval_tree.py,sha256=UaLTsyEBVdXkBQJEkD_4R2S5P3YEYye5HfnWblRvky8,7082 -pandas/tests/indexes/interval/test_setops.py,sha256=OvcxBBSuEVhpJdphXwlW3gh9nk402Vn0x1HTmzoPzIM,7263 -pandas/tests/indexes/multi/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/multi/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_analytics.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_compat.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_conversion.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_copy.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_drop.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_duplicates.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_equivalence.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_get_level_values.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_get_set.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_integrity.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_isin.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_lexsort.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_missing.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_monotonic.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_names.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_partial_indexing.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_reindex.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_reshape.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_sorting.cpython-38.pyc,, -pandas/tests/indexes/multi/__pycache__/test_take.cpython-38.pyc,, -pandas/tests/indexes/multi/conftest.py,sha256=8okav-Aj0Za_uAhRxf4tXejKeA1GGnyjeYNx-2VYLrM,2210 -pandas/tests/indexes/multi/test_analytics.py,sha256=TsQ4rMmIHaVuQL1Y2HtwxVcCWDb8iGOl9I39h0f3N9c,7046 -pandas/tests/indexes/multi/test_astype.py,sha256=YmTnPF6qXwvYY82wZfQ8XFwVwOYYsIls3LSrdADDW-4,924 -pandas/tests/indexes/multi/test_compat.py,sha256=_f1sdowbbJ3DzV75q6_QezKI8bxOsURg_238F0LZeOs,4223 -pandas/tests/indexes/multi/test_constructors.py,sha256=LZtMl1cFNU_JErh_aZrvDS5s98BJWrGb-su76Q4a0Vs,25719 -pandas/tests/indexes/multi/test_conversion.py,sha256=HeL15IHbLs2-gcnkHCGqrLyyphu9ToxrICs2sM_wnj0,4186 -pandas/tests/indexes/multi/test_copy.py,sha256=5WG-r8EpaV0tGlyxQrKCFz2gpUCzhvFAVczylPyPp58,2900 -pandas/tests/indexes/multi/test_drop.py,sha256=pH6G79db5mSNbwiBcQcUTpLsK4vCbsB0wO6WdmGYX0U,5847 -pandas/tests/indexes/multi/test_duplicates.py,sha256=sZH3YlPIFXWQrkWr8HhlvBYVUhq4WIVmABGr8IUl0r0,10245 -pandas/tests/indexes/multi/test_equivalence.py,sha256=8bJNyqL-WNNGixujNHrzLymCbeUI9gnnBM5CIld8JiQ,8203 -pandas/tests/indexes/multi/test_formats.py,sha256=3c_AlfvaKHnWIfRNaYrk9Vy-mzz0TN7YigrsOyREUgU,7776 -pandas/tests/indexes/multi/test_get_level_values.py,sha256=8P7pVjSeoOaR2eN1YzZCYNKV8Z2lp4P56EaKqVJln4E,3546 -pandas/tests/indexes/multi/test_get_set.py,sha256=uDyLvbdH6oOrIHBNO1tIegkFM8ZRFJ3zgEWCdK4bTNw,13305 -pandas/tests/indexes/multi/test_indexing.py,sha256=os_5ttt9VQolqhkodTKUd3LMyc16xIrcJDFsttZNdYk,30626 -pandas/tests/indexes/multi/test_integrity.py,sha256=FWihZJCM93JYTNkEVttd90Fza94sDSxdiNjUUmX1Tlw,8515 -pandas/tests/indexes/multi/test_isin.py,sha256=QpTdTcpPSM3EyTFDO9VTisN2vbj1Fp4a_wYsmdWRbH4,3173 -pandas/tests/indexes/multi/test_join.py,sha256=z6qPVHyL6SOliZWtSMLwhajG26TE5eoM9ydnf584YdI,3799 -pandas/tests/indexes/multi/test_lexsort.py,sha256=r-ESkSEGsMRyh9geUpZkaNtITPeCeQrjRYsv1DXKFQE,1351 -pandas/tests/indexes/multi/test_missing.py,sha256=oxtBiwN7askAsRHYjT2Z3L1pde5sWk_t7fgJ54BBOw0,3349 -pandas/tests/indexes/multi/test_monotonic.py,sha256=UaL_mL49tTSq85Ti9fXti4fdmaJiquUFFOfAg0ITgNc,6907 -pandas/tests/indexes/multi/test_names.py,sha256=U8GiuCiOpaem6kkHXAv_6Fe4uRX14jhb9d-rLdEpE_c,4863 -pandas/tests/indexes/multi/test_partial_indexing.py,sha256=Tza6-7kLmdn2bI50A9jyiNIWvGag4CQNO116yyaKC4g,3376 -pandas/tests/indexes/multi/test_reindex.py,sha256=afdw-hR1tX8q0H1Re3GVWpGfttHianW116dTa8p7mdY,3745 -pandas/tests/indexes/multi/test_reshape.py,sha256=v7r1uWQlTV4UisV_pSCHwkgj-Kf6DpVh_aaf4LvZBFo,5041 -pandas/tests/indexes/multi/test_setops.py,sha256=NvGMdDgQKTF-JJohLqUACS5-fbrsfrldIkFBJu8MMbc,13520 -pandas/tests/indexes/multi/test_sorting.py,sha256=e9IDfNLm_1KKjzb_nM3HGHNL6MWhxmJ0oECRfTcjPs0,8357 -pandas/tests/indexes/multi/test_take.py,sha256=gr4gf2ceh3hf8bcRVVwY2Gz_apjyAtpXa4UM8TFq0TE,2501 -pandas/tests/indexes/numeric/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/numeric/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/numeric/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/numeric/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/numeric/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/numeric/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/numeric/test_astype.py,sha256=grC7leu241Acebqs85KIH_vmUm4NgUDl-tTpbooRHDA,2943 -pandas/tests/indexes/numeric/test_indexing.py,sha256=pdfSxM9qAN25ypQKO_nJlu2koqCWUsAUpYrlvMEtbjU,10120 -pandas/tests/indexes/numeric/test_join.py,sha256=4XWVWmp3_KS9clDPfLA25Lr6aZQUwbDzKo3Ov9NUz-4,14759 -pandas/tests/indexes/numeric/test_setops.py,sha256=zrx5K9Jwy8UmgQGms1vs2yawAnTdG1Z_WszGU6IMj3g,4948 -pandas/tests/indexes/period/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/period/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_asfreq.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_factorize.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_monotonic.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_partial_slicing.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_period.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_period_range.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_scalar_compat.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_searchsorted.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_shift.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_to_timestamp.cpython-38.pyc,, -pandas/tests/indexes/period/__pycache__/test_tools.cpython-38.pyc,, -pandas/tests/indexes/period/test_asfreq.py,sha256=clS66172VsU5u5GTjuii6r8ZGLVqeKGDluBf_K4voMU,5432 -pandas/tests/indexes/period/test_astype.py,sha256=5WakGlKgHYGAm0-Nx_rFhCng1lUlkhrRoh5E7HXVnQ4,6520 -pandas/tests/indexes/period/test_constructors.py,sha256=gjTtY3DLfZ78xmDCYE0Yj2hwsK3_YGuDGyGV4q9s6x8,19805 -pandas/tests/indexes/period/test_factorize.py,sha256=X3qdVfPjntNyXE7AJTA2OYsWdXQ6jNV1voRcvtYWPHE,1267 -pandas/tests/indexes/period/test_fillna.py,sha256=mGuiD_LZh2YZ3UAoL_K9IhrqhkAw0yevlOudk3NWeNk,1104 -pandas/tests/indexes/period/test_formats.py,sha256=qX3DJXOCGfkIoYuDb4eae7J9LLtE9kybOGjJqs5q9CM,6808 -pandas/tests/indexes/period/test_indexing.py,sha256=kQgrX0BnWzDygMfB8OM8DEyRJaRgfaADwIfpMieDBfk,31185 -pandas/tests/indexes/period/test_join.py,sha256=tzivDReLEiK6K50zmOJMyobuIqVTPT8YswFonz75fG8,1410 -pandas/tests/indexes/period/test_monotonic.py,sha256=awODkhwbKFCZvs6A1ejCmcMbRzB2aVOdNELHIElj4GY,1245 -pandas/tests/indexes/period/test_ops.py,sha256=Ne3jRI98OfGwE3tHDI3pQyJGERLucH44xbO9EVne-mA,11728 -pandas/tests/indexes/period/test_partial_slicing.py,sha256=oJCwvbk-J5PwMZc8BXPn5eJcHc6o1av3P5c50XuvmDc,6469 -pandas/tests/indexes/period/test_period.py,sha256=-JnIxBFBzfJQR7MxlaMzTQd-rcmKgil6s66xU-Rxzkc,21507 -pandas/tests/indexes/period/test_period_range.py,sha256=LvXcSfDXiozJSOVlB5UWK6-zr2xIQkQLyl-NH8dc0Xs,3954 -pandas/tests/indexes/period/test_scalar_compat.py,sha256=hrByNobLN7UaHX0guyhQga6pwjwYkuT1-5N02-V0RvQ,1123 -pandas/tests/indexes/period/test_searchsorted.py,sha256=LjYH5sXeVcrhdKvyD1XfVFkO3N_CavvZpjuUfnelaDk,2944 -pandas/tests/indexes/period/test_setops.py,sha256=InGzMC29tqOLfanr2W3UFpsig7TMfCgYhlfvm8zlbw0,12336 -pandas/tests/indexes/period/test_shift.py,sha256=qIhNLGv9HWCnEYkda0CNWNw00djgfU2InkI2RAayF4E,4398 -pandas/tests/indexes/period/test_to_timestamp.py,sha256=7rUxJYh0btndhRIB4na2SnB1yzaJE2nimjSaWl09WgQ,3612 -pandas/tests/indexes/period/test_tools.py,sha256=ApzPMVXSGE7o762Amflima9DB1go2w7LyCRSGnS0LL4,1006 -pandas/tests/indexes/ranges/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/ranges/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/ranges/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/ranges/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/ranges/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/ranges/__pycache__/test_range.cpython-38.pyc,, -pandas/tests/indexes/ranges/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/ranges/test_constructors.py,sha256=ahcA9tzihvTKphPw0v_kZ8kpNvLJgzjNvtoPkJWOG2w,5369 -pandas/tests/indexes/ranges/test_indexing.py,sha256=6OlQBdHT_RItYG2CRl7XsNmRaS2tzWvSdXIAPnOc2FQ,3010 -pandas/tests/indexes/ranges/test_join.py,sha256=PsNqZtWjHCsUYd2BtnOdvWFOC8ROp9Jt6VKt6jSEl-o,6098 -pandas/tests/indexes/ranges/test_range.py,sha256=UhGXR_MN2zgIAN6Je9kzpsamjW2Ka-1yUzQ5Z6P8Yqs,16167 -pandas/tests/indexes/ranges/test_setops.py,sha256=f-bNUQjhVcgOPkUaEyejU0rl5ZAsRswdpirkrZzCvvI,12685 -pandas/tests/indexes/test_any_index.py,sha256=Tu45j7fckXjmDk0mzlP056FRoowareNmYs2MZ7tf6yU,2871 -pandas/tests/indexes/test_base.py,sha256=MX7Y1m_rkrQEIwoSIXu413zU07xIJyTDI-_z6hA52GI,83575 -pandas/tests/indexes/test_common.py,sha256=1xHWOpfJQKEapQz8P3TtMOcpMeK_JSP0splwccJHcJM,14707 -pandas/tests/indexes/test_datetimelike.py,sha256=IZ_En-0SFVYm0ghDgfq1JmodRYAdug6DVFthiEfsz44,6163 -pandas/tests/indexes/test_engines.py,sha256=uHQgbxRCRZRh3AjEfnlV9uFH0J3Oxw1PLcfbDexI4kc,8656 -pandas/tests/indexes/test_frozen.py,sha256=dKwa03hmwCwtwCR7UeI42OQTVX2bkdRzNr6fqaDW4-A,3069 -pandas/tests/indexes/test_index_new.py,sha256=O9DJcxFUufxQ7rXhSiqBBo16PhfsqCg7c1EJ8BPn9fY,4494 -pandas/tests/indexes/test_indexing.py,sha256=YP0YgjrOdsn2F2cnzO_b5xqf7pcjTWnIqdwr2MHWzVk,4352 -pandas/tests/indexes/test_numeric.py,sha256=fgUpXXNhmJKKSCMdOCY0qiP8rk7Kn_jvAYXxjYxlcDY,20442 -pandas/tests/indexes/test_numpy_compat.py,sha256=qo1iZLgvmd3FpXGg6CnlB8Wggxn4JQFg9obBrFIIheY,3502 -pandas/tests/indexes/test_setops.py,sha256=kmaz6igtsf9DTvNpgSQROJkYE-lkNoS5voIzYNCSUsA,15958 -pandas/tests/indexes/timedeltas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexes/timedeltas/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_delete.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_insert.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_partial_slicing.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_scalar_compat.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_searchsorted.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_setops.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_shift.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_timedelta.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/__pycache__/test_timedelta_range.cpython-38.pyc,, -pandas/tests/indexes/timedeltas/test_astype.py,sha256=LTjzPnA_cPoqa3geSRng3Fr3q8-aEWHmJ3C94LJJkkw,4110 -pandas/tests/indexes/timedeltas/test_constructors.py,sha256=ejsLzATlzdN-UgHoPjmjM1yj7AgynkCp49u4e14vDIs,8940 -pandas/tests/indexes/timedeltas/test_delete.py,sha256=n-v2xaf1jdCzXQ_05Wxz-onCoQmSyMM_12nN1DYyqqA,2385 -pandas/tests/indexes/timedeltas/test_fillna.py,sha256=6GuPK0a8WYxwvtJxCJZH3D9kZ25DxtLayYsfMYQpf3Y,576 -pandas/tests/indexes/timedeltas/test_formats.py,sha256=x-gilVrRwSO3id0q3aHHB5TvzQQoz-oAY8scGZBxWqY,3280 -pandas/tests/indexes/timedeltas/test_indexing.py,sha256=_-BAUEa17Ukw5wcHqBwRExnbJ-vVS_pEPiaDj8OpJ00,10043 -pandas/tests/indexes/timedeltas/test_insert.py,sha256=asLCOJoXNCn22yvZsAF06Mnup4OgH8fUByEGB7FHakc,4132 -pandas/tests/indexes/timedeltas/test_join.py,sha256=zRNfeQEaLj0ie7iTHfRVZouu7xuWI_rnkKgBX-_OFy0,1497 -pandas/tests/indexes/timedeltas/test_ops.py,sha256=_MSI50PICbSyB5LUfhK-Ul2NnLHMLpyLb8rWKDvdrnE,9848 -pandas/tests/indexes/timedeltas/test_partial_slicing.py,sha256=sZThlTlo48WAghTX-zk4AOeyPtnXIdGfyOoFBRDXGeg,1347 -pandas/tests/indexes/timedeltas/test_scalar_compat.py,sha256=Mza7xJva41Hb2KmrTBnbBkINfXKnwGkOFna5yo0feYc,4487 -pandas/tests/indexes/timedeltas/test_searchsorted.py,sha256=cSRYSd2kFVytcMC7RGKTY8_T-0_JIZRH8xS0Xd--J28,1019 -pandas/tests/indexes/timedeltas/test_setops.py,sha256=f7ACYYvsjgFJuENz3xakcllm1l7iMhq4QG3rEIfWTBQ,9482 -pandas/tests/indexes/timedeltas/test_shift.py,sha256=CmuWUFPcF58DWcCC6FhkxOAMyWjAjUCmKVeGMNk_A6Q,2751 -pandas/tests/indexes/timedeltas/test_timedelta.py,sha256=kjEAZmXaC0-bRKDMMFIzZCV8wrdVJwcSVVZ_rxkZC9M,7290 -pandas/tests/indexes/timedeltas/test_timedelta_range.py,sha256=fYdCMj87OVuVB_Ar48OR4lplHtjtDNSp2Rs-w6TruFk,3258 -pandas/tests/indexing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexing/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/common.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_at.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_chaining_and_caching.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_check_indexer.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_coercion.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_datetime.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_floats.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_iat.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_iloc.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_indexers.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_loc.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_na_indexing.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_partial.cpython-38.pyc,, -pandas/tests/indexing/__pycache__/test_scalar.cpython-38.pyc,, -pandas/tests/indexing/common.py,sha256=n1eTGitpxoaBLuJclxHSpnqwoaMLmlxcf8itXsqeR2A,5245 -pandas/tests/indexing/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexing/interval/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexing/interval/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/indexing/interval/__pycache__/test_interval_new.cpython-38.pyc,, -pandas/tests/indexing/interval/test_interval.py,sha256=_sLEBtCOX6AqDrezOb3HQRoNO_cJP01lNbUy46kLcgs,4833 -pandas/tests/indexing/interval/test_interval_new.py,sha256=3FPLpVpkPxP8SCB0s9s_cinCL6uA1PhGFPGnDrEF34k,9000 -pandas/tests/indexing/multiindex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/indexing/multiindex/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_chaining_and_caching.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_datetime.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_getitem.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_iloc.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_indexing_slow.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_insert.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_ix.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_loc.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_multiindex.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_partial.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_setitem.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_slice.cpython-38.pyc,, -pandas/tests/indexing/multiindex/__pycache__/test_sorted.cpython-38.pyc,, -pandas/tests/indexing/multiindex/test_chaining_and_caching.py,sha256=s2zbvPogx1nctjlDeLAVFzCuTJ6fwd-Xscnf78RX6dU,2001 -pandas/tests/indexing/multiindex/test_datetime.py,sha256=cditJuDyVUhpC6mCnjTfOWYoo9tbsXsG-LlTo_ngSGU,1209 -pandas/tests/indexing/multiindex/test_getitem.py,sha256=ky5uxKYV_3YDvk4ykXSOa1tccW_eNMdsYkSFExmDF1E,8767 -pandas/tests/indexing/multiindex/test_iloc.py,sha256=0fA21M8J8Wbw8lVc7qJbWuLlU2oHYjLQ3n2alB3OfQI,4991 -pandas/tests/indexing/multiindex/test_indexing_slow.py,sha256=-fy4CYop7V8bLTkBWnQRGJbWrbfl-xACPQJ3VJz9DJ4,2997 -pandas/tests/indexing/multiindex/test_insert.py,sha256=Uw-7CrXDEdtUCTk9_9ShqKeY-kFolac8TBoCTXdqXY8,1021 -pandas/tests/indexing/multiindex/test_ix.py,sha256=IZ27YrBaqmTwusag6tKnYXSoZPZgmX133MTuW7Vq8gI,2117 -pandas/tests/indexing/multiindex/test_loc.py,sha256=E59fTHX7AJM9uaMwWj6JHk0FbuPOBrtJ0Fga6Zw0yOg,22815 -pandas/tests/indexing/multiindex/test_multiindex.py,sha256=bz8ZyCTOycngU7scWuxKKI1w75kS-WLKmprHB3JZTE8,2956 -pandas/tests/indexing/multiindex/test_partial.py,sha256=zE6ECf48XnG4TJJeMya7I8asTpr4Zd7bTPBSQ3PFu4o,8968 -pandas/tests/indexing/multiindex/test_setitem.py,sha256=ihtGJbzpzh8i4reWxofcL_LkFSnCPwQv4cHKrox-K3Q,15936 -pandas/tests/indexing/multiindex/test_slice.py,sha256=1NBpTXyH-cC5WgTTvUa5xyTAQ4xQKoBYqbPXJcv3ch8,26417 -pandas/tests/indexing/multiindex/test_sorted.py,sha256=a0rdpLKhtPVvPUBfG8o1_wCmGDwMkM9csfYJNa-zK5g,4576 -pandas/tests/indexing/test_at.py,sha256=p8Xcb4tQeXjzyX6DUigdKoAwV3WcJqhbkoG4Fqvq1z4,4825 -pandas/tests/indexing/test_categorical.py,sha256=ftJU984MnwJpHqj6vSdEt6YGsZfLV3JxpErOq1GjA6w,18802 -pandas/tests/indexing/test_chaining_and_caching.py,sha256=nJYKNfBqBQUyu-xG-IIYE4dORXHu2HUtizdClgN801Y,13521 -pandas/tests/indexing/test_check_indexer.py,sha256=kZvthQokLBuvD2JSjdV8IQ0smIKROUEmUUI6h838V9o,2908 -pandas/tests/indexing/test_coercion.py,sha256=7qCmf1bRYnpzL6ZUtQYo-Yw3RornNuxkWif4kmIiggE,38614 -pandas/tests/indexing/test_datetime.py,sha256=tSMuW7V0kHQ8wK4ySUtooH4ZOG_vluL2nvnrLRo-efM,7714 -pandas/tests/indexing/test_floats.py,sha256=h3L4yZOiX28SDqf22lQ49LVe2IH6wgVMJSWVrM9SIM8,22805 -pandas/tests/indexing/test_iat.py,sha256=e_i-HHmkqznErQamgNL7nL32XJDLzRW_9MI_z-rNAy8,776 -pandas/tests/indexing/test_iloc.py,sha256=0fsHD2TCzh5SptCn80D6-sbzqhKFvwh1XrJrhcLnhVQ,35333 -pandas/tests/indexing/test_indexers.py,sha256=u7CglgG0ZQhGhXofN-Xn1Vv5-MbKtr_gZjsXk2cgqek,1636 -pandas/tests/indexing/test_indexing.py,sha256=JF7PySEHkUt8lFLcHgGvGMHVmLHJi6Y4WnXOCmrRH94,35414 -pandas/tests/indexing/test_loc.py,sha256=GZS4ASfeesqqKmGq9xy92uQ4At5QSDnfO4YAbuHIvVI,70422 -pandas/tests/indexing/test_na_indexing.py,sha256=Nnn5Co-SeNdH6Yg2T_vKdygszTZ9IplVNv4kYozSoqA,2817 -pandas/tests/indexing/test_partial.py,sha256=jzdFcMqmmrkX_ddgKj3RlpYXZoPAfocIkgL0yiWi8XQ,23019 -pandas/tests/indexing/test_scalar.py,sha256=WDkwbT2o8N2-8LtkXf11jg9w8-aTnHp2JgxhIXjhrKI,9902 -pandas/tests/internals/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/internals/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/internals/__pycache__/test_internals.cpython-38.pyc,, -pandas/tests/internals/test_internals.py,sha256=GZvWxKETnr-O7rRi20SrsB0zo53gtZXNEo9WPrfo-sE,41237 -pandas/tests/io/__init__.py,sha256=f4uYFaEJyf_v2qFP8VagvAWjeJhGVg_nrnkskgib5zs,633 -pandas/tests/io/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/io/__pycache__/generate_legacy_storage_files.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_clipboard.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_compression.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_date_converters.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_feather.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_fsspec.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_gbq.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_gcs.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_html.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_orc.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_parquet.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_pickle.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_s3.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_spss.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_sql.cpython-38.pyc,, -pandas/tests/io/__pycache__/test_stata.cpython-38.pyc,, -pandas/tests/io/conftest.py,sha256=rzbln35DouJNGPbgOPC2vHk6QOPP7hn4nMei8YLKrSM,4527 -pandas/tests/io/excel/__init__.py,sha256=hO5YF81SSRLMOr07_YY-F53LklYHdCf1HOTZNTyjIpY,944 -pandas/tests/io/excel/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_odf.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_odswriter.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_openpyxl.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_readers.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_style.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_writers.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_xlrd.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_xlsxwriter.cpython-38.pyc,, -pandas/tests/io/excel/__pycache__/test_xlwt.cpython-38.pyc,, -pandas/tests/io/excel/conftest.py,sha256=nPB_DRdq1GS7bQG0fJ67GXxJzJ_Qw1KAG-QAVpmvJ5Y,1355 -pandas/tests/io/excel/test_odf.py,sha256=67UIhWAtm5vG0Wr_JmYrsSQ-nUOxx4BBEk_kNkgREq0,1388 -pandas/tests/io/excel/test_odswriter.py,sha256=eyCQR8bp4RKIxxcxQHj5VfM90Tw01gVT2WXo2TJ3DM4,406 -pandas/tests/io/excel/test_openpyxl.py,sha256=Jnm-UQ7DM0EyKMuguEqhAShrF9Y2DQEBqAteOUH1K7k,3933 -pandas/tests/io/excel/test_readers.py,sha256=0v88bXh4ALrhqQLxBUEUgXMoBWUFFrrGwVeVZ8_PRrI,45508 -pandas/tests/io/excel/test_style.py,sha256=78FBAsyr8pocixvHgzZ0TnC_P4lhx8FaBeBbE0nbYJo,6411 -pandas/tests/io/excel/test_writers.py,sha256=UrwtK1kap1HDC_S8PEOhlvDX204edH3duY0Cau_w0mY,50521 -pandas/tests/io/excel/test_xlrd.py,sha256=PasT6CzzQebKneS9r0YTfWYa7ibAekN8ps3n5tpbMaU,3007 -pandas/tests/io/excel/test_xlsxwriter.py,sha256=5b7Ql8iubdkcbD0kovdLo-_y1rxpGhy-41hK4zP9IbY,1988 -pandas/tests/io/excel/test_xlwt.py,sha256=wAzKB0ABN4a5MV6pcik60gIegGzt7E00vumnHJhS83E,2949 -pandas/tests/io/formats/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/io/formats/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_console.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_css.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_eng_formatting.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_format.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_info.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_printing.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_style.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_csv.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_excel.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_html.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_latex.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_markdown.cpython-38.pyc,, -pandas/tests/io/formats/__pycache__/test_to_string.cpython-38.pyc,, -pandas/tests/io/formats/test_console.py,sha256=ntS8xNTouMNVOqfYmYTD_lPsDpD_Zb5je2dab3YgIq0,2460 -pandas/tests/io/formats/test_css.py,sha256=G1HA8k8Olsl_DR38P_NY846EM-Ombxad6dJsrlXy3ns,6690 -pandas/tests/io/formats/test_eng_formatting.py,sha256=42f-v-IVN6Toj21CQGre-pOrDk4KZeCmj0qqa9htJeY,8341 -pandas/tests/io/formats/test_format.py,sha256=Qyys_cgLH4qPjAIRdoT63v2Tb9GeTc9DQ8dfDJw-0O4,117394 -pandas/tests/io/formats/test_info.py,sha256=bhcOKjPUi8aB7cWYQeuOEcC_K6OXfKNx5lJlsxqB0ds,14620 -pandas/tests/io/formats/test_printing.py,sha256=dPAoaxeoGJLC_n5amJ7hbiuRdhQ4zAIIuF6pNVo3cws,6791 -pandas/tests/io/formats/test_style.py,sha256=5zHlu2pl5FSAqjkiB0uPzJmS2SKYTcDpCcWTqJOTO5g,67314 -pandas/tests/io/formats/test_to_csv.py,sha256=62BfSvSqqz8KhieNvKzba2Q5hT3w4IQna77PjB_g0sA,23275 -pandas/tests/io/formats/test_to_excel.py,sha256=CMTPW-qkV0yuk8Rw6rrGkW1BscvnR0dI3WMmMAEKOIU,12536 -pandas/tests/io/formats/test_to_html.py,sha256=2go4BqaBMzB221K974cuHKDqW7FJxDpLzlbKKg5VFms,27918 -pandas/tests/io/formats/test_to_latex.py,sha256=hHzq2c-voZpsXnVGwIpc_rhYxRNE_AYY3J6pbLaMXes,43678 -pandas/tests/io/formats/test_to_markdown.py,sha256=SXtd7qhiUzGx5LVN6roCsC66tutw78bqdUvU-7Sde0Y,2409 -pandas/tests/io/formats/test_to_string.py,sha256=CPgAbLGs7FAE0MhrTcvYd0KRnAYGD8vX8hbJ40giINU,7023 -pandas/tests/io/generate_legacy_storage_files.py,sha256=iHwcBPiT6-eGa-4xgxZZyCbH0nGdv_syVajlOCcSKg8,10059 -pandas/tests/io/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/io/json/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_compression.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_deprecated_kwargs.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_json_table_schema.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_normalize.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_pandas.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_readlines.cpython-38.pyc,, -pandas/tests/io/json/__pycache__/test_ujson.cpython-38.pyc,, -pandas/tests/io/json/conftest.py,sha256=Zp83o90PvZ56MbhNRr1NZEPTpho7jRHcLYiEA9R_BZw,205 -pandas/tests/io/json/test_compression.py,sha256=cQwhmEy8uNt5w5CukajuLoN90XUcD1Trz7X2KqmFUHc,3920 -pandas/tests/io/json/test_deprecated_kwargs.py,sha256=-hz91joqDj5yjDDF0WCUEr8RqhqxTxEryCHZkb9H1hQ,1144 -pandas/tests/io/json/test_json_table_schema.py,sha256=YNYOyd-Q7KA1uP2Gy2jRezwXcSBU6TqZyh-cW1CdB4k,28014 -pandas/tests/io/json/test_normalize.py,sha256=JPfV_KlM8iWffdrVkSBzg8Pv6VxsjDfEK3ROF6VuN1Y,26288 -pandas/tests/io/json/test_pandas.py,sha256=O60lc58O3g5tWLr3Z7NBUZjJ90c7BMoaHmE9r_ksiGg,61489 -pandas/tests/io/json/test_readlines.py,sha256=6Qkw6ZL09UemMoha6rZLTnMtYRaj_uV4UCRvqS9LWFo,9321 -pandas/tests/io/json/test_ujson.py,sha256=xzQhrjM__3rM9PDo1ph4eWptBExX-Wk98OPJRqCaPPg,37454 -pandas/tests/io/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/io/parser/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_c_parser_only.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_comment.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_compression.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_converters.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_dialect.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_encoding.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_header.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_index_col.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_mangle_dupes.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_multi_thread.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_na_values.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_network.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_parse_dates.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_python_parser_only.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_quoting.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_read_fwf.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_skiprows.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_textreader.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_unsupported.cpython-38.pyc,, -pandas/tests/io/parser/__pycache__/test_usecols.cpython-38.pyc,, -pandas/tests/io/parser/conftest.py,sha256=YJgv8r9SUQHXsD_5V4FDQkgz5FVOY7nxekYouVVOynU,2800 -pandas/tests/io/parser/test_c_parser_only.py,sha256=S8OuBrRMhuymeCJzHy-JDbXqKQLecXXd24y_gJb3-0Y,22853 -pandas/tests/io/parser/test_comment.py,sha256=VbhWUm4TmvH0Dh941a_zcjpOoHcQg94QYCtXpkhbgEs,3815 -pandas/tests/io/parser/test_common.py,sha256=yBcF0cS6v7RPh3vUAINYJiiJU7NNBe9CY245qyPgi9M,66551 -pandas/tests/io/parser/test_compression.py,sha256=H61vL2jeYlEcMumepgtgu5fegTEWMPPQKnFkJ0L4rEM,5128 -pandas/tests/io/parser/test_converters.py,sha256=WfHcYaoMXRAHSNX--vVWrmOc710y9vhzYtiEj3Q5iRc,3985 -pandas/tests/io/parser/test_dialect.py,sha256=qC4-wTbddHUSyMObhnjEO-QcoQbvBON61YySPh9X0xY,4090 -pandas/tests/io/parser/test_dtypes.py,sha256=IvRaD4_74N9FzoK1cldCq-5KSExWahL_lWE_-WyivAc,16860 -pandas/tests/io/parser/test_encoding.py,sha256=0ZVHic66L0d6FjfR6sIQE8POK_auwsbHdbsS9J-qWpE,6687 -pandas/tests/io/parser/test_header.py,sha256=k1wmSIYfzZe78bhpfu3oMjACyAMqsOSC1oop1Q2RH0E,15921 -pandas/tests/io/parser/test_index_col.py,sha256=P2ZERSStWet-FcoeEQfPgzKAKhIGdPMs3lfuay-16gg,6814 -pandas/tests/io/parser/test_mangle_dupes.py,sha256=YlOlgZbj2tghi3Aty9Ljks59oRXwm2rv4eGNsvpk2qw,3863 -pandas/tests/io/parser/test_multi_thread.py,sha256=Eay3AY4OKDshQnSLwvYZQ4Q4w4RHqxQaTq_UsEZRS98,3627 -pandas/tests/io/parser/test_na_values.py,sha256=9CIwPbc-eVYcp--OWYqxG9LNCDqGmOJ3lIBVzLGwSsc,15082 -pandas/tests/io/parser/test_network.py,sha256=2cfhfARBD9gEaas9UZq99Q57jeDcAdOgvSVWUIa4c1s,10961 -pandas/tests/io/parser/test_parse_dates.py,sha256=BHmytQdZjAiCba1v_hfv_yCJGgqCOIfqHFcUcnpQznQ,47833 -pandas/tests/io/parser/test_python_parser_only.py,sha256=eJehmu-4daFdfE5XbovcsYkC8j2GFJc9H7fCZ5GhkUY,9485 -pandas/tests/io/parser/test_quoting.py,sha256=VR11oMtJh-pI7ZDVrYDAuSYF6SF3vNCKsUzzwLHMRtM,5084 -pandas/tests/io/parser/test_read_fwf.py,sha256=bF0dT3qLI17pw-tTAwj8H-J21vOLtfkrqox9bSWFkfg,21041 -pandas/tests/io/parser/test_skiprows.py,sha256=vjPtoeML78unD0JPLMSTyeRolul6_yueYRGRMW501_c,7010 -pandas/tests/io/parser/test_textreader.py,sha256=cH0X6Z9Xx_E7yVILLIPbVX6huFHPgYrSmNgSkwKQN_8,10666 -pandas/tests/io/parser/test_unsupported.py,sha256=80_MkJduwIFWC2q2sM4S0_VJbDKldguzWQUDry5p6fg,4268 -pandas/tests/io/parser/test_usecols.py,sha256=uwqRVvB7sTHRWzMMohOSIDfpZYXOoc3PHKMOtdHFM0o,16525 -pandas/tests/io/pytables/__init__.py,sha256=G2INVLWHmRCKdgyOVLyC_OAWLvWlMpsF-VLulkjxKC0,288 -pandas/tests/io/pytables/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/common.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/test_compat.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/test_complex.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/test_pytables_missing.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/test_store.cpython-38.pyc,, -pandas/tests/io/pytables/__pycache__/test_timezones.cpython-38.pyc,, -pandas/tests/io/pytables/common.py,sha256=UyN1Hwd0NbwtEw0qQb81hwXBHPJecgh4D-C92WQcTTw,2054 -pandas/tests/io/pytables/conftest.py,sha256=wivEYaCKfp329li6Zvei7rE9Winre4bF2wL--oT20OA,321 -pandas/tests/io/pytables/test_compat.py,sha256=7wo6Xd-MYxKA5xVQZ3TU9i81IHOxl7PzL0UcWCzBBZ4,2633 -pandas/tests/io/pytables/test_complex.py,sha256=gAjt6OeKC1saFQqhK5YdQ3_f2GJcToIzE9_AVXjk2nU,5569 -pandas/tests/io/pytables/test_pytables_missing.py,sha256=mS4LkjqTPsAovK9V_aKLLMPlEi055_sp-5zykczITRA,341 -pandas/tests/io/pytables/test_store.py,sha256=BGteLRo6eeM6kzjmZ3aLf0xzncxaNzDOmFGxMQ44jnI,180796 -pandas/tests/io/pytables/test_timezones.py,sha256=LiaEjMZkbchtRLw0-m-dbkzjBCaCy8o5Yt1_2odQdYM,14243 -pandas/tests/io/sas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/io/sas/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/io/sas/__pycache__/test_sas.cpython-38.pyc,, -pandas/tests/io/sas/__pycache__/test_sas7bdat.cpython-38.pyc,, -pandas/tests/io/sas/__pycache__/test_xport.cpython-38.pyc,, -pandas/tests/io/sas/test_sas.py,sha256=C2mvGTXUvmtoroqtSKBPW1SDI9A-Dho1WjN9Wz_5Cf8,695 -pandas/tests/io/sas/test_sas7bdat.py,sha256=G-4_60FYd2RGwaUwss34Tf81CVQIPkE3-cTHx4Ta-W4,11800 -pandas/tests/io/sas/test_xport.py,sha256=e9kb_D2qc56lxKeSYkDMnRhPmyW5fr18L1GBKHijoPw,5372 -pandas/tests/io/test_clipboard.py,sha256=2URWrH4Xy7ysmWejeN_-JShXiHOFTpRkO5EkNF_uBeA,8002 -pandas/tests/io/test_common.py,sha256=FDEkJJmK8c4oBpNBO0rs35TFOSkrg0k2LMf9C38xuhQ,14733 -pandas/tests/io/test_compression.py,sha256=YY4Di1SXchidK8qv5lHdzMzgxxd_SwApaaM8NeV_HMY,8223 -pandas/tests/io/test_date_converters.py,sha256=m4ig8gp7LzZ802Wt0zU-ugzzRWnV9iozWH-Wzr5tQ2A,1368 -pandas/tests/io/test_feather.py,sha256=G43mdco5hTX1BJXRuvYnIvFbbual6gHkyKaRAw23UNI,6193 -pandas/tests/io/test_fsspec.py,sha256=amf8OiIBqkqwIzVqw_vW6ZGJ8z4NEZmIzD0LWHiocoA,8839 -pandas/tests/io/test_gbq.py,sha256=8FrXvdTbrBDHRzu4qBsIR4YxDqbFdLguWFD8S7dPYS8,6810 -pandas/tests/io/test_gcs.py,sha256=JfQ5-78mUWXOZddWgWnDsvEOj3H7UTJ4Ci516uutZyQ,4689 -pandas/tests/io/test_html.py,sha256=Z0NltbF3v5cceJw9mDyhAo7YnNder6wzKz0_g8SKLlA,39566 -pandas/tests/io/test_orc.py,sha256=BRZGhL_991jsslfo1u6lM0WLun0O6sGLNen2xfGrZYc,6482 -pandas/tests/io/test_parquet.py,sha256=6qXUDxe45Aiu5dWoskPbyCMKYwtA2W5kZFm55eaLlM0,35394 -pandas/tests/io/test_pickle.py,sha256=xc1Xl88oaqv5AA4iKM00fz-T97jjOQZ6UHCFBe1K9XY,17586 -pandas/tests/io/test_s3.py,sha256=gSkZtuYyqceg2PHnPXdcaLNzaYTnMnF4ZB_KZOCv1fc,1535 -pandas/tests/io/test_spss.py,sha256=rJa2WvARqpkdPAwAGxj5jOc1fJiz9LgWLVH_5vBNdDY,2745 -pandas/tests/io/test_sql.py,sha256=L21zfjiLntWUqOuNlkqyfc8YTNuYPsaLOT5C4RfFXlw,101249 -pandas/tests/io/test_stata.py,sha256=JOB5NM4YIWaOzotVuL8Nb0jmO62K09SXsBh5Nk9qukM,78288 -pandas/tests/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/libs/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/libs/__pycache__/test_hashtable.cpython-38.pyc,, -pandas/tests/libs/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/libs/__pycache__/test_lib.cpython-38.pyc,, -pandas/tests/libs/test_hashtable.py,sha256=xcXxuZ-DGK_7gKRYf_OjCNSPY3kIy8Gvqk-KxZGWIo8,10952 -pandas/tests/libs/test_join.py,sha256=4O2UdcvDmSWDIu7RIozHha8koLHgcqCqjGBt3SopOMA,10909 -pandas/tests/libs/test_lib.py,sha256=ROdzJrfnKGE5pdNIcEUTCl7QZwGhhaA1JREm21o_6xw,7946 -pandas/tests/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/plotting/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/common.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_backend.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_boxplot_method.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_common.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_converter.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_datetimelike.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_groupby.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_hist_method.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_misc.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_series.cpython-38.pyc,, -pandas/tests/plotting/__pycache__/test_style.cpython-38.pyc,, -pandas/tests/plotting/common.py,sha256=udRKYgGcQwNJ7Fq3THkTTK-HkECRYLKpxZV1fd_Eukc,21403 -pandas/tests/plotting/frame/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/plotting/frame/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/plotting/frame/__pycache__/test_frame.cpython-38.pyc,, -pandas/tests/plotting/frame/__pycache__/test_frame_color.cpython-38.pyc,, -pandas/tests/plotting/frame/__pycache__/test_frame_groupby.cpython-38.pyc,, -pandas/tests/plotting/frame/__pycache__/test_frame_subplots.cpython-38.pyc,, -pandas/tests/plotting/frame/test_frame.py,sha256=YaJoiE3Oxbd12ncdzTj4_ccf4CPqN6pwfJhyeroaPyw,86384 -pandas/tests/plotting/frame/test_frame_color.py,sha256=GdeqIy8OhqY8piRl3oDot7CrrFmfhX7vdXL4MplQLWw,24477 -pandas/tests/plotting/frame/test_frame_groupby.py,sha256=bnvev7X63rzyRjRyP2jIBNH-JqrOAMrINENs0N6aZYw,3099 -pandas/tests/plotting/frame/test_frame_subplots.py,sha256=8e_wlVv6HSf4gYrU5bm42wsmsYhWmJa9ngzudr5-os4,26372 -pandas/tests/plotting/test_backend.py,sha256=c-Z8-B0c9fz6-N8gkGLUwQp1x8bCl7PeeQbnJDiiwIo,3520 -pandas/tests/plotting/test_boxplot_method.py,sha256=vxx2GQE1eVoW--zeCqlfgm37r041izfMgQUT5Wl_nUA,20342 -pandas/tests/plotting/test_common.py,sha256=se050hAM6FJaJqVqA-gzLim6zrQb-raYrMONfCm_qXs,919 -pandas/tests/plotting/test_converter.py,sha256=9l_o3hgDft7O_Y8ur-STUiAg2bY5cPJ0t4C7SnZvz6s,12967 -pandas/tests/plotting/test_datetimelike.py,sha256=A5VmwfgKA9L_fc0njiA9Tcat9igSco6rrh6MUKemG48,55646 -pandas/tests/plotting/test_groupby.py,sha256=jVAfSQbJHb7NEfl8KZzGDYggR1k5riJI9yA8zrHX6bQ,4685 -pandas/tests/plotting/test_hist_method.py,sha256=IUjSp5DzGkRiEKELvRM6m6pl59JIWxuB8F-3nqSYoyg,23040 -pandas/tests/plotting/test_misc.py,sha256=KM5uEmXKGN5nPaURC80QPQv2x8HkTbdsl4377kFs0Lg,19558 -pandas/tests/plotting/test_series.py,sha256=WAxIyxy7Wtq5XRINu_443UWsWhaBV_WV390zGNH6gCE,36503 -pandas/tests/plotting/test_style.py,sha256=Wj3UsUKUxwfsg2QdK7LIv8nvoSTxZ_4i-O2Y6hMC_Sw,5203 -pandas/tests/reductions/__init__.py,sha256=vflo8yMcocx2X1Rdw9vt8NpiZ4ZFq9xZRC3PW6Gp-Cs,125 -pandas/tests/reductions/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/reductions/__pycache__/test_reductions.cpython-38.pyc,, -pandas/tests/reductions/__pycache__/test_stat_reductions.cpython-38.pyc,, -pandas/tests/reductions/test_reductions.py,sha256=lCt2P-GlRYfKZdOI0qYkn-dWXAVs6L2l2P1kYbE_PA4,44791 -pandas/tests/reductions/test_stat_reductions.py,sha256=qSxjA65JtabrVudZytyj_adtsPJL6iwT2_cz9g4C6YE,9387 -pandas/tests/resample/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/resample/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/resample/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_base.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_datetime_index.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_deprecated.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_period_index.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_resample_api.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_resampler_grouper.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_time_grouper.cpython-38.pyc,, -pandas/tests/resample/__pycache__/test_timedelta.cpython-38.pyc,, -pandas/tests/resample/conftest.py,sha256=B0V3eJQwVyeZ1q9-IY_ENDoIIMyZlGb1kwfLZ3DnrMs,4149 -pandas/tests/resample/test_base.py,sha256=XtMieSITU3JdbVUKz1YEVacYnCylnsnamhzYUgsj5LE,7259 -pandas/tests/resample/test_datetime_index.py,sha256=LdB2Z99ZiE2F2-snqamnR9h8rAqG6knKSPvswdKsNrU,59647 -pandas/tests/resample/test_deprecated.py,sha256=Q_9-cy8QnFdL-qGghsiGURXddh7SYwEn78uiW3dwiYU,10148 -pandas/tests/resample/test_period_index.py,sha256=cFYY0qdyAVAKeXNkAT1Kr6mQA7dXQpCowXyXPKRALIc,33720 -pandas/tests/resample/test_resample_api.py,sha256=f4uU6dOwc6kdVxVvilJ1lAJOTQ3sN46lLsO1Vy2ugQk,18951 -pandas/tests/resample/test_resampler_grouper.py,sha256=JqV_dSFovwmCC4XeFs1qrgsaLPdhK6kP14WieiwPKUQ,12299 -pandas/tests/resample/test_time_grouper.py,sha256=Gcm0T4H88exXuflo8Y6gTkf7pxXGtueym-ObxO-2cU8,11022 -pandas/tests/resample/test_timedelta.py,sha256=CkYjLrLn2woOTG9PSF1sgmSmgc-DYYaUc1kPWsg6Lpg,6186 -pandas/tests/reshape/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/reshape/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_crosstab.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_cut.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_get_dummies.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_melt.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_pivot.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_pivot_multilevel.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_qcut.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_union_categoricals.cpython-38.pyc,, -pandas/tests/reshape/__pycache__/test_util.cpython-38.pyc,, -pandas/tests/reshape/concat/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/reshape/concat/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_append.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_append_common.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_categorical.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_concat.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_dataframe.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_datetimes.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_empty.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_index.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_invalid.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_series.cpython-38.pyc,, -pandas/tests/reshape/concat/__pycache__/test_sort.cpython-38.pyc,, -pandas/tests/reshape/concat/conftest.py,sha256=s94n_rOGHsQKdP2KbCAQEfZeQpesYmhH_d-RNNTkvYc,162 -pandas/tests/reshape/concat/test_append.py,sha256=QjMpPZL4VMhPzkB3OJdgqz1Mx_oMfh3Fspx7ejf48Jk,13535 -pandas/tests/reshape/concat/test_append_common.py,sha256=k-cxTQMY8kL3otXi-k0_hc8CLuH0uMJ_tMk_vYV2YFY,28180 -pandas/tests/reshape/concat/test_categorical.py,sha256=WaTuZt1xSDxod8ZM_RriiA1C6vX42_ito954CSmOk4M,6861 -pandas/tests/reshape/concat/test_concat.py,sha256=rAXaWUnjyLi6nw1OTM2Vug4-fY0sSaMxNPl4AhaAYJY,20224 -pandas/tests/reshape/concat/test_dataframe.py,sha256=T8i-XoiALZeE1GpuWcBnP41hRyCytm5-B9ih9C2Iakk,6382 -pandas/tests/reshape/concat/test_datetimes.py,sha256=9mlQEkBkrbTISp5m7CbtbOCEjmrUMfZulXVG8-VnZaE,18465 -pandas/tests/reshape/concat/test_empty.py,sha256=qYVrhJKAoNXxZUUvBmpVX1KSXlU6P0oWWXzH1JUlFDs,8550 -pandas/tests/reshape/concat/test_index.py,sha256=S32pe58jwHWlAZx9nLolmS5FXDrTBoWgKRojd5zJ5nM,9651 -pandas/tests/reshape/concat/test_invalid.py,sha256=5ggdhfY9iAlMT_Nb5_MIyGDWBqRKqXqY0XllIeunFjw,1562 -pandas/tests/reshape/concat/test_series.py,sha256=MuX5AjBM4ruh_e77TTWcrQA2vYso0bx47PluYKcqFi0,4950 -pandas/tests/reshape/concat/test_sort.py,sha256=qkvEpk6g96U8qLbfOPwg8nJ-oB6bL92QCxv-_iSe49U,3115 -pandas/tests/reshape/merge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/reshape/merge/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_join.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_merge.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_merge_asof.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_merge_cross.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_merge_index_as_string.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_merge_ordered.cpython-38.pyc,, -pandas/tests/reshape/merge/__pycache__/test_multi.cpython-38.pyc,, -pandas/tests/reshape/merge/test_join.py,sha256=xKUuH_64FTRueksXs43PzcDJ3rOpQgZf8wzfOPcK_MQ,29297 -pandas/tests/reshape/merge/test_merge.py,sha256=VJibe8zul1KySiPgI5IF-ychQ35csEwMPWVkwaPRkBQ,81997 -pandas/tests/reshape/merge/test_merge_asof.py,sha256=lsX1hYwYxaubZ5NOoQvI1MqwKv7iITlyJSWozQRVX7Y,45632 -pandas/tests/reshape/merge/test_merge_cross.py,sha256=JzjwY5kQpgLHp7JbEs2dS3o_G05f20XENVpxscH_ztA,2794 -pandas/tests/reshape/merge/test_merge_index_as_string.py,sha256=tKZu1pxnELCuqfdzhHJv-_J-KOteajhrLTxXXCrfPtc,5360 -pandas/tests/reshape/merge/test_merge_ordered.py,sha256=19UQft_scc7h3Js2CMGAKWPYbHSq5xMvfZf6aWW5Vsg,6370 -pandas/tests/reshape/merge/test_multi.py,sha256=6uRtUy6x1VQS_jXSWrILr9WHi0VJkfuVVcYpY_Cf8xI,29909 -pandas/tests/reshape/test_crosstab.py,sha256=Xq_l-ET2qxER3b7dyRefCYRIuui4IHoTQ07nG81LkRU,29055 -pandas/tests/reshape/test_cut.py,sha256=MN6AzA2h-ceeTgiDhURBglBz49UXFf5ZX1o2_UEG17c,20459 -pandas/tests/reshape/test_get_dummies.py,sha256=2dqRHmNtOltPE7UJiCTkWi5bzgeTsIfe0AV9sg7elt0,23211 -pandas/tests/reshape/test_melt.py,sha256=yS-YQqz-YAr61X-9-0jDs3rkSztunXBsoWQSVKYxkqU,37075 -pandas/tests/reshape/test_pivot.py,sha256=R8M4CNwBLdKas6sNFnQIyGTpkZ9LNRq0UPc5neIbfRY,74156 -pandas/tests/reshape/test_pivot_multilevel.py,sha256=m4DRTVrVGHy89tYi8PUXc7RbA_yuRYBWcv1iQyuWRN8,6704 -pandas/tests/reshape/test_qcut.py,sha256=3HfXrZcflwm48ytznqbgL00ri5IX8gpOz9zoh1K-fKg,8198 -pandas/tests/reshape/test_union_categoricals.py,sha256=aTFbe0TlHxZH0ET_fWMGtYpsiKkeBhpuKCnAsMrx9CU,14282 -pandas/tests/reshape/test_util.py,sha256=2zI6mgoqJoj9452-hZYdhCp6zXYcb9vCCsObcNXIx9w,2846 -pandas/tests/scalar/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/scalar/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/scalar/__pycache__/test_na_scalar.cpython-38.pyc,, -pandas/tests/scalar/__pycache__/test_nat.cpython-38.pyc,, -pandas/tests/scalar/interval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/scalar/interval/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/scalar/interval/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/scalar/interval/__pycache__/test_interval.cpython-38.pyc,, -pandas/tests/scalar/interval/__pycache__/test_ops.cpython-38.pyc,, -pandas/tests/scalar/interval/test_arithmetic.py,sha256=7uTjmMGHglfoPUmrWdj1m8-SITqswKlHNJ-kFU7gIIE,1819 -pandas/tests/scalar/interval/test_interval.py,sha256=2Arqyn1s0TYRZ3YbFCFI3ai8SeHW9dKFfCKni_CwfHw,8840 -pandas/tests/scalar/interval/test_ops.py,sha256=ExrB4ZM0mLdWtzgpscSsLmErnrUeUiNNgkoDKJ1iwnI,2336 -pandas/tests/scalar/period/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/scalar/period/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/scalar/period/__pycache__/test_asfreq.cpython-38.pyc,, -pandas/tests/scalar/period/__pycache__/test_period.cpython-38.pyc,, -pandas/tests/scalar/period/test_asfreq.py,sha256=lr2TRptofjvfPXJBJXZsAtC7J3viO_OD5AGlON3K5dY,36391 -pandas/tests/scalar/period/test_period.py,sha256=78VVavuR3p-amN3tQjJ5UO71UuQ2mDMaUOXv9nYIFfY,55158 -pandas/tests/scalar/test_na_scalar.py,sha256=A0YhmBc6KLnUXzIHEpWsOMhGf6DcQAcLsT8GhRUyfHs,7335 -pandas/tests/scalar/test_nat.py,sha256=_UIBHNTdnclR9q5dX6fSDbaDsB5frbae-YetoAUja-w,17225 -pandas/tests/scalar/timedelta/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/scalar/timedelta/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/scalar/timedelta/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/scalar/timedelta/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/scalar/timedelta/__pycache__/test_formats.cpython-38.pyc,, -pandas/tests/scalar/timedelta/__pycache__/test_timedelta.cpython-38.pyc,, -pandas/tests/scalar/timedelta/test_arithmetic.py,sha256=-PyoHZbz2FwPlca0NlHpheeP_xuTkYJdcC7JMG8-U0E,33789 -pandas/tests/scalar/timedelta/test_constructors.py,sha256=s4kvSCJWT4wHbBHTO_d0xdq3hutvsAAhfrnRZE9Yb9I,10256 -pandas/tests/scalar/timedelta/test_formats.py,sha256=afiVjnkmjtnprcbtxg0v70VqMVnolTWyFJBXMlWaIY8,1261 -pandas/tests/scalar/timedelta/test_timedelta.py,sha256=jFuAgqgtGsdGZh4X0_MmEii3q6J020zGIdaOu-p1No4,19427 -pandas/tests/scalar/timestamp/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/scalar/timestamp/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_comparisons.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_rendering.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_timestamp.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_timezones.cpython-38.pyc,, -pandas/tests/scalar/timestamp/__pycache__/test_unary_ops.cpython-38.pyc,, -pandas/tests/scalar/timestamp/test_arithmetic.py,sha256=at8v0mvYbFGp1wdhSCQgsIfLutga6Dgg9AeflB2yL_U,9023 -pandas/tests/scalar/timestamp/test_comparisons.py,sha256=LGSKFzq6LXf7SSf0Vz5fT5tSu1NQ37ZeomuqabSJ-jw,7740 -pandas/tests/scalar/timestamp/test_constructors.py,sha256=jkzSkejKtuPLo8n-X-MnqS9JK21l7OlGJnwT1-dAGiU,21448 -pandas/tests/scalar/timestamp/test_rendering.py,sha256=2fNXt0m3iHEyZ9O-SDxqy1pRYtImjrqyz7zT2AFKfB0,3740 -pandas/tests/scalar/timestamp/test_timestamp.py,sha256=WBVHvTkAuBCCRCxckl-XTKZq4QEEbXaPdIiNGoqKqQ8,20480 -pandas/tests/scalar/timestamp/test_timezones.py,sha256=17tloz7V6rS4MvMbPY9cywkM-N2mHHUTnze9aOauBns,16145 -pandas/tests/scalar/timestamp/test_unary_ops.py,sha256=qpTlVRcOuccyvYwaG_OzGnk38mHJ66dPA7XhKt0h1D8,15498 -pandas/tests/series/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/series/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_arithmetic.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_constructors.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_cumulative.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_duplicates.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_iteration.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_logical_ops.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_missing.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_npfuncs.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_reductions.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_repr.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_subclass.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_ufunc.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_unary.cpython-38.pyc,, -pandas/tests/series/__pycache__/test_validate.cpython-38.pyc,, -pandas/tests/series/accessors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/series/accessors/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/series/accessors/__pycache__/test_cat_accessor.cpython-38.pyc,, -pandas/tests/series/accessors/__pycache__/test_dt_accessor.cpython-38.pyc,, -pandas/tests/series/accessors/__pycache__/test_sparse_accessor.cpython-38.pyc,, -pandas/tests/series/accessors/__pycache__/test_str_accessor.cpython-38.pyc,, -pandas/tests/series/accessors/test_cat_accessor.py,sha256=a0gKDig0eckGBcKhqTOJZtWqs2HtG7k9JDzUkaySjdU,9145 -pandas/tests/series/accessors/test_dt_accessor.py,sha256=mUCb-9GpTCkwK5S5m2jnWcCbiLFVzm8O9vJQ37x3mug,26384 -pandas/tests/series/accessors/test_sparse_accessor.py,sha256=yPxK1Re7RDPLi5v2r9etrgsUfSL9NN45CAvuR3tYVwA,296 -pandas/tests/series/accessors/test_str_accessor.py,sha256=M29X62c2ekvH1FTv56yye2TLcXyYUCM5AegAQVWLFc8,853 -pandas/tests/series/apply/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/series/apply/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/series/apply/__pycache__/test_apply_relabeling.cpython-38.pyc,, -pandas/tests/series/apply/__pycache__/test_series_apply.cpython-38.pyc,, -pandas/tests/series/apply/__pycache__/test_series_transform.cpython-38.pyc,, -pandas/tests/series/apply/test_apply_relabeling.py,sha256=Ieu325hGjiIpGTqzSVIDhyMzVqSIAU9oYHivkJFCnCs,1325 -pandas/tests/series/apply/test_series_apply.py,sha256=4aSgtObbngFeIKINvVbZCXLl18q-hfcgWg2A_MBCKgU,29580 -pandas/tests/series/apply/test_series_transform.py,sha256=njaQzrVVluIcnF5PFYQ_615qcL22eCU9oXQIYyzg_hU,2740 -pandas/tests/series/indexing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/series/indexing/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_datetime.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_delitem.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_get.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_getitem.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_indexing.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_mask.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_numeric.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_set_value.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_setitem.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_take.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_where.cpython-38.pyc,, -pandas/tests/series/indexing/__pycache__/test_xs.cpython-38.pyc,, -pandas/tests/series/indexing/test_datetime.py,sha256=zGIHRgZFtM4CmtTcSuVoW9rRKK43kmZfHOKaWfJI1S8,18058 -pandas/tests/series/indexing/test_delitem.py,sha256=J7ToYwD9-F7LqmmFMFi8xJb76WfYf40KO4W3xLc9wOM,1404 -pandas/tests/series/indexing/test_get.py,sha256=49Wj_k_M0JUHQj42bkKKsoCY0AR4_OGS1Z4DRaTueQo,4324 -pandas/tests/series/indexing/test_getitem.py,sha256=YiMkwTQIJb68OgA5_4-2o-qUiszv1yjK5JC_Jfg2-fc,15096 -pandas/tests/series/indexing/test_indexing.py,sha256=aXGQs6wJgKhbsmq0mBXDfOMvGnQ9fxm_-lYSZBlScpo,23928 -pandas/tests/series/indexing/test_mask.py,sha256=nej3h-lEEu52GR3Y2ifKyb33j9Pt-rQO6SL6wp2zbXA,1602 -pandas/tests/series/indexing/test_numeric.py,sha256=IlQVxOrqz89fpNR9cIfJ8hRWXNwVWbZ_njcgdnAUvDU,2296 -pandas/tests/series/indexing/test_set_value.py,sha256=s-lbfDRriWnQDvhZTLQy-TgURH5Z6Fy8kQ9F9ica0iM,978 -pandas/tests/series/indexing/test_setitem.py,sha256=Mb8kz-UPX9hHfNiBRpHVgGWcVNGBK-ZErVoRp3Z9OlI,9369 -pandas/tests/series/indexing/test_take.py,sha256=2B79IuWBesI849qvFO4hELdNiVsT2A90yq8wor_aRYk,963 -pandas/tests/series/indexing/test_where.py,sha256=VF2OH1fgO3HnaBH0elTRLF96ZUbhs6CUwRoX5pURmkc,12696 -pandas/tests/series/indexing/test_xs.py,sha256=HY-zHALNvAneUnIBW2UzP-Oaox4Q8MNnwm-ZJTI1MQA,2293 -pandas/tests/series/methods/__init__.py,sha256=zVXqGxDIQ-ebxxcetI9KcJ9ZEHeIC4086CoDvyc8CNM,225 -pandas/tests/series/methods/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_align.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_append.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_argsort.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_asfreq.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_asof.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_astype.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_autocorr.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_between.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_clip.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_combine.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_combine_first.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_compare.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_convert.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_convert_dtypes.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_copy.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_count.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_cov_corr.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_describe.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_diff.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_drop.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_drop_duplicates.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_dropna.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_duplicated.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_equals.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_explode.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_fillna.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_get_numeric_data.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_head_tail.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_infer_objects.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_interpolate.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_is_monotonic.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_isin.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_isna.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_item.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_matmul.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_nlargest.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_pct_change.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_pop.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_quantile.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_rank.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_reindex.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_reindex_like.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_rename.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_rename_axis.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_repeat.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_replace.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_reset_index.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_round.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_searchsorted.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_set_name.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_shift.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_sort_index.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_sort_values.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_to_csv.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_to_dict.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_to_frame.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_truncate.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_tz_convert.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_tz_localize.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_unstack.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_update.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_value_counts.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_values.cpython-38.pyc,, -pandas/tests/series/methods/__pycache__/test_view.cpython-38.pyc,, -pandas/tests/series/methods/test_align.py,sha256=nt1-fCBzrCosQWTb7r_b41Q3KiirhII1VQtJNiu6HwM,5341 -pandas/tests/series/methods/test_append.py,sha256=42ph6opqoihmVGFifAIF-kj9ZxrM3Q26RNDwtfF7Jns,9757 -pandas/tests/series/methods/test_argsort.py,sha256=DMJP35UjJK3H5fgyFTj9dl5vCYlL4TE-4UKJXm5H2JU,2248 -pandas/tests/series/methods/test_asfreq.py,sha256=GoMwnZhRuiqSra1cp3GTaSZ0xhqk-S9DzEQkz7GsrIg,3616 -pandas/tests/series/methods/test_asof.py,sha256=Ye7zI367GP3ut6AcgvwV1tsfBx8-sFO_4yfZcnokWBY,5386 -pandas/tests/series/methods/test_astype.py,sha256=heQxYzV9UuWp29WMpfuO7ZXZuAJmTecYY-8xQm4rmeQ,14894 -pandas/tests/series/methods/test_autocorr.py,sha256=pI9MsjcDr00_4uPYg_Re22IsmVPTDbOjU83P4NOh8ck,999 -pandas/tests/series/methods/test_between.py,sha256=eu-l9-uNwmBJ6yuBL46WqtqL1RwvGH4LEramdtecofo,1197 -pandas/tests/series/methods/test_clip.py,sha256=Bx3UsyiX-iiadZE2HbbaDMrU6DyBHC8KKIOngc5KHCo,3378 -pandas/tests/series/methods/test_combine.py,sha256=ye8pwpjolpG_kUKSFTC8ZoRdj3ze8qtJXvDUZ5gpap4,627 -pandas/tests/series/methods/test_combine_first.py,sha256=OtqhpSLPpgOQiLUzCDVVZAZ488_uJMhnRLhmdOs8XwQ,3402 -pandas/tests/series/methods/test_compare.py,sha256=pb1C1B7bQoswLmvZKpnjJ6ErWktr7yIzR7hsaLNosvg,3734 -pandas/tests/series/methods/test_convert.py,sha256=Ty51C_HPO1BHXReMr8iFqBb6tQSe8q008LSa29mT6i8,4911 -pandas/tests/series/methods/test_convert_dtypes.py,sha256=i35tVe5qpcHm7Ity-BxH1FmOg2_tDn8QP2pcIj7uPGY,5852 -pandas/tests/series/methods/test_copy.py,sha256=ahYJ0ZiCESIwi71QnjXdIrusFXOiQDKsN-4IWFe2YxM,2166 -pandas/tests/series/methods/test_count.py,sha256=JwsQ6wKAEBUFxAWNVXxjFPF1WaNLlcsA4PoNwoSKuL4,2620 -pandas/tests/series/methods/test_cov_corr.py,sha256=1svO9Ktk8rhVA8Cy_W45AVbjVIDEdkcywaTx6esMZTM,5209 -pandas/tests/series/methods/test_describe.py,sha256=gKuAPHJSC2EaeGv2XvRjEhgkMRd8XayFyNnSWdv-t9A,4640 -pandas/tests/series/methods/test_diff.py,sha256=sj2jx-6vYyijU3ueai-OFZZ7dkrDl2Ijie_qd5o1RTQ,2331 -pandas/tests/series/methods/test_drop.py,sha256=ug6VfbsRUlK4CqDfXLUvYQt95Tg9wmNFQx-eUiSkU1A,2957 -pandas/tests/series/methods/test_drop_duplicates.py,sha256=GwlR75KjD_Jk2RQHmHqACtLTBBn4t4hd4W_a0XxwEto,6849 -pandas/tests/series/methods/test_dropna.py,sha256=kgZDwrOpmsbOoWi6geBTuxgaoPlM-bD5VjruhJ1FX7s,2986 -pandas/tests/series/methods/test_duplicated.py,sha256=EGNeuEFFAc5G-yebtUfiVDxmfVRaD22C2z5nxI2_sGE,996 -pandas/tests/series/methods/test_equals.py,sha256=Jcgg0TjrIBUwnq0JQyo7uyDn27cElhMHArcNHWkWfTA,1565 -pandas/tests/series/methods/test_explode.py,sha256=Ag2I4Nn3LK5Ns7n99RgIubS1eMrbyKTCj5zUrR0MIos,3812 -pandas/tests/series/methods/test_fillna.py,sha256=BEUaZBvKBpOBrHscmuyiXK5GNn1SzkKpqL2A6rH4w3o,27338 -pandas/tests/series/methods/test_get_numeric_data.py,sha256=iAtBaRJtPees1Wey6utYi7nNJfnZkBQmXe_-k8wVOCI,857 -pandas/tests/series/methods/test_head_tail.py,sha256=1EWojjTzcLvYH34VvyvEHxczDy7zL3dMTyayFHsVSzY,343 -pandas/tests/series/methods/test_infer_objects.py,sha256=SioL1jaiK8W07ZbpSROpB2gBuVQHXnN-PjieVShP1J0,787 -pandas/tests/series/methods/test_interpolate.py,sha256=d0ObODNy4aSHw_SsDKvdebRvUVKud4K6_UOp7d8fmdY,31137 -pandas/tests/series/methods/test_is_monotonic.py,sha256=7luMMlLqHs11v6fQY_PMadmVFs1Wc0sL7N5J3cypr_s,767 -pandas/tests/series/methods/test_isin.py,sha256=CAv_RyC0NSnEzVV2QVlaFxN35vVXioZYntAPONXqc4Y,5308 -pandas/tests/series/methods/test_isna.py,sha256=u1RM3z37bbw8Key7vUddAQnNp5rAKUqg8GHquWXnbJg,927 -pandas/tests/series/methods/test_item.py,sha256=RS26_lZgBEgQIbVoyBhAtzM5pi8QBw-yU-jjdND4lyo,1422 -pandas/tests/series/methods/test_matmul.py,sha256=xdwYHboX8q1A2blvMOGfb_JVuXcopq8CQgyBwOlHVSA,2655 -pandas/tests/series/methods/test_nlargest.py,sha256=-mGSiPOK7wFJI23X1W3EpRImjP2q9UQcMPbL_sN6P3o,7121 -pandas/tests/series/methods/test_pct_change.py,sha256=GZ97ZyLCDw_BvBlJ3U6V75e8ECudGYJkee0xzfiyayc,2968 -pandas/tests/series/methods/test_pop.py,sha256=xr9ZuFCI7O2gTW8a3WBr-ooQcOhBzoUK4N1x0K5G380,295 -pandas/tests/series/methods/test_quantile.py,sha256=-vGZSAIqlLLNQTPkcjb61eezeBgAz6hDJKri1QA1vfo,6738 -pandas/tests/series/methods/test_rank.py,sha256=g8-_N6ER0WANPSPonSduGrbQQD3_vZA3LBsgU4xXaLc,20287 -pandas/tests/series/methods/test_reindex.py,sha256=bhAFdfGGFs4OEXMGHDGQcJ9zddRWVImH4v9MMLtv5C4,9723 -pandas/tests/series/methods/test_reindex_like.py,sha256=e_nuGo4QLgsdpnZrC49xDVfcz_prTGAOXGyjEEbkKM4,1245 -pandas/tests/series/methods/test_rename.py,sha256=sPM1450Gt6zFta8_YYSkgKKen3D0t4ccP9AzpL-53hY,3378 -pandas/tests/series/methods/test_rename_axis.py,sha256=JsEXmYTfCjo4WKPKowIPteoqUSuTQ543PD0vgoSX1JI,1503 -pandas/tests/series/methods/test_repeat.py,sha256=kCk4s0XbNx60mNfkwIAom7ADtUCQFeikh6RNo9sEZM4,1236 -pandas/tests/series/methods/test_replace.py,sha256=8iHapj_YeX0ho7lCQ9uUZb3-0tb1rwJBRjElejtvd0M,16501 -pandas/tests/series/methods/test_reset_index.py,sha256=8lcaB62kzpFlYMb034QZka-eIOFkVuV33R1dD6I7Qfs,5771 -pandas/tests/series/methods/test_round.py,sha256=5BQUUK8ZUtuivHK1ys46k3B7viN72jkGLYpkQ_MZ0bg,1970 -pandas/tests/series/methods/test_searchsorted.py,sha256=l-NYx0xly6ENy20IalgdoMJ_lwJz2a1Mj_v26eoFQmk,2121 -pandas/tests/series/methods/test_set_name.py,sha256=rt1BK8BnWMd8D8vrO7yQNN4o-Fnapq5bRmlHyrYpxk4,595 -pandas/tests/series/methods/test_shift.py,sha256=aWBE_cL_Eq8XnQRwqUnJzud7oGpuVsiXbd-6zU47HGo,13266 -pandas/tests/series/methods/test_sort_index.py,sha256=JS6FIP-AVvWVimimutjyZ2x_aVr7gdW1UBgRtB5QA8Y,12277 -pandas/tests/series/methods/test_sort_values.py,sha256=4g6g17TsWVGe-GJm-SetFUyp6UTm7zCOhUqZMueR-TQ,7981 -pandas/tests/series/methods/test_to_csv.py,sha256=Fz3cuR87Qn_-7k7s1jVsQitriQn6MrxH8xfIO7wibIs,6229 -pandas/tests/series/methods/test_to_dict.py,sha256=S4V-b3dcnAzxbMXhzB8tt9vqbKdG8UEOeLi8EKapwFo,742 -pandas/tests/series/methods/test_to_frame.py,sha256=sxDjBS4jo-CMFFRnya6xixlVPmo33KuFhTNDzqabFKo,1275 -pandas/tests/series/methods/test_truncate.py,sha256=14nkeOr-wQlwc9Vtr8JCkgtXlXNjrJl3oGezyNUzmS8,2026 -pandas/tests/series/methods/test_tz_convert.py,sha256=bNbxPXysoOfJhX1ksPMq2Gd08-umHTbmPnCkplTlOC4,1008 -pandas/tests/series/methods/test_tz_localize.py,sha256=djG4GQoo1hD6OxvawzUr_tG-g7s4LNL5IguozH8IhoE,3108 -pandas/tests/series/methods/test_unstack.py,sha256=LlDGCLGcv4Nhw_0rqqEMDe3tccjFzzPnotAPhLOnEsU,4106 -pandas/tests/series/methods/test_update.py,sha256=a-AJfcP1wNEK3jtzJ4h40giR5j8WFszzbPgXv_e4BMg,4213 -pandas/tests/series/methods/test_value_counts.py,sha256=gxcOPNrkfdGxQOi689f4192u52RM-OwI3maIiEIiIwI,8055 -pandas/tests/series/methods/test_values.py,sha256=C67U2z0ME2K15hXAdnFVkz3b16vSrw1Jdjt7573TQsw,724 -pandas/tests/series/methods/test_view.py,sha256=4aMiaRzg3DjQ50z62aBICFXb9h86xNGpKmsm9649jqQ,491 -pandas/tests/series/test_api.py,sha256=A-sx4sCZw0aqmxyWyOVkYtPh-BbKRTZElkF2ipAzAtc,5889 -pandas/tests/series/test_arithmetic.py,sha256=QqSO3K1abHAzlzN6wzhd5gK0e4eJvX7Vv2_dF-EGO5I,30847 -pandas/tests/series/test_constructors.py,sha256=mwQWh7ZlwBMlZhUB_jOq5jtGKHfoCEvwbhKfff8DgBM,60382 -pandas/tests/series/test_cumulative.py,sha256=xGtYuDLLkLOcDP_FgXSMQnt4f40stCpB4zb_FUck_ME,5549 -pandas/tests/series/test_dtypes.py,sha256=YmDBkYItRpmwP0QhE7mFi1mKPatfUV_ar-uI-Psx4Zg,5113 -pandas/tests/series/test_duplicates.py,sha256=mQJgFIBhmCCUeL1IP3qgz21yLb4Bo8beEEoRpA5FNmY,2473 -pandas/tests/series/test_iteration.py,sha256=3aPgneBFgn-jsKUyUTBuvPMEnhYMZLHWzFdqmY56A-I,1280 -pandas/tests/series/test_logical_ops.py,sha256=kv4186X0t_AYKs8MuwpjLfrfGJHNERi5tU94YQHVwU0,17530 -pandas/tests/series/test_missing.py,sha256=1aVM1qpDmUYCL2gDCmFQpSik3XsdfD66WCMMGjxWVMs,3301 -pandas/tests/series/test_npfuncs.py,sha256=zJgZDsPuaZub277gmntO50tO3yIBUIuvYLM4n6oEwe4,293 -pandas/tests/series/test_reductions.py,sha256=TBAMHDb7fFeUbdH38dJyfDlgc6Az5gBJ-fk0c5VB8Rk,3112 -pandas/tests/series/test_repr.py,sha256=8pzusesSaStZU0ZZGWdiioiEX9_ds9O-e__cmU-5ezg,14974 -pandas/tests/series/test_subclass.py,sha256=mXBNmZS5ZPubHO3xiIJQQJWZjfS9Li8tKxUd4TIrEBU,2084 -pandas/tests/series/test_ufunc.py,sha256=d9XSQsYYdLJzydKd61FxHWJHY9WmUF7DAPjPMEcJoic,9845 -pandas/tests/series/test_unary.py,sha256=6z_jMcvJvuCcrL7QeZgiR2FhDDV8AzzZY-vIMXOUswY,1755 -pandas/tests/series/test_validate.py,sha256=ziCmKi_jYuGyxcnsVaJpVgwSCjBgpHDJ0dbzWLa1-kA,668 -pandas/tests/test_aggregation.py,sha256=1_GdPm2RQ7Lcb4v8CEDBXg_4VS76hc2s8g2uj_cgrOw,2772 -pandas/tests/test_algos.py,sha256=h8G4g-CakjGmxkNeQ2ZrQ8XpqH7nmkVXwQ82ajoaln0,83590 -pandas/tests/test_common.py,sha256=NO55Vc8QTsG4Cv_z-32fniq6I0qQPy-HkjLMc3E_gfQ,4495 -pandas/tests/test_downstream.py,sha256=7FeOwlfj_6j6zOO8jQan3RqZ8DHP0deeTaRrpsgQG2w,5070 -pandas/tests/test_errors.py,sha256=0AJX0jzXB-KP8NrSA-EluL6O8INGVNJy6_koOQCEweg,1670 -pandas/tests/test_expressions.py,sha256=jX3Hk5qvn65vVQ6eYfhrKt3-jLDOcRt-Tlvt46lN9rc,13414 -pandas/tests/test_flags.py,sha256=Dsu6pvQ5A6Manyt1VlQLK8pRpZtr-S2T3ubJvRQaRlA,1550 -pandas/tests/test_multilevel.py,sha256=GWR3IQsePTZdQiNWRrBRRJQtrnM-APZNTCcdObyTTEY,14320 -pandas/tests/test_nanops.py,sha256=acg5JRR15h_FfS-THAp0IvndMim2qFO-xjYC9z0lNZU,38436 -pandas/tests/test_optional_dependency.py,sha256=5E8DOUBXInOLBEt_cRM9AuqW2Zv48OEjfmWf0v0F9GE,1518 -pandas/tests/test_register_accessor.py,sha256=wf9JnlWOm2gNlHFLVa8m3jo7uSwPrV6bfbee6X5NHsQ,2663 -pandas/tests/test_sorting.py,sha256=Aic_RZRn4Pph0RSd52HAWxU5I3n5QpgMLehpO5tXo6o,18315 -pandas/tests/test_strings.py,sha256=HMTruFalO4R1TCX413JDfQR93rKqaZeAkXKAe6CaqAA,134551 -pandas/tests/test_take.py,sha256=-BQ9IT_m8KhwotGfRL6p2QbaoUBxcRpsM6YQD1vXMOA,16875 -pandas/tests/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tools/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tools/__pycache__/test_to_datetime.cpython-38.pyc,, -pandas/tests/tools/__pycache__/test_to_numeric.cpython-38.pyc,, -pandas/tests/tools/__pycache__/test_to_time.cpython-38.pyc,, -pandas/tests/tools/__pycache__/test_to_timedelta.cpython-38.pyc,, -pandas/tests/tools/test_to_datetime.py,sha256=IfV6BxN-DD2YyLTS7BDc3S1dbmXtOFQ8BrTbvZaZn3Q,93371 -pandas/tests/tools/test_to_numeric.py,sha256=-UUjUwKi7bgpy91T44VDvQeYpnEm_pjdEjP47VbZU4w,20578 -pandas/tests/tools/test_to_time.py,sha256=-wkkraMac9JpiMJkoN8mSbCryUzp_xY9blKl9WQCp0I,2019 -pandas/tests/tools/test_to_timedelta.py,sha256=iUO39DofFaj-KkBrQK7ZPwwZMwxCxoehHDEQMGZInf8,8388 -pandas/tests/tseries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tseries/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tseries/frequencies/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tseries/frequencies/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tseries/frequencies/__pycache__/test_freq_code.cpython-38.pyc,, -pandas/tests/tseries/frequencies/__pycache__/test_frequencies.cpython-38.pyc,, -pandas/tests/tseries/frequencies/__pycache__/test_inference.cpython-38.pyc,, -pandas/tests/tseries/frequencies/test_freq_code.py,sha256=oGae3OrBChbbbfOs7YuuOHY5BZ5ExGWiie3cqjAFjV8,1991 -pandas/tests/tseries/frequencies/test_frequencies.py,sha256=5eU-nFa3pzpwd-PkO2uV4h-qSQbWdGjarUoSBSt2x7Y,808 -pandas/tests/tseries/frequencies/test_inference.py,sha256=bhbMbHXzupuNomk1RppTGDfjHZ59sRDRmXz2jUcERvs,14301 -pandas/tests/tseries/holiday/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tseries/holiday/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tseries/holiday/__pycache__/test_calendar.cpython-38.pyc,, -pandas/tests/tseries/holiday/__pycache__/test_federal.cpython-38.pyc,, -pandas/tests/tseries/holiday/__pycache__/test_holiday.cpython-38.pyc,, -pandas/tests/tseries/holiday/__pycache__/test_observance.cpython-38.pyc,, -pandas/tests/tseries/holiday/test_calendar.py,sha256=67ajDLqNmu5KsojIQQnBrWKHG_TRbaLRwWczqKKBxM8,3514 -pandas/tests/tseries/holiday/test_federal.py,sha256=TPMPlc2skaMCNbeJ8gDYS7JwsAZFQ16EjdJpN4pQysY,1157 -pandas/tests/tseries/holiday/test_holiday.py,sha256=A3OEN0TifKGr9DQlpGMGH0aI05bVWtFf1ukYFtJVn-8,8636 -pandas/tests/tseries/holiday/test_observance.py,sha256=GJBqIF4W6QG4k3Yzz6_13WMOR4nHSVzPbixHxO8Tukw,2723 -pandas/tests/tseries/offsets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tseries/offsets/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/common.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/test_fiscal.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/test_offsets.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/test_offsets_properties.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/test_ticks.cpython-38.pyc,, -pandas/tests/tseries/offsets/__pycache__/test_yqm_offsets.cpython-38.pyc,, -pandas/tests/tseries/offsets/common.py,sha256=P1rsSZLByDzsn-a2A8z24sfVUCV2117ouMyXXyK-1eo,748 -pandas/tests/tseries/offsets/conftest.py,sha256=EToa7cYjNUnSlQY2dMJbI6SrFJkB5hQNOhgUaq13BjE,643 -pandas/tests/tseries/offsets/test_fiscal.py,sha256=R72tbmbrK-YBZnnuAV1Iq2fVa0a5DSFZqXUxbNLIx0U,28005 -pandas/tests/tseries/offsets/test_offsets.py,sha256=KiJXSaaA1OIcXoPiqQcikKxHze2cQ8LdORHaq3cc2UE,163495 -pandas/tests/tseries/offsets/test_offsets_properties.py,sha256=mAQMlSNOnG7O46QuvSfYLXz9Spl22gtQMEV90W5-AU4,3474 -pandas/tests/tseries/offsets/test_ticks.py,sha256=745kjPR92Hmqreuy2pkSIlD3Uc6ejA0OnJzkAXHBoTI,10672 -pandas/tests/tseries/offsets/test_yqm_offsets.py,sha256=i6QebgTSJKLmg4upBpfDh2ij5IwGKQgImreFq9POpFs,50930 -pandas/tests/tslibs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/tslibs/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_array_to_datetime.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_ccalendar.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_conversion.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_fields.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_libfrequencies.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_liboffsets.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_parse_iso8601.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_parsing.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_period_asfreq.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_timedeltas.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_timezones.cpython-38.pyc,, -pandas/tests/tslibs/__pycache__/test_to_offset.cpython-38.pyc,, -pandas/tests/tslibs/test_api.py,sha256=4oRMvbX4LqH6uAdEMtdCq27EetY7DzPdp1U-yjoQ7co,1241 -pandas/tests/tslibs/test_array_to_datetime.py,sha256=v2l22eFeN6AX7VV7Hj4wDINB9ElGu92d1cF99UYSnys,6091 -pandas/tests/tslibs/test_ccalendar.py,sha256=T3-v4p6y4V3w9Heea-gW5CGhlnfF7kqaeuew8kmcCag,1948 -pandas/tests/tslibs/test_conversion.py,sha256=-N3j4yh1OspTkhrIjou812ZGzFiogl9AGO6C4UuRG74,3973 -pandas/tests/tslibs/test_fields.py,sha256=nX-fUHm4_d_XWPT3L8_QRac4QP2BqNwgOyUgqfbEFBM,1123 -pandas/tests/tslibs/test_libfrequencies.py,sha256=1aQnyjAA2F2-xfTlTa081uVE3dTBb2CdkYv8Cry5Gn0,769 -pandas/tests/tslibs/test_liboffsets.py,sha256=NKIfmAvmwijrImQJ6ukG-klGnTX5jBCdU0naYf3h5m8,5095 -pandas/tests/tslibs/test_parse_iso8601.py,sha256=XIidGrTdVtTyauCww9brdCCIcnbXxXuWYqREVorC66E,2069 -pandas/tests/tslibs/test_parsing.py,sha256=UzZ_P_Wx0uL8HKxsOZNwIZjIcm4cNBGVN9kv3rpAdpo,6592 -pandas/tests/tslibs/test_period_asfreq.py,sha256=xePH_inLJA_oRcyZE76QIkW6XEPmiT_v03okhH1snec,2313 -pandas/tests/tslibs/test_timedeltas.py,sha256=2_GQmwHkliJCfNl4ziMgtINWyM74VysYBhLNApMBUQU,960 -pandas/tests/tslibs/test_timezones.py,sha256=KYon3u_vJivo-ULCDk_G6m6gyFT-ABYhkURpVdSHbiM,4089 -pandas/tests/tslibs/test_to_offset.py,sha256=GapjHr9FLpxVk9NprtQ5o3l_H-kcPq1hMlzmenUIN6I,4769 -pandas/tests/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/util/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/util/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_almost_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_categorical_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_extension_array_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_frame_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_index_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_interval_array_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_numpy_array_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_produces_warning.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_assert_series_equal.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_deprecate.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_deprecate_kwarg.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_deprecate_nonkeyword_arguments.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_doc.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_hashing.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_numba.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_safe_import.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_show_versions.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_util.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_validate_args.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_validate_args_and_kwargs.cpython-38.pyc,, -pandas/tests/util/__pycache__/test_validate_kwargs.cpython-38.pyc,, -pandas/tests/util/conftest.py,sha256=loEbQsEtHtv-T4Umeq_UeV6R7s8SO01GHbW6gn8lvlo,476 -pandas/tests/util/test_assert_almost_equal.py,sha256=13YLIY2yLTU39Cstt4Eyu_JJkqc0fo7UtShN0knzebM,12536 -pandas/tests/util/test_assert_categorical_equal.py,sha256=l0eBVe0b0Vs0-Av22MqkSqHklSwFKnlNNezsQPZWvOE,2748 -pandas/tests/util/test_assert_extension_array_equal.py,sha256=6HhnzmEM3Z3JzIsmnatR-C8Z4eaPV3hngfRZd_k1ATU,3464 -pandas/tests/util/test_assert_frame_equal.py,sha256=-JUDb3x_L44T7z1NvxczDe6Mugi8Hylv2Gr4SXSo4gA,9861 -pandas/tests/util/test_assert_index_equal.py,sha256=yubFWMb2vLuLRqf4qOXA1ld-iwav5h2IOVv3FA75clY,6392 -pandas/tests/util/test_assert_interval_array_equal.py,sha256=ITqL0Z8AAy5D1knACPOHodI64AHxmNzxiG-i9FeU0b8,2158 -pandas/tests/util/test_assert_numpy_array_equal.py,sha256=Fw7IEFDA70TBmaLb_YXbETSG6JdeEwRZZYH54T_b2JQ,6361 -pandas/tests/util/test_assert_produces_warning.py,sha256=i0_Q3RIyfekTPzqantI_FSgugyYEs4LpxxOGts05ga4,5040 -pandas/tests/util/test_assert_series_equal.py,sha256=lhgGoW2b07jL6cXBu3g5Yw8VzAIQ0XgHEVP6OemMiKM,9605 -pandas/tests/util/test_deprecate.py,sha256=oZXuNwUnS_hAcMWPgl9ErjGCZSs4beoaivnsOTQzIys,1626 -pandas/tests/util/test_deprecate_kwarg.py,sha256=7T2QkCxXUoJHhCxUjAH_5_hM-BHC6nPWG635LFY35lo,2043 -pandas/tests/util/test_deprecate_nonkeyword_arguments.py,sha256=iE_lNxkFHyuuBf6wsxSwoePYrw9LE_ty3sBOY8mO-hM,2713 -pandas/tests/util/test_doc.py,sha256=u0fxCg4zZWhB4SkJYc2huQ0xv7sKKAt0OlpWldmhh_M,1492 -pandas/tests/util/test_hashing.py,sha256=V73xWNPOG3zoASuO-UsYtvhNpLob8TARB72fRVta1Y4,11014 -pandas/tests/util/test_numba.py,sha256=6eOVcokESth7h6yyeehVizx61FtwDdVbF8wV8j3t-Ic,308 -pandas/tests/util/test_safe_import.py,sha256=UxH90Ju9wyQ7Rs7SduRj3dkxroyehIwaWbBEz3ZzvEw,1020 -pandas/tests/util/test_show_versions.py,sha256=MsjRjtI0WnCNSCSg_7dA4UZwhtOZiogyT5GxV0TiRHs,1241 -pandas/tests/util/test_util.py,sha256=P3fQqMjLt1sL0jOKYj_nYIyeiP2PwDXIy4BPUrf_c6k,1982 -pandas/tests/util/test_validate_args.py,sha256=ygRn_KXeO86BTvXnNDGWmnWsWygqoItKk7Ob1I2-9Wo,1842 -pandas/tests/util/test_validate_args_and_kwargs.py,sha256=OFROeLC6jsezEgz3SypUMUl5VHPQGgj49eZYsMOQ9-s,2391 -pandas/tests/util/test_validate_kwargs.py,sha256=SfWa0wBgTLx8XcdFvWj8OhN3ll6Lt6KuXnZeGW3Qxdw,1742 -pandas/tests/window/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/window/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/window/__pycache__/conftest.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_api.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_apply.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_base_indexer.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_dtypes.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_ewm.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_expanding.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_groupby.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_numba.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_pairwise.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_rolling.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_timeseries_window.cpython-38.pyc,, -pandas/tests/window/__pycache__/test_win_type.cpython-38.pyc,, -pandas/tests/window/conftest.py,sha256=by8tHGSEfGhk59Ri7FQhLwvtXPCRBiJCcvlI5hYd64E,8986 -pandas/tests/window/moments/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tests/window/moments/__pycache__/__init__.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_consistency_ewm.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_consistency_expanding.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_consistency_rolling.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_ewm.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_rolling.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_rolling_apply.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_rolling_functions.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_rolling_quantile.cpython-38.pyc,, -pandas/tests/window/moments/__pycache__/test_moments_rolling_skew_kurt.cpython-38.pyc,, -pandas/tests/window/moments/test_moments_consistency_ewm.py,sha256=1yQhwx1fBUwIaN0wnzGQRLRD6n3tMjIvz1JQHMDPsRw,11399 -pandas/tests/window/moments/test_moments_consistency_expanding.py,sha256=H9JvJFoPvJzHlPj-MU8gleyhPKeRmdg18h_V1hNYTHw,18385 -pandas/tests/window/moments/test_moments_consistency_rolling.py,sha256=jD4JQoFpBYCd9OOblZuFU0AcUiIZRP3chxrxaJTXcn4,24236 -pandas/tests/window/moments/test_moments_ewm.py,sha256=PG8Bp1f9iwQityez9s9b8wLhDmvRGhp4h_9vKvM7wcM,10398 -pandas/tests/window/moments/test_moments_rolling.py,sha256=7k4-WTGhWdNcajGgKPAAGR8jRtju_tjFTyP3aotPzZg,15849 -pandas/tests/window/moments/test_moments_rolling_apply.py,sha256=O3_wdl7yUEGoTb3ui8m2yjVW5kwI-ezlDDH19vBYqac,4426 -pandas/tests/window/moments/test_moments_rolling_functions.py,sha256=gBDsWp-RzCVQk3io4LsCSMylIW9RCuHLquBMinkMc78,9762 -pandas/tests/window/moments/test_moments_rolling_quantile.py,sha256=7ZA4JX-zbDQm5c_fRdwXWuP_1UC2VR_X6dGGa828foY,5059 -pandas/tests/window/moments/test_moments_rolling_skew_kurt.py,sha256=708JmdaXAjhKqZf7shLBy_xGdqEBSBGjQ8IZwSrWzUY,5409 -pandas/tests/window/test_api.py,sha256=zef8Wr70tAVgZZPUrY1AoIGdWK6tl8Wxez_HXsTN6Xs,9525 -pandas/tests/window/test_apply.py,sha256=TmKplJzaMWigloT2CaM2Rgz4QHW4pxvsEpW0N9-ojqc,4869 -pandas/tests/window/test_base_indexer.py,sha256=DNlgJNAHSck7mSLtSLeaM68_bjuGTFiXyaIFyPSvDLg,9636 -pandas/tests/window/test_dtypes.py,sha256=u9l_1J-gCehy5DOsdHvfHXlVxcnGnCa1esAavYp6mPY,5064 -pandas/tests/window/test_ewm.py,sha256=PPflJAb5o8gUQ80zffX14EmZ76hb7pYWHDLW8rGwCtA,4117 -pandas/tests/window/test_expanding.py,sha256=iRUpQVrZ6WxyHHxMYnLkQ_dUsGdgmhYyPlrcpL2QPUU,8137 -pandas/tests/window/test_groupby.py,sha256=I9308aQhgh36F6_EMAsoPoaC6xD0L8kQV4MNcG9LSeM,26540 -pandas/tests/window/test_numba.py,sha256=UZdbNMWjO2OwgKumFnl-zNvLLRc3HOiYk8mjulNXp0g,4370 -pandas/tests/window/test_pairwise.py,sha256=I8W4iInD2X6FNHxjOGqgSksFOwXoiPwWoFc31aPsYjs,8724 -pandas/tests/window/test_rolling.py,sha256=dbBQCSeGmp2xbOFu2ff1WXIWy4QjSFWs2D2BDq8HP60,34855 -pandas/tests/window/test_timeseries_window.py,sha256=xQAhtKXA2QH56jj6u46HdM9wWaD9Ms-JmEs2tRUlVFE,25075 -pandas/tests/window/test_win_type.py,sha256=vqMyDmIby6X6lwY7ihFsX7PudXGx7iiYzT9CDgnTebo,3634 -pandas/tseries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -pandas/tseries/__pycache__/__init__.cpython-38.pyc,, -pandas/tseries/__pycache__/api.cpython-38.pyc,, -pandas/tseries/__pycache__/frequencies.cpython-38.pyc,, -pandas/tseries/__pycache__/holiday.cpython-38.pyc,, -pandas/tseries/__pycache__/offsets.cpython-38.pyc,, -pandas/tseries/api.py,sha256=GqF_UoX9OyCigcaoePStvDGasXZiFByVKSeATIYbWWY,131 -pandas/tseries/frequencies.py,sha256=nxJZSzpG_RntjMz5Ouk4QLyj6aeAbkmLfu7mWEz9uM8,16948 -pandas/tseries/holiday.py,sha256=DqE4qlTpmvipqCMA2k9e9f65MGt2bAM_dtjqrLRxbpE,16684 -pandas/tseries/offsets.py,sha256=r6k_TpTSCMVeLZ5ILWkMvwl9bg3gjK_sjuGmdJyEQUE,1366 -pandas/util/__init__.py,sha256=-i3EBpGY2rZG5FxjAqelpUs1zEOAMZAxxSL5vL8ATKU,372 -pandas/util/__pycache__/__init__.cpython-38.pyc,, -pandas/util/__pycache__/_decorators.cpython-38.pyc,, -pandas/util/__pycache__/_depr_module.cpython-38.pyc,, -pandas/util/__pycache__/_doctools.cpython-38.pyc,, -pandas/util/__pycache__/_exceptions.cpython-38.pyc,, -pandas/util/__pycache__/_print_versions.cpython-38.pyc,, -pandas/util/__pycache__/_test_decorators.cpython-38.pyc,, -pandas/util/__pycache__/_tester.cpython-38.pyc,, -pandas/util/__pycache__/_validators.cpython-38.pyc,, -pandas/util/__pycache__/testing.cpython-38.pyc,, -pandas/util/_decorators.py,sha256=tYSdof8G511nvGp__eEyWQqlZwFwVdLlAk4nV_iVyhw,17021 -pandas/util/_depr_module.py,sha256=PQ1MOqjY8Go8g56pBSvlVla2mPTaseeZhckGh07vrd8,3463 -pandas/util/_doctools.py,sha256=W2q3bbEU5MwsYoJcZvfMXV-VJxGzjAO_dAb7R572dms,6660 -pandas/util/_exceptions.py,sha256=o41FLW_YVeexTEaBDOX376m-k74BKZDXdz90vW6XLc0,452 -pandas/util/_print_versions.py,sha256=2-AQnmdcC0h4ZpFE_0Akg-4IiyF2vLyM_K2KZFZvVtI,4299 -pandas/util/_test_decorators.py,sha256=bu6I5PSUJR-cbTbIWbUD-vmfH-a5uLtivq9XzR24TYk,8050 -pandas/util/_tester.py,sha256=7skbiZFQzhY6WPH-dUTwFjgrU_SqJJSDlNg_KsUZCv4,759 -pandas/util/_validators.py,sha256=bqpuT8N73yYq-sZqpJWvmCiUoeJ40tWqpTmXfJIlXLE,13448 -pandas/util/testing.py,sha256=Cxkz00-nJkvGqpWXUxvO0LOY16F2hjIkJtgv64yJIcA,242 diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/REQUESTED b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/WHEEL b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/WHEEL deleted file mode 100644 index d70ba8e..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.36.2) -Root-Is-Purelib: false -Tag: cp38-cp38-macosx_10_9_x86_64 - diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/entry_points.txt b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/entry_points.txt deleted file mode 100644 index 3c1b523..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/entry_points.txt +++ /dev/null @@ -1,3 +0,0 @@ -[pandas_plotting_backends] -matplotlib = pandas:plotting._matplotlib - diff --git a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/top_level.txt b/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/top_level.txt deleted file mode 100644 index fb6c7ed..0000000 --- a/venv/lib/python3.8/site-packages/pandas-1.2.0.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -pandas diff --git a/venv/lib/python3.8/site-packages/pandas/__init__.py b/venv/lib/python3.8/site-packages/pandas/__init__.py index cc5d835..d6584bf 100644 --- a/venv/lib/python3.8/site-packages/pandas/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/__init__.py @@ -20,9 +20,10 @@ del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import ( - np_version_under1p17 as _np_version_under1p17, - np_version_under1p18 as _np_version_under1p18, - is_numpy_dev as _is_numpy_dev, + _np_version_under1p16, + _np_version_under1p17, + _np_version_under1p18, + _is_numpy_dev, ) try: @@ -33,7 +34,7 @@ except ImportError as e: # pragma: no cover raise ImportError( f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --force' to build the C extensions first." + "'python setup.py build_ext --inplace --force' to build the C extensions first." ) from e from pandas._config import ( @@ -58,8 +59,6 @@ from pandas.core.api import ( UInt16Dtype, UInt32Dtype, UInt64Dtype, - Float32Dtype, - Float64Dtype, CategoricalDtype, PeriodDtype, IntervalDtype, @@ -102,7 +101,6 @@ from pandas.core.api import ( to_datetime, to_timedelta, # misc - Flags, Grouper, factorize, unique, @@ -187,61 +185,181 @@ __version__ = v.get("closest-tag", v["version"]) __git_version__ = v.get("full-revisionid") del get_versions, v - # GH 27101 -def __getattr__(name): - import warnings +# TODO: remove Panel compat in 1.0 +if pandas.compat.PY37: - if name == "datetime": - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime module instead.", - FutureWarning, - stacklevel=2, - ) + def __getattr__(name): + import warnings + + if name == "Panel": + + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) + + class Panel: + pass + + return Panel + + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) + + from datetime import datetime as dt + + return dt + + elif name == "np": + + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np + + return np + + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) + + return type(name, (), {}) + + elif name == "SparseArray": + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray + + return _SparseArray + + raise AttributeError(f"module 'pandas' has no attribute '{name}'") + + +else: + + class Panel: + pass + + class SparseDataFrame: + pass + + class SparseSeries: + pass + + class __numpy: + def __init__(self): + import numpy as np + import warnings + + self.np = np + self.warnings = warnings + + def __getattr__(self, item): + self.warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + + try: + return getattr(self.np, item) + except AttributeError as err: + raise AttributeError(f"module numpy has no attribute {item}") from err + + np = __numpy() + + class __Datetime(type): from datetime import datetime as dt - return dt + datetime = dt - elif name == "np": + def __getattr__(cls, item): + cls.emit_warning() - warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - import numpy as np + try: + return getattr(cls.datetime, item) + except AttributeError as err: + raise AttributeError( + f"module datetime has no attribute {item}" + ) from err - return np + def __instancecheck__(cls, other): + return isinstance(other, cls.datetime) - elif name in {"SparseSeries", "SparseDataFrame"}: - warnings.warn( - f"The {name} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) + class __DatetimeSub(metaclass=__Datetime): + def emit_warning(dummy=0): + import warnings - return type(name, (), {}) + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime instead.", + FutureWarning, + stacklevel=3, + ) - elif name == "SparseArray": + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from datetime import datetime as dt - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=2, - ) - from pandas.core.arrays.sparse import SparseArray as _SparseArray + return dt(*args, **kwargs) - return _SparseArray + datetime = __DatetimeSub - raise AttributeError(f"module 'pandas' has no attribute '{name}'") + class __SparseArray(type): + + from pandas.core.arrays.sparse import SparseArray as sa + + SparseArray = sa + + def __instancecheck__(cls, other): + return isinstance(other, cls.SparseArray) + + class __SparseArraySub(metaclass=__SparseArray): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from pandas.core.arrays.sparse import SparseArray as sa + + return sa(*args, **kwargs) + + SparseArray = __SparseArraySub # module level doc-string diff --git a/venv/lib/python3.8/site-packages/pandas/_config/config.py b/venv/lib/python3.8/site-packages/pandas/_config/config.py index 512b638..d7b73a0 100644 --- a/venv/lib/python3.8/site-packages/pandas/_config/config.py +++ b/venv/lib/python3.8/site-packages/pandas/_config/config.py @@ -392,7 +392,7 @@ class option_context(ContextDecorator): """ def __init__(self, *args): - if len(args) % 2 != 0 or len(args) < 2: + if not (len(args) % 2 == 0 and len(args) >= 2): raise ValueError( "Need to invoke as option_context(pat, val, [(pat, val), ...])." ) @@ -460,7 +460,9 @@ def register_option( path = key.split(".") for k in path: - if not re.match("^" + tokenize.Name + "$", k): + # NOTE: tokenize.Name is not a public constant + # error: Module has no attribute "Name" [attr-defined] + if not re.match("^" + tokenize.Name + "$", k): # type: ignore raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): raise ValueError(f"{k} is a python keyword") @@ -648,7 +650,7 @@ def _build_option_description(k: str) -> str: s += f"\n [default: {o.defval}] [currently: {_get_option(k, True)}]" if d: - rkey = d.rkey or "" + rkey = d.rkey if d.rkey else "" s += "\n (Deprecated" s += f", use `{rkey}` instead." s += ")" diff --git a/venv/lib/python3.8/site-packages/pandas/_config/display.py b/venv/lib/python3.8/site-packages/pandas/_config/display.py index e4553a2..ef319f4 100644 --- a/venv/lib/python3.8/site-packages/pandas/_config/display.py +++ b/venv/lib/python3.8/site-packages/pandas/_config/display.py @@ -22,7 +22,7 @@ def detect_console_encoding() -> str: encoding = None try: encoding = sys.stdout.encoding or sys.stdin.encoding - except (AttributeError, OSError): + except (AttributeError, IOError): pass # try again for something better diff --git a/venv/lib/python3.8/site-packages/pandas/_config/localization.py b/venv/lib/python3.8/site-packages/pandas/_config/localization.py index bc76aca..66865e1 100644 --- a/venv/lib/python3.8/site-packages/pandas/_config/localization.py +++ b/venv/lib/python3.8/site-packages/pandas/_config/localization.py @@ -88,18 +88,17 @@ def _valid_locales(locales, normalize): valid_locales : list A list of valid locales. """ - return [ - loc - for loc in ( - locale.normalize(loc.strip()) if normalize else loc.strip() - for loc in locales - ) - if can_set_locale(loc) - ] + if normalize: + normalizer = lambda x: locale.normalize(x.strip()) + else: + normalizer = lambda x: x.strip() + + return list(filter(can_set_locale, map(normalizer, locales))) def _default_locale_getter(): - return subprocess.check_output(["locale -a"], shell=True) + raw_locales = subprocess.check_output(["locale -a"], shell=True) + return raw_locales def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_getter): diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/algos.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/algos.cpython-38-darwin.so index 2c43108..a16c5e5 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/algos.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/algos.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/groupby.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/groupby.cpython-38-darwin.so index 6e2d5b5..cb342d5 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/groupby.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/groupby.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/hashing.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/hashing.cpython-38-darwin.so index 12fdecc..6a4d9bb 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/hashing.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/hashing.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/hashtable.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/hashtable.cpython-38-darwin.so index 6308964..975c760 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/hashtable.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/hashtable.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/index.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/index.cpython-38-darwin.so index cbbeb12..01e59bb 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/index.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/index.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/indexing.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/indexing.cpython-38-darwin.so index a3c8a2e..76699b0 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/indexing.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/indexing.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-darwin.so index ee23c01..0554577 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/internals.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/interval.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/interval.cpython-38-darwin.so index 65bcf02..2560b46 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/interval.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/interval.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/join.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/join.cpython-38-darwin.so index 350ec8e..18c1b9b 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/join.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/join.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/json.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/json.cpython-38-darwin.so index 2eb5e7f..40191f9 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/json.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/json.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/lib.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/lib.cpython-38-darwin.so index 27e007f..8be32c8 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/lib.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/lib.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/missing.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/missing.cpython-38-darwin.so index 08c23ab..c65a74b 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/missing.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/missing.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/ops.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/ops.cpython-38-darwin.so index 818ffe2..50e45ac 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/ops.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/ops.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/ops_dispatch.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/ops_dispatch.cpython-38-darwin.so index 8a91f83..09dcbb6 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/ops_dispatch.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/ops_dispatch.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/parsers.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/parsers.cpython-38-darwin.so index 03dff5a..f8b362c 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/parsers.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/parsers.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/properties.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/properties.cpython-38-darwin.so index 12fda2d..2986fdf 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/properties.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/properties.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/reduction.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/reduction.cpython-38-darwin.so index 15195d6..567e951 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/reduction.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/reduction.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/reshape.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/reshape.cpython-38-darwin.so index 5cb006e..d5694d1 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/reshape.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/reshape.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/sparse.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/sparse.cpython-38-darwin.so index 655a8ef..e685660 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/sparse.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/sparse.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/testing.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/testing.cpython-38-darwin.so index 41faf38..5b317cd 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/testing.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/testing.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslib.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslib.cpython-38-darwin.so index b945769..c2390d2 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslib.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslib.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/base.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/base.cpython-38-darwin.so index 84dbf26..59fbb8f 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/base.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/base.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/ccalendar.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/ccalendar.cpython-38-darwin.so index 407addd..6c0b210 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/ccalendar.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/ccalendar.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/conversion.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/conversion.cpython-38-darwin.so index 765daec..ae7b370 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/conversion.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/conversion.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/dtypes.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/dtypes.cpython-38-darwin.so index dd863b3..ed97959 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/dtypes.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/dtypes.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/fields.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/fields.cpython-38-darwin.so index 52a3297..35b560c 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/fields.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/fields.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.cpython-38-darwin.so index ac3b01d..3a10941 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/nattype.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/np_datetime.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/np_datetime.cpython-38-darwin.so index 6ce9ac5..3a7389a 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/np_datetime.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/np_datetime.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/offsets.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/offsets.cpython-38-darwin.so index 8553e44..6c25d43 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/offsets.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/offsets.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/parsing.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/parsing.cpython-38-darwin.so index aa28d33..e9d3e0e 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/parsing.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/parsing.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/period.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/period.cpython-38-darwin.so index e30584f..210797c 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/period.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/period.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/strptime.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/strptime.cpython-38-darwin.so index 747bc0d..01a1541 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/strptime.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/strptime.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timedeltas.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timedeltas.cpython-38-darwin.so index 63074c8..68ca772 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timedeltas.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timedeltas.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timestamps.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timestamps.cpython-38-darwin.so index a710885..ccaf616 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timestamps.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timestamps.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timezones.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timezones.cpython-38-darwin.so index 50bbafb..f515317 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timezones.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/timezones.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/tzconversion.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/tzconversion.cpython-38-darwin.so index 60ad820..6b40f7d 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/tzconversion.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/tzconversion.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/vectorized.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/vectorized.cpython-38-darwin.so index 03ce051..54053a6 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/vectorized.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/tslibs/vectorized.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/window/aggregations.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/window/aggregations.cpython-38-darwin.so index 31572af..ef4b5e0 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/window/aggregations.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/window/aggregations.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/window/indexers.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/window/indexers.cpython-38-darwin.so index 45e0847..879513e 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/window/indexers.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/window/indexers.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_libs/writers.cpython-38-darwin.so b/venv/lib/python3.8/site-packages/pandas/_libs/writers.cpython-38-darwin.so index 28a911a..c1b8b6e 100755 Binary files a/venv/lib/python3.8/site-packages/pandas/_libs/writers.cpython-38-darwin.so and b/venv/lib/python3.8/site-packages/pandas/_libs/writers.cpython-38-darwin.so differ diff --git a/venv/lib/python3.8/site-packages/pandas/_testing.py b/venv/lib/python3.8/site-packages/pandas/_testing.py index 73b1dcf..ef6232f 100644 --- a/venv/lib/python3.8/site-packages/pandas/_testing.py +++ b/venv/lib/python3.8/site-packages/pandas/_testing.py @@ -6,7 +6,6 @@ from functools import wraps import gzip import operator import os -import re from shutil import rmtree import string import tempfile @@ -26,7 +25,7 @@ from pandas._config.localization import ( # noqa:F401 from pandas._libs.lib import no_default import pandas._libs.testing as _testing from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries -from pandas.compat import get_lzma_file, import_lzma +from pandas.compat import _get_lzma_file, _import_lzma from pandas.core.dtypes.common import ( is_bool, @@ -71,7 +70,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing -lzma = import_lzma() +lzma = _import_lzma() _N = 30 _K = 4 @@ -85,7 +84,6 @@ ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] -FLOAT_EA_DTYPES: List[Dtype] = ["Float32", "Float64"] COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: List[Dtype] = [str, "str", "U"] @@ -108,8 +106,6 @@ ALL_NUMPY_DTYPES = ( + BYTES_DTYPES ) -NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA] - # set testing_mode _testing_mode_warnings = (DeprecationWarning, ResourceWarning) @@ -119,24 +115,14 @@ def set_testing_mode(): # set the testing mode filters testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") if "deprecate" in testing_mode: - # pandas\_testing.py:119: error: Argument 2 to "simplefilter" has - # incompatible type "Tuple[Type[DeprecationWarning], - # Type[ResourceWarning]]"; expected "Type[Warning]" - warnings.simplefilter( - "always", _testing_mode_warnings # type: ignore[arg-type] - ) + warnings.simplefilter("always", _testing_mode_warnings) def reset_testing_mode(): # reset the testing mode filters testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") if "deprecate" in testing_mode: - # pandas\_testing.py:126: error: Argument 2 to "simplefilter" has - # incompatible type "Tuple[Type[DeprecationWarning], - # Type[ResourceWarning]]"; expected "Type[Warning]" - warnings.simplefilter( - "ignore", _testing_mode_warnings # type: ignore[arg-type] - ) + warnings.simplefilter("ignore", _testing_mode_warnings) set_testing_mode() @@ -253,22 +239,16 @@ def decompress_file(path, compression): if compression is None: f = open(path, "rb") elif compression == "gzip": - # pandas\_testing.py:243: error: Incompatible types in assignment - # (expression has type "IO[Any]", variable has type "BinaryIO") - f = gzip.open(path, "rb") # type: ignore[assignment] + f = gzip.open(path, "rb") elif compression == "bz2": - # pandas\_testing.py:245: error: Incompatible types in assignment - # (expression has type "BZ2File", variable has type "BinaryIO") - f = bz2.BZ2File(path, "rb") # type: ignore[assignment] + f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = get_lzma_file(lzma)(path, "rb") + f = _get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: - # pandas\_testing.py:252: error: Incompatible types in assignment - # (expression has type "IO[bytes]", variable has type "BinaryIO") - f = zip_file.open(zip_names.pop()) # type: ignore[assignment] + f = zip_file.open(zip_names.pop()) else: raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") else: @@ -304,17 +284,11 @@ def write_to_compressed(compression, path, data, dest="test"): if compression == "zip": compress_method = zipfile.ZipFile elif compression == "gzip": - # pandas\_testing.py:288: error: Incompatible types in assignment - # (expression has type "Type[GzipFile]", variable has type - # "Type[ZipFile]") - compress_method = gzip.GzipFile # type: ignore[assignment] + compress_method = gzip.GzipFile elif compression == "bz2": - # pandas\_testing.py:290: error: Incompatible types in assignment - # (expression has type "Type[BZ2File]", variable has type - # "Type[ZipFile]") - compress_method = bz2.BZ2File # type: ignore[assignment] + compress_method = bz2.BZ2File elif compression == "xz": - compress_method = get_lzma_file(lzma) + compress_method = _get_lzma_file(lzma) else: raise ValueError(f"Unrecognized compression type: {compression}") @@ -324,10 +298,7 @@ def write_to_compressed(compression, path, data, dest="test"): method = "writestr" else: mode = "wb" - # pandas\_testing.py:302: error: Incompatible types in assignment - # (expression has type "Tuple[Any]", variable has type "Tuple[Any, - # Any]") - args = (data,) # type: ignore[assignment] + args = (data,) method = "write" with compress_method(path, mode=mode) as f: @@ -694,7 +665,6 @@ def assert_index_equal( check_less_precise: Union[bool, int] = no_default, check_exact: bool = True, check_categorical: bool = True, - check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, obj: str = "Index", @@ -724,12 +694,6 @@ def assert_index_equal( Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. - check_order : bool, default True - Whether to compare the order of index entries as well as their values. - If True, both indexes must contain the same elements, in the same order. - If False, both indexes must contain the same elements, but in any order. - - .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -741,36 +705,30 @@ def assert_index_equal( obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. - - Examples - -------- - >>> from pandas.testing import assert_index_equal - >>> a = pd.Index([1, 2, 3]) - >>> b = pd.Index([1, 2, 3]) - >>> assert_index_equal(a, b) """ __tracebackhide__ = True - def _check_types(left, right, obj="Index"): + def _check_types(l, r, obj="Index"): if exact: - assert_class_equal(left, right, exact=exact, obj=obj) + assert_class_equal(l, r, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: - assert_attr_equal("dtype", left, right, obj=obj) + assert_attr_equal("dtype", l, r, obj=obj) # allow string-like to have different inferred_types - if left.inferred_type in ("string"): - assert right.inferred_type in ("string") + if l.inferred_type in ("string"): + assert r.inferred_type in ("string") else: - assert_attr_equal("inferred_type", left, right, obj=obj) + assert_attr_equal("inferred_type", l, r, obj=obj) def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) - return unique._shallow_copy(filled, name=index.names[level]) + values = unique._shallow_copy(filled, name=index.names[level]) + return values if check_less_precise is not no_default: warnings.warn( @@ -802,11 +760,6 @@ def assert_index_equal( msg3 = f"{len(right)}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) - # If order doesn't matter then sort the index entries - if not check_order: - left = left.sort_values() - right = right.sort_values() - # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: left = cast(MultiIndex, left) @@ -986,7 +939,7 @@ def assert_categorical_equal( if check_category_order: assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes" + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", ) else: try: @@ -995,7 +948,9 @@ def assert_categorical_equal( except TypeError: # e.g. '<' not supported between instances of 'int' and 'str' lc, rc = left.categories, right.categories - assert_index_equal(lc, rc, obj=f"{obj}.categories") + assert_index_equal( + lc, rc, obj=f"{obj}.categories", + ) assert_index_equal( left.categories.take(left.codes), right.categories.take(right.codes), @@ -1023,14 +978,8 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") """ _check_isinstance(left, right, IntervalArray) - kwargs = {} - if left._left.dtype.kind in ["m", "M"]: - # We have a DatetimeArray or TimedeltaArray - kwargs["check_freq"] = False - - assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) - assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) - + assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") + assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") assert_attr_equal("closed", left, right, obj=obj) @@ -1041,22 +990,20 @@ def assert_period_array_equal(left, right, obj="PeriodArray"): assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj="DatetimeArray", check_freq=True): +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - if check_freq: - assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("freq", left, right, obj=obj) assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray", check_freq=True): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - if check_freq: - assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): @@ -1145,13 +1092,13 @@ def assert_numpy_array_equal( if err_msg is None: if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape + obj, f"{obj} shapes are different", left.shape, right.shape, ) diff = 0 - for left_arr, right_arr in zip(left, right): + for l, r in zip(left, right): # count up differences - if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan): + if not array_equivalent(l, r, strict_nan=strict_nan): diff += 1 diff = diff * 100.0 / left.size @@ -1214,13 +1161,6 @@ def assert_extension_array_equal( Missing values are checked separately from valid values. A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. - - Examples - -------- - >>> from pandas.testing import assert_extension_array_equal - >>> a = pd.Series([1, 2, 3, 4]) - >>> b, c = a.array, a.array - >>> assert_extension_array_equal(b, c) """ if check_less_precise is not no_default: warnings.warn( @@ -1287,7 +1227,6 @@ def assert_series_equal( check_categorical=True, check_category_order=True, check_freq=True, - check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", @@ -1334,11 +1273,6 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. - check_flags : bool, default True - Whether to check the `flags` attribute. - - .. versionadded:: 1.2.0 - rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1350,13 +1284,6 @@ def assert_series_equal( obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. - - Examples - -------- - >>> from pandas.testing import assert_series_equal - >>> a = pd.Series([1, 2, 3, 4]) - >>> b = pd.Series([1, 2, 3, 4]) - >>> assert_series_equal(a, b) """ __tracebackhide__ = True @@ -1382,9 +1309,6 @@ def assert_series_equal( msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) - if check_flags: - assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" - # index comparison assert_index_equal( left.index, @@ -1458,16 +1382,7 @@ def assert_series_equal( check_dtype=check_dtype, index_values=np.asarray(left.index), ) - elif is_extension_array_dtype_and_needs_i8_conversion( - left.dtype, right.dtype - ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype): - assert_extension_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - index_values=np.asarray(left.index), - ) - elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): + elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, @@ -1516,7 +1431,6 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, - check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", @@ -1578,8 +1492,6 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. - check_flags : bool, default True - Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1647,11 +1559,11 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", ) - if check_flags: - assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + if check_like: + left, right = left.reindex_like(right), right # index comparison assert_index_equal( @@ -1661,7 +1573,6 @@ def assert_frame_equal( check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, - check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.index", @@ -1675,15 +1586,11 @@ def assert_frame_equal( check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, - check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.columns", ) - if check_like: - left, right = left.reindex_like(right), right - # compare by blocks if by_blocks: rblocks = right._to_dict_of_blocks() @@ -1779,7 +1686,7 @@ def box_expected(expected, box_cls, transpose=True): elif box_cls is pd.DataFrame: expected = pd.Series(expected).to_frame() if transpose: - # for vector operations, we need a DataFrame to be a single-row, + # for vector operations, we we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame # vectors of the same length. expected = expected.T @@ -1877,20 +1784,6 @@ def assert_copy(iter1, iter2, **eql_kwargs): assert elem1 is not elem2, msg -def is_extension_array_dtype_and_needs_i8_conversion(left_dtype, right_dtype) -> bool: - """ - Checks that we have the combination of an ExtensionArraydtype and - a dtype that should be converted to int64 - - Returns - ------- - bool - - Related to issue #37609 - """ - return is_extension_array_dtype(left_dtype) and needs_i8_conversion(right_dtype) - - def getCols(k): return string.ascii_uppercase[:k] @@ -1955,7 +1848,8 @@ def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): def makePeriodIndex(k=10, name=None, **kwargs): dt = datetime(2000, 1, 1) - return pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + return dr def makeMultiIndex(k=10, names=None, **kwargs): @@ -2053,7 +1947,8 @@ def index_subclass_makers_generator(): makeCategoricalIndex, makeMultiIndex, ] - yield from make_index_funcs + for make_index_func in make_index_funcs: + yield make_index_func def all_timeseries_index_generator(k=10): @@ -2067,8 +1962,7 @@ def all_timeseries_index_generator(k=10): """ make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] for make_index_func in make_index_funcs: - # pandas\_testing.py:1986: error: Cannot call function of unknown type - yield make_index_func(k=k) # type: ignore[operator] + yield make_index_func(k=k) # make series @@ -2192,18 +2086,17 @@ def makeCustomIndex( names = [names] # specific 1D index type requested? - idx_func = { - "i": makeIntIndex, - "f": makeFloatIndex, - "s": makeStringIndex, - "u": makeUnicodeIndex, - "dt": makeDateIndex, - "td": makeTimedeltaIndex, - "p": makePeriodIndex, - }.get(idx_type) + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) if idx_func: - # pandas\_testing.py:2120: error: Cannot call function of unknown type - idx = idx_func(nentries) # type: ignore[operator] + idx = idx_func(nentries) # but we need to fill in the name if names: idx.name = names[0] @@ -2231,8 +2124,7 @@ def makeCustomIndex( # build a list of lists to create the index from div_factor = nentries // ndupe_l[i] + 1 - # pandas\_testing.py:2148: error: Need type annotation for 'cnt' - cnt = Counter() # type: ignore[var-annotated] + cnt = Counter() for j in range(div_factor): label = f"{prefix}_l{i}_g{j}" cnt[label] = ndupe_l[i] @@ -2390,14 +2282,7 @@ def _create_missing_idx(nrows, ncols, density, random_state=None): def makeMissingDataframe(density=0.9, random_state=None): df = makeDataFrame() - # pandas\_testing.py:2306: error: "_create_missing_idx" gets multiple - # values for keyword argument "density" [misc] - - # pandas\_testing.py:2306: error: "_create_missing_idx" gets multiple - # values for keyword argument "random_state" [misc] - i, j = _create_missing_idx( # type: ignore[misc] - *df.shape, density=density, random_state=random_state - ) + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) df.values[i, j] = np.nan return df @@ -2422,10 +2307,7 @@ def optional_args(decorator): is_decorating = not kwargs and len(args) == 1 and callable(args[0]) if is_decorating: f = args[0] - # pandas\_testing.py:2331: error: Incompatible types in assignment - # (expression has type "List[]", variable has type - # "Tuple[Any, ...]") - args = [] # type: ignore[assignment] + args = [] return dec(f) else: return dec @@ -2509,7 +2391,7 @@ def can_connect(url, error_classes=None): @optional_args def network( t, - url="https://www.google.com", + url="http://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, error_classes=None, @@ -2533,7 +2415,7 @@ def network( The test requiring network connectivity. url : path The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'https://www.google.com'. + for connectivity. Defaults to 'http://www.google.com'. raise_on_error : bool If True, never catches errors. check_before_test : bool @@ -2577,7 +2459,7 @@ def network( You can specify alternative URLs:: - >>> @network("https://www.yahoo.com") + >>> @network("http://www.yahoo.com") ... def test_something_with_yahoo(): ... raise IOError("Failure Message") >>> test_something_with_yahoo() @@ -2607,20 +2489,15 @@ def network( @wraps(t) def wrapper(*args, **kwargs): - if ( - check_before_test - and not raise_on_error - and not can_connect(url, error_classes) - ): - skip() + if check_before_test and not raise_on_error: + if not can_connect(url, error_classes): + skip() try: return t(*args, **kwargs) except Exception as err: errno = getattr(err, "errno", None) if not errno and hasattr(errno, "reason"): - # pandas\_testing.py:2521: error: "Exception" has no attribute - # "reason" - errno = getattr(err.reason, "errno", None) # type: ignore[attr-defined] + errno = getattr(err.reason, "errno", None) if errno in skip_errnos: skip(f"Skipping test due to known errno and error {err}") @@ -2648,11 +2525,10 @@ with_connectivity_check = network @contextmanager def assert_produces_warning( - expected_warning: Optional[Union[Type[Warning], bool]] = Warning, + expected_warning=Warning, filter_level="always", - check_stacklevel: bool = True, - raise_on_extra_warnings: bool = True, - match: Optional[str] = None, + check_stacklevel=True, + raise_on_extra_warnings=True, ): """ Context manager for running code expected to either raise a specific @@ -2687,8 +2563,6 @@ def assert_produces_warning( raise_on_extra_warnings : bool, default True Whether extra warnings not of the type `expected_warning` should cause the test to fail. - match : str, optional - Match warning message. Examples -------- @@ -2715,28 +2589,28 @@ def assert_produces_warning( with warnings.catch_warnings(record=True) as w: saw_warning = False - matched_message = False - warnings.simplefilter(filter_level) yield w extra_warnings = [] for actual_warning in w: - if not expected_warning: - continue - - expected_warning = cast(Type[Warning], expected_warning) - if issubclass(actual_warning.category, expected_warning): + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): saw_warning = True if check_stacklevel and issubclass( actual_warning.category, (FutureWarning, DeprecationWarning) ): - _assert_raised_with_correct_stacklevel(actual_warning) - - if match is not None and re.search(match, str(actual_warning.message)): - matched_message = True + from inspect import getframeinfo, stack + caller = getframeinfo(stack()[2][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg else: extra_warnings.append( ( @@ -2746,41 +2620,18 @@ def assert_produces_warning( actual_warning.lineno, ) ) - if expected_warning: - expected_warning = cast(Type[Warning], expected_warning) - if not saw_warning: - raise AssertionError( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) - - if match and not matched_message: - raise AssertionError( - f"Did not see warning {repr(expected_warning.__name__)} " - f"matching {match}" - ) - + msg = ( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + assert saw_warning, msg if raise_on_extra_warnings and extra_warnings: raise AssertionError( f"Caused unexpected warning(s): {repr(extra_warnings)}" ) -def _assert_raised_with_correct_stacklevel( - actual_warning: warnings.WarningMessage, -) -> None: - from inspect import getframeinfo, stack - - caller = getframeinfo(stack()[3][0]) - msg = ( - "Warning not set with correct stacklevel. " - f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" - ) - assert actual_warning.filename == caller.filename, msg - - class RNGContext: """ Context manager to set the numpy random number generator speed. Returns @@ -2849,7 +2700,7 @@ def use_numexpr(use, min_elements=None): if min_elements is None: min_elements = expr._MIN_ELEMENTS - olduse = expr.USE_NUMEXPR + olduse = expr._USE_NUMEXPR oldmin = expr._MIN_ELEMENTS expr.set_use_numexpr(use) expr._MIN_ELEMENTS = min_elements @@ -3029,10 +2880,11 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): Expected output of to_csv() in current OS. """ sep = os.linesep - return sep.join(rows_list) + sep + expected = sep.join(rows_list) + sep + return expected -def external_error_raised(expected_exception: Type[Exception]) -> ContextManager: +def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/venv/lib/python3.8/site-packages/pandas/_typing.py b/venv/lib/python3.8/site-packages/pandas/_typing.py index 09c490e..76ec527 100644 --- a/venv/lib/python3.8/site-packages/pandas/_typing.py +++ b/venv/lib/python3.8/site-packages/pandas/_typing.py @@ -1,7 +1,5 @@ from datetime import datetime, timedelta, tzinfo -from io import BufferedIOBase, RawIOBase, TextIOBase, TextIOWrapper -from mmap import mmap -from os import PathLike +from pathlib import Path from typing import ( IO, TYPE_CHECKING, @@ -14,8 +12,6 @@ from typing import ( List, Mapping, Optional, - Sequence, - Tuple, Type, TypeVar, Union, @@ -27,27 +23,16 @@ import numpy as np # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: - from typing import final + from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 - from pandas._libs import Period, Timedelta, Timestamp + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.dtypes.dtypes import ExtensionDtype - - from pandas import Interval + from pandas import Interval # noqa: F401 from pandas.core.arrays.base import ExtensionArray # noqa: F401 - from pandas.core.frame import DataFrame + from pandas.core.frame import DataFrame # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 - from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy - from pandas.core.indexes.base import Index - from pandas.core.resample import Resampler - from pandas.core.series import Series - from pandas.core.window.rolling import BaseWindow - - from pandas.io.formats.format import EngFormatter -else: - # typing.final does not exist until py38 - final = lambda x: x - + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 # array-like @@ -74,9 +59,10 @@ Timezone = Union[str, tzinfo] # other Dtype = Union[ - "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] +FilePathOrBuffer = Union[str, Path, IO[AnyStr]] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -92,9 +78,7 @@ FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") Axis = Union[str, int] Label = Optional[Hashable] -IndexLabel = Union[Label, Sequence[Label]] Level = Union[Label, int] -Shape = Tuple[int, ...] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] Axes = Collection @@ -117,34 +101,8 @@ IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] # types of `func` kwarg for DataFrame.aggregate and Series.aggregate AggFuncTypeBase = Union[Callable, str] -AggFuncTypeDict = Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] AggFuncType = Union[ AggFuncTypeBase, List[AggFuncTypeBase], - AggFuncTypeDict, + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], ] -AggObjType = Union[ - "Series", - "DataFrame", - "SeriesGroupBy", - "DataFrameGroupBy", - "BaseWindow", - "Resampler", -] - -# filenames and file-like-objects -Buffer = Union[IO[AnyStr], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap] -FileOrBuffer = Union[str, Buffer[T]] -FilePathOrBuffer = Union["PathLike[str]", FileOrBuffer[T]] - -# for arbitrary kwargs passed during reading/writing files -StorageOptions = Optional[Dict[str, Any]] - - -# compression keywords and compression -CompressionDict = Dict[str, Any] -CompressionOptions = Optional[Union[str, CompressionDict]] - - -# type of float formatter in DataFrameFormatter -FloatFormatType = Union[str, Callable, "EngFormatter"] diff --git a/venv/lib/python3.8/site-packages/pandas/_version.py b/venv/lib/python3.8/site-packages/pandas/_version.py index a49e58d..5922b44 100644 --- a/venv/lib/python3.8/site-packages/pandas/_version.py +++ b/venv/lib/python3.8/site-packages/pandas/_version.py @@ -1,18 +1,20 @@ -# This file was generated by 'versioneer.py' (0.19) from +# This file was generated by 'versioneer.py' (0.15) from # revision-control system data, or from the parent directory name of an # unpacked source archive. Distribution tarballs contain a pre-generated copy # of this file. -import json +from warnings import catch_warnings +with catch_warnings(record=True): + import json +import sys version_json = ''' { - "date": "2020-12-26T13:47:00+0000", "dirty": false, "error": null, - "full-revisionid": "3e89b4c4b1580aa890023fc550774e63d499da25", - "version": "1.2.0" + "full-revisionid": "b5958ee1999e9aead1938c0bba2b674378807b3d", + "version": "1.1.5" } ''' # END VERSION_JSON diff --git a/venv/lib/python3.8/site-packages/pandas/api/types/__init__.py b/venv/lib/python3.8/site-packages/pandas/api/types/__init__.py index fb1abdd..3495b49 100644 --- a/venv/lib/python3.8/site-packages/pandas/api/types/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/api/types/__init__.py @@ -4,7 +4,7 @@ Public toolkit API. from pandas._libs.lib import infer_dtype -from pandas.core.dtypes.api import * # noqa: F401, F403 +from pandas.core.dtypes.api import * # noqa: F403, F401 from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ( CategoricalDtype, diff --git a/venv/lib/python3.8/site-packages/pandas/arrays/__init__.py b/venv/lib/python3.8/site-packages/pandas/arrays/__init__.py index 0fa070b..61832a8 100644 --- a/venv/lib/python3.8/site-packages/pandas/arrays/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/arrays/__init__.py @@ -7,7 +7,6 @@ from pandas.core.arrays import ( BooleanArray, Categorical, DatetimeArray, - FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -21,7 +20,6 @@ __all__ = [ "BooleanArray", "Categorical", "DatetimeArray", - "FloatingArray", "IntegerArray", "IntervalArray", "PandasArray", diff --git a/venv/lib/python3.8/site-packages/pandas/compat/__init__.py b/venv/lib/python3.8/site-packages/pandas/compat/__init__.py index 2ac9b9e..b5a1dc2 100644 --- a/venv/lib/python3.8/site-packages/pandas/compat/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/compat/__init__.py @@ -8,17 +8,27 @@ Other items: * platform checker """ import platform +import struct import sys import warnings from pandas._typing import F +PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PYPY = platform.python_implementation() == "PyPy" IS64 = sys.maxsize > 2 ** 32 +# ---------------------------------------------------------------------------- +# functions largely based / taken from the six module + +# Much of the code in this module comes from Benjamin Peterson's six library. +# The license for this library can be found in LICENSES/SIX and the code can be +# found at https://bitbucket.org/gutworth/six + + def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. @@ -29,6 +39,7 @@ def set_function_name(f: F, name: str, cls) -> F: return f +# https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian() -> bool: """ Checking if the running platform is little endian. @@ -50,7 +61,7 @@ def is_platform_windows() -> bool: bool True if the running platform is windows. """ - return sys.platform in ["win32", "cygwin"] + return sys.platform == "win32" or sys.platform == "cygwin" def is_platform_linux() -> bool: @@ -62,7 +73,7 @@ def is_platform_linux() -> bool: bool True if the running platform is linux. """ - return sys.platform == "linux" + return sys.platform == "linux2" def is_platform_mac() -> bool: @@ -77,7 +88,19 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" -def import_lzma(): +def is_platform_32bit() -> bool: + """ + Checking if the running platform is 32-bit. + + Returns + ------- + bool + True if the running platform is 32-bit. + """ + return struct.calcsize("P") * 8 < 64 + + +def _import_lzma(): """ Importing the `lzma` module. @@ -97,7 +120,7 @@ def import_lzma(): warnings.warn(msg) -def get_lzma_file(lzma): +def _get_lzma_file(lzma): """ Importing the `LZMAFile` class from the `lzma` module. diff --git a/venv/lib/python3.8/site-packages/pandas/compat/_optional.py b/venv/lib/python3.8/site-packages/pandas/compat/_optional.py index 533e67a..9a5e54d 100644 --- a/venv/lib/python3.8/site-packages/pandas/compat/_optional.py +++ b/venv/lib/python3.8/site-packages/pandas/compat/_optional.py @@ -11,24 +11,25 @@ VERSIONS = { "fsspec": "0.7.4", "fastparquet": "0.3.2", "gcsfs": "0.6.0", - "lxml.etree": "4.3.0", - "matplotlib": "2.2.3", - "numexpr": "2.6.8", + "lxml.etree": "3.8.0", + "matplotlib": "2.2.2", + "numexpr": "2.6.2", "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.12.0", - "pyarrow": "0.15.0", + "pyarrow": "0.13.0", + "pytables": "3.4.3", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.2.8", - "tables": "3.5.1", + "sqlalchemy": "1.1.4", + "tables": "3.4.3", "tabulate": "0.8.3", - "xarray": "0.12.3", - "xlrd": "1.2.0", - "xlwt": "1.3.0", - "xlsxwriter": "1.0.2", + "xarray": "0.8.2", + "xlrd": "1.1.0", + "xlwt": "1.2.0", + "xlsxwriter": "0.9.8", "numba": "0.46.0", } diff --git a/venv/lib/python3.8/site-packages/pandas/compat/numpy/__init__.py b/venv/lib/python3.8/site-packages/pandas/compat/numpy/__init__.py index a2444b7..789a466 100644 --- a/venv/lib/python3.8/site-packages/pandas/compat/numpy/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/compat/numpy/__init__.py @@ -8,19 +8,19 @@ import numpy as np # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -np_version_under1p17 = _nlv < LooseVersion("1.17") -np_version_under1p18 = _nlv < LooseVersion("1.18") +_np_version_under1p16 = _nlv < LooseVersion("1.16") +_np_version_under1p17 = _nlv < LooseVersion("1.17") +_np_version_under1p18 = _nlv < LooseVersion("1.18") _np_version_under1p19 = _nlv < LooseVersion("1.19") _np_version_under1p20 = _nlv < LooseVersion("1.20") -is_numpy_dev = ".dev" in str(_nlv) -_min_numpy_ver = "1.16.5" +_is_numpy_dev = ".dev" in str(_nlv) -if _nlv < _min_numpy_ver: +if _nlv < "1.15.4": raise ImportError( - f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" + "this version of pandas is incompatible with numpy < 1.15.4\n" f"your numpy version is {_np_version}.\n" - f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" + "Please upgrade numpy to >= 1.15.4 to use this pandas version" ) @@ -65,6 +65,7 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = [ "np", "_np_version", - "np_version_under1p17", - "is_numpy_dev", + "_np_version_under1p16", + "_np_version_under1p17", + "_is_numpy_dev", ] diff --git a/venv/lib/python3.8/site-packages/pandas/compat/numpy/function.py b/venv/lib/python3.8/site-packages/pandas/compat/numpy/function.py index c47c31f..d7a14c2 100644 --- a/venv/lib/python3.8/site-packages/pandas/compat/numpy/function.py +++ b/venv/lib/python3.8/site-packages/pandas/compat/numpy/function.py @@ -1,24 +1,27 @@ """ -For compatibility with numpy libraries, pandas functions or methods have to -accept '*args' and '**kwargs' parameters to accommodate numpy arguments that -are not actually used or respected in the pandas implementation. +For compatibility with numpy libraries, pandas functions or +methods have to accept '*args' and '**kwargs' parameters to +accommodate numpy arguments that are not actually used or +respected in the pandas implementation. -To ensure that users do not abuse these parameters, validation is performed in -'validators.py' to make sure that any extra parameters passed correspond ONLY -to those in the numpy signature. Part of that validation includes whether or -not the user attempted to pass in non-default values for these extraneous -parameters. As we want to discourage users from relying on these parameters -when calling the pandas implementation, we want them only to pass in the -default values for these parameters. +To ensure that users do not abuse these parameters, validation +is performed in 'validators.py' to make sure that any extra +parameters passed correspond ONLY to those in the numpy signature. +Part of that validation includes whether or not the user attempted +to pass in non-default values for these extraneous parameters. As we +want to discourage users from relying on these parameters when calling +the pandas implementation, we want them only to pass in the default values +for these parameters. -This module provides a set of commonly used default arguments for functions and -methods that are spread throughout the codebase. This module will make it +This module provides a set of commonly used default arguments for functions +and methods that are spread throughout the codebase. This module will make it easier to adjust to future upstream changes in the analogous numpy signatures. """ +from collections import OrderedDict from distutils.version import LooseVersion from typing import Any, Dict, Optional, Union -from numpy import __version__, ndarray +from numpy import __version__ as _np_version, ndarray from pandas._libs.lib import is_bool, is_integer from pandas.errors import UnsupportedFunctionCall @@ -71,7 +74,7 @@ class CompatValidator: raise ValueError(f"invalid validation method '{method}'") -ARGMINMAX_DEFAULTS = {"out": None} +ARGMINMAX_DEFAULTS = dict(out=None) validate_argmin = CompatValidator( ARGMINMAX_DEFAULTS, fname="argmin", method="both", max_fname_arg_count=1 ) @@ -90,10 +93,11 @@ def process_skipna(skipna, args): def validate_argmin_with_skipna(skipna, args, kwargs): """ - If 'Series.argmin' is called via the 'numpy' library, the third parameter - in its signature is 'out', which takes either an ndarray or 'None', so - check if the 'skipna' parameter is either an instance of ndarray or is - None, since 'skipna' itself should be a boolean + If 'Series.argmin' is called via the 'numpy' library, + the third parameter in its signature is 'out', which + takes either an ndarray or 'None', so check if the + 'skipna' parameter is either an instance of ndarray or + is None, since 'skipna' itself should be a boolean """ skipna, args = process_skipna(skipna, args) validate_argmin(args, kwargs) @@ -102,22 +106,23 @@ def validate_argmin_with_skipna(skipna, args, kwargs): def validate_argmax_with_skipna(skipna, args, kwargs): """ - If 'Series.argmax' is called via the 'numpy' library, the third parameter - in its signature is 'out', which takes either an ndarray or 'None', so - check if the 'skipna' parameter is either an instance of ndarray or is - None, since 'skipna' itself should be a boolean + If 'Series.argmax' is called via the 'numpy' library, + the third parameter in its signature is 'out', which + takes either an ndarray or 'None', so check if the + 'skipna' parameter is either an instance of ndarray or + is None, since 'skipna' itself should be a boolean """ skipna, args = process_skipna(skipna, args) validate_argmax(args, kwargs) return skipna -ARGSORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} +ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None -if LooseVersion(__version__) >= LooseVersion("1.17.0"): +if LooseVersion(_np_version) >= LooseVersion("1.17.0"): # GH-26361. NumPy added radix sort and changed default to None. ARGSORT_DEFAULTS["kind"] = None @@ -126,9 +131,9 @@ validate_argsort = CompatValidator( ARGSORT_DEFAULTS, fname="argsort", max_fname_arg_count=0, method="both" ) -# two different signatures of argsort, this second validation for when the -# `kind` param is supported -ARGSORT_DEFAULTS_KIND: Dict[str, Optional[int]] = {} +# two different signatures of argsort, this second validation +# for when the `kind` param is supported +ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -138,10 +143,11 @@ validate_argsort_kind = CompatValidator( def validate_argsort_with_ascending(ascending, args, kwargs): """ - If 'Categorical.argsort' is called via the 'numpy' library, the first - parameter in its signature is 'axis', which takes either an integer or - 'None', so check if the 'ascending' parameter has either integer type or is - None, since 'ascending' itself should be a boolean + If 'Categorical.argsort' is called via the 'numpy' library, the + first parameter in its signature is 'axis', which takes either + an integer or 'None', so check if the 'ascending' parameter has + either integer type or is None, since 'ascending' itself should + be a boolean """ if is_integer(ascending) or ascending is None: args = (ascending,) + args @@ -151,7 +157,7 @@ def validate_argsort_with_ascending(ascending, args, kwargs): return ascending -CLIP_DEFAULTS: Dict[str, Any] = {"out": None} +CLIP_DEFAULTS: Dict[str, Any] = dict(out=None) validate_clip = CompatValidator( CLIP_DEFAULTS, fname="clip", method="both", max_fname_arg_count=3 ) @@ -159,10 +165,10 @@ validate_clip = CompatValidator( def validate_clip_with_axis(axis, args, kwargs): """ - If 'NDFrame.clip' is called via the numpy library, the third parameter in - its signature is 'out', which can takes an ndarray, so check if the 'axis' - parameter is an instance of ndarray, since 'axis' itself should either be - an integer or None + If 'NDFrame.clip' is called via the numpy library, the third + parameter in its signature is 'out', which can takes an ndarray, + so check if the 'axis' parameter is an instance of ndarray, since + 'axis' itself should either be an integer or None """ if isinstance(axis, ndarray): args = (axis,) + args @@ -172,7 +178,7 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -CUM_FUNC_DEFAULTS: Dict[str, Any] = {} +CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -185,9 +191,10 @@ validate_cumsum = CompatValidator( def validate_cum_func_with_skipna(skipna, args, kwargs, name): """ - If this function is called via the 'numpy' library, the third parameter in - its signature is 'dtype', which takes either a 'numpy' dtype or 'None', so - check if the 'skipna' parameter is a boolean or not + If this function is called via the 'numpy' library, the third + parameter in its signature is 'dtype', which takes either a + 'numpy' dtype or 'None', so check if the 'skipna' parameter is + a boolean or not """ if not is_bool(skipna): args = (skipna,) + args @@ -197,7 +204,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS: Dict[str, Optional[bool]] = {} +ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -208,10 +215,10 @@ validate_any = CompatValidator( ALLANY_DEFAULTS, fname="any", method="both", max_fname_arg_count=1 ) -LOGICAL_FUNC_DEFAULTS = {"out": None, "keepdims": False} +LOGICAL_FUNC_DEFAULTS = dict(out=None, keepdims=False) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method="kwargs") -MINMAX_DEFAULTS = {"axis": None, "out": None, "keepdims": False} +MINMAX_DEFAULTS = dict(axis=None, out=None, keepdims=False) validate_min = CompatValidator( MINMAX_DEFAULTS, fname="min", method="both", max_fname_arg_count=1 ) @@ -219,28 +226,28 @@ validate_max = CompatValidator( MINMAX_DEFAULTS, fname="max", method="both", max_fname_arg_count=1 ) -RESHAPE_DEFAULTS: Dict[str, str] = {"order": "C"} +RESHAPE_DEFAULTS: Dict[str, str] = dict(order="C") validate_reshape = CompatValidator( RESHAPE_DEFAULTS, fname="reshape", method="both", max_fname_arg_count=1 ) -REPEAT_DEFAULTS: Dict[str, Any] = {"axis": None} +REPEAT_DEFAULTS: Dict[str, Any] = dict(axis=None) validate_repeat = CompatValidator( REPEAT_DEFAULTS, fname="repeat", method="both", max_fname_arg_count=1 ) -ROUND_DEFAULTS: Dict[str, Any] = {"out": None} +ROUND_DEFAULTS: Dict[str, Any] = dict(out=None) validate_round = CompatValidator( ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} +SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS: Dict[str, Optional[Any]] = {} +STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -274,13 +281,13 @@ validate_median = CompatValidator( MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS: Dict[str, Optional[bool]] = {} +STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS: Dict[str, Optional[str]] = {} +TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") @@ -288,9 +295,10 @@ validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") def validate_take_with_convert(convert, args, kwargs): """ - If this function is called via the 'numpy' library, the third parameter in - its signature is 'axis', which takes either an ndarray or 'None', so check - if the 'convert' parameter is either an instance of ndarray or is None + If this function is called via the 'numpy' library, the third + parameter in its signature is 'axis', which takes either an + ndarray or 'None', so check if the 'convert' parameter is either + an instance of ndarray or is None """ if isinstance(convert, ndarray) or convert is None: args = (convert,) + args @@ -300,7 +308,7 @@ def validate_take_with_convert(convert, args, kwargs): return convert -TRANSPOSE_DEFAULTS = {"axes": None} +TRANSPOSE_DEFAULTS = dict(axes=None) validate_transpose = CompatValidator( TRANSPOSE_DEFAULTS, fname="transpose", method="both", max_fname_arg_count=0 ) @@ -353,9 +361,10 @@ def validate_expanding_func(name, args, kwargs) -> None: def validate_groupby_func(name, args, kwargs, allowed=None) -> None: """ - 'args' and 'kwargs' should be empty, except for allowed kwargs because all - of their necessary parameters are explicitly listed in the function - signature + 'args' and 'kwargs' should be empty, except for allowed + kwargs because all of + their necessary parameters are explicitly listed in + the function signature """ if allowed is None: allowed = [] @@ -374,8 +383,9 @@ RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") def validate_resampler_func(method: str, args, kwargs) -> None: """ - 'args' and 'kwargs' should be empty because all of their necessary - parameters are explicitly listed in the function signature + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature """ if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: @@ -387,20 +397,20 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: +def validate_minmax_axis(axis: Optional[int]) -> None: """ - Ensure that the axis argument passed to min, max, argmin, or argmax is zero - or None, as otherwise it will be incorrectly ignored. + Ensure that the axis argument passed to min, max, argmin, or argmax is + zero or None, as otherwise it will be incorrectly ignored. Parameters ---------- axis : int or None - ndim : int, default 1 Raises ------ ValueError """ + ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/venv/lib/python3.8/site-packages/pandas/compat/pickle_compat.py b/venv/lib/python3.8/site-packages/pandas/compat/pickle_compat.py index 80ee1f2..015b203 100644 --- a/venv/lib/python3.8/site-packages/pandas/compat/pickle_compat.py +++ b/venv/lib/python3.8/site-packages/pandas/compat/pickle_compat.py @@ -64,7 +64,7 @@ class _LoadSparseSeries: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "Series", but must return # a subtype of "_LoadSparseSeries") - def __new__(cls) -> "Series": # type: ignore[misc] + def __new__(cls) -> "Series": # type: ignore from pandas import Series warnings.warn( @@ -82,7 +82,7 @@ class _LoadSparseFrame: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "DataFrame", but must # return a subtype of "_LoadSparseFrame") - def __new__(cls) -> "DataFrame": # type: ignore[misc] + def __new__(cls) -> "DataFrame": # type: ignore from pandas import DataFrame warnings.warn( @@ -181,7 +181,7 @@ _class_locations_map = { # functions for compat and uses a non-public class of the pickle module. # error: Name 'pkl._Unpickler' is not defined -class Unpickler(pkl._Unpickler): # type: ignore[name-defined] +class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass key = (module, name) @@ -274,7 +274,7 @@ def patch_pickle(): """ orig_loads = pkl.loads try: - setattr(pkl, "loads", loads) + pkl.loads = loads yield finally: - setattr(pkl, "loads", orig_loads) + pkl.loads = orig_loads diff --git a/venv/lib/python3.8/site-packages/pandas/conftest.py b/venv/lib/python3.8/site-packages/pandas/conftest.py index d84a72d..74cab2e 100644 --- a/venv/lib/python3.8/site-packages/pandas/conftest.py +++ b/venv/lib/python3.8/site-packages/pandas/conftest.py @@ -33,10 +33,8 @@ from pytz import FixedOffset, utc import pandas.util._test_decorators as td -from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype - import pandas as pd -from pandas import DataFrame, Interval, Period, Series, Timedelta, Timestamp +from pandas import DataFrame import pandas._testing as tm from pandas.core import ops from pandas.core.indexes.api import Index, MultiIndex @@ -57,9 +55,6 @@ def pytest_configure(config): ) config.addinivalue_line("markers", "high_memory: mark a test as a high-memory only") config.addinivalue_line("markers", "clipboard: mark a pd.read_clipboard test") - config.addinivalue_line( - "markers", "arm_slow: mark a test as slow for arm64 architecture" - ) def pytest_addoption(parser): @@ -176,6 +171,14 @@ def axis(request): axis_frame = axis +@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") +def axis_series(request): + """ + Fixture for returning the axis numbers of a Series. + """ + return request.param + + @pytest.fixture(params=[True, False, None]) def observed(request): """ @@ -266,7 +269,7 @@ def nselect_method(request): # ---------------------------------------------------------------- # Missing values & co. # ---------------------------------------------------------------- -@pytest.fixture(params=tm.NULL_OBJECTS, ids=str) +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), pd.NA], ids=str) def nulls_fixture(request): """ Fixture for each null type in pandas. @@ -288,22 +291,11 @@ def unique_nulls_fixture(request): # Generate cartesian product of unique_nulls_fixture: unique_nulls_fixture2 = unique_nulls_fixture + # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- - - -@pytest.fixture(params=[pd.DataFrame, pd.Series]) -def frame_or_series(request): - """ - Fixture to parametrize over DataFrame and Series. - """ - return request.param - - -@pytest.fixture( - params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item] -) +@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) def index_or_series(request): """ Fixture to parametrize over Index and Series, made necessary by a mypy @@ -320,16 +312,6 @@ def index_or_series(request): index_or_series2 = index_or_series -@pytest.fixture( - params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"] -) -def index_or_series_or_array(request): - """ - Fixture to parametrize over Index, Series, and ExtensionArray - """ - return request.param - - @pytest.fixture def dict_subclass(): """ @@ -377,24 +359,11 @@ def multiindex_year_month_day_dataframe_random_data(): tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) + ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd -@pytest.fixture -def multiindex_dataframe_random_data(): - """DataFrame with 2 level MultiIndex with random data""" - index = MultiIndex( - levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=["first", "second"], - ) - return DataFrame( - np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp") - ) - - def _create_multiindex(): """ MultiIndex used to test the general functionality of this object @@ -407,12 +376,13 @@ def _create_multiindex(): major_codes = np.array([0, 0, 1, 2, 3, 3]) minor_codes = np.array([0, 1, 0, 1, 0, 1]) index_names = ["first", "second"] - return MultiIndex( + mi = MultiIndex( levels=[major_axis, minor_axis], codes=[major_codes, minor_codes], names=index_names, verify_integrity=False, ) + return mi def _create_mi_with_dt64tz_level(): @@ -467,29 +437,6 @@ def index(request): index_fixture2 = index -@pytest.fixture(params=indices_dict.keys()) -def index_with_missing(request): - """ - Fixture for indices with missing values - """ - if request.param in ["int", "uint", "range", "empty", "repeats"]: - pytest.xfail("missing values not supported") - # GH 35538. Use deep copy to avoid illusive bug on np-dev - # Azure pipeline that writes into indices_dict despite copy - ind = indices_dict[request.param].copy(deep=True) - vals = ind.values - if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: - # For setting missing values in the top level of MultiIndex - vals = ind.tolist() - vals[0] = (None,) + vals[0][1:] - vals[-1] = (None,) + vals[-1][1:] - return MultiIndex.from_tuples(vals) - else: - vals[0] = None - vals[-1] = None - return type(ind)(vals) - - # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- @@ -549,23 +496,6 @@ def series_with_simple_index(index): return _create_series(index) -@pytest.fixture -def series_with_multilevel_index(): - """ - Fixture with a Series with a 2-level MultiIndex. - """ - arrays = [ - ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - tuples = zip(*arrays) - index = MultiIndex.from_tuples(tuples) - data = np.random.randn(8) - ser = Series(data, index=index) - ser[3] = np.NaN - return ser - - _narrow_dtypes = [ np.float16, np.float32, @@ -698,26 +628,6 @@ def float_frame(): return DataFrame(tm.getSeriesData()) -# ---------------------------------------------------------------- -# Scalars -# ---------------------------------------------------------------- -@pytest.fixture( - params=[ - (Interval(left=0, right=5), IntervalDtype("int64")), - (Interval(left=0.1, right=0.5), IntervalDtype("float64")), - (Period("2012-01", freq="M"), "period[M]"), - (Period("2012-02-01", freq="D"), "period[D]"), - ( - Timestamp("2011-01-01", tz="US/Eastern"), - DatetimeTZDtype(tz="US/Eastern"), - ), - (Timedelta(seconds=500), "timedelta64[ns]"), - ] -) -def ea_scalar_and_dtype(request): - return request.param - - # ---------------------------------------------------------------- # Operators & Operations # ---------------------------------------------------------------- @@ -747,43 +657,6 @@ def all_arithmetic_operators(request): return request.param -@pytest.fixture( - params=[ - operator.add, - ops.radd, - operator.sub, - ops.rsub, - operator.mul, - ops.rmul, - operator.truediv, - ops.rtruediv, - operator.floordiv, - ops.rfloordiv, - operator.mod, - ops.rmod, - operator.pow, - ops.rpow, - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, - operator.and_, - ops.rand_, - operator.xor, - ops.rxor, - operator.or_, - ops.ror_, - ] -) -def all_binary_operators(request): - """ - Fixture for operator and roperator arithmetic, comparison, and logical ops. - """ - return request.param - - @pytest.fixture( params=[ operator.add, @@ -964,10 +837,6 @@ TIMEZONES = [ "Asia/Tokyo", "dateutil/US/Pacific", "dateutil/Asia/Singapore", - "+01:15", - "-02:15", - "UTC+01:15", - "UTC-02:15", tzutc(), tzlocal(), FixedOffset(300), @@ -1089,31 +958,6 @@ def float_dtype(request): return request.param -@pytest.fixture(params=tm.FLOAT_EA_DTYPES) -def float_ea_dtype(request): - """ - Parameterized fixture for float dtypes. - - * 'Float32' - * 'Float64' - """ - return request.param - - -@pytest.fixture(params=tm.FLOAT_DTYPES + tm.FLOAT_EA_DTYPES) -def any_float_allowed_nullable_dtype(request): - """ - Parameterized fixture for float dtypes. - - * float - * 'float32' - * 'float64' - * 'Float32' - * 'Float64' - """ - return request.param - - @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ @@ -1188,26 +1032,6 @@ def any_nullable_int_dtype(request): return request.param -@pytest.fixture(params=tm.ALL_EA_INT_DTYPES + tm.FLOAT_EA_DTYPES) -def any_numeric_dtype(request): - """ - Parameterized fixture for any nullable integer dtype and - any float ea dtypes. - - * 'UInt8' - * 'Int8' - * 'UInt16' - * 'Int16' - * 'UInt32' - * 'Int32' - * 'UInt64' - * 'Int64' - * 'Float32' - * 'Float64' - """ - return request.param - - @pytest.fixture(params=tm.SIGNED_EA_INT_DTYPES) def any_signed_nullable_int_dtype(request): """ @@ -1370,13 +1194,7 @@ def ip(): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell - # GH#35711 make sure sqlite history file handle is not leaked - from traitlets.config import Config # isort:skip - - c = Config() - c.HistoryManager.hist_file = ":memory:" - - return InteractiveShell(config=c) + return InteractiveShell() @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) @@ -1389,6 +1207,15 @@ def spmatrix(request): return getattr(sparse, request.param + "_matrix") +@pytest.fixture(params=list(tm.cython_table)) +def cython_table_items(request): + """ + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. + """ + return request.param + + @pytest.fixture( params=[ getattr(pd.offsets, o) @@ -1410,39 +1237,3 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param - - -@pytest.fixture() -def fsspectest(): - pytest.importorskip("fsspec") - from fsspec import register_implementation - from fsspec.implementations.memory import MemoryFileSystem - from fsspec.registry import _registry as registry - - class TestMemoryFS(MemoryFileSystem): - protocol = "testmem" - test = [None] - - def __init__(self, **kwargs): - self.test[0] = kwargs.pop("test", None) - super().__init__(**kwargs) - - register_implementation("testmem", TestMemoryFS, clobber=True) - yield TestMemoryFS() - registry.pop("testmem", None) - TestMemoryFS.test[0] = None - TestMemoryFS.store.clear() - - -@pytest.fixture( - params=[ - ("foo", None, None), - ("Egon", "Venkman", None), - ("NCC1701D", "NCC1701D", "NCC1701D"), - ] -) -def names(request): - """ - A 3-tuple of names, the first two for operands, the last for a result. - """ - return request.param diff --git a/venv/lib/python3.8/site-packages/pandas/core/accessor.py b/venv/lib/python3.8/site-packages/pandas/core/accessor.py index 15c2a4a..2caf1f7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/accessor.py +++ b/venv/lib/python3.8/site-packages/pandas/core/accessor.py @@ -4,7 +4,7 @@ accessor.py contains base classes for implementing accessor properties that can be mixed into or pinned onto other pandas classes. """ -from typing import FrozenSet, List, Set +from typing import FrozenSet, Set import warnings from pandas.util._decorators import doc @@ -12,21 +12,28 @@ from pandas.util._decorators import doc class DirNamesMixin: _accessors: Set[str] = set() - _hidden_attrs: FrozenSet[str] = frozenset() + _deprecations: FrozenSet[str] = frozenset() - def _dir_deletions(self) -> Set[str]: + def _dir_deletions(self): """ Delete unwanted __dir__ for this object. """ - return self._accessors | self._hidden_attrs + return self._accessors | self._deprecations - def _dir_additions(self) -> Set[str]: + def _dir_additions(self): """ Add additional __dir__ for this object. """ - return {accessor for accessor in self._accessors if hasattr(self, accessor)} + rv = set() + for accessor in self._accessors: + try: + getattr(self, accessor) + rv.add(accessor) + except AttributeError: + pass + return rv - def __dir__(self) -> List[str]: + def __dir__(self): """ Provide method name lookup and completion. @@ -34,7 +41,7 @@ class DirNamesMixin: ----- Only provide 'public' methods. """ - rv = set(super().__dir__()) + rv = set(dir(type(self))) rv = (rv - self._dir_deletions()) | self._dir_additions() return sorted(rv) diff --git a/venv/lib/python3.8/site-packages/pandas/core/aggregation.py b/venv/lib/python3.8/site-packages/pandas/core/aggregation.py index c64f0bd..73e470e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/aggregation.py +++ b/venv/lib/python3.8/site-packages/pandas/core/aggregation.py @@ -6,46 +6,32 @@ kwarg aggregations in groupby and DataFrame/Series aggregation from collections import defaultdict from functools import partial from typing import ( - TYPE_CHECKING, Any, Callable, DefaultDict, Dict, - Iterable, List, Optional, Sequence, Tuple, Union, - cast, ) -from pandas._typing import ( - AggFuncType, - AggFuncTypeBase, - AggFuncTypeDict, - AggObjType, - Axis, - FrameOrSeries, - FrameOrSeriesUnion, - Label, -) +from pandas._typing import AggFuncType, Label -from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import is_dict_like, is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCNDFrame, ABCSeries -from pandas.core.base import DataError, SpecificationError +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index - -if TYPE_CHECKING: - from pandas.core.series import Series +from pandas.core.series import FrameOrSeriesUnion, Series def reconstruct_func( - func: Optional[AggFuncType], **kwargs -) -> Tuple[bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]]]: + func: Optional[AggFuncType], **kwargs, +) -> Tuple[ + bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], +]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. @@ -291,13 +277,12 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: def relabel_result( - result: FrameOrSeries, + result: FrameOrSeriesUnion, func: Dict[str, List[Union[Callable, str]]], - columns: Iterable[Label], - order: Iterable[int], -) -> Dict[Label, "Series"]: - """ - Internal function to reorder result if relabelling is True for + columns: Tuple, + order: List[int], +) -> Dict[Label, Series]: + """Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. Parameters: @@ -322,10 +307,10 @@ def relabel_result( reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] - reordered_result_in_dict: Dict[Label, "Series"] = {} + reordered_result_in_dict: Dict[Label, Series] = {} idx = 0 - reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 + reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() @@ -388,7 +373,7 @@ def validate_func_kwargs( (['one', 'two'], ['min', 'max']) """ no_arg_message = "Must provide 'func' or named aggregation **kwargs." - tuple_given_message = "func is expected but received {} in **kwargs." + tuple_given_message = "func is expected but recieved {} in **kwargs." columns = list(kwargs) func = [] for col_func in kwargs.values(): @@ -398,390 +383,3 @@ def validate_func_kwargs( if not columns: raise TypeError(no_arg_message) return columns, func - - -def transform( - obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs -) -> FrameOrSeriesUnion: - """ - Transform a DataFrame or Series - - Parameters - ---------- - obj : DataFrame or Series - Object to compute the transform on. - func : string, function, list, or dictionary - Function(s) to compute the transform with. - axis : {0 or 'index', 1 or 'columns'} - Axis along which the function is applied: - - * 0 or 'index': apply function to each column. - * 1 or 'columns': apply function to each row. - - Returns - ------- - DataFrame or Series - Result of applying ``func`` along the given axis of the - Series or DataFrame. - - Raises - ------ - ValueError - If the transform function fails or does not transform. - """ - is_series = obj.ndim == 1 - - if obj._get_axis_number(axis) == 1: - assert not is_series - return transform(obj.T, func, 0, *args, **kwargs).T - - if is_list_like(func) and not is_dict_like(func): - func = cast(List[AggFuncTypeBase], func) - # Convert func equivalent dict - if is_series: - func = {com.get_callable_name(v) or v: v for v in func} - else: - func = {col: func for col in obj} - - if is_dict_like(func): - func = cast(AggFuncTypeDict, func) - return transform_dict_like(obj, func, *args, **kwargs) - - # func is either str or callable - func = cast(AggFuncTypeBase, func) - try: - result = transform_str_or_callable(obj, func, *args, **kwargs) - except Exception: - raise ValueError("Transform function failed") - - # Functions that transform may return empty Series/DataFrame - # when the dtype is not appropriate - if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: - raise ValueError("Transform function failed") - if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( - obj.index - ): - raise ValueError("Function did not transform") - - return result - - -def transform_dict_like( - obj: FrameOrSeries, - func: AggFuncTypeDict, - *args, - **kwargs, -): - """ - Compute transform in the case of a dict-like func - """ - from pandas.core.reshape.concat import concat - - if len(func) == 0: - raise ValueError("No transform functions were provided") - - if obj.ndim != 1: - # Check for missing columns on a frame - cols = sorted(set(func.keys()) - set(obj.columns)) - if len(cols) > 0: - raise SpecificationError(f"Column(s) {cols} do not exist") - - # Can't use func.values(); wouldn't work for a Series - if any(is_dict_like(v) for _, v in func.items()): - # GH 15931 - deprecation of renaming keys - raise SpecificationError("nested renamer is not supported") - - results: Dict[Label, FrameOrSeriesUnion] = {} - for name, how in func.items(): - colg = obj._gotitem(name, ndim=1) - try: - results[name] = transform(colg, how, 0, *args, **kwargs) - except Exception as err: - if ( - str(err) == "Function did not transform" - or str(err) == "No transform functions were provided" - ): - raise err - - # combine results - if len(results) == 0: - raise ValueError("Transform function failed") - return concat(results, axis=1) - - -def transform_str_or_callable( - obj: FrameOrSeries, func: AggFuncTypeBase, *args, **kwargs -) -> FrameOrSeriesUnion: - """ - Compute transform in the case of a string or callable func - """ - if isinstance(func, str): - return obj._try_aggregate_string_function(func, *args, **kwargs) - - if not args and not kwargs: - f = obj._get_cython_func(func) - if f: - return getattr(obj, f)() - - # Two possible ways to use a UDF - apply or call directly - try: - return obj.apply(func, args=args, **kwargs) - except Exception: - return func(obj, *args, **kwargs) - - -def aggregate( - obj: AggObjType, - arg: AggFuncType, - *args, - **kwargs, -): - """ - Provide an implementation for the aggregators. - - Parameters - ---------- - obj : Pandas object to compute aggregation on. - arg : string, dict, function. - *args : args to pass on to the function. - **kwargs : kwargs to pass on to the function. - - Returns - ------- - tuple of result, how. - - Notes - ----- - how can be a string describe the required post-processing, or - None if not required. - """ - _axis = kwargs.pop("_axis", None) - if _axis is None: - _axis = getattr(obj, "axis", 0) - - if isinstance(arg, str): - return obj._try_aggregate_string_function(arg, *args, **kwargs), None - elif is_dict_like(arg): - arg = cast(AggFuncTypeDict, arg) - return agg_dict_like(obj, arg, _axis), True - elif is_list_like(arg): - # we require a list, but not an 'str' - arg = cast(List[AggFuncTypeBase], arg) - return agg_list_like(obj, arg, _axis=_axis), None - else: - result = None - - if callable(arg): - f = obj._get_cython_func(arg) - if f and not args and not kwargs: - return getattr(obj, f)(), None - - # caller can react - return result, True - - -def agg_list_like( - obj: AggObjType, - arg: List[AggFuncTypeBase], - _axis: int, -) -> FrameOrSeriesUnion: - """ - Compute aggregation in the case of a list-like argument. - - Parameters - ---------- - obj : Pandas object to compute aggregation on. - arg : list - Aggregations to compute. - _axis : int, 0 or 1 - Axis to compute aggregation on. - - Returns - ------- - Result of aggregation. - """ - from pandas.core.reshape.concat import concat - - if _axis != 0: - raise NotImplementedError("axis other than 0 is not supported") - - if obj._selected_obj.ndim == 1: - selected_obj = obj._selected_obj - else: - selected_obj = obj._obj_with_exclusions - - results = [] - keys = [] - - # degenerate case - if selected_obj.ndim == 1: - for a in arg: - colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) - try: - new_res = colg.aggregate(a) - - except TypeError: - pass - else: - results.append(new_res) - - # make sure we find a good name - name = com.get_callable_name(a) or a - keys.append(name) - - # multiples - else: - for index, col in enumerate(selected_obj): - colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) - try: - new_res = colg.aggregate(arg) - except (TypeError, DataError): - pass - except ValueError as err: - # cannot aggregate - if "Must produce aggregated value" in str(err): - # raised directly in _aggregate_named - pass - elif "no results" in str(err): - # raised directly in _aggregate_multiple_funcs - pass - else: - raise - else: - results.append(new_res) - keys.append(col) - - # if we are empty - if not len(results): - raise ValueError("no results") - - try: - return concat(results, keys=keys, axis=1, sort=False) - except TypeError as err: - - # we are concatting non-NDFrame objects, - # e.g. a list of scalars - - from pandas import Series - - result = Series(results, index=keys, name=obj.name) - if is_nested_object(result): - raise ValueError( - "cannot combine transform and aggregation operations" - ) from err - return result - - -def agg_dict_like( - obj: AggObjType, - arg: AggFuncTypeDict, - _axis: int, -) -> FrameOrSeriesUnion: - """ - Compute aggregation in the case of a dict-like argument. - - Parameters - ---------- - obj : Pandas object to compute aggregation on. - arg : dict - label-aggregation pairs to compute. - _axis : int, 0 or 1 - Axis to compute aggregation on. - - Returns - ------- - Result of aggregation. - """ - is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) - - if _axis != 0: # pragma: no cover - raise ValueError("Can only pass dict with axis=0") - - selected_obj = obj._selected_obj - - # if we have a dict of any non-scalars - # eg. {'A' : ['mean']}, normalize all to - # be list-likes - if any(is_aggregator(x) for x in arg.values()): - new_arg: AggFuncTypeDict = {} - for k, v in arg.items(): - if not isinstance(v, (tuple, list, dict)): - new_arg[k] = [v] - else: - new_arg[k] = v - - # the keys must be in the columns - # for ndim=2, or renamers for ndim=1 - - # ok for now, but deprecated - # {'A': { 'ra': 'mean' }} - # {'A': { 'ra': ['mean'] }} - # {'ra': ['mean']} - - # not ok - # {'ra' : { 'A' : 'mean' }} - if isinstance(v, dict): - raise SpecificationError("nested renamer is not supported") - elif isinstance(selected_obj, ABCSeries): - raise SpecificationError("nested renamer is not supported") - elif ( - isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns - ): - raise KeyError(f"Column '{k}' does not exist!") - - arg = new_arg - - else: - # deprecation of renaming keys - # GH 15931 - keys = list(arg.keys()) - if isinstance(selected_obj, ABCDataFrame) and len( - selected_obj.columns.intersection(keys) - ) != len(keys): - cols = sorted(set(keys) - set(selected_obj.columns.intersection(keys))) - raise SpecificationError(f"Column(s) {cols} do not exist") - - from pandas.core.reshape.concat import concat - - if selected_obj.ndim == 1: - # key only used for output - colg = obj._gotitem(obj._selection, ndim=1) - results = {key: colg.agg(how) for key, how in arg.items()} - else: - # key used for column selection and output - results = {key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()} - - # set the final keys - keys = list(arg.keys()) - - # Avoid making two isinstance calls in all and any below - is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] - - # combine results - if all(is_ndframe): - keys_to_use = [k for k in keys if not results[k].empty] - # Have to check, if at least one DataFrame is not empty. - keys_to_use = keys_to_use if keys_to_use != [] else keys - axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat({k: results[k] for k in keys_to_use}, axis=axis) - elif any(is_ndframe): - # There is a mix of NDFrames and scalars - raise ValueError( - "cannot perform both aggregation " - "and transformation operations " - "simultaneously" - ) - else: - from pandas import Series - - # we have a dict of scalars - # GH 36212 use name only if obj is a series - if obj.ndim == 1: - obj = cast("Series", obj) - name = obj.name - else: - name = None - - result = Series(results, name=name) - - return result diff --git a/venv/lib/python3.8/site-packages/pandas/core/algorithms.py b/venv/lib/python3.8/site-packages/pandas/core/algorithms.py index 1d411f3..32b5eae 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/algorithms.py +++ b/venv/lib/python3.8/site-packages/pandas/core/algorithms.py @@ -2,17 +2,15 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ -from __future__ import annotations - import operator from textwrap import dedent -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn import numpy as np from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib -from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion +from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -50,9 +48,9 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.generic import ( ABCDatetimeArray, ABCExtensionArray, + ABCIndex, ABCIndexClass, ABCMultiIndex, - ABCRangeIndex, ABCSeries, ABCTimedeltaArray, ) @@ -62,7 +60,7 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Index, Series + from pandas import Series _shared_docs: Dict[str, str] = {} @@ -71,7 +69,7 @@ _shared_docs: Dict[str, str] = {} # dtype access # # --------------- # def _ensure_data( - values: ArrayLike, dtype: Optional[DtypeObj] = None + values, dtype: Optional[DtypeObj] = None ) -> Tuple[np.ndarray, DtypeObj]: """ routine to ensure that our data is of the correct @@ -97,12 +95,6 @@ def _ensure_data( pandas_dtype : np.dtype or ExtensionDtype """ - if dtype is not None: - # We only have non-None dtype when called from `isin`, and - # both Datetimelike and Categorical dispatch before getting here. - assert not needs_i8_conversion(dtype) - assert not is_categorical_dtype(dtype) - if not isinstance(values, ABCMultiIndex): # extract_array would raise values = extract_array(values, extract_numpy=True) @@ -139,20 +131,21 @@ def _ensure_data( return ensure_object(values), np.dtype("object") # datetimelike - if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): - if is_period_dtype(values.dtype) or is_period_dtype(dtype): + vals_dtype = getattr(values, "dtype", None) + if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): + if is_period_dtype(vals_dtype) or is_period_dtype(dtype): from pandas import PeriodIndex - values = PeriodIndex(values)._data + values = PeriodIndex(values) dtype = values.dtype - elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex - values = TimedeltaIndex(values)._data + values = TimedeltaIndex(values) dtype = values.dtype else: # Datetime - if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): + if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs @@ -162,15 +155,14 @@ def _ensure_data( from pandas import DatetimeIndex - values = DatetimeIndex(values)._data + values = DatetimeIndex(values) dtype = values.dtype return values.asi8, dtype - elif is_categorical_dtype(values.dtype) and ( + elif is_categorical_dtype(vals_dtype) and ( is_categorical_dtype(dtype) or dtype is None ): - values = cast("Categorical", values) values = values.codes dtype = pandas_dtype("category") @@ -234,8 +226,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string", "mixed-integer"]: - # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 + if inferred in ["mixed", "string"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -253,11 +244,11 @@ _hashtables = { } -def _get_hashtable_algo(values: np.ndarray): +def _get_hashtable_algo(values): """ Parameters ---------- - values : np.ndarray + values : arraylike Returns ------- @@ -271,15 +262,15 @@ def _get_hashtable_algo(values: np.ndarray): return htable, values -def _get_values_for_rank(values: ArrayLike): +def _get_values_for_rank(values): if is_categorical_dtype(values): - values = cast("Categorical", values)._values_for_rank() + values = values._values_for_rank() values, _ = _ensure_data(values) return values -def get_data_algo(values: ArrayLike): +def _get_data_algo(values): values = _get_values_for_rank(values) ndtype = _check_object_for_strings(values) @@ -295,6 +286,7 @@ def _check_object_for_strings(values) -> str: Parameters ---------- values : ndarray + ndtype : str Returns ------- @@ -437,64 +429,54 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: f"to isin(), you passed a [{type(values).__name__}]" ) - if not isinstance( - values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray) - ): - values = _ensure_arraylike(list(values)) - elif isinstance(values, ABCMultiIndex): - # Avoid raising in extract_array - values = np.array(values) - else: - values = extract_array(values, extract_numpy=True) + if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)): + values = construct_1d_object_array_from_listlike(list(values)) + # TODO: could use ensure_arraylike here - comps = _ensure_arraylike(comps) comps = extract_array(comps, extract_numpy=True) - if is_categorical_dtype(comps.dtype): + if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return cast("Categorical", comps).isin(values) + return comps.isin(values) # type: ignore - if needs_i8_conversion(comps.dtype): - # Dispatch to DatetimeLikeArrayMixin.isin - return array(comps).isin(values) - elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): - # e.g. comps are integers and values are datetime64s - return np.zeros(comps.shape, dtype=bool) - # TODO: not quite right ... Sparse/Categorical - elif needs_i8_conversion(values.dtype): - return isin(comps, values.astype(object)) + comps, dtype = _ensure_data(comps) + values, _ = _ensure_data(values, dtype=dtype) - elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( - values.dtype - ): - return isin(np.asarray(comps), np.asarray(values)) + # faster for larger cases to use np.in1d + f = htable.ismember_object # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception - # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), - # in1d is faster for small sizes - if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): - # If the values include nan we need to check for nan explicitly + if len(comps) > 1_000_000 and not is_object_dtype(comps): + # If the the values include nan we need to check for nan explicitly # since np.nan it not equal to np.nan if isna(values).any(): f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) else: f = np.in1d + elif is_integer_dtype(comps): + try: + values = values.astype("int64", copy=False) + comps = comps.astype("int64", copy=False) + f = htable.ismember_int64 + except (TypeError, ValueError, OverflowError): + values = values.astype(object) + comps = comps.astype(object) - else: - common = np.find_common_type([values.dtype, comps.dtype], []) - values = values.astype(common, copy=False) - comps = comps.astype(common, copy=False) - name = common.name - if name == "bool": - name = "uint8" - f = getattr(htable, f"ismember_{name}") + elif is_float_dtype(comps): + try: + values = values.astype("float64", copy=False) + comps = comps.astype("float64", copy=False) + f = htable.ismember_float64 + except (TypeError, ValueError): + values = values.astype(object) + comps = comps.astype(object) return f(comps, values) -def factorize_array( - values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None +def _factorize_array( + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. @@ -522,7 +504,7 @@ def factorize_array( codes : ndarray uniques : ndarray """ - hash_klass, values = get_data_algo(values) + hash_klass, values = _get_data_algo(values) table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( @@ -560,7 +542,7 @@ def factorize( sort: bool = False, na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, -) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]: +) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -680,9 +662,6 @@ def factorize( # responsible only for factorization. All data coercion, sorting and boxing # should happen here. - if isinstance(values, ABCRangeIndex): - return values.factorize(sort=sort) - values = _ensure_arraylike(values) original = values if not isinstance(values, ABCMultiIndex): @@ -719,7 +698,7 @@ def factorize( else: na_value = None - codes, uniques = factorize_array( + codes, uniques = _factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) @@ -740,8 +719,6 @@ def factorize( # return original tenor if isinstance(original, ABCIndexClass): - if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray): - uniques = type(original._data)._simple_new(uniques, dtype=original.dtype) uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index @@ -758,7 +735,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> Series: +) -> "Series": """ Compute a histogram of the counts of non-null values. @@ -817,7 +794,7 @@ def value_counts( counts = result._values else: - keys, counts = value_counts_arraylike(values, dropna) + keys, counts = _value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) @@ -830,8 +807,8 @@ def value_counts( return result -# Called once from SparseArray, otherwise could be private -def value_counts_arraylike(values, dropna: bool): +# Called once from SparseArray +def _value_counts_arraylike(values, dropna: bool): """ Parameters ---------- @@ -875,7 +852,7 @@ def value_counts_arraylike(values, dropna: bool): return keys, counts -def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: +def duplicated(values, keep="first") -> np.ndarray: """ Return boolean ndarray denoting duplicate values. @@ -900,7 +877,7 @@ def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: return f(values, keep=keep) -def mode(values, dropna: bool = True) -> Series: +def mode(values, dropna: bool = True) -> "Series": """ Returns the mode(s) of an array. @@ -1068,10 +1045,11 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] - ).any() or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() + ) if to_raise: raise OverflowError("Overflow in int64 addition") @@ -1176,9 +1154,6 @@ class SelectN: if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') - def compute(self, method: str) -> FrameOrSeriesUnion: - raise NotImplementedError - def nlargest(self): return self.compute("nlargest") @@ -1211,7 +1186,7 @@ class SelectNSeries(SelectN): nordered : Series """ - def compute(self, method: str) -> Series: + def compute(self, method): n = self.n dtype = self.obj.dtype @@ -1225,8 +1200,10 @@ class SelectNSeries(SelectN): # slow method if n >= len(self.obj): + reverse_it = self.keep == "last" or method == "nlargest" ascending = method == "nsmallest" - return dropped.sort_values(ascending=ascending).head(n) + slc = np.s_[::-1] if reverse_it else np.s_[:] + return dropped[slc].sort_values(ascending=ascending).head(n) # fast method arr, pandas_dtype = _ensure_data(dropped.values) @@ -1283,7 +1260,7 @@ class SelectNFrame(SelectN): columns = list(columns) self.columns = columns - def compute(self, method: str) -> DataFrame: + def compute(self, method): from pandas import Int64Index @@ -1571,6 +1548,8 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) """ Take elements from an array. + .. versionadded:: 0.23.0 + Parameters ---------- arr : sequence @@ -1588,7 +1567,7 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other - negative values raise a ``ValueError``. + other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. @@ -1694,8 +1673,7 @@ def take_nd( """ mask_info = None - if isinstance(arr, ABCExtensionArray): - # Check for EA to catch DatetimeArray, TimedeltaArray + if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = extract_array(arr) @@ -1826,7 +1804,7 @@ def take_2d_multi(arr, indexer, fill_value=np.nan): # ------------ # -def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: +def searchsorted(arr, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -1875,7 +1853,7 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: if ( isinstance(arr, np.ndarray) - and is_integer_dtype(arr.dtype) + and is_integer_dtype(arr) and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be @@ -1953,8 +1931,6 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if is_extension_array_dtype(dtype): if hasattr(arr, f"__{op.__name__}__"): - if axis != 0: - raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: warn( @@ -1969,26 +1945,18 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): is_timedelta = False is_bool = False if needs_i8_conversion(arr.dtype): - dtype = np.int64 + dtype = np.float64 arr = arr.view("i8") na = iNaT is_timedelta = True elif is_bool_dtype(dtype): - # We have to cast in order to be able to hold np.nan dtype = np.object_ is_bool = True elif is_integer_dtype(dtype): - # We have to cast in order to be able to hold np.nan dtype = np.float64 - orig_ndim = arr.ndim - if orig_ndim == 1: - # reshape so we can always use algos.diff_2d - arr = arr.reshape(-1, 1) - # TODO: require axis == 0 - dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) @@ -1999,7 +1967,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if arr.ndim == 2 and arr.dtype.name in _diff_special: # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? - algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) + algos.diff_2d(arr, out_arr, n, axis) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -2033,10 +2001,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] if is_timedelta: - out_arr = out_arr.view("timedelta64[ns]") + out_arr = out_arr.astype("int64").view("timedelta64[ns]") - if orig_ndim == 1: - out_arr = out_arr[:, 0] return out_arr @@ -2100,30 +2066,32 @@ def safe_sort( "Only list-like objects are allowed to be passed to safe_sort as values" ) - if not isinstance(values, (np.ndarray, ABCExtensionArray)): + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) - sorter = None + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + sorter = None if ( not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer" ): - ordered = _sort_mixed(values) + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: - # Previous sorters failed or were not applicable, try `_sort_mixed` - # which would work, but which fails for special case of 1d arrays - # with tuples. - if values.size and isinstance(values[0], tuple): - ordered = _sort_tuples(values) - else: - ordered = _sort_mixed(values) + # try this anyway + ordered = sort_mixed(values) # codes: @@ -2142,7 +2110,7 @@ def safe_sort( if sorter is None: # mixed types - hash_klass, values = get_data_algo(values) + hash_klass, values = _get_data_algo(values) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) @@ -2170,26 +2138,3 @@ def safe_sort( np.putmask(new_codes, mask, na_sentinel) return ordered, ensure_platform_int(new_codes) - - -def _sort_mixed(values): - """ order ints before strings in 1d arrays, safe in py3 """ - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - - -def _sort_tuples(values: np.ndarray[tuple]): - """ - Convert array of tuples (1d) to array or array (2d). - We need to keep the columns separately as they contain different types and - nans (can't use `np.sort` as it may fail when str and nan are mixed in a - column as types cannot be compared). - """ - from pandas.core.internals.construction import to_arrays - from pandas.core.sorting import lexsort_indexer - - arrays, _ = to_arrays(values, None) - indexer = lexsort_indexer(arrays, orders=True) - return values[indexer] diff --git a/venv/lib/python3.8/site-packages/pandas/core/api.py b/venv/lib/python3.8/site-packages/pandas/core/api.py index 67e86c2..b0b65f9 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/api.py +++ b/venv/lib/python3.8/site-packages/pandas/core/api.py @@ -14,7 +14,6 @@ from pandas.core.dtypes.missing import isna, isnull, notna, notnull from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.floating import Float32Dtype, Float64Dtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -27,7 +26,6 @@ from pandas.core.arrays.integer import ( ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array -from pandas.core.flags import Flags from pandas.core.groupby import Grouper, NamedAgg from pandas.core.indexes.api import ( CategoricalIndex, diff --git a/venv/lib/python3.8/site-packages/pandas/core/apply.py b/venv/lib/python3.8/site-packages/pandas/core/apply.py index 6d9e11e..af47174 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/apply.py +++ b/venv/lib/python3.8/site-packages/pandas/core/apply.py @@ -1,12 +1,12 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union import numpy as np from pandas._config import option_context -from pandas._typing import Axis, FrameOrSeriesUnion +from pandas._typing import Axis from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -31,6 +31,7 @@ def frame_apply( axis: Axis = 0, raw: bool = False, result_type: Optional[str] = None, + ignore_failures: bool = False, args=None, kwds=None, ): @@ -47,6 +48,7 @@ def frame_apply( func, raw=raw, result_type=result_type, + ignore_failures=ignore_failures, args=args, kwds=kwds, ) @@ -76,7 +78,7 @@ class FrameApply(metaclass=abc.ABCMeta): @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> FrameOrSeriesUnion: + ) -> Union["Series", "DataFrame"]: pass # --------------------------------------------------------------- @@ -87,11 +89,13 @@ class FrameApply(metaclass=abc.ABCMeta): func, raw: bool, result_type: Optional[str], + ignore_failures: bool, args, kwds, ): self.obj = obj self.raw = raw + self.ignore_failures = ignore_failures self.args = args or () self.kwds = kwds or {} @@ -142,11 +146,7 @@ class FrameApply(metaclass=abc.ABCMeta): """ compute the results """ # dispatch to agg if is_list_like(self.f) or is_dict_like(self.f): - # pandas\core\apply.py:144: error: "aggregate" of "DataFrame" gets - # multiple values for keyword argument "axis" - return self.obj.aggregate( # type: ignore[misc] - self.f, axis=self.axis, *self.args, **self.kwds - ) + return self.obj.aggregate(self.f, axis=self.axis, *self.args, **self.kwds) # all empty if len(self.columns) == 0 and len(self.index) == 0: @@ -284,18 +284,35 @@ class FrameApply(metaclass=abc.ABCMeta): results = {} - with option_context("mode.chained_assignment", None): + if self.ignore_failures: + successes = [] for i, v in enumerate(series_gen): - # ignore SettingWithCopy here in case the user mutates - results[i] = self.f(v) - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + try: + results[i] = self.f(v) + except Exception: + pass + else: + successes.append(i) + + # so will work with MultiIndex + if len(successes) < len(res_index): + res_index = res_index.take(successes) + + else: + with option_context("mode.chained_assignment", None): + for i, v in enumerate(series_gen): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index - def wrap_results(self, results: ResType, res_index: "Index") -> FrameOrSeriesUnion: + def wrap_results( + self, results: ResType, res_index: "Index" + ) -> Union["Series", "DataFrame"]: from pandas import Series # see if we can infer the results @@ -339,7 +356,7 @@ class FrameRowApply(FrameApply): def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> FrameOrSeriesUnion: + ) -> Union["Series", "DataFrame"]: """ return the results for the rows """ if self.result_type == "reduce": @@ -352,10 +369,8 @@ class FrameRowApply(FrameApply): isinstance(x, dict) for x in results.values() ): # Our operation was a to_dict op e.g. - # test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544 - res = self.obj._constructor_sliced(results) - res.index = res_index - return res + # test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196 + return self.obj._constructor_sliced(results) try: result = self.obj._constructor(data=results) @@ -422,9 +437,9 @@ class FrameColumnApply(FrameApply): def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> FrameOrSeriesUnion: + ) -> Union["Series", "DataFrame"]: """ return the results for the columns """ - result: FrameOrSeriesUnion + result: Union["Series", "DataFrame"] # we have requested to expand if self.result_type == "expand": diff --git a/venv/lib/python3.8/site-packages/pandas/core/array_algos/masked_reductions.py b/venv/lib/python3.8/site-packages/pandas/core/array_algos/masked_reductions.py index bce6f1a..1b9ed01 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/array_algos/masked_reductions.py +++ b/venv/lib/python3.8/site-packages/pandas/core/array_algos/masked_reductions.py @@ -8,7 +8,7 @@ from typing import Callable import numpy as np from pandas._libs import missing as libmissing -from pandas.compat.numpy import np_version_under1p17 +from pandas.compat.numpy import _np_version_under1p17 from pandas.core.nanops import check_below_min_count @@ -17,7 +17,6 @@ def _sumprod( func: Callable, values: np.ndarray, mask: np.ndarray, - *, skipna: bool = True, min_count: int = 0, ): @@ -47,31 +46,25 @@ def _sumprod( if check_below_min_count(values.shape, mask, min_count): return libmissing.NA - if np_version_under1p17: + if _np_version_under1p17: return func(values[~mask]) else: return func(values, where=~mask) -def sum( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 -): +def sum(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): return _sumprod( np.sum, values=values, mask=mask, skipna=skipna, min_count=min_count ) -def prod( - values: np.ndarray, mask: np.ndarray, *, skipna: bool = True, min_count: int = 0 -): +def prod(values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0): return _sumprod( np.prod, values=values, mask=mask, skipna=skipna, min_count=min_count ) -def _minmax( - func: Callable, values: np.ndarray, mask: np.ndarray, *, skipna: bool = True -): +def _minmax(func: Callable, values: np.ndarray, mask: np.ndarray, skipna: bool = True): """ Reduction for 1D masked array. @@ -101,9 +94,9 @@ def _minmax( return libmissing.NA -def min(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def min(values: np.ndarray, mask: np.ndarray, skipna: bool = True): return _minmax(np.min, values=values, mask=mask, skipna=skipna) -def max(values: np.ndarray, mask: np.ndarray, *, skipna: bool = True): +def max(values: np.ndarray, mask: np.ndarray, skipna: bool = True): return _minmax(np.max, values=values, mask=mask, skipna=skipna) diff --git a/venv/lib/python3.8/site-packages/pandas/core/array_algos/replace.py b/venv/lib/python3.8/site-packages/pandas/core/array_algos/replace.py deleted file mode 100644 index 76d723b..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/array_algos/replace.py +++ /dev/null @@ -1,133 +0,0 @@ -""" -Methods used by Block.replace and related methods. -""" -import operator -import re -from typing import Optional, Pattern, Union - -import numpy as np - -from pandas._typing import ArrayLike, Scalar - -from pandas.core.dtypes.common import ( - is_datetimelike_v_numeric, - is_numeric_v_string_like, - is_re, - is_scalar, -) -from pandas.core.dtypes.missing import isna - - -def compare_or_regex_search( - a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike -) -> Union[ArrayLike, bool]: - """ - Compare two array_like inputs of the same shape or two scalar values - - Calls operator.eq or re.search, depending on regex argument. If regex is - True, perform an element-wise regex matching. - - Parameters - ---------- - a : array_like - b : scalar or regex pattern - regex : bool - mask : array_like - - Returns - ------- - mask : array_like of bool - """ - - def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] - ): - """ - Raises an error if the two arrays (a,b) cannot be compared. - Otherwise, returns the comparison result as expected. - """ - if is_scalar(result) and isinstance(a, np.ndarray): - type_names = [type(a).__name__, type(b).__name__] - - if isinstance(a, np.ndarray): - type_names[0] = f"ndarray(dtype={a.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - - if not regex: - op = lambda x: operator.eq(x, b) - else: - op = np.vectorize( - lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False - ) - - # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray): - a = a[mask] - - if is_numeric_v_string_like(a, b): - # GH#29553 avoid deprecation warnings from numpy - return np.zeros(a.shape, dtype=bool) - - elif is_datetimelike_v_numeric(a, b): - # GH#29553 avoid deprecation warnings from numpy - _check_comparison_types(False, a, b) - return False - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - tmp[mask] = result - result = tmp - - _check_comparison_types(result, a, b) - return result - - -def replace_regex(values: ArrayLike, rx: re.Pattern, value, mask: Optional[np.ndarray]): - """ - Parameters - ---------- - values : ArrayLike - Object dtype. - rx : re.Pattern - value : Any - mask : np.ndarray[bool], optional - - Notes - ----- - Alters values in-place. - """ - - # deal with replacing values with objects (strings) that match but - # whose replacement is not a string (numeric, nan, object) - if isna(value) or not isinstance(value, str): - - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return value if rx.search(s) is not None else s - else: - return s - - else: - # value is guaranteed to be a string here, s can be either a string - # or null if it's null it gets returned - def re_replacer(s): - if is_re(rx) and isinstance(s, str): - return rx.sub(value, s) - else: - return s - - f = np.vectorize(re_replacer, otypes=[values.dtype]) - - if mask is None: - values[:] = f(values) - else: - values[mask] = f(values[mask]) diff --git a/venv/lib/python3.8/site-packages/pandas/core/arraylike.py b/venv/lib/python3.8/site-packages/pandas/core/arraylike.py deleted file mode 100644 index 6b28f8f..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/arraylike.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -Methods that can be shared by many array-like classes or subclasses: - Series - Index - ExtensionArray -""" -import operator -from typing import Any, Callable -import warnings - -import numpy as np - -from pandas._libs import lib - -from pandas.core.construction import extract_array -from pandas.core.ops import maybe_dispatch_ufunc_to_dunder_op, roperator -from pandas.core.ops.common import unpack_zerodim_and_defer - - -class OpsMixin: - # ------------------------------------------------------------- - # Comparisons - - def _cmp_method(self, other, op): - return NotImplemented - - @unpack_zerodim_and_defer("__eq__") - def __eq__(self, other): - return self._cmp_method(other, operator.eq) - - @unpack_zerodim_and_defer("__ne__") - def __ne__(self, other): - return self._cmp_method(other, operator.ne) - - @unpack_zerodim_and_defer("__lt__") - def __lt__(self, other): - return self._cmp_method(other, operator.lt) - - @unpack_zerodim_and_defer("__le__") - def __le__(self, other): - return self._cmp_method(other, operator.le) - - @unpack_zerodim_and_defer("__gt__") - def __gt__(self, other): - return self._cmp_method(other, operator.gt) - - @unpack_zerodim_and_defer("__ge__") - def __ge__(self, other): - return self._cmp_method(other, operator.ge) - - # ------------------------------------------------------------- - # Logical Methods - - def _logical_method(self, other, op): - return NotImplemented - - @unpack_zerodim_and_defer("__and__") - def __and__(self, other): - return self._logical_method(other, operator.and_) - - @unpack_zerodim_and_defer("__rand__") - def __rand__(self, other): - return self._logical_method(other, roperator.rand_) - - @unpack_zerodim_and_defer("__or__") - def __or__(self, other): - return self._logical_method(other, operator.or_) - - @unpack_zerodim_and_defer("__ror__") - def __ror__(self, other): - return self._logical_method(other, roperator.ror_) - - @unpack_zerodim_and_defer("__xor__") - def __xor__(self, other): - return self._logical_method(other, operator.xor) - - @unpack_zerodim_and_defer("__rxor__") - def __rxor__(self, other): - return self._logical_method(other, roperator.rxor) - - # ------------------------------------------------------------- - # Arithmetic Methods - - def _arith_method(self, other, op): - return NotImplemented - - @unpack_zerodim_and_defer("__add__") - def __add__(self, other): - return self._arith_method(other, operator.add) - - @unpack_zerodim_and_defer("__radd__") - def __radd__(self, other): - return self._arith_method(other, roperator.radd) - - @unpack_zerodim_and_defer("__sub__") - def __sub__(self, other): - return self._arith_method(other, operator.sub) - - @unpack_zerodim_and_defer("__rsub__") - def __rsub__(self, other): - return self._arith_method(other, roperator.rsub) - - @unpack_zerodim_and_defer("__mul__") - def __mul__(self, other): - return self._arith_method(other, operator.mul) - - @unpack_zerodim_and_defer("__rmul__") - def __rmul__(self, other): - return self._arith_method(other, roperator.rmul) - - @unpack_zerodim_and_defer("__truediv__") - def __truediv__(self, other): - return self._arith_method(other, operator.truediv) - - @unpack_zerodim_and_defer("__rtruediv__") - def __rtruediv__(self, other): - return self._arith_method(other, roperator.rtruediv) - - @unpack_zerodim_and_defer("__floordiv__") - def __floordiv__(self, other): - return self._arith_method(other, operator.floordiv) - - @unpack_zerodim_and_defer("__rfloordiv") - def __rfloordiv__(self, other): - return self._arith_method(other, roperator.rfloordiv) - - @unpack_zerodim_and_defer("__mod__") - def __mod__(self, other): - return self._arith_method(other, operator.mod) - - @unpack_zerodim_and_defer("__rmod__") - def __rmod__(self, other): - return self._arith_method(other, roperator.rmod) - - @unpack_zerodim_and_defer("__divmod__") - def __divmod__(self, other): - return self._arith_method(other, divmod) - - @unpack_zerodim_and_defer("__rdivmod__") - def __rdivmod__(self, other): - return self._arith_method(other, roperator.rdivmod) - - @unpack_zerodim_and_defer("__pow__") - def __pow__(self, other): - return self._arith_method(other, operator.pow) - - @unpack_zerodim_and_defer("__rpow__") - def __rpow__(self, other): - return self._arith_method(other, roperator.rpow) - - -def array_ufunc(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any): - """ - Compatibility with numpy ufuncs. - - See also - -------- - numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ - """ - from pandas.core.generic import NDFrame - from pandas.core.internals import BlockManager - - cls = type(self) - - # for binary ops, use our custom dunder methods - result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs) - if result is not NotImplemented: - return result - - # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) - - for item in inputs: - higher_priority = ( - hasattr(item, "__array_priority__") - and item.__array_priority__ > self.__array_priority__ - ) - has_array_ufunc = ( - hasattr(item, "__array_ufunc__") - and type(item).__array_ufunc__ not in no_defer - and not isinstance(item, self._HANDLED_TYPES) - ) - if higher_priority or has_array_ufunc: - return NotImplemented - - # align all the inputs. - types = tuple(type(x) for x in inputs) - alignable = [x for x, t in zip(inputs, types) if issubclass(t, NDFrame)] - - if len(alignable) > 1: - # This triggers alignment. - # At the moment, there aren't any ufuncs with more than two inputs - # so this ends up just being x1.index | x2.index, but we write - # it to handle *args. - - if len(set(types)) > 1: - # We currently don't handle ufunc(DataFrame, Series) - # well. Previously this raised an internal ValueError. We might - # support it someday, so raise a NotImplementedError. - raise NotImplementedError( - "Cannot apply ufunc {} to mixed DataFrame and Series " - "inputs.".format(ufunc) - ) - axes = self.axes - for obj in alignable[1:]: - # this relies on the fact that we aren't handling mixed - # series / frame ufuncs. - for i, (ax1, ax2) in enumerate(zip(axes, obj.axes)): - axes[i] = ax1.union(ax2) - - reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes)) - inputs = tuple( - x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x - for x, t in zip(inputs, types) - ) - else: - reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes)) - - if self.ndim == 1: - names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] - name = names[0] if len(set(names)) == 1 else None - reconstruct_kwargs = {"name": name} - else: - reconstruct_kwargs = {} - - def reconstruct(result): - if lib.is_scalar(result): - return result - if result.ndim != self.ndim: - if method == "outer": - if self.ndim == 2: - # we already deprecated for Series - msg = ( - "outer method for ufunc {} is not implemented on " - "pandas objects. Returning an ndarray, but in the " - "future this will raise a 'NotImplementedError'. " - "Consider explicitly converting the DataFrame " - "to an array with '.to_numpy()' first." - ) - warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=4) - return result - raise NotImplementedError - return result - if isinstance(result, BlockManager): - # we went through BlockManager.apply - result = self._constructor(result, **reconstruct_kwargs, copy=False) - else: - # we converted an array, lost our axes - result = self._constructor( - result, **reconstruct_axes, **reconstruct_kwargs, copy=False - ) - # TODO: When we support multiple values in __finalize__, this - # should pass alignable to `__fianlize__` instead of self. - # Then `np.add(a, b)` would consider attrs from both a and b - # when a and b are NDFrames. - if len(alignable) == 1: - result = result.__finalize__(self) - return result - - if self.ndim > 1 and ( - len(inputs) > 1 or ufunc.nout > 1 # type: ignore[attr-defined] - ): - # Just give up on preserving types in the complex case. - # In theory we could preserve them for them. - # * nout>1 is doable if BlockManager.apply took nout and - # returned a Tuple[BlockManager]. - # * len(inputs) > 1 is doable when we know that we have - # aligned blocks / dtypes. - inputs = tuple(np.asarray(x) for x in inputs) - result = getattr(ufunc, method)(*inputs) - elif self.ndim == 1: - # ufunc(series, ...) - inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) - result = getattr(ufunc, method)(*inputs, **kwargs) - else: - # ufunc(dataframe) - mgr = inputs[0]._mgr - result = mgr.apply(getattr(ufunc, method)) - - if ufunc.nout > 1: # type: ignore[attr-defined] - result = tuple(reconstruct(x) for x in result) - else: - result = reconstruct(result) - return result diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/__init__.py index e5258a6..1d53882 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/__init__.py @@ -6,10 +6,8 @@ from pandas.core.arrays.base import ( from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.datetimes import DatetimeArray -from pandas.core.arrays.floating import FloatingArray from pandas.core.arrays.integer import IntegerArray, integer_array from pandas.core.arrays.interval import IntervalArray -from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.numpy_ import PandasArray, PandasDtype from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.arrays.sparse import SparseArray @@ -20,11 +18,9 @@ __all__ = [ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", - "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", - "FloatingArray", "IntegerArray", "integer_array", "IntervalArray", diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/_arrow_utils.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/_arrow_utils.py index 959a13d..4a33e0e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/_arrow_utils.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/_arrow_utils.py @@ -4,7 +4,7 @@ import json import numpy as np import pyarrow -from pandas.core.arrays.interval import VALID_CLOSED +from pandas.core.arrays.interval import _VALID_CLOSED _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") @@ -30,7 +30,7 @@ def pyarrow_array_to_numpy_and_mask(arr, dtype): bitmask = buflist[0] if bitmask is not None: mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask], offset=arr.offset + pyarrow.bool_(), len(arr), [None, bitmask] ) mask = np.asarray(mask) else: @@ -83,7 +83,7 @@ if _pyarrow_version_ge_015: def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) - assert closed in VALID_CLOSED + assert closed in _VALID_CLOSED self._closed = closed if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py index 02214ff..832d09b 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py @@ -1,30 +1,15 @@ -from __future__ import annotations - -from typing import Any, Optional, Sequence, Type, TypeVar, Union +from typing import Any, Sequence, Tuple, TypeVar import numpy as np -from pandas._libs import lib -from pandas._typing import Shape from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly, doc -from pandas.util._validators import validate_fillna_kwargs +from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import is_dtype_equal -from pandas.core.dtypes.inference import is_array_like -from pandas.core.dtypes.missing import array_equivalent - -from pandas.core import missing from pandas.core.algorithms import take, unique -from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray -from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer -NDArrayBackedExtensionArrayT = TypeVar( - "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" -) +_T = TypeVar("_T", bound="NDArrayBackedExtensionArray") class NDArrayBackedExtensionArray(ExtensionArray): @@ -34,9 +19,7 @@ class NDArrayBackedExtensionArray(ExtensionArray): _ndarray: np.ndarray - def _from_backing_data( - self: NDArrayBackedExtensionArrayT, arr: np.ndarray - ) -> NDArrayBackedExtensionArrayT: + def _from_backing_data(self: _T, arr: np.ndarray) -> _T: """ Construct a new ExtensionArray `new_array` with `arr` as its _ndarray. @@ -45,42 +28,26 @@ class NDArrayBackedExtensionArray(ExtensionArray): """ raise AbstractMethodError(self) - def _box_func(self, x): - """ - Wrap numpy type in our dtype.type if necessary. - """ - return x - - def _validate_scalar(self, value): - # used by NDArrayBackedExtensionIndex.insert - raise AbstractMethodError(self) - # ------------------------------------------------------------------------ def take( - self: NDArrayBackedExtensionArrayT, + self: _T, indices: Sequence[int], - *, allow_fill: bool = False, fill_value: Any = None, - axis: int = 0, - ) -> NDArrayBackedExtensionArrayT: + ) -> _T: if allow_fill: fill_value = self._validate_fill_value(fill_value) new_data = take( - self._ndarray, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - axis=axis, + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value, ) return self._from_backing_data(new_data) def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to a representation - suitable for self._ndarray, raising TypeError if this is not possible. + suitable for self._ndarray, raising ValueError if this is not possible. Parameters ---------- @@ -92,7 +59,7 @@ class NDArrayBackedExtensionArray(ExtensionArray): Raises ------ - TypeError + ValueError """ raise AbstractMethodError(self) @@ -101,7 +68,7 @@ class NDArrayBackedExtensionArray(ExtensionArray): # TODO: make this a cache_readonly; for that to work we need to remove # the _index_data kludge in libreduction @property - def shape(self) -> Shape: + def shape(self) -> Tuple[int, ...]: return self._ndarray.shape def __len__(self) -> int: @@ -119,42 +86,26 @@ class NDArrayBackedExtensionArray(ExtensionArray): def nbytes(self) -> int: return self._ndarray.nbytes - def reshape( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: + def reshape(self: _T, *args, **kwargs) -> _T: new_data = self._ndarray.reshape(*args, **kwargs) return self._from_backing_data(new_data) - def ravel( - self: NDArrayBackedExtensionArrayT, *args, **kwargs - ) -> NDArrayBackedExtensionArrayT: + def ravel(self: _T, *args, **kwargs) -> _T: new_data = self._ndarray.ravel(*args, **kwargs) return self._from_backing_data(new_data) @property - def T(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def T(self: _T) -> _T: new_data = self._ndarray.T return self._from_backing_data(new_data) # ------------------------------------------------------------------------ - def equals(self, other) -> bool: - if type(self) is not type(other): - return False - if not is_dtype_equal(self.dtype, other.dtype): - return False - return bool(array_equivalent(self._ndarray, other._ndarray)) - - def _values_for_argsort(self): - return self._ndarray - - def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def copy(self: _T) -> _T: new_data = self._ndarray.copy() return self._from_backing_data(new_data) - def repeat( - self: NDArrayBackedExtensionArrayT, repeats, axis=None - ) -> NDArrayBackedExtensionArrayT: + def repeat(self: _T, repeats, axis=None) -> _T: """ Repeat elements of an array. @@ -162,181 +113,10 @@ class NDArrayBackedExtensionArray(ExtensionArray): -------- numpy.ndarray.repeat """ - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) new_data = self._ndarray.repeat(repeats, axis=axis) return self._from_backing_data(new_data) - def unique(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: + def unique(self: _T) -> _T: new_data = unique(self._ndarray) return self._from_backing_data(new_data) - - @classmethod - @doc(ExtensionArray._concat_same_type) - def _concat_same_type( - cls: Type[NDArrayBackedExtensionArrayT], - to_concat: Sequence[NDArrayBackedExtensionArrayT], - axis: int = 0, - ) -> NDArrayBackedExtensionArrayT: - dtypes = {str(x.dtype) for x in to_concat} - if len(dtypes) != 1: - raise ValueError("to_concat must have the same dtype (tz)", dtypes) - - new_values = [x._ndarray for x in to_concat] - new_values = np.concatenate(new_values, axis=axis) - return to_concat[0]._from_backing_data(new_values) - - @doc(ExtensionArray.searchsorted) - def searchsorted(self, value, side="left", sorter=None): - value = self._validate_searchsorted_value(value) - return self._ndarray.searchsorted(value, side=side, sorter=sorter) - - def _validate_searchsorted_value(self, value): - return value - - @doc(ExtensionArray.shift) - def shift(self, periods=1, fill_value=None, axis=0): - - fill_value = self._validate_shift_value(fill_value) - new_values = shift(self._ndarray, periods, axis, fill_value) - - return self._from_backing_data(new_values) - - def _validate_shift_value(self, fill_value): - # TODO: after deprecation in datetimelikearraymixin is enforced, - # we can remove this and ust validate_fill_value directly - return self._validate_fill_value(fill_value) - - def __setitem__(self, key, value): - key = check_array_indexer(self, key) - value = self._validate_setitem_value(value) - self._ndarray[key] = value - - def _validate_setitem_value(self, value): - return value - - def __getitem__( - self: NDArrayBackedExtensionArrayT, key: Union[int, slice, np.ndarray] - ) -> Union[NDArrayBackedExtensionArrayT, Any]: - if lib.is_integer(key): - # fast-path - result = self._ndarray[key] - if self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) - - key = extract_array(key, extract_numpy=True) - key = check_array_indexer(self, key) - result = self._ndarray[key] - if lib.is_scalar(result): - return self._box_func(result) - - result = self._from_backing_data(result) - return result - - @doc(ExtensionArray.fillna) - def fillna( - self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None - ) -> NDArrayBackedExtensionArrayT: - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - # TODO: share this with EA base class implementation - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - func = missing.get_fill_func(method) - new_values = func(self._ndarray.copy(), limit=limit, mask=mask) - # TODO: PandasArray didnt used to copy, need tests for this - new_values = self._from_backing_data(new_values) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - - # ------------------------------------------------------------------------ - # Reductions - - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): - meth = getattr(self, name, None) - if meth: - return meth(skipna=skipna, **kwargs) - else: - msg = f"'{type(self).__name__}' does not implement reduction '{name}'" - raise TypeError(msg) - - def _wrap_reduction_result(self, axis: Optional[int], result): - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) - - # ------------------------------------------------------------------------ - - def __repr__(self) -> str: - if self.ndim == 1: - return super().__repr__() - - from pandas.io.formats.printing import format_object_summary - - # the short repr has no trailing newline, while the truncated - # repr does. So we include a newline in our template, and strip - # any trailing newlines from format_object_summary - lines = [ - format_object_summary(x, self._formatter(), indent_for_name=False).rstrip( - ", \n" - ) - for x in self - ] - data = ",\n".join(lines) - class_name = f"<{type(self).__name__}>" - return f"{class_name}\n[\n{data}\n]\nShape: {self.shape}, dtype: {self.dtype}" - - # ------------------------------------------------------------------------ - # __array_function__ methods - - def putmask(self, mask, value): - """ - Analogue to np.putmask(self, mask, value) - - Parameters - ---------- - mask : np.ndarray[bool] - value : scalar or listlike - - Raises - ------ - TypeError - If value cannot be cast to self.dtype. - """ - value = self._validate_setitem_value(value) - - np.putmask(self._ndarray, mask, value) - - def where(self, mask, value): - """ - Analogue to np.where(mask, self, value) - - Parameters - ---------- - mask : np.ndarray[bool] - value : scalar or listlike - - Raises - ------ - TypeError - If value cannot be cast to self.dtype. - """ - value = self._validate_setitem_value(value) - - res_values = np.where(mask, self._ndarray, value) - return self._from_backing_data(res_values) diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/base.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/base.py index 9547042..2553a65 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/base.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/base.py @@ -6,26 +6,13 @@ An interface for extending pandas with custom arrays. This is an experimental API and subject to breaking changes without warning. """ -from __future__ import annotations - import operator -from typing import ( - Any, - Callable, - Dict, - Optional, - Sequence, - Tuple, - Type, - TypeVar, - Union, - cast, -) +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike, Shape +from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -33,25 +20,17 @@ from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import ( - is_array_like, - is_dtype_equal, - is_list_like, - is_scalar, - pandas_dtype, -) +from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.algorithms import factorize_array, unique -from pandas.core.missing import get_fill_func +from pandas.core.algorithms import _factorize_array, unique +from pandas.core.missing import backfill_1d, pad_1d from pandas.core.sorting import nargminmax, nargsort -_extension_array_shared_docs: Dict[str, str] = {} - -ExtensionArrayT = TypeVar("ExtensionArrayT", bound="ExtensionArray") +_extension_array_shared_docs: Dict[str, str] = dict() class ExtensionArray: @@ -62,6 +41,8 @@ class ExtensionArray: with a custom type and will not attempt to coerce them to objects. They may be stored directly inside a :class:`DataFrame` or :class:`Series`. + .. versionadded:: 0.23.0 + Attributes ---------- dtype @@ -189,7 +170,7 @@ class ExtensionArray: # ------------------------------------------------------------------------ @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): """ Construct a new ExtensionArray from a sequence of scalars. @@ -211,7 +192,7 @@ class ExtensionArray: raise AbstractMethodError(cls) @classmethod - def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): """ Construct a new ExtensionArray from a sequence of strings. @@ -248,8 +229,8 @@ class ExtensionArray: See Also -------- - factorize : Top-level factorize method that dispatches here. - ExtensionArray.factorize : Encode the extension array as an enumerated type. + factorize + ExtensionArray.factorize """ raise AbstractMethodError(cls) @@ -257,9 +238,8 @@ class ExtensionArray: # Must be a Sequence # ------------------------------------------------------------------------ - def __getitem__( - self, item: Union[int, slice, np.ndarray] - ) -> Union[ExtensionArray, Any]: + def __getitem__(self, item): + # type (Any) -> Any """ Select a subset of self. @@ -355,23 +335,6 @@ class ExtensionArray: for i in range(len(self)): yield self[i] - def __contains__(self, item) -> bool: - """ - Return for `item in self`. - """ - # GH37867 - # comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA] - # would raise a TypeError. The implementation below works around that. - if is_scalar(item) and isna(item): - if not self._can_hold_na: - return False - elif item is self.dtype.na_value or isinstance(item, self.dtype.type): - return self.isna().any() - else: - return False - else: - return (item == self).any() - def __eq__(self, other: Any) -> ArrayLike: """ Return for `self == other` (element-wise equality). @@ -437,7 +400,7 @@ class ExtensionArray: raise AbstractMethodError(self) @property - def shape(self) -> Shape: + def shape(self) -> Tuple[int, ...]: """ Return a tuple of the array dimensions. """ @@ -489,19 +452,9 @@ class ExtensionArray: NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype - from pandas.core.arrays.string_arrow import ArrowStringDtype dtype = pandas_dtype(dtype) - if is_dtype_equal(dtype, self.dtype): - if not copy: - return self - else: - return self.copy() - - # FIXME: Really hard-code here? - if isinstance( - dtype, (ArrowStringDtype, StringDtype) - ): # allow conversion to StringArrays + if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) return np.array(self, dtype=dtype, copy=copy) @@ -540,18 +493,13 @@ class ExtensionArray: See Also -------- - ExtensionArray.argsort : Return the indices that would sort this array. + ExtensionArray.argsort """ # Note: this is used in `ExtensionArray.argsort`. return np.array(self) def argsort( - self, - ascending: bool = True, - kind: str = "quicksort", - na_position: str = "last", - *args, - **kwargs, + self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs ) -> np.ndarray: """ Return the indices that would sort this array. @@ -582,14 +530,8 @@ class ExtensionArray: # 2. argsort : total control over sorting. ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - values = self._values_for_argsort() - return nargsort( - values, - kind=kind, - ascending=ascending, - na_position=na_position, - mask=np.asarray(self.isna()), - ) + result = nargsort(self, kind=kind, ascending=ascending, na_position="last") + return result def argmin(self): """ @@ -666,7 +608,7 @@ class ExtensionArray: if mask.any(): if method is not None: - func = get_fill_func(method) + func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: @@ -687,7 +629,7 @@ class ExtensionArray: """ return self[~self.isna()] - def shift(self, periods: int = 1, fill_value: object = None) -> ExtensionArray: + def shift(self, periods: int = 1, fill_value: object = None) -> "ExtensionArray": """ Shift values by desired number. @@ -800,7 +742,7 @@ class ExtensionArray: arr = self.astype(object) return arr.searchsorted(value, side=side, sorter=sorter) - def equals(self, other: object) -> bool: + def equals(self, other: "ExtensionArray") -> bool: """ Return if another array is equivalent to this array. @@ -818,12 +760,11 @@ class ExtensionArray: boolean Whether the arrays are equivalent. """ - if type(self) != type(other): + if not type(self) == type(other): return False - other = cast(ExtensionArray, other) - if not is_dtype_equal(self.dtype, other.dtype): + elif not self.dtype == other.dtype: return False - elif len(self) != len(other): + elif not len(self) == len(other): return False else: equal_values = self == other @@ -857,7 +798,7 @@ class ExtensionArray: """ return self.astype(object), np.nan - def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray"]: """ Encode the extension array as an enumerated type. @@ -898,7 +839,7 @@ class ExtensionArray: # Complete control over factorization. arr, na_value = self._values_for_factorize() - codes, uniques = factorize_array( + codes, uniques = _factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) @@ -952,7 +893,7 @@ class ExtensionArray: @Substitution(klass="ExtensionArray") @Appender(_extension_array_shared_docs["repeat"]) def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) ind = np.arange(len(self)).repeat(repeats) return self.take(ind) @@ -961,12 +902,8 @@ class ExtensionArray: # ------------------------------------------------------------------------ def take( - self, - indices: Sequence[int], - *, - allow_fill: bool = False, - fill_value: Any = None, - ) -> ExtensionArray: + self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None + ) -> "ExtensionArray": """ Take elements from an array. @@ -1010,8 +947,8 @@ class ExtensionArray: See Also -------- - numpy.take : Take elements from an array along an axis. - api.extensions.take : Take elements from an array. + numpy.take + api.extensions.take Notes ----- @@ -1055,7 +992,7 @@ class ExtensionArray: # pandas.api.extensions.take raise AbstractMethodError(self) - def copy(self: ExtensionArrayT) -> ExtensionArrayT: + def copy(self) -> "ExtensionArray": """ Return a copy of the array. @@ -1135,20 +1072,7 @@ class ExtensionArray: # Reshaping # ------------------------------------------------------------------------ - def transpose(self, *axes) -> ExtensionArray: - """ - Return a transposed view on this array. - - Because ExtensionArrays are always 1D, this is a no-op. It is included - for compatibility with np.ndarray. - """ - return self[:] - - @property - def T(self) -> ExtensionArray: - return self.transpose() - - def ravel(self, order="C") -> ExtensionArray: + def ravel(self, order="C") -> "ExtensionArray": """ Return a flattened view on this array. @@ -1169,8 +1093,8 @@ class ExtensionArray: @classmethod def _concat_same_type( - cls: Type[ExtensionArrayT], to_concat: Sequence[ExtensionArrayT] - ) -> ExtensionArrayT: + cls, to_concat: Sequence["ExtensionArray"] + ) -> "ExtensionArray": """ Concatenate multiple array of this dtype. @@ -1196,7 +1120,7 @@ class ExtensionArray: # of objects _can_hold_na = True - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. @@ -1237,54 +1161,42 @@ class ExtensionOpsMixin: with NumPy arrays. """ - @classmethod - def _create_arithmetic_method(cls, op): - raise AbstractMethodError(cls) - @classmethod def _add_arithmetic_ops(cls): - setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) - setattr(cls, "__radd__", cls._create_arithmetic_method(ops.radd)) - setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) - setattr(cls, "__rsub__", cls._create_arithmetic_method(ops.rsub)) - setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) - setattr(cls, "__rmul__", cls._create_arithmetic_method(ops.rmul)) - setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) - setattr(cls, "__rpow__", cls._create_arithmetic_method(ops.rpow)) - setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) - setattr(cls, "__rmod__", cls._create_arithmetic_method(ops.rmod)) - setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) - setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(ops.rfloordiv)) - setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) - setattr(cls, "__rtruediv__", cls._create_arithmetic_method(ops.rtruediv)) - setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) - setattr(cls, "__rdivmod__", cls._create_arithmetic_method(ops.rdivmod)) - - @classmethod - def _create_comparison_method(cls, op): - raise AbstractMethodError(cls) + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) + cls.__sub__ = cls._create_arithmetic_method(operator.sub) + cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) + cls.__pow__ = cls._create_arithmetic_method(operator.pow) + cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) + cls.__mod__ = cls._create_arithmetic_method(operator.mod) + cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) + cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) + cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) + cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) + cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) + cls.__divmod__ = cls._create_arithmetic_method(divmod) + cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) @classmethod def _add_comparison_ops(cls): - setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) - setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) - setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) - setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) - setattr(cls, "__le__", cls._create_comparison_method(operator.le)) - setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) - - @classmethod - def _create_logical_method(cls, op): - raise AbstractMethodError(cls) + cls.__eq__ = cls._create_comparison_method(operator.eq) + cls.__ne__ = cls._create_comparison_method(operator.ne) + cls.__lt__ = cls._create_comparison_method(operator.lt) + cls.__gt__ = cls._create_comparison_method(operator.gt) + cls.__le__ = cls._create_comparison_method(operator.le) + cls.__ge__ = cls._create_comparison_method(operator.ge) @classmethod def _add_logical_ops(cls): - setattr(cls, "__and__", cls._create_logical_method(operator.and_)) - setattr(cls, "__rand__", cls._create_logical_method(ops.rand_)) - setattr(cls, "__or__", cls._create_logical_method(operator.or_)) - setattr(cls, "__ror__", cls._create_logical_method(ops.ror_)) - setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) - setattr(cls, "__rxor__", cls._create_logical_method(ops.rxor)) + cls.__and__ = cls._create_logical_method(operator.and_) + cls.__rand__ = cls._create_logical_method(ops.rand_) + cls.__or__ = cls._create_logical_method(operator.or_) + cls.__ror__ = cls._create_logical_method(ops.ror_) + cls.__xor__ = cls._create_logical_method(operator.xor) + cls.__rxor__ = cls._create_logical_method(ops.rxor) class ExtensionScalarOpsMixin(ExtensionOpsMixin): @@ -1361,7 +1273,7 @@ class ExtensionScalarOpsMixin(ExtensionOpsMixin): ovalues = [param] * len(self) return ovalues - if isinstance(other, (ABCSeries, ABCIndexClass, ABCDataFrame)): + if isinstance(other, (ABCSeries, ABCIndexClass)): # rely on pandas to unbox and dispatch to us return NotImplemented diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/boolean.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/boolean.py index 44cc108..dbce71b 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/boolean.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/boolean.py @@ -6,6 +6,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing from pandas._typing import ArrayLike +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( @@ -19,6 +20,7 @@ from pandas.core.dtypes.common import ( pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -26,7 +28,7 @@ from pandas.core import ops from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: - import pyarrow + import pyarrow # noqa: F401 @register_extension_dtype @@ -57,9 +59,8 @@ class BooleanDtype(BaseMaskedDtype): name = "boolean" - # mypy: https://github.com/python/mypy/issues/4125 @property - def type(self) -> Type: # type: ignore[override] + def type(self) -> Type[np.bool_]: return np.bool_ @property @@ -98,7 +99,7 @@ class BooleanDtype(BaseMaskedDtype): """ Construct BooleanArray from pyarrow Array/ChunkedArray. """ - import pyarrow + import pyarrow # noqa: F811 if isinstance(array, pyarrow.Array): chunks = [array] @@ -170,13 +171,12 @@ def coerce_to_array( values[~mask_values] = values_object[~mask_values].astype(bool) # if the values were integer-like, validate it were actually 0/1's - if (inferred_dtype in integer_like) and not ( - np.all( + if inferred_dtype in integer_like: + if not np.all( values[~mask_values].astype(float) == values_object[~mask_values].astype(float) - ) - ): - raise TypeError("Need to pass bool-like values") + ): + raise TypeError("Need to pass bool-like values") if mask is None and mask_values is None: mask = np.zeros(len(values), dtype=bool) @@ -194,9 +194,9 @@ def coerce_to_array( if mask_values is not None: mask = mask | mask_values - if values.ndim != 1: + if not values.ndim == 1: raise ValueError("values must be a 1D list-like") - if mask.ndim != 1: + if not mask.ndim == 1: raise ValueError("mask must be a 1D list-like") return values, mask @@ -273,9 +273,7 @@ class BooleanArray(BaseMaskedArray): return self._dtype @classmethod - def _from_sequence( - cls, scalars, *, dtype=None, copy: bool = False - ) -> "BooleanArray": + def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "BooleanArray": if dtype: assert dtype == "boolean" values, mask = coerce_to_array(scalars, copy=copy) @@ -283,7 +281,7 @@ class BooleanArray(BaseMaskedArray): @classmethod def _from_sequence_of_strings( - cls, strings: List[str], *, dtype=None, copy: bool = False + cls, strings: List[str], dtype=None, copy: bool = False ) -> "BooleanArray": def map_string(s): if isna(s): @@ -296,7 +294,7 @@ class BooleanArray(BaseMaskedArray): raise ValueError(f"{s} cannot be cast to bool") scalars = [map_string(x) for x in strings] - return cls._from_sequence(scalars, dtype=dtype, copy=copy) + return cls._from_sequence(scalars, dtype, copy) _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) @@ -378,10 +376,7 @@ class BooleanArray(BaseMaskedArray): if isinstance(dtype, BooleanDtype): values, mask = coerce_to_array(self, copy=copy) - if not copy: - return self - else: - return BooleanArray(values, mask, copy=False) + return BooleanArray(values, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -398,8 +393,9 @@ class BooleanArray(BaseMaskedArray): self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False ) # for integer, error if there are missing values - if is_integer_dtype(dtype) and self._hasna: - raise ValueError("cannot convert NA to integer") + if is_integer_dtype(dtype): + if self._hasna: + raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value @@ -420,13 +416,13 @@ class BooleanArray(BaseMaskedArray): See Also -------- - ExtensionArray.argsort : Return the indices that would sort this array. + ExtensionArray.argsort """ data = self._data.copy() data[self._mask] = -1 return data - def any(self, *, skipna: bool = True, **kwargs): + def any(self, skipna: bool = True, **kwargs): """ Return whether any element is True. @@ -494,7 +490,7 @@ class BooleanArray(BaseMaskedArray): else: return self.dtype.na_value - def all(self, *, skipna: bool = True, **kwargs): + def all(self, skipna: bool = True, **kwargs): """ Return whether all elements are True. @@ -561,135 +557,108 @@ class BooleanArray(BaseMaskedArray): else: return self.dtype.na_value - def _logical_method(self, other, op): + @classmethod + def _create_logical_method(cls, op): + def logical_method(self, other): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented - assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} - other_is_booleanarray = isinstance(other, BooleanArray) - other_is_scalar = lib.is_scalar(other) - mask = None + assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} + other = lib.item_from_zerodim(other) + other_is_booleanarray = isinstance(other, BooleanArray) + other_is_scalar = lib.is_scalar(other) + mask = None - if other_is_booleanarray: - other, mask = other._data, other._mask - elif is_list_like(other): - other = np.asarray(other, dtype="bool") - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - other, mask = coerce_to_array(other, copy=False) - elif isinstance(other, np.bool_): - other = other.item() + if other_is_booleanarray: + other, mask = other._data, other._mask + elif is_list_like(other): + other = np.asarray(other, dtype="bool") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + other, mask = coerce_to_array(other, copy=False) + elif isinstance(other, np.bool_): + other = other.item() - if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other): - raise TypeError( - "'other' should be pandas.NA or a bool. " - f"Got {type(other).__name__} instead." - ) + if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): + raise TypeError( + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." + ) - if not other_is_scalar and len(self) != len(other): - raise ValueError("Lengths must match to compare") - - if op.__name__ in {"or_", "ror_"}: - result, mask = ops.kleene_or(self._data, other, self._mask, mask) - elif op.__name__ in {"and_", "rand_"}: - result, mask = ops.kleene_and(self._data, other, self._mask, mask) - elif op.__name__ in {"xor", "rxor"}: - result, mask = ops.kleene_xor(self._data, other, self._mask, mask) - - return BooleanArray(result, mask) - - def _cmp_method(self, other, op): - from pandas.arrays import FloatingArray, IntegerArray - - if isinstance(other, (IntegerArray, FloatingArray)): - return NotImplemented - - mask = None - - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): + if not other_is_scalar and len(self) != len(other): raise ValueError("Lengths must match to compare") - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - result = np.zeros_like(self._data) - mask = np.ones_like(self._data) - else: - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - result = op(self._data, other) + if op.__name__ in {"or_", "ror_"}: + result, mask = ops.kleene_or(self._data, other, self._mask, mask) + elif op.__name__ in {"and_", "rand_"}: + result, mask = ops.kleene_and(self._data, other, self._mask, mask) + elif op.__name__ in {"xor", "rxor"}: + result, mask = ops.kleene_xor(self._data, other, self._mask, mask) - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask + return BooleanArray(result, mask) - return BooleanArray(result, mask, copy=False) + name = f"__{op.__name__}__" + return set_function_name(logical_method, name, cls) - def _arith_method(self, other, op): - mask = None - op_name = op.__name__ + @classmethod + def _create_comparison_method(cls, op): + def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, BooleanArray): - other, mask = other._data, other._mask + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match") + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") - # nans propagate - if mask is None: - mask = self._mask if other is libmissing.NA: - mask |= True - else: - mask = self._mask | mask - - if other is libmissing.NA: - # if other is NA, the result will be all NA and we can't run the - # actual op, so we need to choose the resulting dtype manually - if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: - dtype = "int8" + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + result = np.zeros_like(self._data) + mask = np.ones_like(self._data) else: - dtype = "bool" - result = np.zeros(len(self._data), dtype=dtype) - else: - if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): - # Avoid DeprecationWarning: In future, it will be an error - # for 'np.bool_' scalars to be interpreted as an index - other = bool(other) + # numpy will show a DeprecationWarning on invalid elementwise + # comparisons, this will raise in the future + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + result = op(self._data, other) - with np.errstate(all="ignore"): - result = op(self._data, other) + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) + return BooleanArray(result, mask, copy=False) - return self._maybe_mask_result(result, mask, other, op_name) + name = f"__{op.__name__}" + return set_function_name(cmp_method, name, cls) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in {"any", "all"}: return getattr(self, name)(skipna=skipna, **kwargs) - return super()._reduce(name, skipna=skipna, **kwargs) + return super()._reduce(name, skipna, **kwargs) def _maybe_mask_result(self, result, mask, other, op_name: str): """ @@ -706,11 +675,10 @@ class BooleanArray(BaseMaskedArray): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - from pandas.core.arrays import FloatingArray + result[mask] = np.nan + return result - return FloatingArray(result, mask, copy=False) - - elif is_bool_dtype(result): + if is_bool_dtype(result): return BooleanArray(result, mask, copy=False) elif is_integer_dtype(result): @@ -720,3 +688,66 @@ class BooleanArray(BaseMaskedArray): else: result[mask] = np.nan return result + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + def boolean_arithmetic_method(self, other): + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + # Rely on pandas to unbox and dispatch to us. + return NotImplemented + + other = lib.item_from_zerodim(other) + mask = None + + if isinstance(other, BooleanArray): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + + # nans propagate + if mask is None: + mask = self._mask + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | mask + + if other is libmissing.NA: + # if other is NA, the result will be all NA and we can't run the + # actual op, so we need to choose the resulting dtype manually + if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: + dtype = "int8" + else: + dtype = "bool" + result = np.zeros(len(self._data), dtype=dtype) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op_name}__" + return set_function_name(boolean_arithmetic_method, name, cls) + + +BooleanArray._add_logical_ops() +BooleanArray._add_comparison_ops() +BooleanArray._add_arithmetic_ops() diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py index 3995e7b..6e5c7bc 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/categorical.py @@ -2,7 +2,7 @@ from csv import QUOTE_NONNUMERIC from functools import partial import operator from shutil import get_terminal_size -from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -10,10 +10,9 @@ import numpy as np from pandas._config import get_option from pandas._libs import NaT, algos as libalgos, hashtable as htable -from pandas._libs.lib import no_default from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import cache_readonly, deprecate_kwarg +from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.cast import ( @@ -29,7 +28,6 @@ from pandas.core.dtypes.common import ( is_dict_like, is_dtype_equal, is_extension_array_dtype, - is_hashable, is_integer_dtype, is_list_like, is_object_dtype, @@ -39,36 +37,38 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject +from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d +from pandas.core.array_algos.transforms import shift +from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray +from pandas.core.base import ( + ExtensionArray, + NoNewAttributesMixin, + PandasObject, + _shared_docs, +) import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import deprecate_ndim_indexing +from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort -from pandas.core.strings.object_array import ObjectStringArrayMixin from pandas.io.formats import console -CategoricalT = TypeVar("CategoricalT", bound="Categorical") - def _cat_compare_op(op): opname = f"__{op.__name__}__" - fill_value = True if op is operator.ne else False @unpack_zerodim_and_defer(opname) def func(self, other): - hashable = is_hashable(other) - if is_list_like(other) and len(other) != len(self) and not hashable: - # in hashable case we may have a tuple that is itself a category + if is_list_like(other) and len(other) != len(self): + # TODO: Could this fail if the categories are listlike objects? raise ValueError("Lengths must match.") if not self.ordered: @@ -77,41 +77,58 @@ def _cat_compare_op(op): "Unordered Categoricals can only compare equality or not" ) if isinstance(other, Categorical): - # Two Categoricals can only be compared if the categories are + # Two Categoricals can only be be compared if the categories are # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if not self._categories_match_up_to_permutation(other): + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif self.ordered and not (self.categories == other.categories).all(): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): raise TypeError(msg) + if not (self.ordered == other.ordered): + raise TypeError( + "Categoricals can only be compared if 'ordered' is the same" + ) if not self.ordered and not self.categories.equals(other.categories): # both unordered and different order - other_codes = recode_for_categories( - other.codes, other.categories, self.categories, copy=False - ) + other_codes = _get_codes_for_values(other, self.categories) else: other_codes = other._codes - ret = op(self._codes, other_codes) + f = getattr(self._codes, opname) + ret = f(other_codes) mask = (self._codes == -1) | (other_codes == -1) if mask.any(): - ret[mask] = fill_value + # In other series, the leads to False, so do that here too + if opname == "__ne__": + ret[(self._codes == -1) & (other_codes == -1)] = True + else: + ret[mask] = False return ret - if hashable: + if is_scalar(other): if other in self.categories: - i = self._unbox_scalar(other) - ret = op(self._codes, i) + i = self.categories.get_loc(other) + ret = getattr(self._codes, opname)(i) if opname not in {"__eq__", "__ge__", "__gt__"}: - # GH#29820 performance trick; get_loc will always give i>=0, - # so in the cases (__ne__, __le__, __lt__) the setting - # here is a no-op, so can be skipped. + # check for NaN needed if we are not equal or larger mask = self._codes == -1 - ret[mask] = fill_value + ret[mask] = False return ret else: - return ops.invalid_comparison(self, other, op) + if opname == "__eq__": + return np.zeros(len(self), dtype=bool) + elif opname == "__ne__": + return np.ones(len(self), dtype=bool) + else: + raise TypeError( + f"Cannot compare a Categorical for op {opname} with a " + "scalar, which is not a category." + ) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons @@ -184,7 +201,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): +class Categorical(NDArrayBackedExtensionArray, PandasObject): """ Represent a categorical variable in classic R / S-plus fashion. @@ -263,19 +280,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] - Missing values are not included as a category. - - >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) - >>> c - [1, 2, 3, 1, 2, 3, NaN] - Categories (3, int64): [1, 2, 3] - - However, their presence is indicated in the `codes` attribute - by code `-1`. - - >>> c.codes - array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) - Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. @@ -293,9 +297,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) + _deprecations = PandasObject._deprecations | frozenset(["tolist"]) _typ = "categorical" - _can_hold_na = True def __init__( self, values, categories=None, ordered=None, dtype=None, fastpath=False @@ -325,7 +328,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # sanitize_array coerces np.nan to a string under certain versions # of numpy values = maybe_infer_to_datetimelike(values, convert_dates=True) - if not isinstance(values, (np.ndarray, ExtensionArray)): + if not isinstance(values, np.ndarray): values = com.convert_to_list_like(values) # By convention, empty lists result in object dtype: @@ -359,7 +362,9 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi dtype = CategoricalDtype(categories, dtype.ordered) elif is_categorical_dtype(values.dtype): - old_codes = extract_array(values).codes + old_codes = ( + values._values.codes if isinstance(values, ABCSeries) else values.codes + ) codes = recode_for_categories( old_codes, values.dtype.categories, dtype.categories ) @@ -376,6 +381,56 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) + @property + def categories(self): + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + self._dtype = new_dtype + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self.dtype.ordered + @property def dtype(self) -> CategoricalDtype: """ @@ -388,9 +443,13 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return Categorical @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype @@ -403,42 +462,20 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi If copy is set to False and dtype is categorical, the original object is returned. """ - if self.dtype is dtype: - result = self.copy() if copy else self - - elif is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) - # GH 10696/18593/18630 + # GH 10696/18593 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - result = self._set_dtype(dtype) - - # TODO: consolidate with ndarray case? - elif is_extension_array_dtype(dtype): - result = array(self, dtype=dtype, copy=copy) - - elif is_integer_dtype(dtype) and self.isna().any(): + if dtype == self.dtype: + return self + return self._set_dtype(dtype) + if is_extension_array_dtype(dtype): + return array(self, dtype=dtype, copy=copy) + if is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - - elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array(self, dtype=dtype, copy=copy) - - else: - # GH8628 (PERF): astype category codes instead of astyping array - try: - astyped_cats = self.categories.astype(dtype=dtype, copy=copy) - except ( - TypeError, # downstream error msg for CategoricalIndex is misleading - ValueError, - ): - msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" - raise ValueError(msg) - - astyped_cats = extract_array(astyped_cats, extract_numpy=True) - result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) - - return result + return np.array(self, dtype=dtype, copy=copy) @cache_readonly def itemsize(self) -> int: @@ -591,59 +628,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return cls(codes, dtype=dtype, fastpath=True) - # ------------------------------------------------------------------ - # Categories/Codes/Ordered - - @property - def categories(self): - """ - The categories of this categorical. - - Setting assigns new values to each category (effectively a rename of - each individual category). - - The assigned value has to be a list-like object. All items must be - unique and the number of items in the new categories must be the same - as the number of items in the old categories. - - Assigning to `categories` is a inplace operation! - - Raises - ------ - ValueError - If the new categories do not validate as categories or if the - number of new categories is unequal the number of old categories - - See Also - -------- - rename_categories : Rename categories. - reorder_categories : Reorder categories. - add_categories : Add new categories. - remove_categories : Remove the specified categories. - remove_unused_categories : Remove categories which are not used. - set_categories : Set the categories to the specified ones. - """ - return self.dtype.categories - - @categories.setter - def categories(self, categories): - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) - self._dtype = new_dtype - - @property - def ordered(self) -> Ordered: - """ - Whether the categories have an ordered relationship. - """ - return self.dtype.ordered - @property def codes(self) -> np.ndarray: """ @@ -748,8 +732,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- - Categorical or None - Ordered Categorical or None if ``inplace=True``. + Categorical + Ordered Categorical. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(True, inplace=inplace) @@ -766,8 +750,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- - Categorical or None - Unordered Categorical or None if ``inplace=True``. + Categorical + Unordered Categorical. """ inplace = validate_bool_kwarg(inplace, "inplace") return self.set_ordered(False, inplace=inplace) @@ -866,6 +850,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi * callable : a callable that is called on all items in the old categories and whose return values comprise the new categories. + .. versionadded:: 0.23.0. + inplace : bool, default False Whether or not to rename the categories inplace or return a copy of this categorical with renamed categories. @@ -873,7 +859,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- cat : Categorical or None - Categorical with removed categories or None if ``inplace=True``. + With ``inplace=False``, the new categorical is returned. + With ``inplace=True``, there is no return value. Raises ------ @@ -941,8 +928,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- - cat : Categorical or None - Categorical with removed categories or None if ``inplace=True``. + cat : Categorical with reordered categories or None if inplace. Raises ------ @@ -982,8 +968,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- - cat : Categorical or None - Categorical with new categories added or None if ``inplace=True``. + cat : Categorical with new categories added or None if inplace. Raises ------ @@ -1033,8 +1018,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Returns ------- - cat : Categorical or None - Categorical with removed categories or None if ``inplace=True``. + cat : Categorical with removed categories or None if inplace. Raises ------ @@ -1069,7 +1053,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi new_categories, ordered=self.ordered, rename=False, inplace=inplace ) - def remove_unused_categories(self, inplace=no_default): + def remove_unused_categories(self, inplace=False): """ Remove categories which are not used. @@ -1079,12 +1063,9 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Whether or not to drop unused categories inplace or return a copy of this categorical with unused categories dropped. - .. deprecated:: 1.2.0 - Returns ------- - cat : Categorical or None - Categorical with unused categories dropped or None if ``inplace=True``. + cat : Categorical with unused categories dropped or None if inplace. See Also -------- @@ -1094,17 +1075,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi remove_categories : Remove the specified categories. set_categories : Set the categories to the specified ones. """ - if inplace is not no_default: - warn( - "The `inplace` parameter in pandas.Categorical." - "remove_unused_categories is deprecated and " - "will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - inplace = False - inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() idx, inv = np.unique(cat._codes, return_inverse=True) @@ -1122,8 +1092,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi if not inplace: return cat - # ------------------------------------------------------------------ - def map(self, mapper): """ Map categories using input correspondence (dict, Series, or function). @@ -1212,23 +1180,39 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) - # ------------------------------------------------------------- - # Validators; ideally these can be de-duplicated + def shift(self, periods, fill_value=None): + """ + Shift Categorical by desired number of periods. - def _validate_searchsorted_value(self, value): - # searchsorted is very performance sensitive. By converting codes - # to same dtype as self.codes, we get much faster performance. - if is_scalar(value): - codes = self._unbox_scalar(value) - else: - locs = [self.categories.get_loc(x) for x in value] - codes = np.array(locs, dtype=self.codes.dtype) - return codes + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + fill_value : object, optional + The scalar value to use for newly introduced missing values. + + .. versionadded:: 0.24.0 + + Returns + ------- + shifted : Categorical + """ + # since categoricals always have ndim == 1, an axis parameter + # doesn't make any sense here. + codes = self.codes + if codes.ndim > 1: + raise NotImplementedError("Categorical with ndim > 1.") + + fill_value = self._validate_fill_value(fill_value) + + codes = shift(codes, periods, axis=0, fill_value=fill_value) + + return self._constructor(codes, dtype=self.dtype, fastpath=True) def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our - underlying ndarray, raising TypeError if this is not possible. + underlying ndarray, raising ValueError if this is not possible. Parameters ---------- @@ -1240,24 +1224,20 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Raises ------ - TypeError + ValueError """ - if is_valid_nat_for_dtype(fill_value, self.categories.dtype): + if isna(fill_value): fill_value = -1 elif fill_value in self.categories: - fill_value = self._unbox_scalar(fill_value) + fill_value = self.categories.get_loc(fill_value) else: - raise TypeError( + raise ValueError( f"'fill_value={fill_value}' is not present " "in this Categorical's categories" ) return fill_value - _validate_scalar = _validate_fill_value - - # ------------------------------------------------------------- - def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1269,13 +1249,15 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi if dtype==None (default), the same dtype as categorical.categories.dtype. """ - ret = take_1d(self.categories._values, self._codes) + ret = take_1d(self.categories.values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) - # When we're a Categorical[ExtensionArray], like Interval, - # we need to ensure __array__ gets all the way to an - # ndarray. - return np.asarray(ret) + if is_extension_array_dtype(ret): + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ get's all the way to an + # ndarray. + ret = np.asarray(ret) + return ret def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # for binary ops, use our custom dunder methods @@ -1304,10 +1286,10 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi setattr(self, k, v) @property - def nbytes(self) -> int: + def nbytes(self): return self._codes.nbytes + self.dtype.categories.values.nbytes - def memory_usage(self, deep: bool = False) -> int: + def memory_usage(self, deep=False): """ Memory usage of my values @@ -1332,6 +1314,18 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) + @doc(_shared_docs["searchsorted"], klass="Categorical") + def searchsorted(self, value, side="left", sorter=None): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self.categories.get_loc(value) + codes = self.codes.dtype.type(codes) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) + return self.codes.searchsorted(codes, side=side, sorter=sorter) + def isna(self): """ Detect missing values @@ -1349,7 +1343,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi Categorical.notna : Boolean inverse of Categorical.isna. """ - return self._codes == -1 + ret = self._codes == -1 + return ret isnull = isna @@ -1375,6 +1370,20 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi notnull = notna + def dropna(self): + """ + Return the Categorical without null values. + + Missing values (-1 in .codes) are detected. + + Returns + ------- + valid : Categorical + """ + result = self[self.notna()] + + return result + def value_counts(self, dropna=True): """ Return a Series containing counts of each category. @@ -1397,7 +1406,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories - ncat, mask = (len(cat), code >= 0) + ncat, mask = len(cat), 0 <= code ix, clean = np.arange(ncat), mask.all() if dropna or clean: @@ -1407,7 +1416,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._from_backing_data(ix) + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype="int64") @@ -1439,6 +1448,9 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi "Categorical to an ordered one\n" ) + def _values_for_argsort(self): + return self._codes + def argsort(self, ascending=True, kind="quicksort", **kwargs): """ Return the indices that would sort the Categorical. @@ -1493,7 +1505,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( - self, inplace: bool = False, ascending: bool = True, na_position: str = "last" + self, inplace: bool = False, ascending: bool = True, na_position: str = "last", ): """ Sort the Categorical by category value returning a new @@ -1575,10 +1587,11 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: - self._codes[:] = self._codes[sorted_idx] + self._codes = self._codes[sorted_idx] else: - codes = self._codes[sorted_idx] - return self._from_backing_data(codes) + return self._constructor( + values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True + ) def _values_for_rank(self): """ @@ -1613,7 +1626,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi def view(self, dtype=None): if dtype is not None: raise NotImplementedError(dtype) - return self._from_backing_data(self._ndarray) + return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) def to_dense(self): """ @@ -1665,7 +1678,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi value, method = validate_fillna_kwargs( value, method, validate_scalar_dict_value=False ) - value = extract_array(value, extract_numpy=True) if value is None: value = np.nan @@ -1674,31 +1686,130 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi "specifying a limit for fillna has not been implemented yet" ) + codes = self._codes + + # pad / bfill if method is not None: - # pad / bfill # TODO: dispatch when self.categories is EA-dtype values = np.asarray(self).reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None).astype( + values = interpolate_2d(values, method, 0, None, value).astype( self.categories.dtype )[0] codes = _get_codes_for_values(values, self.categories) else: - # We copy even if there is nothing to fill - codes = self._ndarray.copy() - mask = self.isna() - new_codes = self._validate_setitem_value(value) - - if isinstance(value, (np.ndarray, Categorical)): + # If value is a dict or a Series (a dict value has already + # been converted to a Series) + if isinstance(value, (np.ndarray, Categorical, ABCSeries)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - codes[mask] = new_codes[mask] - else: - codes[mask] = new_codes - return self._from_backing_data(codes) + mask = ~algorithms.isin(value, self.categories) + if not isna(value[mask]).all(): + raise ValueError("fill value must be in categories") + + values_codes = _get_codes_for_values(value, self.categories) + indexer = np.where(codes == -1) + codes = codes.copy() + codes[indexer] = values_codes[indexer] + + # If value is not a dict or Series it should be a scalar + elif is_hashable(value): + if not isna(value) and value not in self.categories: + raise ValueError("fill value must be in categories") + + mask = codes == -1 + if mask.any(): + codes = codes.copy() + if isna(value): + codes[mask] = -1 + else: + codes[mask] = self.categories.get_loc(value) + + else: + raise TypeError( + f"'value' parameter must be a scalar, dict " + f"or Series, but you passed a {type(value).__name__}" + ) + + return self._constructor(codes, dtype=self.dtype, fastpath=True) + + def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: + """ + Take elements from the Categorical. + + Parameters + ---------- + indexer : sequence of int + The indices in `self` to take. The meaning of negative values in + `indexer` depends on the value of `allow_fill`. + allow_fill : bool, default False + How to handle negative values in `indexer`. + + * False: negative values in `indices` indicate positional indices + from the right. This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate missing values + (the default). These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + .. versionchanged:: 1.0.0 + + Default value changed from ``True`` to ``False``. + + fill_value : object + The value to use for `indices` that are missing (-1), when + ``allow_fill=True``. This should be the category, i.e. a value + in ``self.categories``, not a code. + + Returns + ------- + Categorical + This Categorical will have the same categories and ordered as + `self`. + + See Also + -------- + Series.take : Similar method for Series. + numpy.ndarray.take : Similar method for NumPy arrays. + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'b']) + >>> cat + ['a', 'a', 'b'] + Categories (2, object): ['a', 'b'] + + Specify ``allow_fill==False`` to have negative indices mean indexing + from the right. + + >>> cat.take([0, -1, -2], allow_fill=False) + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] + + With ``allow_fill=True``, indices equal to ``-1`` mean "missing" + values that should be filled with the `fill_value`, which is + ``np.nan`` by default. + + >>> cat.take([0, -1, -1], allow_fill=True) + ['a', NaN, NaN] + Categories (2, object): ['a', 'b'] + + The fill value can be specified. + + >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') + ['a', 'a', 'a'] + Categories (2, object): ['a', 'b'] + + Specifying a fill value that's not in ``self.categories`` + will raise a ``ValueError``. + """ + return NDArrayBackedExtensionArray.take( + self, indexer, allow_fill=allow_fill, fill_value=fill_value + ) # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -1710,18 +1821,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi def _from_backing_data(self, arr: np.ndarray) -> "Categorical": return self._constructor(arr, dtype=self.dtype, fastpath=True) - def _box_func(self, i: int): - if i == -1: - return np.NaN - return self.categories[i] - - def _unbox_scalar(self, key) -> int: - # searchsorted is very performance sensitive. By converting codes - # to same dtype as self.codes, we get much faster performance. - code = self.categories.get_loc(key) - code = self._codes.dtype.type(code) - return code - # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1749,13 +1848,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return contains(self, key, container=self._codes) - # ------------------------------------------------------------------ - # Rendering Methods - - def _formatter(self, boxed=False): - # Defer to CategoricalFormatter's formatter. - return None - def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1854,34 +1946,59 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return result - # ------------------------------------------------------------------ + def _maybe_coerce_indexer(self, indexer): + """ + return an indexer coerced to the codes dtype + """ + if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": + indexer = indexer.astype(self._codes.dtype) + return indexer def __getitem__(self, key): """ Return an item. """ - result = super().__getitem__(key) - if getattr(result, "ndim", 0) > 1: - result = result._ndarray - deprecate_ndim_indexing(result) - return result + if isinstance(key, (int, np.integer)): + i = self._codes[key] + if i == -1: + return np.nan + else: + return self.categories[i] - def _validate_setitem_value(self, value): + key = check_array_indexer(self, key) + + result = self._codes[key] + if result.ndim > 1: + deprecate_ndim_indexing(result) + return result + return self._constructor(result, dtype=self.dtype, fastpath=True) + + def __setitem__(self, key, value): + """ + Item assignment. + + Raises + ------ + ValueError + If (one or more) Value is not in categories or if a assigned + `Categorical` does not have the same categories + """ value = extract_array(value, extract_numpy=True) # require identical categories set if isinstance(value, Categorical): - if not is_dtype_equal(self.dtype, value.dtype): + if not is_dtype_equal(self, value): raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) - # is_dtype_equal implies categories_match_up_to_permutation - value = self._encode_with_my_categories(value) - return value._codes + if not self.categories.equals(value.categories): + new_codes = recode_for_categories( + value.codes, value.categories, self.categories + ) + value = Categorical.from_codes(new_codes, dtype=self.dtype) - # wrap scalars and hashable-listlikes in list - rvalue = value if not is_hashable(value) else [value] + rvalue = value if is_list_like(value) else [value] from pandas import Index @@ -1895,8 +2012,34 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi "category, set the categories first" ) - codes = self.categories.get_indexer(rvalue) - return codes.astype(self._ndarray.dtype, copy=False) + # set by position + if isinstance(key, (int, np.integer)): + pass + + # tuple of indexers (dataframe) + elif isinstance(key, tuple): + # only allow 1 dimensional slicing, but can + # in a 2-d case be passed (slice(None),....) + if len(key) == 2: + if not com.is_null_slice(key[0]): + raise AssertionError("invalid slicing for a 1-ndim categorical") + key = key[1] + elif len(key) == 1: + key = key[0] + else: + raise AssertionError("invalid slicing for a 1-ndim categorical") + + # slicing in Series or Categorical + elif isinstance(key, slice): + pass + + # else: array of True/False in Series or Categorical + + lindexer = self.categories.get_indexer(rvalue) + lindexer = self._maybe_coerce_indexer(lindexer) + + key = check_array_indexer(self, key) + self._codes[key] = lindexer def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ @@ -1929,13 +2072,18 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ) counts = counts.cumsum() _result = (r[start:end] for start, end in zip(counts, counts[1:])) - return dict(zip(categories, _result)) + result = dict(zip(categories, _result)) + return result - # ------------------------------------------------------------------ - # Reductions + # reduction ops # + def _reduce(self, name: str, skipna: bool = True, **kwargs): + func = getattr(self, name, None) + if func is None: + raise TypeError(f"Categorical cannot perform the operation {name}") + return func(skipna=skipna, **kwargs) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def min(self, *, skipna=True, **kwargs): + def min(self, skipna=True, **kwargs): """ The minimum value of the object. @@ -1954,7 +2102,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ------- min : the minimum of this `Categorical` """ - nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_min((), kwargs) self.check_for_ordered("min") @@ -1969,10 +2116,10 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return np.nan else: pointer = self._codes.min() - return self._wrap_reduction_result(None, pointer) + return self.categories[pointer] @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") - def max(self, *, skipna=True, **kwargs): + def max(self, skipna=True, **kwargs): """ The maximum value of the object. @@ -1991,7 +2138,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ------- max : the maximum of this `Categorical` """ - nv.validate_minmax_axis(kwargs.get("axis", 0)) nv.validate_max((), kwargs) self.check_for_ordered("max") @@ -2006,7 +2152,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return np.nan else: pointer = self._codes.max() - return self._wrap_reduction_result(None, pointer) + return self.categories[pointer] def mode(self, dropna=True): """ @@ -2030,10 +2176,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi good = self._codes != -1 codes = self._codes[good] codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) - return self._from_backing_data(codes) - - # ------------------------------------------------------------------ - # ExtensionArray Interface + return self._constructor(values=codes, dtype=self.dtype, fastpath=True) def unique(self): """ @@ -2053,7 +2196,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi -------- pandas.unique CategoricalIndex.unique - Series.unique : Return unique values of Series object. + Series.unique Examples -------- @@ -2090,7 +2233,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - return self._ndarray, -1 + codes = self.codes.astype("int64") + return codes, -1 @classmethod def _from_factorized(cls, uniques, original): @@ -2098,7 +2242,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi original.categories.take(uniques), dtype=original.dtype ) - def equals(self, other: object) -> bool: + def equals(self, other): """ Returns True if categorical arrays are equal. @@ -2110,41 +2254,18 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ------- bool """ - if not isinstance(other, Categorical): - return False - elif self._categories_match_up_to_permutation(other): - other = self._encode_with_my_categories(other) - return np.array_equal(self._codes, other._codes) + if self.is_dtype_equal(other): + if self.categories.equals(other.categories): + # fastpath to avoid re-coding + other_codes = other._codes + else: + other_codes = recode_for_categories( + other.codes, other.categories, self.categories + ) + return np.array_equal(self._codes, other_codes) return False - @classmethod - def _concat_same_type( - cls: Type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 - ) -> CategoricalT: - from pandas.core.dtypes.concat import union_categoricals - - return union_categoricals(to_concat) - - # ------------------------------------------------------------------ - - def _encode_with_my_categories(self, other: "Categorical") -> "Categorical": - """ - Re-encode another categorical using this Categorical's categories. - - Notes - ----- - This assumes we have already checked - self._categories_match_up_to_permutation(other). - """ - # Indexing on codes is more efficient if categories are the same, - # so we can apply some optimizations based on the degree of - # dtype-matching. - codes = recode_for_categories( - other.codes, other.categories, self.categories, copy=False - ) - return self._from_backing_data(codes) - - def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: + def is_dtype_equal(self, other): """ Returns True if categoricals are the same dtype same categories, and same ordered @@ -2157,17 +2278,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi ------- bool """ - return hash(self.dtype) == hash(other.dtype) - - def is_dtype_equal(self, other) -> bool: - warn( - "Categorical.is_dtype_equal is deprecated and will be removed " - "in a future version", - FutureWarning, - stacklevel=2, - ) try: - return self._categories_match_up_to_permutation(other) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False @@ -2191,7 +2303,18 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi return result - def isin(self, values) -> np.ndarray: + # Implement the ExtensionArray interface + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import union_categoricals + + return union_categoricals(to_concat) + + def isin(self, values): """ Check whether `values` are contained in Categorical. @@ -2303,25 +2426,6 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi if not inplace: return cat - # ------------------------------------------------------------------------ - # String methods interface - def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): - # Optimization to apply the callable `f` to the categories once - # and rebuild the result by `take`ing from the result with the codes. - # Returns the same type as the object-dtype implementation though. - from pandas.core.arrays import PandasArray - - categories = self.categories - codes = self.codes - result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) - return take_1d(result, codes, fill_value=na_value) - - def _str_get_dummies(self, sep="|"): - # sep may not be in categories. Just bail on this. - from pandas.core.arrays import PandasArray - - return PandasArray(self.astype(str))._str_get_dummies(sep) - # The Series.cat accessor @@ -2492,11 +2596,9 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): # utility routines -def _get_codes_for_values(values, categories) -> np.ndarray: +def _get_codes_for_values(values, categories): """ utility routine to turn values into codes given the specified categories - - If `values` is known to be a Categorical, use recode_for_categories instead. """ dtype_equal = is_dtype_equal(values.dtype, categories.dtype) @@ -2519,16 +2621,14 @@ def _get_codes_for_values(values, categories) -> np.ndarray: # Only hit here when we've already coerced to object dtypee. - hash_klass, vals = get_data_algo(values) - _, cats = get_data_algo(categories) + hash_klass, vals = _get_data_algo(values) + _, cats = _get_data_algo(categories) t = hash_klass(len(cats)) t.map_locations(cats) return coerce_indexer_dtype(t.lookup(vals), cats) -def recode_for_categories( - codes: np.ndarray, old_categories, new_categories, copy: bool = True -) -> np.ndarray: +def recode_for_categories(codes: np.ndarray, old_categories, new_categories): """ Convert a set of codes for to a new set of categories @@ -2536,8 +2636,6 @@ def recode_for_categories( ---------- codes : np.ndarray old_categories, new_categories : Index - copy: bool, default True - Whether to copy if the codes are unchanged. Returns ------- @@ -2553,19 +2651,14 @@ def recode_for_categories( """ if len(old_categories) == 0: # All null anyway, so just retain the nulls - if copy: - return codes.copy() - return codes + return codes.copy() elif new_categories.equals(old_categories): # Same categories, so no need to actually recode - if copy: - return codes.copy() - return codes - + return codes.copy() indexer = coerce_indexer_dtype( new_categories.get_indexer(old_categories), new_categories ) - new_codes = take_1d(indexer, codes, fill_value=-1) + new_codes = take_1d(indexer, codes.copy(), fill_value=-1) return new_codes diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimelike.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimelike.py index be98647..a9fe95c 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimelike.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimelike.py @@ -1,19 +1,6 @@ -from __future__ import annotations - from datetime import datetime, timedelta import operator -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Optional, - Sequence, - Tuple, - Type, - TypeVar, - Union, - cast, -) +from typing import Any, Callable, Optional, Sequence, Tuple, Type, TypeVar, Union, cast import warnings import numpy as np @@ -37,9 +24,11 @@ from pandas._libs.tslibs.timestamps import ( round_nsint64, ) from pandas._typing import DatetimeLikeScalar, DtypeObj +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, Substitution +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -59,64 +48,101 @@ from pandas.core.dtypes.common import ( is_unsigned_integer_dtype, pandas_dtype, ) +from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import nanops, ops -from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core import missing, nanops, ops +from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts +from pandas.core.array_algos.transforms import shift +from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_array_indexer, check_setitem_lengths +from pandas.core.indexers import check_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies -if TYPE_CHECKING: - from pandas.core.arrays import DatetimeArray, TimedeltaArray - DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] -DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") -class InvalidComparison(Exception): +def _datetimelike_array_cmp(cls, op): """ - Raised by _validate_comparison_value to indicate to caller it should - return invalid_comparison. + Wrap comparison operations to convert Timestamp/Timedelta/Period-like to + boxed scalars/arrays. """ + opname = f"__{op.__name__}__" + nat_result = opname == "__ne__" - pass + class InvalidComparison(Exception): + pass + + def _validate_comparison_value(self, other): + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + raise InvalidComparison(other) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + self._check_compatible_with(other) + + elif not is_list_like(other): + raise InvalidComparison(other) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + try: + other = self._validate_listlike(other, opname, allow_object=True) + except TypeError as err: + raise InvalidComparison(other) from err + + return other + + @unpack_zerodim_and_defer(opname) + def wrapper(self, other): + if self.ndim > 1 and getattr(other, "shape", None) == self.shape: + # TODO: handle 2D-like listlikes + return op(self.ravel(), other.ravel()).reshape(self.shape) + + try: + other = _validate_comparison_value(self, other) + except InvalidComparison: + return invalid_comparison(self, other, op) + + dtype = getattr(other, "dtype", None) + if is_object_dtype(dtype): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY(op, self.astype(object), other) + return result + + other_i8 = self._unbox(other) + result = op(self.asi8, other_i8) + + o_mask = isna(other) + if self._hasnans | np.any(o_mask): + result[self._isnan | o_mask] = nat_result + + return result + + return set_function_name(wrapper, opname, cls) -class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): - """ - Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray - - Assumes that __new__/__init__ defines: - _data - _freq - - and that the inheriting class has methods: - _generate_range - """ - - # _infer_matches -> which infer_dtype strings are close enough to our own - _infer_matches: Tuple[str, ...] - _is_recognized_dtype: Callable[[DtypeObj], bool] - _recognized_scalars: Tuple[Type, ...] +class AttributesMixin: _data: np.ndarray - def __init__(self, data, dtype=None, freq=None, copy=False): - raise AbstractMethodError(self) - @classmethod - def _simple_new( - cls: Type[DatetimeLikeArrayT], - values: np.ndarray, - freq: Optional[BaseOffset] = None, - dtype=None, - ) -> DatetimeLikeArrayT: + def _simple_new(cls, values: np.ndarray, **kwargs): raise AbstractMethodError(cls) @property @@ -150,9 +176,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): """ raise AbstractMethodError(self) - def _unbox_scalar( - self, value: DTScalarOrNaT, setitem: bool = False - ) -> Union[np.int64, np.datetime64, np.timedelta64]: + def _unbox_scalar(self, value: DTScalarOrNaT) -> int: """ Unbox the integer value of a scalar `value`. @@ -160,8 +184,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): ---------- value : Period, Timestamp, Timedelta, or NaT Depending on subclass. - setitem : bool, default False - Whether to check compatibility with setitem strictness. Returns ------- @@ -199,38 +221,272 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): """ raise AbstractMethodError(self) + +class DatelikeOps: + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + @Substitution( + URL="https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior" + ) + def strftime(self, date_format): + """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format + doc <%(URL)s>`__. + + Parameters + ---------- + date_format : str + Date format string (e.g. "%%Y-%%m-%%d"). + + Returns + ------- + ndarray + NumPy ndarray of formatted strings. + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%%B %%d, %%Y, %%r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """ + result = self._format_native_types(date_format=date_format, na_rep=np.nan) + return result.astype(object) + + +class TimelikeOps: + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _round_doc = """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times. + + .. versionadded:: 0.24.0 + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, \ +default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """ + + _round_example = """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + + _floor_example = """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + + _ceil_example = """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + if is_datetime64tz_dtype(self.dtype): + # operate on naive timestamps, then convert back to aware + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + aware = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + return aware + + values = self.view("i8") + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + return self._simple_new(result, dtype=self.dtype) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous="raise", nonexistent="raise"): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + def _with_freq(self, freq): + """ + Helper to get a view on the same data, with a new freq. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + + Returns + ------- + Same type as self + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, BaseOffset): + # Always valid. In the TimedeltaArray case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + arr = self.view() + arr._freq = freq + return arr + + +DatetimeLikeArrayT = TypeVar("DatetimeLikeArrayT", bound="DatetimeLikeArrayMixin") + + +class DatetimeLikeArrayMixin( + ExtensionOpsMixin, AttributesMixin, NDArrayBackedExtensionArray +): + """ + Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray + + Assumes that __new__/__init__ defines: + _data + _freq + + and that the inheriting class has methods: + _generate_range + """ + + _is_recognized_dtype: Callable[[DtypeObj], bool] + _recognized_scalars: Tuple[Type, ...] + # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat - @cache_readonly + # TODO: make this a cache_readonly; need to get around _index_data + # kludge in libreduction + @property def _ndarray(self) -> np.ndarray: - return self._data + # NB: A bunch of Interval tests fail if we use ._data + return self.asi8 - def _from_backing_data( - self: DatetimeLikeArrayT, arr: np.ndarray - ) -> DatetimeLikeArrayT: + def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` - return type(self)._simple_new(arr, dtype=self.dtype) + return type(self)(arr, dtype=self.dtype) # type: ignore # ------------------------------------------------------------------ - def _box_func(self, x): + @property + def _box_func(self): """ box function to get object from internal representation """ raise AbstractMethodError(self) - def _box_values(self, values) -> np.ndarray: + def _box_values(self, values): """ apply box func to passed values """ return lib.map_infer(values, self._box_func) def __iter__(self): - if self.ndim > 1: - return (self[n] for n in range(len(self))) - else: - return (self._box_func(v) for v in self.asi8) + return (self._box_func(v) for v in self.asi8) @property def asi8(self) -> np.ndarray: @@ -269,21 +525,41 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) - return self._ndarray + return self._data - def __getitem__( - self, key: Union[int, slice, np.ndarray] - ) -> Union[DatetimeLikeArrayMixin, DTScalarOrNaT]: + def __getitem__(self, key): """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars """ - result = super().__getitem__(key) - if lib.is_scalar(result): - return result - result._freq = self._get_getitem_freq(key) - return result + if lib.is_integer(key): + # fast-path + result = self._data[key] + if self.ndim == 1: + return self._box_func(result) + return self._simple_new(result, dtype=self.dtype) + + if com.is_bool_indexer(key): + # first convert to boolean, because check_array_indexer doesn't + # allow object dtype + if is_object_dtype(key): + key = np.asarray(key, dtype=bool) + + key = check_array_indexer(self, key) + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + elif isinstance(key, list) and len(key) == 1 and isinstance(key[0], slice): + # see https://github.com/pandas-dev/pandas/issues/31299, need to allow + # this for now (would otherwise raise in check_array_indexer) + pass + else: + key = check_array_indexer(self, key) + + freq = self._get_getitem_freq(key) + result = self._data[key] + if lib.is_scalar(result): + return self._box_func(result) + return self._simple_new(result, dtype=self.dtype, freq=freq) def _get_getitem_freq(self, key): """ @@ -292,10 +568,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq - elif self.ndim != 1: - freq = None else: - key = check_array_indexer(self, key) # maybe ndarray[bool] -> slice freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: @@ -306,10 +579,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # GH#21282 indexing with Ellipsis is similar to a full slice, # should preserve `freq` attribute freq = self.freq - elif com.is_bool_indexer(key): - new_key = lib.maybe_booleans_to_slice(key.view(np.uint8)) - if isinstance(new_key, slice): - return self._get_getitem_freq(new_key) return freq def __setitem__( @@ -322,11 +591,27 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # to a period in from_sequence). For DatetimeArray, it's Timestamp... # I don't know if mypy can do that, possibly with Generics. # https://mypy.readthedocs.io/en/latest/generics.html - no_op = check_setitem_lengths(key, value, self) - if no_op: - return + if is_list_like(value): + is_slice = isinstance(key, slice) - super().__setitem__(key, value) + if lib.is_scalar(key): + raise ValueError("setting an array element with a sequence.") + + if not is_slice: + key = cast(Sequence, key) + if len(key) != len(value) and not com.is_bool_indexer(key): + msg = ( + f"shape mismatch: value array of length '{len(key)}' " + "does not match indexing result of length " + f"'{len(value)}'." + ) + raise ValueError(msg) + elif not len(key): + return + + value = self._validate_setitem_value(value) + key = check_array_indexer(self, key) + self._data[key] = value self._maybe_clear_freq() def _maybe_clear_freq(self): @@ -377,23 +662,26 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): def view(self, dtype=None): if dtype is None or dtype is self.dtype: - return type(self)(self._ndarray, dtype=self.dtype) - return self._ndarray.view(dtype=dtype) + return type(self)(self._data, dtype=self.dtype) + return self._data.view(dtype=dtype) # ------------------------------------------------------------------ # ExtensionArray Interface @classmethod - def _concat_same_type( - cls: Type[DatetimeLikeArrayT], - to_concat: Sequence[DatetimeLikeArrayT], - axis: int = 0, - ) -> DatetimeLikeArrayT: - new_obj = super()._concat_same_type(to_concat, axis) + def _concat_same_type(cls, to_concat, axis: int = 0): + + # do not pass tz to set because tzlocal cannot be hashed + dtypes = {str(x.dtype) for x in to_concat} + if len(dtypes) != 1: + raise ValueError("to_concat must have the same dtype (tz)", dtypes) obj = to_concat[0] dtype = obj.dtype + i8values = [x.asi8 for x in to_concat] + values = np.concatenate(i8values, axis=axis) + new_freq = None if is_period_dtype(dtype): new_freq = obj.freq @@ -407,69 +695,38 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): new_freq = obj.freq - new_obj._freq = new_freq - return new_obj + return cls._simple_new(values, dtype=dtype, freq=new_freq) def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: - new_obj = super().copy() - new_obj._freq = self.freq - return new_obj + values = self.asi8.copy() + return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) def _values_for_factorize(self): - return self._ndarray, iNaT + return self.asi8, iNaT @classmethod - def _from_factorized( - cls: Type[DatetimeLikeArrayT], values, original - ) -> DatetimeLikeArrayT: + def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) + def _values_for_argsort(self): + return self._data + + @Appender(ExtensionArray.shift.__doc__) + def shift(self, periods=1, fill_value=None, axis=0): + + fill_value = self._validate_shift_value(fill_value) + new_values = shift(self._data, periods, axis, fill_value) + + return type(self)._simple_new(new_values, dtype=self.dtype) + # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior - def _validate_comparison_value(self, other): - if isinstance(other, str): - try: - # GH#18435 strings get a pass from tzawareness compat - other = self._scalar_from_string(other) - except ValueError: - # failed to parse as Timestamp/Timedelta/Period - raise InvalidComparison(other) - - if isinstance(other, self._recognized_scalars) or other is NaT: - # pandas\core\arrays\datetimelike.py:432: error: Too many arguments - # for "object" [call-arg] - other = self._scalar_type(other) # type: ignore[call-arg] - try: - self._check_compatible_with(other) - except TypeError as err: - # e.g. tzawareness mismatch - raise InvalidComparison(other) from err - - elif not is_list_like(other): - raise InvalidComparison(other) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = self._validate_listlike(other, allow_object=True) - self._check_compatible_with(other) - except TypeError as err: - if is_object_dtype(getattr(other, "dtype", None)): - # We will have to operate element-wise - pass - else: - raise InvalidComparison(other) from err - - return other - def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to an i8 representation, - raising TypeError if this is not possible. + raising ValueError if this is not possible. Parameters ---------- @@ -477,31 +734,35 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): Returns ------- - fill_value : np.int64, np.datetime64, or np.timedelta64 + fill_value : np.int64 Raises ------ - TypeError + ValueError """ - return self._validate_scalar(fill_value) + msg = ( + f"'fill_value' should be a {self._scalar_type}. " + f"Got '{str(fill_value)}'." + ) + try: + fill_value = self._validate_scalar(fill_value, msg) + except TypeError as err: + raise ValueError(msg) from err + return self._unbox(fill_value) def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value if is_valid_nat_for_dtype(fill_value, self.dtype): fill_value = NaT elif isinstance(fill_value, self._recognized_scalars): - # pandas\core\arrays\datetimelike.py:746: error: Too many arguments - # for "object" [call-arg] - fill_value = self._scalar_type(fill_value) # type: ignore[call-arg] + fill_value = self._scalar_type(fill_value) else: # only warn if we're not going to raise if self._scalar_type is Period and lib.is_integer(fill_value): # kludge for #31971 since Period(integer) tries to cast to str - new_fill = Period._from_ordinal(fill_value, freq=self.freq) + new_fill = Period._from_ordinal(fill_value, freq=self.dtype.freq) else: - # pandas\core\arrays\datetimelike.py:753: error: Too many - # arguments for "object" [call-arg] - new_fill = self._scalar_type(fill_value) # type: ignore[call-arg] + new_fill = self._scalar_type(fill_value) # stacklevel here is chosen to be correct when called from # DataFrame.shift or Series.shift @@ -514,15 +775,10 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): ) fill_value = new_fill - return self._unbox(fill_value, setitem=True) + return self._unbox(fill_value) def _validate_scalar( - self, - value, - *, - allow_listlike: bool = False, - setitem: bool = True, - unbox: bool = True, + self, value, msg: Optional[str] = None, cast_str: bool = False ): """ Validate that the input value can be cast to our scalar_type. @@ -530,25 +786,22 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): Parameters ---------- value : object - allow_listlike: bool, default False - When raising an exception, whether the message should say - listlike inputs are allowed. - setitem : bool, default True - Whether to check compatibility with setitem strictness. - unbox : bool, default True - Whether to unbox the result before returning. Note: unbox=False - skips the setitem compatibility check. + msg : str, optional. + Message to raise in TypeError on invalid input. + If not provided, `value` is cast to a str and used + as the message. + cast_str : bool, default False + Whether to try to parse string input to scalar_type. Returns ------- self._scalar_type or NaT """ - if isinstance(value, str): + if cast_str and isinstance(value, str): # NB: Careful about tzawareness try: value = self._scalar_from_string(value) except ValueError as err: - msg = self._validation_error_message(value, allow_listlike) raise TypeError(msg) from err elif is_valid_nat_for_dtype(value, self.dtype): @@ -556,48 +809,18 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): value = NaT elif isinstance(value, self._recognized_scalars): - # error: Too many arguments for "object" [call-arg] - value = self._scalar_type(value) # type: ignore[call-arg] + value = self._scalar_type(value) # type: ignore else: - msg = self._validation_error_message(value, allow_listlike) + if msg is None: + msg = str(value) raise TypeError(msg) - if not unbox: - # NB: In general NDArrayBackedExtensionArray will unbox here; - # this option exists to prevent a performance hit in - # TimedeltaIndex.get_loc - return value - return self._unbox_scalar(value, setitem=setitem) + return value - def _validation_error_message(self, value, allow_listlike: bool = False) -> str: - """ - Construct an exception message on validation error. - - Some methods allow only scalar inputs, while others allow either scalar - or listlike. - - Parameters - ---------- - allow_listlike: bool, default False - - Returns - ------- - str - """ - if allow_listlike: - msg = ( - f"value should be a '{self._scalar_type.__name__}', 'NaT', " - f"or array of those. Got '{type(value).__name__}' instead." - ) - else: - msg = ( - f"value should be a '{self._scalar_type.__name__}' or 'NaT'. " - f"Got '{type(value).__name__}' instead." - ) - return msg - - def _validate_listlike(self, value, allow_object: bool = False): + def _validate_listlike( + self, value, opname: str, cast_str: bool = False, allow_object: bool = False + ): if isinstance(value, type(self)): return value @@ -606,7 +829,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): value = array(value) value = extract_array(value, extract_numpy=True) - if is_dtype_equal(value.dtype, "string"): + if cast_str and is_dtype_equal(value.dtype, "string"): # We got a StringArray try: # TODO: Could use from_sequence_of_strings if implemented @@ -620,45 +843,72 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): if is_dtype_equal(value.categories.dtype, self.dtype): # TODO: do we need equal dtype or just comparable? value = value._internal_get_values() - value = extract_array(value, extract_numpy=True) if allow_object and is_object_dtype(value.dtype): pass elif not type(self)._is_recognized_dtype(value.dtype): - msg = self._validation_error_message(value, True) - raise TypeError(msg) + raise TypeError( + f"{opname} requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) return value def _validate_searchsorted_value(self, value): + msg = "searchsorted requires compatible dtype or scalar" if not is_list_like(value): - return self._validate_scalar(value, allow_listlike=True, setitem=False) + value = self._validate_scalar(value, msg, cast_str=True) else: - value = self._validate_listlike(value) + # TODO: cast_str? we accept it for scalar + value = self._validate_listlike(value, "searchsorted") return self._unbox(value) def _validate_setitem_value(self, value): + msg = ( + f"'value' should be a '{self._scalar_type.__name__}', 'NaT', " + f"or array of those. Got '{type(value).__name__}' instead." + ) if is_list_like(value): - value = self._validate_listlike(value) + value = self._validate_listlike(value, "setitem", cast_str=True) else: - return self._validate_scalar(value, allow_listlike=True) + # TODO: cast_str for consistency? + value = self._validate_scalar(value, msg, cast_str=False) - return self._unbox(value, setitem=True) + self._check_compatible_with(value, setitem=True) + return self._unbox(value) - def _unbox( - self, other, setitem: bool = False - ) -> Union[np.int64, np.datetime64, np.timedelta64, np.ndarray]: + def _validate_insert_value(self, value): + msg = f"cannot insert {type(self).__name__} with incompatible label" + value = self._validate_scalar(value, msg, cast_str=False) + + self._check_compatible_with(value, setitem=True) + # TODO: if we dont have compat, should we raise or astype(object)? + # PeriodIndex does astype(object) + return value + + def _validate_where_value(self, other): + msg = f"Where requires matching dtype, not {type(other)}" + if not is_list_like(other): + other = self._validate_scalar(other, msg) + else: + other = self._validate_listlike(other, "where") + self._check_compatible_with(other, setitem=True) + + self._check_compatible_with(other, setitem=True) + return self._unbox(other) + + def _unbox(self, other) -> Union[np.int64, np.ndarray]: """ Unbox either a scalar with _unbox_scalar or an instance of our own type. """ if lib.is_scalar(other): - other = self._unbox_scalar(other, setitem=setitem) + other = self._unbox_scalar(other) else: # same type as self - self._check_compatible_with(other, setitem=setitem) - other = other._ndarray + self._check_compatible_with(other) + other = other.view("i8") return other # ------------------------------------------------------------------ @@ -666,7 +916,37 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def value_counts(self, dropna: bool = False): + def searchsorted(self, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + Find the indices into a sorted array `self` such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. + + Parameters + ---------- + value : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `value`. + """ + value = self._validate_searchsorted_value(value) + + # TODO: Use datetime64 semantics for sorting, xref GH#29844 + return self.asi8.searchsorted(value, side=side, sorter=sorter) + + def value_counts(self, dropna=False): """ Return a Series containing counts of unique values. @@ -682,9 +962,9 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): from pandas import Index, Series if dropna: - values = self[~self.isna()]._ndarray + values = self[~self.isna()]._data else: - values = self._ndarray + values = self._data cls = type(self) @@ -704,86 +984,31 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): return Index(self).map(mapper).array - def isin(self, values) -> np.ndarray: - """ - Compute boolean array of whether each value is found in the - passed set of values. - - Parameters - ---------- - values : set or sequence of values - - Returns - ------- - ndarray[bool] - """ - if not hasattr(values, "dtype"): - values = np.asarray(values) - - if values.dtype.kind in ["f", "i", "u", "c"]: - # TODO: de-duplicate with equals, validate_comparison_value - return np.zeros(self.shape, dtype=bool) - - if not isinstance(values, type(self)): - inferrable = [ - "timedelta", - "timedelta64", - "datetime", - "datetime64", - "date", - "period", - ] - if values.dtype == object: - inferred = lib.infer_dtype(values, skipna=False) - if inferred not in inferrable: - if inferred == "string": - pass - - elif "mixed" in inferred: - return isin(self.astype(object), values) - else: - return np.zeros(self.shape, dtype=bool) - - try: - values = type(self)._from_sequence(values) - except ValueError: - return isin(self.astype(object), values) - - try: - self._check_compatible_with(values) - except (TypeError, ValueError): - # Includes tzawareness mismatch and IncompatibleFrequencyError - return np.zeros(self.shape, dtype=bool) - - return isin(self.asi8, values.asi8) - # ------------------------------------------------------------------ # Null Handling - def isna(self) -> np.ndarray: + def isna(self): return self._isnan @property # NB: override with cache_readonly in immutable subclasses - def _isnan(self) -> np.ndarray: + def _isnan(self): """ return if each value is nan """ return self.asi8 == iNaT @property # NB: override with cache_readonly in immutable subclasses - def _hasnans(self) -> np.ndarray: + def _hasnans(self): """ return if I have any nans; enables various perf speedups """ return bool(self._isnan.any()) - def _maybe_mask_results( - self, result: np.ndarray, fill_value=iNaT, convert=None - ) -> np.ndarray: + def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): """ Parameters ---------- - result : np.ndarray + result : a ndarray fill_value : object, default iNaT convert : str, dtype or None @@ -801,9 +1026,56 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): result = result.astype(convert) if fill_value is None: fill_value = np.nan - np.putmask(result, self._isnan, fill_value) + result[self._isnan] = fill_value return result + def fillna(self, value=None, method=None, limit=None): + # TODO(GH-20300): remove this + # Just overriding to ensure that we avoid an astype(object). + # Either 20300 or a `_values_for_fillna` would avoid this duplication. + if isinstance(value, ABCSeries): + value = value.array + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + if method == "pad": + func = missing.pad_1d + else: + func = missing.backfill_1d + + values = self._data + if not is_period_dtype(self.dtype): + # For PeriodArray self._data is i8, which gets copied + # by `func`. Otherwise we need to make a copy manually + # to avoid modifying `self` in-place. + values = values.copy() + + new_values = func(values, limit=limit, mask=mask) + if is_datetime64tz_dtype(self.dtype): + # we need to pass int64 values to the constructor to avoid + # re-localizing incorrectly + new_values = new_values.view("i8") + new_values = type(self)(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + # ------------------------------------------------------------------ # Frequency Properties/Methods @@ -857,8 +1129,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): """ Returns day, hour, minute, second, millisecond or microsecond """ - # error: Item "None" of "Optional[Any]" has no attribute "attrname" - return self._resolution_obj.attrname # type: ignore[union-attr] + return self._resolution_obj.attrname # type: ignore @classmethod def _validate_frequency(cls, index, freq, **kwargs): @@ -899,62 +1170,24 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): f"does not conform to passed frequency {freq.freqstr}" ) from e - @classmethod - def _generate_range( - cls: Type[DatetimeLikeArrayT], start, end, periods, freq, *args, **kwargs - ) -> DatetimeLikeArrayT: - raise AbstractMethodError(cls) - # monotonicity/uniqueness properties are called via frequencies.infer_freq, # see GH#23789 @property - def _is_monotonic_increasing(self) -> bool: + def _is_monotonic_increasing(self): return algos.is_monotonic(self.asi8, timelike=True)[0] @property - def _is_monotonic_decreasing(self) -> bool: + def _is_monotonic_decreasing(self): return algos.is_monotonic(self.asi8, timelike=True)[1] @property - def _is_unique(self) -> bool: + def _is_unique(self): return len(unique1d(self.asi8)) == len(self) # ------------------------------------------------------------------ # Arithmetic Methods - - def _cmp_method(self, other, op): - if self.ndim > 1 and getattr(other, "shape", None) == self.shape: - # TODO: handle 2D-like listlikes - return op(self.ravel(), other.ravel()).reshape(self.shape) - - try: - other = self._validate_comparison_value(other) - except InvalidComparison: - return invalid_comparison(self, other, op) - - dtype = getattr(other, "dtype", None) - if is_object_dtype(dtype): - # We have to use comp_method_OBJECT_ARRAY instead of numpy - # comparison otherwise it would fail to raise when - # comparing tz-aware and tz-naive - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY( - op, np.asarray(self.astype(object)), other - ) - return result - - other_vals = self._unbox(other) - # GH#37462 comparison on i8 values is almost 2x faster than M8/m8 - result = op(self._ndarray.view("i8"), other_vals.view("i8")) - - o_mask = isna(other) - mask = self._isnan | o_mask - if mask.any(): - nat_result = op is operator.ne - np.putmask(result, mask, nat_result) - - return result + _create_comparison_method = classmethod(_datetimelike_array_cmp) # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops @@ -989,7 +1222,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): raise TypeError(f"cannot subtract Period from a {type(self).__name__}") def _add_period(self, other: Period): - # Overridden by TimedeltaArray + # Overriden by TimedeltaArray raise TypeError(f"cannot add Period to a {type(self).__name__}") def _add_offset(self, offset): @@ -1006,7 +1239,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds new_values = np.empty(self.shape, dtype="i8") - new_values.fill(iNaT) + new_values[:] = iNaT return type(self)(new_values, dtype=self.dtype) inc = delta_to_nanoseconds(other) @@ -1047,8 +1280,8 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): self_i8, other_i8, arr_mask=self._isnan, b_mask=other._isnan ) if self._hasnans or other._hasnans: - mask = self._isnan | other._isnan - np.putmask(new_values, mask, iNaT) + mask = (self._isnan) | (other._isnan) + new_values[mask] = iNaT return type(self)(new_values, dtype=self.dtype) @@ -1063,7 +1296,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - result = np.empty(self.shape, dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return type(self)(result, dtype=self.dtype, freq=None) @@ -1077,7 +1310,7 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # For datetime64 dtypes by convention we treat NaT as a datetime, so # this subtraction returns a timedelta64 dtype. # For period dtype, timedelta64 is a close-enough return dtype. - result = np.empty(self.shape, dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return result.view("timedelta64[ns]") @@ -1101,8 +1334,9 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): result : same class as self """ assert op in [operator.add, operator.sub] - if len(other) == 1 and self.ndim == 1: + if len(other) == 1: # If both 1D then broadcasting is unambiguous + # TODO(EA2D): require self.ndim == other.ndim here return op(self, other[0]) warnings.warn( @@ -1138,10 +1372,11 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): if isinstance(freq, str): freq = to_offset(freq) offset = periods * freq - return self + offset + result = self + offset + return result - if periods == 0 or len(self) == 0: - # GH#14811 empty case + if periods == 0: + # immutable so OK return self.copy() if self.freq is None: @@ -1291,7 +1526,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # TODO: Can we simplify/generalize these cases at all? raise TypeError(f"cannot subtract {type(self).__name__} from {other.dtype}") elif is_timedelta64_dtype(self.dtype): - self = cast("TimedeltaArray", self) return (-self) + other # We get here with e.g. datetime objects @@ -1318,7 +1552,14 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): # -------------------------------------------------------------- # Reductions - def min(self, *, axis=None, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): + op = getattr(self, name, None) + if op: + return op(skipna=skipna, **kwargs) + else: + return super()._reduce(name, skipna, **kwargs) + + def min(self, axis=None, skipna=True, *args, **kwargs): """ Return the minimum value of the Array or minimum along an axis. @@ -1329,25 +1570,16 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): Index.min : Return the minimum value in an Index. Series.min : Return the minimum value in a Series. """ - nv.validate_min((), kwargs) - nv.validate_minmax_axis(axis, self.ndim) + nv.validate_min(args, kwargs) + nv.validate_minmax_axis(axis) - if is_period_dtype(self.dtype): - # pass datetime64 values to nanops to get correct NaT semantics - result = nanops.nanmin( - self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna - ) - if result is NaT: - return NaT - result = result.view("i8") - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) + if isna(result): + # Period._from_ordinal does not handle np.nan gracefully + return NaT + return self._box_func(result) - result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) - - def max(self, *, axis=None, skipna=True, **kwargs): + def max(self, axis=None, skipna=True, *args, **kwargs): """ Return the maximum value of the Array or maximum along an axis. @@ -1360,25 +1592,26 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): """ # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 - nv.validate_max((), kwargs) - nv.validate_minmax_axis(axis, self.ndim) + nv.validate_max(args, kwargs) + nv.validate_minmax_axis(axis) - if is_period_dtype(self.dtype): - # pass datetime64 values to nanops to get correct NaT semantics - result = nanops.nanmax( - self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna - ) - if result is NaT: - return result - result = result.view("i8") - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + mask = self.isna() + if skipna: + values = self[~mask].asi8 + elif mask.any(): + return NaT + else: + values = self.asi8 - result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + if not len(values): + # short-circuit for empty max / min + return NaT - def mean(self, *, skipna=True, axis: Optional[int] = 0): + result = nanops.nanmax(values, skipna=skipna) + # Don't have to worry about NA `result`, since no NA went in. + return self._box_func(result) + + def mean(self, skipna=True): """ Return the mean value of the Array. @@ -1388,7 +1621,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): ---------- skipna : bool, default True Whether to ignore any NaT elements. - axis : int, optional, default 0 Returns ------- @@ -1412,249 +1644,21 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): "obj.to_timestamp(how='start').mean()" ) - result = nanops.nanmean( - self._ndarray, axis=axis, skipna=skipna, mask=self.isna() - ) - return self._wrap_reduction_result(axis, result) - - def median(self, *, axis: Optional[int] = None, skipna: bool = True, **kwargs): - nv.validate_median((), kwargs) - - if axis is not None and abs(axis) >= self.ndim: - raise ValueError("abs(axis) must be less than ndim") - - if is_period_dtype(self.dtype): - # pass datetime64 values to nanops to get correct NaT semantics - result = nanops.nanmedian( - self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna - ) - result = result.view("i8") - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) - - result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) - - -class DatelikeOps(DatetimeLikeArrayMixin): - """ - Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. - """ - - @Substitution( - URL="https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior" - ) - def strftime(self, date_format): - """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format - doc <%(URL)s>`__. - - Parameters - ---------- - date_format : str - Date format string (e.g. "%%Y-%%m-%%d"). - - Returns - ------- - ndarray - NumPy ndarray of formatted strings. - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%%B %%d, %%Y, %%r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """ - result = self._format_native_types(date_format=date_format, na_rep=np.nan) - return result.astype(object) - - -_round_doc = """ - Perform {op} operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - :ref:`frequency aliases ` for - a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times. - - .. versionadded:: 0.24.0 - - nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times. - - .. versionadded:: 0.24.0 - - Returns - ------- - DatetimeIndex, TimedeltaIndex, or Series - Index of the same type for a DatetimeIndex or TimedeltaIndex, - or a Series with the same index for a Series. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') - """ - -_round_example = """>>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - -_floor_example = """>>> rng.floor('H') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.floor("H") - 0 2018-01-01 11:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - -_ceil_example = """>>> rng.ceil('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.ceil("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 13:00:00 - dtype: datetime64[ns] - """ - - -class TimelikeOps(DatetimeLikeArrayMixin): - """ - Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. - """ - - def _round(self, freq, mode, ambiguous, nonexistent): - # round the local times - if is_datetime64tz_dtype(self.dtype): - # operate on naive timestamps, then convert back to aware - self = cast("DatetimeArray", self) - naive = self.tz_localize(None) - result = naive._round(freq, mode, ambiguous, nonexistent) - return result.tz_localize( - self.tz, ambiguous=ambiguous, nonexistent=nonexistent - ) - - values = self.view("i8") - result = round_nsint64(values, mode, freq) - result = self._maybe_mask_results(result, fill_value=NaT) - return self._simple_new(result, dtype=self.dtype) - - @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) - - @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - - @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous="raise", nonexistent="raise"): - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) - - # -------------------------------------------------------------- - # Frequency Methods - - def _maybe_clear_freq(self): - self._freq = None - - def _with_freq(self, freq): - """ - Helper to get a view on the same data, with a new freq. - - Parameters - ---------- - freq : DateOffset, None, or "infer" - - Returns - ------- - Same type as self - """ - # GH#29843 - if freq is None: - # Always valid - pass - elif len(self) == 0 and isinstance(freq, BaseOffset): - # Always valid. In the TimedeltaArray case, we assume this - # is a Tick offset. - pass + mask = self.isna() + if skipna: + values = self[~mask] + elif mask.any(): + return NaT else: - # As an internal method, we can ensure this assertion always holds - assert freq == "infer" - freq = to_offset(self.inferred_freq) + values = self - arr = self.view() - arr._freq = freq - return arr + if not len(values): + # short-circuit for empty max / min + return NaT + + result = nanops.nanmean(values.view("i8"), skipna=skipna) + # Don't have to worry about NA `result`, since no NA went in. + return self._box_func(result) # -------------------------------------------------------------- @@ -1665,16 +1669,14 @@ class TimelikeOps(DatetimeLikeArrayMixin): uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") [assignment] - uniques = uniques[::-1] # type: ignore[assignment] + uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort return super().factorize(na_sentinel=na_sentinel) +DatetimeLikeArrayMixin._add_comparison_ops() + # ------------------------------------------------------------------- # Shared Constructor Helpers diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py index f073fc2..8d6016f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py @@ -1,14 +1,12 @@ from datetime import datetime, time, timedelta, tzinfo -from typing import Optional, Union, cast +from typing import Optional, Union import warnings import numpy as np from pandas._libs import lib, tslib from pandas._libs.tslibs import ( - BaseOffset, NaT, - NaTType, Resolution, Timestamp, conversion, @@ -78,7 +76,9 @@ def tz_to_dtype(tz): def _field_accessor(name, field, docstring=None): def f(self): - values = self._local_timestamps() + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() if field in self._bool_ops: if field.endswith(("start", "end")): @@ -114,7 +114,7 @@ def _field_accessor(name, field, docstring=None): return property(f) -class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): +class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -155,7 +155,6 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _scalar_type = Timestamp _recognized_scalars = (datetime, np.datetime64) _is_recognized_dtype = is_datetime64_any_dtype - _infer_matches = ("datetime", "datetime64", "date") # define my properties & methods for delegation _bool_ops = [ @@ -179,9 +178,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): "week", "weekday", "dayofweek", - "day_of_week", "dayofyear", - "day_of_year", "quarter", "days_in_month", "daysinmonth", @@ -288,9 +285,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new( - cls, values, freq: Optional[BaseOffset] = None, dtype=DT64NS_DTYPE - ) -> "DatetimeArray": + def _simple_new(cls, values, freq=None, dtype=DT64NS_DTYPE): assert isinstance(values, np.ndarray) if values.dtype != DT64NS_DTYPE: assert values.dtype == "i8" @@ -303,11 +298,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): return result @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): - return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) - - @classmethod - def _from_sequence_not_strict( + def _from_sequence( cls, data, dtype=None, @@ -428,9 +419,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: - start = start.tz_localize(tz, ambiguous, nonexistent).asm8 + start = start.tz_localize(tz).asm8 if end is not None: - end = end.tz_localize(tz, ambiguous, nonexistent).asm8 + end = end.tz_localize(tz).asm8 else: # Create a linearly spaced date_range in local time # Nanosecond-granularity timestamps aren't always correctly @@ -446,11 +437,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): ) if not left_closed and len(index) and index[0] == start: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[1:]) + index = index[1:] if not right_closed and len(index) and index[-1] == end: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[:-1]) + index = index[:-1] dtype = tz_to_dtype(tz) return cls._simple_new(index.asi8, freq=freq, dtype=dtype) @@ -458,13 +447,12 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # ----------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value, setitem: bool = False) -> np.datetime64: + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") if not isna(value): - self._check_compatible_with(value, setitem=setitem) - return value.asm8 - return np.datetime64(value.value, "ns") + self._check_compatible_with(value) + return value.value def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) @@ -476,13 +464,17 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): if setitem: # Stricter check for setitem vs comparison methods if not timezones.tz_compare(self.tz, other.tz): - raise ValueError(f"Timezones don't match. '{self.tz}' != '{other.tz}'") + raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") + + def _maybe_clear_freq(self): + self._freq = None # ----------------------------------------------------------------- # Descriptive Properties - def _box_func(self, x) -> Union[Timestamp, NaTType]: - return Timestamp(x, freq=self.freq, tz=self.tz) + @property + def _box_func(self): + return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) @property def dtype(self) -> Union[np.dtype, DatetimeTZDtype]: @@ -563,22 +555,20 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): ------ tstamp : Timestamp """ - if self.ndim > 1: - for i in range(len(self)): - yield self[i] - else: - # convert in chunks of 10k for efficiency - data = self.asi8 - length = len(self) - chunksize = 10000 - chunks = int(length / chunksize) + 1 - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, length) - converted = ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" - ) - yield from converted + + # convert in chunks of 10k for efficiency + data = self.asi8 + length = len(self) + chunksize = 10000 + chunks = int(length / chunksize) + 1 + for i in range(chunks): + start_i = i * chunksize + end_i = min((i + 1) * chunksize, length) + converted = ints_to_pydatetime( + data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" + ) + for v in converted: + yield v def astype(self, dtype, copy=True): # We handle @@ -613,9 +603,9 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # Rendering Methods def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import get_format_datetime64_from_values + from pandas.io.formats.format import _get_format_datetime64_from_values - fmt = get_format_datetime64_from_values(self, date_format) + fmt = _get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep @@ -681,7 +671,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): arr_mask = self._isnan | other._isnan new_values = checked_add_with_arr(self_i8, -other_i8, arr_mask=arr_mask) if self._hasnans or other._hasnans: - np.putmask(new_values, arr_mask, iNaT) + new_values[arr_mask] = iNaT return new_values.view("timedelta64[ns]") def _add_offset(self, offset): @@ -739,8 +729,6 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): This is used to calculate time-of-day information as if the timestamps were timezone-naive. """ - if self.tz is None or timezones.is_utc(self.tz): - return self.asi8 return tzconversion.tz_convert_from_utc(self.asi8, self.tz) def tz_convert(self, tz): @@ -1157,6 +1145,8 @@ default 'raise' """ Return the month names of the DateTimeIndex with specified locale. + .. versionadded:: 0.23.0 + Parameters ---------- locale : str, optional @@ -1177,7 +1167,10 @@ default 'raise' >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') """ - values = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + else: + values = self.asi8 result = fields.get_date_name_field(values, "month_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1187,6 +1180,8 @@ default 'raise' """ Return the day names of the DateTimeIndex with specified locale. + .. versionadded:: 0.23.0 + Parameters ---------- locale : str, optional @@ -1207,7 +1202,10 @@ default 'raise' >>> idx.day_name() Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') """ - values = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + else: + values = self.asi8 result = fields.get_date_name_field(values, "day_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1221,7 +1219,10 @@ default 'raise' # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - timestamps = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 return ints_to_pydatetime(timestamps, box="time") @@ -1242,7 +1243,10 @@ default 'raise' # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - timestamps = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 return ints_to_pydatetime(timestamps, box="date") @@ -1260,10 +1264,8 @@ default 'raise' See Also -------- - Timestamp.isocalendar : Function return a 3-tuple containing ISO year, - week number, and weekday for the given Timestamp object. - datetime.date.isocalendar : Return a named tuple object with - three components: year, week and weekday. + Timestamp.isocalendar + datetime.date.isocalendar Examples -------- @@ -1283,7 +1285,10 @@ default 'raise' """ from pandas import DataFrame - values = self._local_timestamps() + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + else: + values = self.asi8 sarray = fields.build_isocalendar_sarray(values) iso_calendar_df = DataFrame( sarray, columns=["year", "week", "day"], dtype="UInt32" @@ -1536,18 +1541,16 @@ default 'raise' 2017-01-08 6 Freq: D, dtype: int64 """ - day_of_week = _field_accessor("day_of_week", "dow", _dayofweek_doc) - dayofweek = day_of_week - weekday = day_of_week + dayofweek = _field_accessor("dayofweek", "dow", _dayofweek_doc) + weekday = dayofweek - day_of_year = _field_accessor( + dayofyear = _field_accessor( "dayofyear", "doy", """ The ordinal day of the year. """, ) - dayofyear = day_of_year quarter = _field_accessor( "quarter", "q", @@ -1861,28 +1864,6 @@ default 'raise' / 24.0 ) - # ----------------------------------------------------------------- - # Reductions - - def std( - self, - axis=None, - dtype=None, - out=None, - ddof: int = 1, - keepdims: bool = False, - skipna: bool = True, - ): - # Because std is translation-invariant, we can get self.std - # by calculating (self - Timestamp(0)).std, and we can do it - # without creating a copy by using a view on self._ndarray - from pandas.core.arrays import TimedeltaArray - - tda = TimedeltaArray(self._ndarray.view("i8")) - return tda.std( - axis=axis, dtype=dtype, out=out, ddof=ddof, keepdims=keepdims, skipna=skipna - ) - # ------------------------------------------------------------------- # Constructor Helpers @@ -1973,13 +1954,7 @@ def sequence_to_dt64ns( data, inferred_tz = objects_to_datetime64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) - if tz and inferred_tz: - # two timezones: convert to intended from base UTC repr - data = tzconversion.tz_convert_from_utc(data.view("i8"), tz) - data = data.view(DT64NS_DTYPE) - elif inferred_tz: - tz = inferred_tz - + tz = _maybe_infer_tz(tz, inferred_tz) data_dtype = data.dtype # `data` may have originally been a Categorical[datetime64[ns, tz]], @@ -2049,7 +2024,6 @@ def objects_to_datetime64ns( utc : bool, default False Whether to convert timezone-aware timestamps to UTC. errors : {'raise', 'ignore', 'coerce'} - require_iso8601 : bool, default False allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/floating.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/floating.py deleted file mode 100644 index 1077538..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/floating.py +++ /dev/null @@ -1,515 +0,0 @@ -import numbers -from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union -import warnings - -import numpy as np - -from pandas._libs import lib, missing as libmissing -from pandas._typing import ArrayLike, DtypeObj -from pandas.compat.numpy import function as nv -from pandas.util._decorators import cache_readonly - -from pandas.core.dtypes.cast import astype_nansafe -from pandas.core.dtypes.common import ( - is_bool_dtype, - is_datetime64_dtype, - is_float_dtype, - is_integer_dtype, - is_list_like, - is_object_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna - -from pandas.core import ops -from pandas.core.ops import invalid_comparison -from pandas.core.tools.numeric import to_numeric - -from .masked import BaseMaskedDtype -from .numeric import NumericArray - -if TYPE_CHECKING: - import pyarrow - - -class FloatingDtype(BaseMaskedDtype): - """ - An ExtensionDtype to hold a single size of floating dtype. - - These specific implementations are subclasses of the non-public - FloatingDtype. For example we have Float32Dtype to represent float32. - - The attributes name & type are set when these subclasses are created. - """ - - def __repr__(self) -> str: - return f"{self.name}Dtype()" - - @property - def _is_numeric(self) -> bool: - return True - - @classmethod - def construct_array_type(cls) -> Type["FloatingArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return FloatingArray - - def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: - # for now only handle other floating types - if not all(isinstance(t, FloatingDtype) for t in dtypes): - return None - np_dtype = np.find_common_type( - [t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr] - ) - if np.issubdtype(np_dtype, np.floating): - return FLOAT_STR_TO_DTYPE[str(np_dtype)] - return None - - def __from_arrow__( - self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] - ) -> "FloatingArray": - """ - Construct FloatingArray from pyarrow Array/ChunkedArray. - """ - import pyarrow - - from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask - - pyarrow_type = pyarrow.from_numpy_dtype(self.type) - if not array.type.equals(pyarrow_type): - array = array.cast(pyarrow_type) - - if isinstance(array, pyarrow.Array): - chunks = [array] - else: - # pyarrow.ChunkedArray - chunks = array.chunks - - results = [] - for arr in chunks: - data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) - float_arr = FloatingArray(data.copy(), ~mask, copy=False) - results.append(float_arr) - - return FloatingArray._concat_same_type(results) - - -def coerce_to_array( - values, dtype=None, mask=None, copy: bool = False -) -> Tuple[np.ndarray, np.ndarray]: - """ - Coerce the input values array to numpy arrays with a mask. - - Parameters - ---------- - values : 1D list-like - dtype : float dtype - mask : bool 1D array, optional - copy : bool, default False - if True, copy the input - - Returns - ------- - tuple of (values, mask) - """ - # if values is floating numpy array, preserve its dtype - if dtype is None and hasattr(values, "dtype"): - if is_float_dtype(values.dtype): - dtype = values.dtype - - if dtype is not None: - if isinstance(dtype, str) and dtype.startswith("Float"): - # Avoid DeprecationWarning from NumPy about np.dtype("Float64") - # https://github.com/numpy/numpy/pull/7476 - dtype = dtype.lower() - - if not issubclass(type(dtype), FloatingDtype): - try: - dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - - if isinstance(values, FloatingArray): - values, mask = values._data, values._mask - if dtype is not None: - values = values.astype(dtype.numpy_dtype, copy=False) - - if copy: - values = values.copy() - mask = mask.copy() - return values, mask - - values = np.array(values, copy=copy) - if is_object_dtype(values): - inferred_type = lib.infer_dtype(values, skipna=True) - if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) - elif inferred_type not in [ - "floating", - "integer", - "mixed-integer", - "integer-na", - "mixed-integer-float", - ]: - raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") - - elif is_bool_dtype(values) and is_float_dtype(dtype): - values = np.array(values, dtype=float, copy=copy) - - elif not (is_integer_dtype(values) or is_float_dtype(values)): - raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") - - if mask is None: - mask = isna(values) - else: - assert len(mask) == len(values) - - if not values.ndim == 1: - raise TypeError("values must be a 1D list-like") - if not mask.ndim == 1: - raise TypeError("mask must be a 1D list-like") - - # infer dtype if needed - if dtype is None: - dtype = np.dtype("float64") - else: - dtype = dtype.type - - # if we are float, let's make sure that we can - # safely cast - - # we copy as need to coerce here - # TODO should this be a safe cast? - if mask.any(): - values = values.copy() - values[mask] = np.nan - values = values.astype(dtype, copy=False) # , casting="safe") - else: - values = values.astype(dtype, copy=False) # , casting="safe") - - return values, mask - - -class FloatingArray(NumericArray): - """ - Array of floating (optional missing) values. - - .. versionadded:: 1.2.0 - - .. warning:: - - FloatingArray is currently experimental, and its API or internal - implementation may change without warning. Expecially the behaviour - regarding NaN (distinct from NA missing values) is subject to change. - - We represent a FloatingArray with 2 numpy arrays: - - - data: contains a numpy float array of the appropriate dtype - - mask: a boolean array holding a mask on the data, True is missing - - To construct an FloatingArray from generic array-like input, use - :func:`pandas.array` with one of the float dtypes (see examples). - - See :ref:`integer_na` for more. - - Parameters - ---------- - values : numpy.ndarray - A 1-d float-dtype array. - mask : numpy.ndarray - A 1-d boolean-dtype array indicating missing values. - copy : bool, default False - Whether to copy the `values` and `mask`. - - Attributes - ---------- - None - - Methods - ------- - None - - Returns - ------- - FloatingArray - - Examples - -------- - Create an FloatingArray with :func:`pandas.array`: - - >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype()) - - [0.1, , 0.3] - Length: 3, dtype: Float32 - - String aliases for the dtypes are also available. They are capitalized. - - >>> pd.array([0.1, None, 0.3], dtype="Float32") - - [0.1, , 0.3] - Length: 3, dtype: Float32 - """ - - # The value used to fill '_data' to avoid upcasting - _internal_fill_value = 0.0 - - @cache_readonly - def dtype(self) -> FloatingDtype: - return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] - - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): - raise TypeError( - "values should be floating numpy array. Use " - "the 'pd.array' function instead" - ) - super().__init__(values, mask, copy=copy) - - @classmethod - def _from_sequence( - cls, scalars, *, dtype=None, copy: bool = False - ) -> "FloatingArray": - values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) - return FloatingArray(values, mask) - - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype=None, copy: bool = False - ) -> "FloatingArray": - scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype=dtype, copy=copy) - - _HANDLED_TYPES = (np.ndarray, numbers.Number) - - def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): - # For FloatingArray inputs, we apply the ufunc to ._data - # and mask the result. - if method == "reduce": - # Not clear how to handle missing values in reductions. Raise. - raise NotImplementedError("The 'reduce' method is not supported.") - out = kwargs.get("out", ()) - - for x in inputs + out: - if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)): - return NotImplemented - - # for binary ops, use our custom dunder methods - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - - mask = np.zeros(len(self), dtype=bool) - inputs2 = [] - for x in inputs: - if isinstance(x, FloatingArray): - mask |= x._mask - inputs2.append(x._data) - else: - inputs2.append(x) - - def reconstruct(x): - # we don't worry about scalar `x` here, since we - # raise for reduce up above. - - # TODO - if is_float_dtype(x.dtype): - m = mask.copy() - return FloatingArray(x, m) - else: - x[mask] = np.nan - return x - - result = getattr(ufunc, method)(*inputs2, **kwargs) - if isinstance(result, tuple): - tuple(reconstruct(x) for x in result) - else: - return reconstruct(result) - - def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]: - return coerce_to_array(value, dtype=self.dtype) - - def astype(self, dtype, copy: bool = True) -> ArrayLike: - """ - Cast to a NumPy array or ExtensionArray with 'dtype'. - - Parameters - ---------- - dtype : str or dtype - Typecode or data-type to which the array is cast. - copy : bool, default True - Whether to copy the data, even if not necessary. If False, - a copy is made only if the old dtype does not match the - new dtype. - - Returns - ------- - ndarray or ExtensionArray - NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with - 'dtype' for its dtype. - - Raises - ------ - TypeError - if incompatible type with an FloatingDtype, equivalent of same_kind - casting - """ - from pandas.core.arrays.string_ import StringArray, StringDtype - - dtype = pandas_dtype(dtype) - - # if the dtype is exactly the same, we can fastpath - if self.dtype == dtype: - # return the same object for copy=False - return self.copy() if copy else self - # if we are astyping to another nullable masked dtype, we can fastpath - if isinstance(dtype, BaseMaskedDtype): - # TODO deal with NaNs - data = self._data.astype(dtype.numpy_dtype, copy=copy) - # mask is copied depending on whether the data was copied, and - # not directly depending on the `copy` keyword - mask = self._mask if data is self._data else self._mask.copy() - return dtype.construct_array_type()(data, mask, copy=False) - elif isinstance(dtype, StringDtype): - return StringArray._from_sequence(self, copy=False) - - # coerce - if is_float_dtype(dtype): - # In astype, we consider dtype=float to also mean na_value=np.nan - kwargs = {"na_value": np.nan} - elif is_datetime64_dtype(dtype): - kwargs = {"na_value": np.datetime64("NaT")} - else: - kwargs = {} - - data = self.to_numpy(dtype=dtype, **kwargs) - return astype_nansafe(data, dtype, copy=False) - - def _values_for_argsort(self) -> np.ndarray: - return self._data - - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray, IntegerArray - - mask = None - - if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): - other, mask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - # This may be fixed by NA.__array_ufunc__. Revisit this check - # once that's implemented. - result = np.zeros(self._data.shape, dtype="bool") - mask = np.ones(self._data.shape, dtype="bool") - else: - with warnings.catch_warnings(): - # numpy may show a FutureWarning: - # elementwise comparison failed; returning scalar instead, - # but in the future will perform elementwise comparison - # before returning NotImplemented. We fall back to the correct - # behavior today, so that should be fine to ignore. - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op.__name__}__") - result = method(other) - - if result is NotImplemented: - result = invalid_comparison(self._data, other, op) - - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask - - return BooleanArray(result, mask) - - def sum(self, *, skipna=True, min_count=0, **kwargs): - nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count) - - def prod(self, *, skipna=True, min_count=0, **kwargs): - nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count) - - def min(self, *, skipna=True, **kwargs): - nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna) - - def max(self, *, skipna=True, **kwargs): - nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna) - - def _maybe_mask_result(self, result, mask, other, op_name: str): - """ - Parameters - ---------- - result : array-like - mask : array-like bool - other : scalar or array-like - op_name : str - """ - # TODO are there cases we don't end up with float? - # if we have a float operand we are by-definition - # a float result - # or our op is a divide - # if (is_float_dtype(other) or is_float(other)) or ( - # op_name in ["rtruediv", "truediv"] - # ): - # result[mask] = np.nan - # return result - - return type(self)(result, mask, copy=False) - - -_dtype_docstring = """ -An ExtensionDtype for {dtype} data. - -This dtype uses ``pd.NA`` as missing value indicator. - -Attributes ----------- -None - -Methods -------- -None -""" - -# create the Dtype - - -@register_extension_dtype -class Float32Dtype(FloatingDtype): - type = np.float32 - name = "Float32" - __doc__ = _dtype_docstring.format(dtype="float32") - - -@register_extension_dtype -class Float64Dtype(FloatingDtype): - type = np.float64 - name = "Float64" - __doc__ = _dtype_docstring.format(dtype="float64") - - -FLOAT_STR_TO_DTYPE = { - "float32": Float32Dtype(), - "float64": Float64Dtype(), -} diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/integer.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/integer.py index fa427e9..0cbcd90 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/integer.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/integer.py @@ -4,8 +4,9 @@ import warnings import numpy as np -from pandas._libs import iNaT, lib, missing as libmissing +from pandas._libs import lib, missing as libmissing from pandas._typing import ArrayLike, DtypeObj +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -15,6 +16,7 @@ from pandas.core.dtypes.common import ( is_datetime64_dtype, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, @@ -23,14 +25,15 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.missing import isna from pandas.core import ops +from pandas.core.array_algos import masked_reductions from pandas.core.ops import invalid_comparison +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric from .masked import BaseMaskedArray, BaseMaskedDtype -from .numeric import NumericArray if TYPE_CHECKING: - import pyarrow + import pyarrow # noqa: F401 class _IntegerDtype(BaseMaskedDtype): @@ -43,6 +46,10 @@ class _IntegerDtype(BaseMaskedDtype): The attributes name & type are set when these subclasses are created. """ + name: str + base = None + type: Type + def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" return f"{sign}Int{8 * self.itemsize}Dtype()" @@ -59,6 +66,20 @@ class _IntegerDtype(BaseMaskedDtype): def _is_numeric(self) -> bool: return True + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize + @classmethod def construct_array_type(cls) -> Type["IntegerArray"]: """ @@ -85,11 +106,7 @@ class _IntegerDtype(BaseMaskedDtype): [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): - return INT_STR_TO_DTYPE[str(np_dtype)] - elif np.issubdtype(np_dtype, np.floating): - from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - - return FLOAT_STR_TO_DTYPE[str(np_dtype)] + return _dtypes[str(np_dtype)] return None def __from_arrow__( @@ -98,7 +115,7 @@ class _IntegerDtype(BaseMaskedDtype): """ Construct IntegerArray from pyarrow Array/ChunkedArray. """ - import pyarrow + import pyarrow # noqa: F811 from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -121,7 +138,7 @@ class _IntegerDtype(BaseMaskedDtype): return IntegerArray._concat_same_type(results) -def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": +def integer_array(values, dtype=None, copy: bool = False,) -> "IntegerArray": """ Infer and return an integer array of the values. @@ -165,7 +182,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( - values, dtype, mask=None, copy: bool = False + values, dtype, mask=None, copy: bool = False, ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask @@ -182,7 +199,7 @@ def coerce_to_array( ------- tuple of (values, mask) """ - # if values is integer numpy array, preserve its dtype + # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype @@ -197,7 +214,7 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: - dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] + dtype = _dtypes[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -262,7 +279,7 @@ def coerce_to_array( return values, mask -class IntegerArray(NumericArray): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. @@ -337,7 +354,7 @@ class IntegerArray(NumericArray): @cache_readonly def dtype(self) -> _IntegerDtype: - return INT_STR_TO_DTYPE[str(self._data.dtype)] + return _dtypes[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): @@ -357,17 +374,15 @@ class IntegerArray(NumericArray): return type(self)(np.abs(self._data), self._mask) @classmethod - def _from_sequence( - cls, scalars, *, dtype=None, copy: bool = False - ) -> "IntegerArray": + def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray": return integer_array(scalars, dtype=dtype, copy=copy) @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype=None, copy: bool = False + cls, strings, dtype=None, copy: bool = False ) -> "IntegerArray": scalars = to_numeric(strings, errors="raise") - return cls._from_sequence(scalars, dtype=dtype, copy=copy) + return cls._from_sequence(scalars, dtype, copy) _HANDLED_TYPES = (np.ndarray, numbers.Number) @@ -412,7 +427,7 @@ class IntegerArray(NumericArray): result = getattr(ufunc, method)(*inputs2, **kwargs) if isinstance(result, tuple): - return tuple(reconstruct(x) for x in result) + tuple(reconstruct(x) for x in result) else: return reconstruct(result) @@ -485,73 +500,74 @@ class IntegerArray(NumericArray): See Also -------- - ExtensionArray.argsort : Return the indices that would sort this array. + ExtensionArray.argsort """ data = self._data.copy() if self._mask.any(): data[self._mask] = data.min() - 1 return data - def _cmp_method(self, other, op): - from pandas.core.arrays import BooleanArray + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ - mask = None + @unpack_zerodim_and_defer(op.__name__) + def cmp_method(self, other): + from pandas.arrays import BooleanArray - if isinstance(other, BaseMaskedArray): - other, mask = other._data, other._mask + mask = None - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match to compare") + if isinstance(other, (BooleanArray, IntegerArray)): + other, mask = other._data, other._mask - if other is libmissing.NA: - # numpy does not handle pd.NA well as "other" scalar (it returns - # a scalar False instead of an array) - # This may be fixed by NA.__array_ufunc__. Revisit this check - # once that's implemented. - result = np.zeros(self._data.shape, dtype="bool") - mask = np.ones(self._data.shape, dtype="bool") - else: - with warnings.catch_warnings(): - # numpy may show a FutureWarning: - # elementwise comparison failed; returning scalar instead, - # but in the future will perform elementwise comparison - # before returning NotImplemented. We fall back to the correct - # behavior today, so that should be fine to ignore. - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op.__name__}__") - result = method(other) + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match to compare") - if result is NotImplemented: - result = invalid_comparison(self._data, other, op) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) - # nans propagate - if mask is None: - mask = self._mask.copy() - else: - mask = self._mask | mask + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) - return BooleanArray(result, mask) + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask - def sum(self, *, skipna=True, min_count=0, **kwargs): + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + def sum(self, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) - return super()._reduce("sum", skipna=skipna, min_count=min_count) - - def prod(self, *, skipna=True, min_count=0, **kwargs): - nv.validate_prod((), kwargs) - return super()._reduce("prod", skipna=skipna, min_count=min_count) - - def min(self, *, skipna=True, **kwargs): - nv.validate_min((), kwargs) - return super()._reduce("min", skipna=skipna) - - def max(self, *, skipna=True, **kwargs): - nv.validate_max((), kwargs) - return super()._reduce("max", skipna=skipna) + result = masked_reductions.sum( + values=self._data, mask=self._mask, skipna=skipna, min_count=min_count + ) + return result def _maybe_mask_result(self, result, mask, other, op_name: str): """ @@ -568,18 +584,89 @@ class IntegerArray(NumericArray): if (is_float_dtype(other) or is_float(other)) or ( op_name in ["rtruediv", "truediv"] ): - from pandas.core.arrays import FloatingArray - - return FloatingArray(result, mask, copy=False) - - if result.dtype == "timedelta64[ns]": - from pandas.core.arrays import TimedeltaArray - - result[mask] = iNaT - return TimedeltaArray._simple_new(result) + result[mask] = np.nan + return result return type(self)(result, mask, copy=False) + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def integer_arithmetic_method(self, other): + + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, IntegerArray): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op.__name__}__" + return set_function_name(integer_arithmetic_method, name, cls) + + +IntegerArray._add_arithmetic_ops() +IntegerArray._add_comparison_ops() + _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. @@ -657,7 +744,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = { +_dtypes: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/interval.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/interval.py index 53a98fc..ed2437c 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/interval.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/interval.py @@ -1,19 +1,11 @@ -import operator from operator import le, lt import textwrap -from typing import Sequence, Type, TypeVar import numpy as np from pandas._config import get_option -from pandas._libs.interval import ( - VALID_CLOSED, - Interval, - IntervalMixin, - intervals_to_interval_bounds, -) -from pandas._libs.missing import NA +from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -23,6 +15,7 @@ from pandas.core.dtypes.common import ( is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, + is_interval, is_interval_dtype, is_list_like, is_object_dtype, @@ -34,34 +27,27 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, + ABCIndexClass, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, ) -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna +from pandas.core.dtypes.missing import isna, notna from pandas.core.algorithms import take, value_counts from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import ( - array, - ensure_wrapped_if_datetimelike, - extract_array, -) +from pandas.core.construction import array from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index -from pandas.core.ops import invalid_comparison, unpack_zerodim_and_defer - -IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") +_VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} -_shared_docs_kwargs = { - "klass": "IntervalArray", - "qualname": "arrays.IntervalArray", - "name": "", -} +_shared_docs_kwargs = dict( + klass="IntervalArray", qualname="arrays.IntervalArray", name="" +) _interval_shared_docs[ @@ -81,6 +67,8 @@ closed : {'left', 'right', 'both', 'neither'}, default 'right' neither. dtype : dtype or None, default None If None, dtype will be inferred. + + .. versionadded:: 0.23.0 copy : bool, default False Copy the input data. %(name)s\ @@ -129,14 +117,14 @@ for more. @Appender( _interval_shared_docs["class"] - % { - "klass": "IntervalArray", - "summary": "Pandas array for interval data that are closed on the same side.", - "versionadded": "0.24.0", - "name": "", - "extra_attributes": "", - "extra_methods": "", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side.", + versionadded="0.24.0", + name="", + extra_attributes="", + extra_methods="", + examples=textwrap.dedent( """\ Examples -------- @@ -153,16 +141,13 @@ for more. :meth:`IntervalArray.from_breaks`, and :meth:`IntervalArray.from_tuples`. """ ), - } + ) ) class IntervalArray(IntervalMixin, ExtensionArray): ndim = 1 can_hold_na = True _na_value = _fill_value = np.nan - # --------------------------------------------------------------------- - # Constructors - def __new__( cls, data, @@ -172,14 +157,12 @@ class IntervalArray(IntervalMixin, ExtensionArray): verify_integrity: bool = True, ): - if isinstance(data, (ABCSeries, ABCIntervalIndex)) and is_interval_dtype( - data.dtype - ): - data = data._values # TODO: extract_array? + if isinstance(data, ABCSeries) and is_interval_dtype(data.dtype): + data = data._values - if isinstance(data, cls): - left = data._left - right = data._right + if isinstance(data, (cls, ABCIntervalIndex)): + left = data.left + right = data.right closed = closed or data.closed else: @@ -256,18 +239,6 @@ class IntervalArray(IntervalMixin, ExtensionArray): ) raise ValueError(msg) - # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray - left = ensure_wrapped_if_datetimelike(left) - left = extract_array(left, extract_numpy=True) - right = ensure_wrapped_if_datetimelike(right) - right = extract_array(right, extract_numpy=True) - - lbase = getattr(left, "_ndarray", left).base - rbase = getattr(right, "_ndarray", right).base - if lbase is not None and lbase is rbase: - # If these share data, then setitem could corrupt our IA - right = right.copy() - result._left = left result._right = right result._closed = closed @@ -276,7 +247,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): return result @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars, dtype=dtype, copy=copy) @classmethod @@ -290,40 +261,42 @@ class IntervalArray(IntervalMixin, ExtensionArray): _interval_shared_docs["from_breaks"] = textwrap.dedent( """ - Construct an %(klass)s from an array of splits. + Construct an %(klass)s from an array of splits. - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - Copy the data. - dtype : dtype or None, default None - If None, dtype will be inferred. + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + .. versionadded:: 0.23.0 - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct from a left and right array. - %(klass)s.from_tuples : Construct from a sequence of tuples. + Returns + ------- + %(klass)s - %(examples)s\ - """ + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. + + %(examples)s\ + """ ) @classmethod @Appender( _interval_shared_docs["from_breaks"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ Examples -------- @@ -333,7 +306,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) @@ -358,6 +331,8 @@ class IntervalArray(IntervalMixin, ExtensionArray): dtype : dtype, optional If None, dtype will be inferred. + .. versionadded:: 0.23.0 + Returns ------- %(klass)s @@ -392,9 +367,9 @@ class IntervalArray(IntervalMixin, ExtensionArray): @classmethod @Appender( _interval_shared_docs["from_arrays"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) @@ -402,7 +377,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) @@ -414,42 +389,44 @@ class IntervalArray(IntervalMixin, ExtensionArray): _interval_shared_docs["from_tuples"] = textwrap.dedent( """ - Construct an %(klass)s from an array-like of tuples. + Construct an %(klass)s from an array-like of tuples. - Parameters - ---------- - data : array-like (1-dimensional) - Array of tuples. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - By-default copy the data, this is compat only and ignored. - dtype : dtype or None, default None - If None, dtype will be inferred. + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + .. versionadded:: 0.23.0 - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array. - %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits. + Returns + ------- + %(klass)s - %(examples)s\ - """ + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + + %(examples)s\ + """ ) @classmethod @Appender( _interval_shared_docs["from_tuples"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ Examples -------- @@ -459,7 +436,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): Length: 2, closed: right, dtype: interval[int64] """ ), - } + ) ) def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): @@ -498,85 +475,95 @@ class IntervalArray(IntervalMixin, ExtensionArray): * left and right have the same missing values * left is always below right """ - if self.closed not in VALID_CLOSED: + if self.closed not in _VALID_CLOSED: msg = f"invalid option for 'closed': {self.closed}" raise ValueError(msg) - if len(self._left) != len(self._right): + if len(self.left) != len(self.right): msg = "left and right must have the same length" raise ValueError(msg) - left_mask = notna(self._left) - right_mask = notna(self._right) + left_mask = notna(self.left) + right_mask = notna(self.right) if not (left_mask == right_mask).all(): msg = ( "missing values must be missing in the same " "location both left and right sides" ) raise ValueError(msg) - if not (self._left[left_mask] <= self._right[left_mask]).all(): + if not (self.left[left_mask] <= self.right[left_mask]).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) - def _shallow_copy(self, left, right): - """ - Return a new IntervalArray with the replacement attributes - - Parameters - ---------- - left : Index - Values to be used for the left-side of the intervals. - right : Index - Values to be used for the right-side of the intervals. - """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - - # --------------------------------------------------------------------- - # Descriptive - - @property - def dtype(self): - return IntervalDtype(self.left.dtype) - - @property - def nbytes(self) -> int: - return self.left.nbytes + self.right.nbytes - - @property - def size(self) -> int: - # Avoid materializing self.values - return self.left.size - - # --------------------------------------------------------------------- - # EA Interface - + # --------- + # Interface + # --------- def __iter__(self): return iter(np.asarray(self)) def __len__(self) -> int: - return len(self._left) + return len(self.left) - def __getitem__(self, key): - key = check_array_indexer(self, key) - left = self._left[key] - right = self._right[key] + def __getitem__(self, value): + value = check_array_indexer(self, value) + left = self.left[value] + right = self.right[value] - if not isinstance(left, (np.ndarray, ExtensionArray)): - # scalar + # scalar + if not isinstance(left, ABCIndexClass): if is_scalar(left) and isna(left): return self._fill_value + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) - if np.ndim(left) > 1: - # GH#30588 multi-dimensional indexer disallowed - raise ValueError("multi-dimensional indexing not allowed") + return self._shallow_copy(left, right) def __setitem__(self, key, value): - value_left, value_right = self._validate_setitem_value(value) + # na value: need special casing to set directly on numpy arrays + needs_float_conversion = False + if is_scalar(value) and isna(value): + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64("NaT") + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64("NaT") + value_left, value_right = value, value + + # scalar interval + elif is_interval_dtype(value) or isinstance(value, Interval): + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + + else: + # list-like of intervals + try: + array = IntervalArray(value) + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + + if needs_float_conversion: + raise ValueError("Cannot set float NaN to integer-backed IntervalArray") + key = check_array_indexer(self, key) - self._left[key] = value_left - self._right[key] = value_right + # Need to ensure that left and right are updated atomically, so we're + # forced to copy, update the copy, and swap in the new values. + left = self.left.copy(deep=True) + left._values[key] = value_left + self._left = left - def _cmp_method(self, other, op): + right = self.right.copy(deep=True) + right._values[key] = value_right + self._right = right + + def __eq__(self, other): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): @@ -584,7 +571,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches - return invalid_comparison(self, other, op) + return np.zeros(len(self), dtype=bool) # determine the dtype of the elements we want to compare if isinstance(other, Interval): @@ -598,97 +585,33 @@ class IntervalArray(IntervalMixin, ExtensionArray): # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: - return invalid_comparison(self, other, op) - - other = other.categories.take( - other.codes, allow_fill=True, fill_value=other.categories._na_value - ) + return np.zeros(len(self), dtype=bool) + other = other.categories.take(other.codes) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: - return invalid_comparison(self, other, op) - elif not isinstance(other, Interval): - other = type(self)(other) - - if op is operator.eq: - return (self._left == other.left) & (self._right == other.right) - elif op is operator.ne: - return (self._left != other.left) | (self._right != other.right) - elif op is operator.gt: - return (self._left > other.left) | ( - (self._left == other.left) & (self._right > other.right) - ) - elif op is operator.ge: - return (self == other) | (self > other) - elif op is operator.lt: - return (self._left < other.left) | ( - (self._left == other.left) & (self._right < other.right) - ) - else: - # operator.lt - return (self == other) | (self < other) + return np.zeros(len(self), dtype=bool) + return (self.left == other.left) & (self.right == other.right) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): - return invalid_comparison(self, other, op) + return np.zeros(len(self), dtype=bool) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): - try: - result[i] = op(self[i], obj) - except TypeError: - if obj is NA: - # comparison with np.nan returns NA - # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 - result[i] = op is operator.ne - else: - raise + # need object to be an Interval with same closed and endpoints + if ( + isinstance(obj, Interval) + and self.closed == obj.closed + and self.left[i] == obj.left + and self.right[i] == obj.right + ): + result[i] = True + return result - @unpack_zerodim_and_defer("__eq__") - def __eq__(self, other): - return self._cmp_method(other, operator.eq) - - @unpack_zerodim_and_defer("__ne__") - def __ne__(self, other): - return self._cmp_method(other, operator.ne) - - @unpack_zerodim_and_defer("__gt__") - def __gt__(self, other): - return self._cmp_method(other, operator.gt) - - @unpack_zerodim_and_defer("__ge__") - def __ge__(self, other): - return self._cmp_method(other, operator.ge) - - @unpack_zerodim_and_defer("__lt__") - def __lt__(self, other): - return self._cmp_method(other, operator.lt) - - @unpack_zerodim_and_defer("__le__") - def __le__(self, other): - return self._cmp_method(other, operator.le) - - def argsort( - self, - ascending: bool = True, - kind: str = "quicksort", - na_position: str = "last", - *args, - **kwargs, - ) -> np.ndarray: - ascending = nv.validate_argsort_with_ascending(ascending, args, kwargs) - - if ascending and kind == "quicksort" and na_position == "last": - return np.lexsort((self.right, self.left)) - - # TODO: other cases we can use lexsort for? much more performant. - return super().argsort( - ascending=ascending, kind=kind, na_position=na_position, **kwargs - ) - def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -721,12 +644,23 @@ class IntervalArray(IntervalMixin, ExtensionArray): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - value_left, value_right = self._validate_fill_value(value) + if not isinstance(value, Interval): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." + ) + raise TypeError(msg) - left = self.left.fillna(value=value_left) - right = self.right.fillna(value=value_right) + self._check_closed_matches(value, name="value") + + left = self.left.fillna(value=value.left) + right = self.right.fillna(value=value.right) return self._shallow_copy(left, right) + @property + def dtype(self): + return IntervalDtype(self.left.dtype) + def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. @@ -746,7 +680,6 @@ class IntervalArray(IntervalMixin, ExtensionArray): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ - from pandas import Index from pandas.core.arrays.string_ import StringDtype if dtype is not None: @@ -758,10 +691,8 @@ class IntervalArray(IntervalMixin, ExtensionArray): # need to cast to different subtype try: - # We need to use Index rules for astype to prevent casting - # np.nan entries to int subtypes - new_left = Index(self._left, copy=False).astype(dtype.subtype) - new_right = Index(self._right, copy=False).astype(dtype.subtype) + new_left = self.left.astype(dtype.subtype) + new_right = self.right.astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" @@ -769,7 +700,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): raise TypeError(msg) from err return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): - return Categorical(np.asarray(self), dtype=dtype) + return Categorical(np.asarray(self)) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) @@ -780,20 +711,8 @@ class IntervalArray(IntervalMixin, ExtensionArray): msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" raise TypeError(msg) from err - def equals(self, other) -> bool: - if type(self) != type(other): - return False - - return bool( - self.closed == other.closed - and self.left.equals(other.left) - and self.right.equals(other.right) - ) - @classmethod - def _concat_same_type( - cls: Type[IntervalArrayT], to_concat: Sequence[IntervalArrayT] - ) -> IntervalArrayT: + def _concat_same_type(cls, to_concat): """ Concatenate multiple IntervalArray @@ -814,7 +733,20 @@ class IntervalArray(IntervalMixin, ExtensionArray): right = np.concatenate([interval.right for interval in to_concat]) return cls._simple_new(left, right, closed=closed, copy=False) - def copy(self: IntervalArrayT) -> IntervalArrayT: + def _shallow_copy(self, left, right): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : Index + Values to be used for the left-side of the intervals. + right : Index + Values to be used for the right-side of the intervals. + """ + return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + + def copy(self): """ Return a copy of the array. @@ -822,14 +754,23 @@ class IntervalArray(IntervalMixin, ExtensionArray): ------- IntervalArray """ - left = self._left.copy() - right = self._right.copy() + left = self.left.copy(deep=True) + right = self.right.copy(deep=True) closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def isna(self) -> np.ndarray: - return isna(self._left) + def isna(self): + return isna(self.left) + + @property + def nbytes(self) -> int: + return self.left.nbytes + self.right.nbytes + + @property + def size(self) -> int: + # Avoid materializing self.values + return self.left.size def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: @@ -845,9 +786,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): empty_len = min(abs(periods), len(self)) if isna(fill_value): - from pandas import Index - - fill_value = Index(self._left, copy=False)._na_value + fill_value = self.left._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: empty = self._from_sequence([fill_value] * empty_len) @@ -860,7 +799,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): b = empty return self._concat_same_type([a, b]) - def take(self, indices, *, allow_fill=False, fill_value=None, axis=None, **kwargs): + def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. @@ -906,77 +845,32 @@ class IntervalArray(IntervalMixin, ExtensionArray): When `indices` contains negative values other than ``-1`` and `allow_fill` is True. """ - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) fill_left = fill_right = fill_value if allow_fill: - fill_left, fill_right = self._validate_fill_value(fill_value) + if fill_value is None: + fill_left = fill_right = self.left._na_value + elif is_interval(fill_value): + self._check_closed_matches(fill_value, name="fill_value") + fill_left, fill_right = fill_value.left, fill_value.right + elif not is_scalar(fill_value) and notna(fill_value): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. " + f"Got a '{type(fill_value).__name__}' instead." + ) + raise ValueError(msg) left_take = take( - self._left, indices, allow_fill=allow_fill, fill_value=fill_left + self.left, indices, allow_fill=allow_fill, fill_value=fill_left ) right_take = take( - self._right, indices, allow_fill=allow_fill, fill_value=fill_right + self.right, indices, allow_fill=allow_fill, fill_value=fill_right ) return self._shallow_copy(left_take, right_take) - def _validate_listlike(self, value): - # list-like of intervals - try: - array = IntervalArray(value) - # TODO: self._check_closed_matches(array, name="value") - value_left, value_right = array.left, array.right - except TypeError as err: - # wrong type: not interval or NA - msg = f"'value' should be an interval type, got {type(value)} instead." - raise TypeError(msg) from err - return value_left, value_right - - def _validate_scalar(self, value): - if isinstance(value, Interval): - self._check_closed_matches(value, name="value") - left, right = value.left, value.right - elif is_valid_nat_for_dtype(value, self.left.dtype): - # GH#18295 - left = right = value - else: - raise TypeError( - "can only insert Interval objects and NA into an IntervalArray" - ) - return left, right - - def _validate_fill_value(self, value): - return self._validate_scalar(value) - - def _validate_setitem_value(self, value): - needs_float_conversion = False - - if is_valid_nat_for_dtype(value, self.left.dtype): - # na value: need special casing to set directly on numpy arrays - if is_integer_dtype(self.dtype.subtype): - # can't set NaN on a numpy integer array - needs_float_conversion = True - elif is_datetime64_any_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.datetime64("NaT") - elif is_timedelta64_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.timedelta64("NaT") - value_left, value_right = value, value - - elif is_interval_dtype(value) or isinstance(value, Interval): - # scalar interval - self._check_closed_matches(value, name="value") - value_left, value_right = value.left, value.right - - else: - return self._validate_listlike(value) - - if needs_float_conversion: - raise ValueError("Cannot set float NaN to integer-backed IntervalArray") - return value_left, value_right - def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. @@ -997,8 +891,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): # TODO: implement this is a non-naive way! return value_counts(np.asarray(self), dropna=dropna) - # --------------------------------------------------------------------- - # Rendering Methods + # Formatting def _format_data(self): @@ -1052,18 +945,13 @@ class IntervalArray(IntervalMixin, ExtensionArray): space = " " * (len(type(self).__name__) + 1) return f"\n{space}" - # --------------------------------------------------------------------- - # Vectorized Interval Properties/Attributes - @property def left(self): """ Return the left endpoints of each Interval in the IntervalArray as an Index. """ - from pandas import Index - - return Index(self._left, copy=False) + return self._left @property def right(self): @@ -1071,9 +959,66 @@ class IntervalArray(IntervalMixin, ExtensionArray): Return the right endpoints of each Interval in the IntervalArray as an Index. """ - from pandas import Index + return self._right - return Index(self._right, copy=False) + @property + def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither. + """ + return self._closed + + _interval_shared_docs["set_closed"] = textwrap.dedent( + """ + Return an %(klass)s identical to the current one, but closed on the + specified side. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + new_index : %(klass)s + + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) + >>> index + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + >>> index.set_closed('both') + + [[0, 1], [1, 2], [2, 3]] + Length: 3, closed: both, dtype: interval[int64] + """ + ), + ) + ) + def set_closed(self, closed): + if closed not in _VALID_CLOSED: + msg = f"invalid option for 'closed': {closed}" + raise ValueError(msg) + + return type(self)._simple_new( + left=self.left, right=self.right, closed=closed, verify_integrity=False + ) @property def length(self): @@ -1102,6 +1047,202 @@ class IntervalArray(IntervalMixin, ExtensionArray): # datetime safe version return self.left + 0.5 * self.length + _interval_shared_docs[ + "is_non_overlapping_monotonic" + ] = """ + Return True if the %(klass)s is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False. + """ + + # https://github.com/python/mypy/issues/1362 + # Mypy does not support decorated properties + @property # type: ignore + @Appender( + _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs + ) + def is_non_overlapping_monotonic(self): + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + + # strict inequality for closed == 'both'; equality implies overlapping + # at a point when both sides of intervals are included + if self.closed == "both": + return bool( + (self.right[:-1] < self.left[1:]).all() + or (self.left[:-1] > self.right[1:]).all() + ) + + # non-strict inequality when closed != 'both'; at least one side is + # not included in the intervals, so equality does not imply overlapping + return bool( + (self.right[:-1] <= self.left[1:]).all() + or (self.left[:-1] >= self.right[1:]).all() + ) + + # Conversion + def __array__(self, dtype=None) -> np.ndarray: + """ + Return the IntervalArray's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self.isna() + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError as err: + raise TypeError( + f"Conversion to arrow with subtype '{self.dtype.subtype}' " + "is not supported" + ) from err + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + f"different 'subtype' ({self.dtype.subtype} vs {type.subtype}) " + f"and 'closed' ({self.closed} vs {type.closed}) attributes" + ) + else: + raise TypeError( + f"Not supported to convert IntervalArray to '{type}' type" + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + + _interval_shared_docs[ + "to_tuples" + ] = """ + Return an %(return_type)s of tuples of the form (left, right). + + Parameters + ---------- + na_tuple : bool, default True + Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA + value itself if False, ``nan``. + + .. versionadded:: 0.23.0 + + Returns + ------- + tuples: %(return_type)s + %(examples)s\ + """ + + @Appender( + _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") + ) + def to_tuples(self, na_tuple=True): + tuples = com.asarray_tuplesafe(zip(self.left, self.right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self.isna(), tuples, np.nan) + return tuples + + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + left_repeat = self.left.repeat(repeats) + right_repeat = self.right.repeat(repeats) + return self._shallow_copy(left=left_repeat, right=right_repeat) + + _interval_shared_docs["contains"] = textwrap.dedent( + """ + Check elementwise if the Intervals contain the value. + + Return a boolean mask whether the value is contained in the Intervals + of the %(klass)s. + + .. versionadded:: 0.25.0 + + Parameters + ---------- + other : scalar + The value to check whether it is contained in the Intervals. + + Returns + ------- + boolean array + + See Also + -------- + Interval.contains : Check whether Interval object contains value. + %(klass)s.overlaps : Check if an Interval overlaps the values in the + %(klass)s. + + Examples + -------- + %(examples)s + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + ) + + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def contains(self, other): + if isinstance(other, Interval): + raise NotImplementedError("contains not implemented for two intervals") + + return (self.left < other if self.open_left else self.left <= other) & ( + other < self.right if self.open_right else other <= self.right + ) + _interval_shared_docs["overlaps"] = textwrap.dedent( """ Check elementwise if an Interval overlaps the values in the %(klass)s. @@ -1146,9 +1287,9 @@ class IntervalArray(IntervalMixin, ExtensionArray): @Appender( _interval_shared_docs["overlaps"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( + % dict( + klass="IntervalArray", + examples=textwrap.dedent( """\ >>> data = [(0, 1), (1, 3), (2, 4)] >>> intervals = pd.arrays.IntervalArray.from_tuples(data) @@ -1158,7 +1299,7 @@ class IntervalArray(IntervalMixin, ExtensionArray): Length: 3, closed: right, dtype: interval[int64] """ ), - } + ) ) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): @@ -1176,265 +1317,6 @@ class IntervalArray(IntervalMixin, ExtensionArray): # (simplifying the negation allows this to be done in less operations) return op1(self.left, other.right) & op2(other.left, self.right) - # --------------------------------------------------------------------- - - @property - def closed(self): - """ - Whether the intervals are closed on the left-side, right-side, both or - neither. - """ - return self._closed - - _interval_shared_docs["set_closed"] = textwrap.dedent( - """ - Return an %(klass)s identical to the current one, but closed on the - specified side. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - closed : {'left', 'right', 'both', 'neither'} - Whether the intervals are closed on the left-side, right-side, both - or neither. - - Returns - ------- - new_index : %(klass)s - - %(examples)s\ - """ - ) - - @Appender( - _interval_shared_docs["set_closed"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ - Examples - -------- - >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) - >>> index - - [(0, 1], (1, 2], (2, 3]] - Length: 3, closed: right, dtype: interval[int64] - >>> index.set_closed('both') - - [[0, 1], [1, 2], [2, 3]] - Length: 3, closed: both, dtype: interval[int64] - """ - ), - } - ) - def set_closed(self, closed): - if closed not in VALID_CLOSED: - msg = f"invalid option for 'closed': {closed}" - raise ValueError(msg) - - return type(self)._simple_new( - left=self._left, right=self._right, closed=closed, verify_integrity=False - ) - - _interval_shared_docs[ - "is_non_overlapping_monotonic" - ] = """ - Return True if the %(klass)s is non-overlapping (no Intervals share - points) and is either monotonic increasing or monotonic decreasing, - else False. - """ - - # https://github.com/python/mypy/issues/1362 - # Mypy does not support decorated properties - @property # type: ignore[misc] - @Appender( - _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs - ) - def is_non_overlapping_monotonic(self): - # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) - # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) - # we already require left <= right - - # strict inequality for closed == 'both'; equality implies overlapping - # at a point when both sides of intervals are included - if self.closed == "both": - return bool( - (self._right[:-1] < self._left[1:]).all() - or (self._left[:-1] > self._right[1:]).all() - ) - - # non-strict inequality when closed != 'both'; at least one side is - # not included in the intervals, so equality does not imply overlapping - return bool( - (self._right[:-1] <= self._left[1:]).all() - or (self._left[:-1] >= self._right[1:]).all() - ) - - # --------------------------------------------------------------------- - # Conversion - - def __array__(self, dtype=None) -> np.ndarray: - """ - Return the IntervalArray's data as a numpy array of Interval - objects (with dtype='object') - """ - left = self._left - right = self._right - mask = self.isna() - closed = self._closed - - result = np.empty(len(left), dtype=object) - for i in range(len(left)): - if mask[i]: - result[i] = np.nan - else: - result[i] = Interval(left[i], right[i], closed) - return result - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow - - from pandas.core.arrays._arrow_utils import ArrowIntervalType - - try: - subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) - except TypeError as err: - raise TypeError( - f"Conversion to arrow with subtype '{self.dtype.subtype}' " - "is not supported" - ) from err - interval_type = ArrowIntervalType(subtype, self.closed) - storage_array = pyarrow.StructArray.from_arrays( - [ - pyarrow.array(self._left, type=subtype, from_pandas=True), - pyarrow.array(self._right, type=subtype, from_pandas=True), - ], - names=["left", "right"], - ) - mask = self.isna() - if mask.any(): - # if there are missing values, set validity bitmap also on the array level - null_bitmap = pyarrow.array(~mask).buffers()[1] - storage_array = pyarrow.StructArray.from_buffers( - storage_array.type, - len(storage_array), - [null_bitmap], - children=[storage_array.field(0), storage_array.field(1)], - ) - - if type is not None: - if type.equals(interval_type.storage_type): - return storage_array - elif isinstance(type, ArrowIntervalType): - # ensure we have the same subtype and closed attributes - if not type.equals(interval_type): - raise TypeError( - "Not supported to convert IntervalArray to type with " - f"different 'subtype' ({self.dtype.subtype} vs {type.subtype}) " - f"and 'closed' ({self.closed} vs {type.closed}) attributes" - ) - else: - raise TypeError( - f"Not supported to convert IntervalArray to '{type}' type" - ) - - return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) - - _interval_shared_docs[ - "to_tuples" - ] = """ - Return an %(return_type)s of tuples of the form (left, right). - - Parameters - ---------- - na_tuple : bool, default True - Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA - value itself if False, ``nan``. - - Returns - ------- - tuples: %(return_type)s - %(examples)s\ - """ - - @Appender( - _interval_shared_docs["to_tuples"] % {"return_type": "ndarray", "examples": ""} - ) - def to_tuples(self, na_tuple=True): - tuples = com.asarray_tuplesafe(zip(self._left, self._right)) - if not na_tuple: - # GH 18756 - tuples = np.where(~self.isna(), tuples, np.nan) - return tuples - - # --------------------------------------------------------------------- - - @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) - def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) - left_repeat = self.left.repeat(repeats) - right_repeat = self.right.repeat(repeats) - return self._shallow_copy(left=left_repeat, right=right_repeat) - - _interval_shared_docs["contains"] = textwrap.dedent( - """ - Check elementwise if the Intervals contain the value. - - Return a boolean mask whether the value is contained in the Intervals - of the %(klass)s. - - .. versionadded:: 0.25.0 - - Parameters - ---------- - other : scalar - The value to check whether it is contained in the Intervals. - - Returns - ------- - boolean array - - See Also - -------- - Interval.contains : Check whether Interval object contains value. - %(klass)s.overlaps : Check if an Interval overlaps the values in the - %(klass)s. - - Examples - -------- - %(examples)s - >>> intervals.contains(0.5) - array([ True, False, False]) - """ - ) - - @Appender( - _interval_shared_docs["contains"] - % { - "klass": "IntervalArray", - "examples": textwrap.dedent( - """\ - >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - - [(0, 1], (1, 3], (2, 4]] - Length: 3, closed: right, dtype: interval[int64] - """ - ), - } - ) - def contains(self, other): - if isinstance(other, Interval): - raise NotImplementedError("contains not implemented for two intervals") - - return (self._left < other if self.open_left else self._left <= other) & ( - other < self._right if self.open_right else other <= self._right - ) - def maybe_convert_platform_interval(values): """ diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/masked.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/masked.py index caed932..235840d 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/masked.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/masked.py @@ -1,13 +1,11 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union +from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar import numpy as np from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly, doc +from pandas.util._decorators import doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -19,10 +17,9 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import factorize_array, take +from pandas.core.algorithms import _factorize_array, take from pandas.core.array_algos import masked_reductions -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.indexers import check_array_indexer if TYPE_CHECKING: @@ -37,28 +34,14 @@ class BaseMaskedDtype(ExtensionDtype): Base class for dtypes for BasedMaskedArray subclasses. """ - name: str - base = None - type: Type - na_value = libmissing.NA - @cache_readonly + @property def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ - return np.dtype(self.type) - - @cache_readonly - def kind(self) -> str: - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ - return self.numpy_dtype.itemsize + raise AbstractMethodError @classmethod - def construct_array_type(cls) -> Type[BaseMaskedArray]: + def construct_array_type(cls) -> Type["BaseMaskedArray"]: """ Return the array type associated with this dtype. @@ -69,7 +52,7 @@ class BaseMaskedDtype(ExtensionDtype): raise NotImplementedError -class BaseMaskedArray(OpsMixin, ExtensionArray): +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ Base class for masked arrays (which use _data and _mask to store the data). @@ -86,9 +69,9 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): "mask should be boolean numpy array. Use " "the 'pd.array' function instead" ) - if values.ndim != 1: + if not values.ndim == 1: raise ValueError("values must be a 1D array") - if mask.ndim != 1: + if not mask.ndim == 1: raise ValueError("mask must be a 1D array") if copy: @@ -102,9 +85,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) - def __getitem__( - self, item: Union[int, slice, np.ndarray] - ) -> Union[BaseMaskedArray, Any]: + def __getitem__(self, item): if is_integer(item): if self._mask[item]: return self.dtype.na_value @@ -145,7 +126,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): return type(self)(~self._data, self._mask) def to_numpy( - self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default + self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default, ) -> np.ndarray: """ Convert to a NumPy Array. @@ -213,8 +194,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): dtype = object if self._hasna: if ( - not is_object_dtype(dtype) - and not is_string_dtype(dtype) + not (is_object_dtype(dtype) or is_string_dtype(dtype)) and na_value is libmissing.NA ): raise ValueError( @@ -265,9 +245,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): return self._data.nbytes + self._mask.nbytes @classmethod - def _concat_same_type( - cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] - ) -> BaseMaskedArrayT: + def _concat_same_type(cls: Type[BaseMaskedArrayT], to_concat) -> BaseMaskedArrayT: data = np.concatenate([x._data for x in to_concat]) mask = np.concatenate([x._mask for x in to_concat]) return cls(data, mask) @@ -275,7 +253,6 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): def take( self: BaseMaskedArrayT, indexer, - *, allow_fill: bool = False, fill_value: Optional[Scalar] = None, ) -> BaseMaskedArrayT: @@ -310,7 +287,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): arr = self._data mask = self._mask - codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) @@ -364,7 +341,7 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): return Series(counts, index=index) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/numeric.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/numeric.py deleted file mode 100644 index 5447a84..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/numeric.py +++ /dev/null @@ -1,92 +0,0 @@ -import datetime - -import numpy as np - -from pandas._libs import Timedelta, missing as libmissing -from pandas.errors import AbstractMethodError - -from pandas.core.dtypes.common import ( - is_float, - is_float_dtype, - is_integer, - is_integer_dtype, - is_list_like, -) - -from .masked import BaseMaskedArray - - -class NumericArray(BaseMaskedArray): - """ - Base class for IntegerArray and FloatingArray. - """ - - def _maybe_mask_result(self, result, mask, other, op_name: str): - raise AbstractMethodError(self) - - def _arith_method(self, other, op): - op_name = op.__name__ - omask = None - - if getattr(other, "ndim", 0) > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - - if isinstance(other, NumericArray): - other, omask = other._data, other._mask - - elif is_list_like(other): - other = np.asarray(other) - if other.ndim > 1: - raise NotImplementedError("can only perform ops with 1-d structures") - if len(self) != len(other): - raise ValueError("Lengths must match") - if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError("can only perform ops with numeric values") - - elif isinstance(other, (datetime.timedelta, np.timedelta64)): - other = Timedelta(other) - - else: - if not (is_float(other) or is_integer(other) or other is libmissing.NA): - raise TypeError("can only perform ops with numeric values") - - if omask is None: - mask = self._mask.copy() - if other is libmissing.NA: - mask |= True - else: - mask = self._mask | omask - - if op_name == "pow": - # 1 ** x is 1. - mask = np.where((self._data == 1) & ~self._mask, False, mask) - # x ** 0 is 1. - if omask is not None: - mask = np.where((other == 0) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 0, False, mask) - - elif op_name == "rpow": - # 1 ** x is 1. - if omask is not None: - mask = np.where((other == 1) & ~omask, False, mask) - elif other is not libmissing.NA: - mask = np.where(other == 1, False, mask) - # x ** 0 is 1. - mask = np.where((self._data == 0) & ~self._mask, False, mask) - - if other is libmissing.NA: - result = np.ones_like(self._data) - else: - with np.errstate(all="ignore"): - result = op(self._data, other) - - # divmod returns a tuple - if op_name == "divmod": - div, mod = result - return ( - self._maybe_mask_result(div, mask, other, "floordiv"), - self._maybe_mask_result(mod, mask, other, "mod"), - ) - - return self._maybe_mask_result(result, mask, other, op_name) diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/numpy_.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/numpy_.py index 50d1270..f6dfb1f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/numpy_.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/numpy_.py @@ -1,5 +1,5 @@ import numbers -from typing import Tuple, Type, Union +from typing import Optional, Tuple, Type, Union import numpy as np from numpy.lib.mixins import NDArrayOperatorsMixin @@ -7,14 +7,23 @@ from numpy.lib.mixins import NDArrayOperatorsMixin from pandas._libs import lib from pandas._typing import Scalar from pandas.compat.numpy import function as nv +from pandas.util._decorators import doc +from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna -from pandas.core import nanops, ops -from pandas.core.arraylike import OpsMixin +from pandas import compat +from pandas.core import nanops +from pandas.core.algorithms import searchsorted +from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.strings.object_array import ObjectStringArrayMixin +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.construction import extract_array +from pandas.core.indexers import check_array_indexer +from pandas.core.missing import backfill_1d, pad_1d class PandasDtype(ExtensionDtype): @@ -113,10 +122,7 @@ class PandasDtype(ExtensionDtype): class PandasArray( - OpsMixin, - NDArrayBackedExtensionArray, - NDArrayOperatorsMixin, - ObjectStringArrayMixin, + NDArrayBackedExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin ): """ A pandas ExtensionArray for NumPy data. @@ -144,7 +150,7 @@ class PandasArray( # If you're wondering why pd.Series(cls) doesn't put the array in an # ExtensionBlock, search for `ABCPandasArray`. We check for - # that _typ to ensure that users don't unnecessarily use EAs inside + # that _typ to ensure that that users don't unnecessarily use EAs inside # pandas internals, which turns off things like block consolidation. _typ = "npy_extension" __array_priority__ = 1000 @@ -171,9 +177,7 @@ class PandasArray( self._dtype = PandasDtype(values.dtype) @classmethod - def _from_sequence( - cls, scalars, *, dtype=None, copy: bool = False - ) -> "PandasArray": + def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "PandasArray": if isinstance(dtype, PandasDtype): dtype = dtype._dtype @@ -186,6 +190,10 @@ class PandasArray( def _from_factorized(cls, values, original) -> "PandasArray": return cls(values) + @classmethod + def _concat_same_type(cls, to_concat) -> "PandasArray": + return cls(np.concatenate(to_concat)) + def _from_backing_data(self, arr: np.ndarray) -> "PandasArray": return type(self)(arr) @@ -218,16 +226,6 @@ class PandasArray( if not isinstance(x, self._HANDLED_TYPES + (PandasArray,)): return NotImplemented - if ufunc not in [np.logical_or, np.bitwise_or, np.bitwise_xor]: - # For binary ops, use our custom dunder methods - # We haven't implemented logical dunder funcs, so exclude these - # to avoid RecursionError - result = ops.maybe_dispatch_ufunc_to_dunder_op( - self, ufunc, method, *inputs, **kwargs - ) - if result is not NotImplemented: - return result - # Defer to the implementation of the ufunc on unwrapped values. inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs) if out: @@ -257,113 +255,158 @@ class PandasArray( # ------------------------------------------------------------------------ # Pandas ExtensionArray Interface + def __getitem__(self, item): + if isinstance(item, type(self)): + item = item._ndarray + + item = check_array_indexer(self, item) + + result = self._ndarray[item] + if not lib.is_scalar(item): + result = type(self)(result) + return result + + def __setitem__(self, key, value) -> None: + value = extract_array(value, extract_numpy=True) + + key = check_array_indexer(self, key) + scalar_value = lib.is_scalar(value) + + if not scalar_value: + value = np.asarray(value, dtype=self._ndarray.dtype) + + self._ndarray[key] = value + def isna(self) -> np.ndarray: return isna(self._ndarray) + def fillna( + self, value=None, method: Optional[str] = None, limit: Optional[int] = None, + ) -> "PandasArray": + # TODO(_values_for_fillna): remove this + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == "pad" else backfill_1d + new_values = func(self._ndarray, limit=limit, mask=mask) + new_values = self._from_sequence(new_values, dtype=self.dtype) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + def _validate_fill_value(self, fill_value): if fill_value is None: # Primarily for subclasses fill_value = self.dtype.na_value return fill_value + def _values_for_argsort(self) -> np.ndarray: + return self._ndarray + def _values_for_factorize(self) -> Tuple[np.ndarray, int]: return self._ndarray, -1 # ------------------------------------------------------------------------ # Reductions - def any(self, *, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_any((), {"out": out, "keepdims": keepdims}) - result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + def _reduce(self, name, skipna=True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = f"'{type(self).__name__}' does not implement reduction '{name}'" + raise TypeError(msg) - def all(self, *, axis=None, out=None, keepdims=False, skipna=True): - nv.validate_all((), {"out": out, "keepdims": keepdims}) - result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + def any(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_any((), dict(out=out, keepdims=keepdims)) + return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) - def min(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: + def all(self, axis=None, out=None, keepdims=False, skipna=True): + nv.validate_all((), dict(out=out, keepdims=keepdims)) + return nanops.nanall(self._ndarray, axis=axis, skipna=skipna) + + def min(self, skipna: bool = True, **kwargs) -> Scalar: nv.validate_min((), kwargs) - result = nanops.nanmin( - values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna + result = masked_reductions.min( + values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return self._wrap_reduction_result(axis, result) + return result - def max(self, *, axis=None, skipna: bool = True, **kwargs) -> Scalar: + def max(self, skipna: bool = True, **kwargs) -> Scalar: nv.validate_max((), kwargs) - result = nanops.nanmax( - values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna + result = masked_reductions.max( + values=self.to_numpy(), mask=self.isna(), skipna=skipna ) - return self._wrap_reduction_result(axis, result) + return result - def sum(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: + def sum(self, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_sum((), kwargs) - result = nanops.nansum( + return nanops.nansum( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) - return self._wrap_reduction_result(axis, result) - def prod(self, *, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: + def prod(self, axis=None, skipna=True, min_count=0, **kwargs) -> Scalar: nv.validate_prod((), kwargs) - result = nanops.nanprod( + return nanops.nanprod( self._ndarray, axis=axis, skipna=skipna, min_count=min_count ) - return self._wrap_reduction_result(axis, result) - def mean(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): - nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims}) - result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + def mean(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + nv.validate_mean((), dict(dtype=dtype, out=out, keepdims=keepdims)) + return nanops.nanmean(self._ndarray, axis=axis, skipna=skipna) def median( - self, *, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True + self, axis=None, out=None, overwrite_input=False, keepdims=False, skipna=True ): nv.validate_median( - (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims} + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) ) - result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + return nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna) - def std( - self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True - ): + def std(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) - result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - return self._wrap_reduction_result(axis, result) + return nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - def var( - self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True - ): + def var(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="var" ) - result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - return self._wrap_reduction_result(axis, result) + return nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - def sem( - self, *, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True - ): + def sem(self, axis=None, dtype=None, out=None, ddof=1, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="sem" ) - result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - return self._wrap_reduction_result(axis, result) + return nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - def kurt(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + def kurt(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="kurt" ) - result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + return nanops.nankurt(self._ndarray, axis=axis, skipna=skipna) - def skew(self, *, axis=None, dtype=None, out=None, keepdims=False, skipna=True): + def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="skew" ) - result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) - return self._wrap_reduction_result(axis, result) + return nanops.nanskew(self._ndarray, axis=axis, skipna=skipna) # ------------------------------------------------------------------------ # Additional Methods @@ -381,44 +424,38 @@ class PandasArray( return result + @doc(ExtensionArray.searchsorted) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) + # ------------------------------------------------------------------------ # Ops def __invert__(self): return type(self)(~self._ndarray) - def _cmp_method(self, other, op): - if isinstance(other, PandasArray): - other = other._ndarray + @classmethod + def _create_arithmetic_method(cls, op): + def arithmetic_method(self, other): + if isinstance(other, (ABCIndexClass, ABCSeries)): + return NotImplemented - pd_op = ops.get_array_op(op) - result = pd_op(self._ndarray, other) + elif isinstance(other, cls): + other = other._ndarray - if op is divmod or op is ops.rdivmod: - a, b = result - if isinstance(a, np.ndarray): - # for e.g. op vs TimedeltaArray, we may already - # have an ExtensionArray, in which case we do not wrap - return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b) - return a, b + with np.errstate(all="ignore"): + result = op(self._ndarray, other) - if isinstance(result, np.ndarray): - # for e.g. multiplication vs TimedeltaArray, we may already - # have an ExtensionArray, in which case we do not wrap - return self._wrap_ndarray_result(result) - return result + if op is divmod: + a, b = result + return cls(a), cls(b) - _arith_method = _cmp_method + return cls(result) - def _wrap_ndarray_result(self, result: np.ndarray): - # If we have timedelta64[ns] result, return a TimedeltaArray instead - # of a PandasArray - if result.dtype == "timedelta64[ns]": - from pandas.core.arrays import TimedeltaArray + return compat.set_function_name(arithmetic_method, f"__{op.__name__}__", cls) - return TimedeltaArray._simple_new(result) - return type(self)(result) + _create_comparison_method = _create_arithmetic_method - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = np.nan + +PandasArray._add_arithmetic_ops() +PandasArray._add_comparison_ops() diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/period.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/period.py index 7b0e4ce..4d117a3 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/period.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/period.py @@ -33,7 +33,6 @@ from pandas.core.dtypes.common import ( TD64NS_DTYPE, ensure_object, is_datetime64_dtype, - is_dtype_equal, is_float_dtype, is_period_dtype, pandas_dtype, @@ -49,6 +48,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com @@ -63,13 +63,11 @@ def _field_accessor(name: str, docstring=None): return property(f) -class PeriodArray(PeriodMixin, dtl.DatelikeOps): +class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. - Users should use :func:`period_range` to create new instances. - Alternatively, :func:`array` can be used to create new instances - from a sequence of Period scalars. + Users should use :func:`period_array` to create new instances. Parameters ---------- @@ -78,14 +76,14 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): converted to ordinals without inference or copy (PeriodArray, ndarray[int64]), or a box around such an array (Series[period], PeriodIndex). - dtype : PeriodDtype, optional - A PeriodDtype instance from which to extract a `freq`. If both - `freq` and `dtype` are specified, then the frequencies must match. freq : str or DateOffset The `freq` to use for the array. Mostly applicable when `values` is an ndarray of integers, when `freq` is required. When `values` is a PeriodArray (or box around), it's checked that ``values.freq`` matches `freq`. + dtype : PeriodDtype, optional + A PeriodDtype instance from which to extract a `freq`. If both + `freq` and `dtype` are specified, then the frequencies must match. copy : bool, default False Whether to copy the ordinals before storing. @@ -99,10 +97,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): See Also -------- - Period: Represents a period of time. + period_array : Create a new PeriodArray. PeriodIndex : Immutable Index for period data. - period_range: Create a fixed-frequency PeriodArray. - array: Construct a pandas array. Notes ----- @@ -124,7 +120,6 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): _scalar_type = Period _recognized_scalars = (Period,) _is_recognized_dtype = is_period_dtype - _infer_matches = ("period",) # Names others delegate to us _other_ops: List[str] = [] @@ -141,9 +136,7 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): "weekday", "week", "dayofweek", - "day_of_week", "dayofyear", - "day_of_year", "quarter", "qyear", "days_in_month", @@ -155,7 +148,7 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): # -------------------------------------------------------------------- # Constructors - def __init__(self, values, dtype=None, freq=None, copy=False): + def __init__(self, values, freq=None, dtype=None, copy=False): freq = validate_dtype_freq(dtype, freq) if freq is not None: @@ -181,19 +174,16 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new( - cls, values: np.ndarray, freq: Optional[BaseOffset] = None, dtype=None - ) -> "PeriodArray": + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": # alias for PeriodArray.__init__ assertion_msg = "Should be numpy array of type i8" assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg - return cls(values, freq=freq, dtype=dtype) + return cls(values, freq=freq, **kwargs) @classmethod def _from_sequence( cls: Type["PeriodArray"], scalars: Union[Sequence[Optional[Period]], AnyArrayLike], - *, dtype: Optional[PeriodDtype] = None, copy: bool = False, ) -> "PeriodArray": @@ -209,6 +199,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): return scalars periods = np.asarray(scalars, dtype=object) + if copy: + periods = periods.copy() freq = freq or libperiod.extract_freq(periods) ordinals = libperiod.extract_ordinals(periods, freq) @@ -216,9 +208,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype=None, copy=False + cls, strings, dtype=None, copy=False ) -> "PeriodArray": - return cls._from_sequence(strings, dtype=dtype, copy=copy) + return cls._from_sequence(strings, dtype, copy) @classmethod def _from_datetime64(cls, data, freq, tz=None) -> "PeriodArray": @@ -262,14 +254,12 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): # ----------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar( - self, value: Union[Period, NaTType], setitem: bool = False - ) -> int: + def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: if value is NaT: - return np.int64(value.value) + return value.value elif isinstance(value, self._scalar_type): - self._check_compatible_with(value, setitem=setitem) - return np.int64(value.ordinal) + self._check_compatible_with(value) + return value.ordinal else: raise ValueError(f"'value' should be a Period. Got '{value}' instead.") @@ -289,8 +279,8 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): def dtype(self) -> PeriodDtype: return self._dtype - # error: Read-only property cannot override read-write property - @property # type: ignore[misc] + # error: Read-only property cannot override read-write property [misc] + @property # type: ignore def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. @@ -379,13 +369,12 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): """, ) week = weekofyear - day_of_week = _field_accessor( - "day_of_week", + dayofweek = _field_accessor( + "weekday", """ The day of the week with Monday=0, Sunday=6. """, ) - dayofweek = day_of_week weekday = dayofweek dayofyear = day_of_year = _field_accessor( "day_of_year", @@ -492,8 +481,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): values[self._isnan] = iNaT return type(self)(values, freq=self.freq) - def _box_func(self, x) -> Union[Period, NaTType]: - return Period._from_ordinal(ordinal=x, freq=self.freq) + @property + def _box_func(self): + return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) def asfreq(self, freq=None, how: str = "E") -> "PeriodArray": """ @@ -588,22 +578,11 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) - if is_dtype_equal(dtype, self._dtype): - if not copy: - return self - else: - return self.copy() + if is_period_dtype(dtype): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: - value = self._validate_searchsorted_value(value).view("M8[ns]") - - # Cast to M8 to get datetime-like NaT placement - m8arr = self._ndarray.view("M8[ns]") - return m8arr.searchsorted(value, side=side, sorter=sorter) - # ------------------------------------------------------------------ # Arithmetic Methods @@ -651,12 +630,12 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): new_values = np.array([self.freq.base * x for x in new_values]) if self._hasnans or other._hasnans: - mask = self._isnan | other._isnan + mask = (self._isnan) | (other._isnan) new_values[mask] = NaT return new_values def _addsub_int_array( - self, other: np.ndarray, op: Callable[[Any, Any], Any] + self, other: np.ndarray, op: Callable[[Any, Any], Any], ) -> "PeriodArray": """ Add or subtract array of integers; equivalent to applying @@ -676,7 +655,7 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): other = -other res_values = algos.checked_add_with_arr(self.asi8, other, arr_mask=self._isnan) res_values = res_values.view("i8") - np.putmask(res_values, self._isnan, iNaT) + res_values[self._isnan] = iNaT return type(self)(res_values, freq=self.freq) def _add_offset(self, other: BaseOffset): @@ -788,6 +767,9 @@ class PeriodArray(PeriodMixin, dtl.DatelikeOps): raise raise_on_incompatible(self, other) + def factorize(self, na_sentinel=-1): + return ExtensionArray.factorize(self, na_sentinel=na_sentinel) + def raise_on_incompatible(left, right): """ @@ -885,7 +867,7 @@ def period_array( if is_datetime64_dtype(data_dtype): return PeriodArray._from_datetime64(data, freq) if is_period_dtype(data_dtype): - return PeriodArray(data, freq=freq) + return PeriodArray(data, freq) # other iterable of some kind if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): @@ -1082,9 +1064,11 @@ def _make_field_arrays(*fields): elif length is None: length = len(x) - return [ + arrays = [ np.asarray(x) if isinstance(x, (np.ndarray, list, ABCSeries)) else np.repeat(x, length) for x in fields ] + + return arrays diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/__init__.py index e9ff4b7..e928db4 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/__init__.py @@ -5,6 +5,6 @@ from pandas.core.arrays.sparse.array import ( BlockIndex, IntIndex, SparseArray, - make_sparse_index, + _make_index, ) from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py index ec4b0fd..da8d695 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py @@ -88,9 +88,9 @@ class SparseAccessor(BaseAccessor, PandasDelegate): dtype: Sparse[float64, nan] """ from pandas import Series - from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series - result = coo_to_sparse_series(A, dense_index=dense_index) + result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) return result @@ -168,9 +168,9 @@ class SparseAccessor(BaseAccessor, PandasDelegate): >>> columns [('a', 0), ('a', 1), ('b', 0), ('b', 1)] """ - from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo + from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo - A, rows, columns = sparse_series_to_coo( + A, rows, columns = _sparse_series_to_coo( self._parent, row_levels, column_levels, sort_labels=sort_labels ) return A, rows, columns diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py index b8375af..1531f7b 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py @@ -4,7 +4,7 @@ SparseArray data structure from collections import abc import numbers import operator -from typing import Any, Callable, Sequence, Type, TypeVar, Union +from typing import Any, Callable, Union import warnings import numpy as np @@ -14,6 +14,7 @@ import pandas._libs.sparse as splib from pandas._libs.sparse import BlockIndex, IntIndex, SparseIndex from pandas._libs.tslibs import NaT from pandas._typing import Scalar +import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning @@ -22,7 +23,6 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, - maybe_box_datetimelike, ) from pandas.core.dtypes.common import ( is_array_like, @@ -40,8 +40,7 @@ from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.algorithms as algos -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -50,15 +49,15 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.nanops import check_below_min_count import pandas.core.ops as ops +from pandas.core.ops.common import unpack_zerodim_and_defer import pandas.io.formats.printing as printing # ---------------------------------------------------------------------------- # Array -SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") -_sparray_doc_kwargs = {"klass": "SparseArray"} +_sparray_doc_kwargs = dict(klass="SparseArray") def _get_fill(arr: "SparseArray") -> np.ndarray: @@ -196,7 +195,7 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): ) -class SparseArray(OpsMixin, PandasObject, ExtensionArray): +class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): """ An ExtensionArray for storing sparse data. @@ -272,7 +271,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ _subtyp = "sparse_array" # register ABCSparseArray - _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"]) + _deprecations = PandasObject._deprecations | frozenset(["get_values"]) _sparse_index: SparseIndex def __init__( @@ -318,8 +317,9 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): raise Exception("must only pass scalars with an index") if is_scalar(data): - if index is not None and data is None: - data = np.nan + if index is not None: + if data is None: + data = np.nan if index is not None: npoints = len(index) @@ -398,11 +398,8 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): @classmethod def _simple_new( - cls: Type[SparseArrayT], - sparse_array: np.ndarray, - sparse_index: SparseIndex, - dtype: SparseDtype, - ) -> SparseArrayT: + cls, sparse_array: np.ndarray, sparse_index: SparseIndex, dtype: SparseDtype + ) -> "SparseArray": new = object.__new__(cls) new._sparse_index = sparse_index new._sparse_values = sparse_array @@ -455,7 +452,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None) -> np.ndarray: + def __array__(self, dtype=None, copy=True) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -488,7 +485,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): raise TypeError(msg) @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): return cls(scalars, dtype=dtype) @classmethod @@ -579,7 +576,8 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): >>> s.density 0.6 """ - return float(self.sp_index.npoints) / float(self.sp_index.length) + r = float(self.sp_index.npoints) / float(self.sp_index.length) + return r @property def npoints(self) -> int: @@ -737,25 +735,35 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): """ from pandas import Index, Series - keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps - if fcounts > 0 and (not self._null_fill_value or not dropna): - mask = isna(keys) if self._null_fill_value else keys == self.fill_value - if mask.any(): - counts[mask] += fcounts + if fcounts > 0: + if self._null_fill_value and dropna: + pass else: - keys = np.insert(keys, 0, self.fill_value) - counts = np.insert(counts, 0, fcounts) + if self._null_fill_value: + mask = isna(keys) + else: + mask = keys == self.fill_value + + if mask.any(): + counts[mask] += fcounts + else: + keys = np.insert(keys, 0, self.fill_value) + counts = np.insert(counts, 0, fcounts) if not isinstance(keys, ABCIndexClass): keys = Index(keys) - return Series(counts, index=keys) + result = Series(counts, index=keys) + return result # -------- # Indexing # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer if isinstance(key, tuple): if len(key) > 1: @@ -788,6 +796,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): key = check_array_indexer(self, key) if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): @@ -810,10 +819,10 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): return self.fill_value else: val = self.sp_values[sp_loc] - val = maybe_box_datetimelike(val, self.sp_values.dtype) + val = com.maybe_box_datetimelike(val, self.sp_values.dtype) return val - def take(self, indices, *, allow_fill=False, fill_value=None) -> "SparseArray": + def take(self, indices, allow_fill=False, fill_value=None) -> "SparseArray": if is_scalar(indices): raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") indices = np.asarray(indices, dtype=np.int32) @@ -941,14 +950,12 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): v = np.asarray(v) return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) - def copy(self: SparseArrayT) -> SparseArrayT: + def copy(self): values = self.sp_values.copy() return self._simple_new(values, self.sp_index, self.dtype) @classmethod - def _concat_same_type( - cls: Type[SparseArrayT], to_concat: Sequence[SparseArrayT] - ) -> SparseArrayT: + def _concat_same_type(cls, to_concat): fill_value = to_concat[0].fill_value values = [] @@ -979,7 +986,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting - # BlockIndex may have more blocks. + # BlockIndex may have more blocs. blengths = [] blocs = [] @@ -1056,11 +1063,6 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): IntIndex Indices: array([2, 3], dtype=int32) """ - if is_dtype_equal(dtype, self._dtype): - if not copy: - return self - else: - return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str # TODO copy=False is broken for astype_nansafe with int -> float, so cannot @@ -1162,7 +1164,7 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): method = getattr(self, name, None) if method is None: @@ -1307,6 +1309,19 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): nsparse = self.sp_index.ngaps return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + def transpose(self, *axes) -> "SparseArray": + """ + Returns the SparseArray. + """ + return self + + @property + def T(self) -> "SparseArray": + """ + Returns the SparseArray. + """ + return self + # ------------------------------------------------------------------------ # Ufuncs # ------------------------------------------------------------------------ @@ -1370,82 +1385,110 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): # Ops # ------------------------------------------------------------------------ - def _arith_method(self, other, op): + @classmethod + def _create_unary_method(cls, op) -> Callable[["SparseArray"], "SparseArray"]: + def sparse_unary_method(self) -> "SparseArray": + fill_value = op(np.array(self.fill_value)).item() + values = op(self.sp_values) + dtype = SparseDtype(values.dtype, fill_value) + return cls._simple_new(values, self.sp_index, dtype) + + name = f"__{op.__name__}__" + return compat.set_function_name(sparse_unary_method, name, cls) + + @classmethod + def _create_arithmetic_method(cls, op): op_name = op.__name__ - if isinstance(other, SparseArray): - return _sparse_array_op(self, other, op, op_name) + @unpack_zerodim_and_defer(op_name) + def sparse_arithmetic_method(self, other): - elif is_scalar(other): - with np.errstate(all="ignore"): - fill = op(_get_fill(self), np.asarray(other)) - result = op(self.sp_values, other) + if isinstance(other, SparseArray): + return _sparse_array_op(self, other, op, op_name) - if op_name == "divmod": - left, right = result - lfill, rfill = fill - return ( - _wrap_result(op_name, left, self.sp_index, lfill), - _wrap_result(op_name, right, self.sp_index, rfill), - ) + elif is_scalar(other): + with np.errstate(all="ignore"): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) - return _wrap_result(op_name, result, self.sp_index, fill) + if op_name == "divmod": + left, right = result + lfill, rfill = fill + return ( + _wrap_result(op_name, left, self.sp_index, lfill), + _wrap_result(op_name, right, self.sp_index, rfill), + ) - else: - other = np.asarray(other) - with np.errstate(all="ignore"): - # TODO: look into _wrap_result + return _wrap_result(op_name, result, self.sp_index, fill) + + else: + other = np.asarray(other) + with np.errstate(all="ignore"): + # TODO: look into _wrap_result + if len(self) != len(other): + raise AssertionError( + (f"length mismatch: {len(self)} vs. {len(other)}") + ) + if not isinstance(other, SparseArray): + dtype = getattr(other, "dtype", None) + other = SparseArray( + other, fill_value=self.fill_value, dtype=dtype + ) + return _sparse_array_op(self, other, op, op_name) + + name = f"__{op.__name__}__" + return compat.set_function_name(sparse_arithmetic_method, name, cls) + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + if op_name in {"and_", "or_"}: + op_name = op_name[:-1] + + @unpack_zerodim_and_defer(op_name) + def cmp_method(self, other): + + if not is_scalar(other) and not isinstance(other, type(self)): + # convert list-like to ndarray + other = np.asarray(other) + + if isinstance(other, np.ndarray): + # TODO: make this more flexible than just ndarray... if len(self) != len(other): raise AssertionError( f"length mismatch: {len(self)} vs. {len(other)}" ) - if not isinstance(other, SparseArray): - dtype = getattr(other, "dtype", None) - other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) + other = SparseArray(other, fill_value=self.fill_value) + + if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) + else: + with np.errstate(all="ignore"): + fill_value = op(self.fill_value, other) + result = op(self.sp_values, other) - def _cmp_method(self, other, op) -> "SparseArray": - if not is_scalar(other) and not isinstance(other, type(self)): - # convert list-like to ndarray - other = np.asarray(other) + return type(self)( + result, + sparse_index=self.sp_index, + fill_value=fill_value, + dtype=np.bool_, + ) - if isinstance(other, np.ndarray): - # TODO: make this more flexible than just ndarray... - if len(self) != len(other): - raise AssertionError(f"length mismatch: {len(self)} vs. {len(other)}") - other = SparseArray(other, fill_value=self.fill_value) + name = f"__{op.__name__}__" + return compat.set_function_name(cmp_method, name, cls) - if isinstance(other, SparseArray): - op_name = op.__name__.strip("_") - return _sparse_array_op(self, other, op, op_name) - else: - with np.errstate(all="ignore"): - fill_value = op(self.fill_value, other) - result = op(self.sp_values, other) + @classmethod + def _add_unary_ops(cls): + cls.__pos__ = cls._create_unary_method(operator.pos) + cls.__neg__ = cls._create_unary_method(operator.neg) + cls.__invert__ = cls._create_unary_method(operator.invert) - return type(self)( - result, - sparse_index=self.sp_index, - fill_value=fill_value, - dtype=np.bool_, - ) - - _logical_method = _cmp_method - - def _unary_method(self, op) -> "SparseArray": - fill_value = op(np.array(self.fill_value)).item() - values = op(self.sp_values) - dtype = SparseDtype(values.dtype, fill_value) - return type(self)._simple_new(values, self.sp_index, dtype) - - def __pos__(self) -> "SparseArray": - return self._unary_method(operator.pos) - - def __neg__(self) -> "SparseArray": - return self._unary_method(operator.neg) - - def __invert__(self) -> "SparseArray": - return self._unary_method(operator.invert) + @classmethod + def _add_comparison_ops(cls): + cls.__and__ = cls._create_comparison_method(operator.and_) + cls.__or__ = cls._create_comparison_method(operator.or_) + cls.__xor__ = cls._create_arithmetic_method(operator.xor) + super()._add_comparison_ops() # ---------- # Formatting @@ -1462,7 +1505,12 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): return None -def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None): +SparseArray._add_arithmetic_ops() +SparseArray._add_comparison_ops() +SparseArray._add_unary_ops() + + +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format @@ -1508,7 +1556,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None): else: indices = mask.nonzero()[0].astype(np.int32) - index = make_sparse_index(length, indices, kind) + index = _make_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) @@ -1516,7 +1564,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None): return sparsified_values, index, fill_value -def make_sparse_index(length, indices, kind): +def _make_index(length, indices, kind): if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py index c066291..ccf2825 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.common import ( from pandas.core.dtypes.missing import isna, na_value_for_dtype if TYPE_CHECKING: - from pandas.core.arrays.sparse.array import SparseArray + from pandas.core.arrays.sparse.array import SparseArray # noqa: F401 @register_extension_dtype @@ -180,7 +180,7 @@ class SparseDtype(ExtensionDtype): ------- type """ - from pandas.core.arrays.sparse.array import SparseArray + from pandas.core.arrays.sparse.array import SparseArray # noqa: F811 return SparseArray diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/scipy_sparse.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/scipy_sparse.py index 56c678c..eafd782 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/scipy_sparse.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/sparse/scipy_sparse.py @@ -85,7 +85,7 @@ def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): return values, i_coord, j_coord, i_labels, j_labels -def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -113,7 +113,7 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa return sparse_matrix, rows, columns -def coo_to_sparse_series(A, dense_index: bool = False): +def _coo_to_sparse_series(A, dense_index: bool = False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/string_.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/string_.py index cc2013d..b5a83c1 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/string_.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/string_.py @@ -1,32 +1,25 @@ +import operator from typing import TYPE_CHECKING, Type, Union import numpy as np from pandas._libs import lib, missing as libmissing -from pandas._typing import Scalar -from pandas.compat.numpy import function as nv from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype, - pandas_dtype, -) +from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.inference import is_array_like +from pandas import compat from pandas.core import ops -from pandas.core.array_algos import masked_reductions -from pandas.core.arrays import FloatingArray, IntegerArray, PandasArray -from pandas.core.arrays.floating import FloatingDtype +from pandas.core.arrays import IntegerArray, PandasArray from pandas.core.arrays.integer import _IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna if TYPE_CHECKING: - import pyarrow + import pyarrow # noqa: F401 @register_extension_dtype @@ -87,7 +80,7 @@ class StringDtype(ExtensionDtype): """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow + import pyarrow # noqa: F811 if isinstance(array, pyarrow.Array): chunks = [array] @@ -187,10 +180,7 @@ class StringArray(PandasArray): values = extract_array(values) super().__init__(values, copy=copy) - # pandas\core\arrays\string_.py:188: error: Incompatible types in - # assignment (expression has type "StringDtype", variable has type - # "PandasDtype") [assignment] - self._dtype = StringDtype() # type: ignore[assignment] + self._dtype = StringDtype() if not isinstance(values, type(self)): self._validate() @@ -205,35 +195,19 @@ class StringArray(PandasArray): ) @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): + def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - from pandas.core.arrays.masked import BaseMaskedArray + # convert non-na-likes to str, and nan-likes to StringDtype.na_value + result = lib.ensure_string_array( + scalars, na_value=StringDtype.na_value, copy=copy + ) - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - result[na_values] = StringDtype.na_value - - else: - # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy - ) - - # Manually creating new array avoids the validation step in the __init__, so is - # faster. Refactor need for validation? - new_string_array = object.__new__(cls) - new_string_array._dtype = StringDtype() - new_string_array._ndarray = result - - return new_string_array + return cls(result) @classmethod - def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False): + def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): return cls._from_sequence(strings, dtype=dtype, copy=copy) def __arrow_array__(self, type=None): @@ -283,6 +257,10 @@ class StringArray(PandasArray): super().__setitem__(key, value) + def fillna(self, value=None, method=None, limit=None): + # TODO: validate dtype + return super().fillna(value, method, limit) + def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, StringDtype): @@ -295,134 +273,77 @@ class StringArray(PandasArray): arr[mask] = 0 values = arr.astype(dtype.numpy_dtype) return IntegerArray(values, mask, copy=False) - elif isinstance(dtype, FloatingDtype): - arr = self.copy() - mask = self.isna() - arr[mask] = "0" - values = arr.astype(dtype.numpy_dtype) - return FloatingArray(values, mask, copy=False) - elif np.issubdtype(dtype, np.floating): - arr = self._ndarray.copy() - mask = self.isna() - arr[mask] = 0 - values = arr.astype(dtype) - values[mask] = np.nan - return values return super().astype(dtype, copy) - def _reduce(self, name: str, *, skipna: bool = True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in ["min", "max"]: return getattr(self, name)(skipna=skipna) raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_min((), kwargs) - result = masked_reductions.min( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return self._wrap_reduction_result(axis, result) - - def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: - nv.validate_max((), kwargs) - result = masked_reductions.max( - values=self.to_numpy(), mask=self.isna(), skipna=skipna - ) - return self._wrap_reduction_result(axis, result) - def value_counts(self, dropna=False): from pandas import value_counts return value_counts(self._ndarray, dropna=dropna).astype("Int64") - def memory_usage(self, deep: bool = False) -> int: + def memory_usage(self, deep=False): result = self._ndarray.nbytes if deep: return result + lib.memory_usage_of_objects(self._ndarray) return result - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray + # Override parent because we have different return types. + @classmethod + def _create_arithmetic_method(cls, op): + # Note: this handles both arithmetic and comparison methods. + def method(self, other): + from pandas.arrays import BooleanArray - if isinstance(other, StringArray): - other = other._ndarray + assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS - mask = isna(self) | isna(other) - valid = ~mask + if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): + return NotImplemented - if not lib.is_scalar(other): - if len(other) != len(self): - # prevent improper broadcasting when other is 2D - raise ValueError( - f"Lengths of operands do not match: {len(self)} != {len(other)}" - ) + elif isinstance(other, cls): + other = other._ndarray - other = np.asarray(other) - other = other[valid] + mask = isna(self) | isna(other) + valid = ~mask - if op.__name__ in ops.ARITHMETIC_BINOPS: - result = np.empty_like(self._ndarray, dtype="object") - result[mask] = StringDtype.na_value - result[valid] = op(self._ndarray[valid], other) - return StringArray(result) - else: - # logical - result = np.zeros(len(self._ndarray), dtype="bool") - result[valid] = op(self._ndarray[valid], other) - return BooleanArray(result, mask) + if not lib.is_scalar(other): + if len(other) != len(self): + # prevent improper broadcasting when other is 2D + raise ValueError( + f"Lengths of operands do not match: {len(self)} != {len(other)}" + ) - _arith_method = _cmp_method + other = np.asarray(other) + other = other[valid] - # ------------------------------------------------------------------------ - # String methods interface - _str_na_value = StringDtype.na_value - - def _str_map(self, f, na_value=None, dtype=None): - from pandas.arrays import BooleanArray, IntegerArray, StringArray - from pandas.core.arrays.string_ import StringDtype - - if dtype is None: - dtype = StringDtype() - if na_value is None: - na_value = self.dtype.na_value - - mask = isna(self) - arr = np.asarray(self) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray + if op.__name__ in ops.ARITHMETIC_BINOPS: + result = np.empty_like(self._ndarray, dtype="object") + result[mask] = StringDtype.na_value + result[valid] = op(self._ndarray[valid], other) + return StringArray(result) else: - constructor = BooleanArray + # logical + result = np.zeros(len(self._ndarray), dtype="bool") + result[valid] = op(self._ndarray[valid], other) + return BooleanArray(result, mask) - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - f, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) + return compat.set_function_name(method, f"__{op.__name__}__", cls) - if not na_value_is_na: - mask[:] = False + @classmethod + def _add_arithmetic_ops(cls): + cls.__add__ = cls._create_arithmetic_method(operator.add) + cls.__radd__ = cls._create_arithmetic_method(ops.radd) - return constructor(result, mask) + cls.__mul__ = cls._create_arithmetic_method(operator.mul) + cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, f, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, f, mask.view("uint8")) + _create_comparison_method = _create_arithmetic_method + + +StringArray._add_arithmetic_ops() +StringArray._add_comparison_ops() diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/string_arrow.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/string_arrow.py deleted file mode 100644 index 184fbc0..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/string_arrow.py +++ /dev/null @@ -1,625 +0,0 @@ -from __future__ import annotations - -from distutils.version import LooseVersion -from typing import TYPE_CHECKING, Any, Sequence, Type, Union - -import numpy as np - -from pandas._libs import lib, missing as libmissing -from pandas.util._validators import validate_fillna_kwargs - -from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna - -from pandas.api.types import ( - is_array_like, - is_bool_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) -from pandas.core.arraylike import OpsMixin -from pandas.core.arrays.base import ExtensionArray -from pandas.core.indexers import check_array_indexer, validate_indices -from pandas.core.missing import get_fill_func - -try: - import pyarrow as pa -except ImportError: - pa = None -else: - # our min supported version of pyarrow, 0.15.1, does not have a compute - # module - try: - import pyarrow.compute as pc - except ImportError: - pass - else: - ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } - - -if TYPE_CHECKING: - from pandas import Series - - -@register_extension_dtype -class ArrowStringDtype(ExtensionDtype): - """ - Extension dtype for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringDtype is considered experimental. The implementation and - parts of the API may change without warning. - - Attributes - ---------- - None - - Methods - ------- - None - - Examples - -------- - >>> from pandas.core.arrays.string_arrow import ArrowStringDtype - >>> ArrowStringDtype() - ArrowStringDtype - """ - - name = "arrow_string" - - #: StringDtype.na_value uses pandas.NA - na_value = libmissing.NA - - @property - def type(self) -> Type[str]: - return str - - @classmethod - def construct_array_type(cls) -> Type["ArrowStringArray"]: - """ - Return the array type associated with this dtype. - - Returns - ------- - type - """ - return ArrowStringArray - - def __hash__(self) -> int: - return hash("ArrowStringDtype") - - def __repr__(self) -> str: - return "ArrowStringDtype" - - def __from_arrow__( - self, array: Union["pa.Array", "pa.ChunkedArray"] - ) -> "ArrowStringArray": - """ - Construct StringArray from pyarrow Array/ChunkedArray. - """ - return ArrowStringArray(array) - - def __eq__(self, other) -> bool: - """Check whether 'other' is equal to self. - - By default, 'other' is considered equal if - * it's a string matching 'self.name'. - * it's an instance of this type. - - Parameters - ---------- - other : Any - - Returns - ------- - bool - """ - if isinstance(other, ArrowStringDtype): - return True - elif isinstance(other, str) and other == "arrow_string": - return True - else: - return False - - -class ArrowStringArray(OpsMixin, ExtensionArray): - """ - Extension array for string data in a ``pyarrow.ChunkedArray``. - - .. versionadded:: 1.2.0 - - .. warning:: - - ArrowStringArray is considered experimental. The implementation and - parts of the API may change without warning. - - Parameters - ---------- - values : pyarrow.Array or pyarrow.ChunkedArray - The array of data. - - Attributes - ---------- - None - - Methods - ------- - None - - See Also - -------- - array - The recommended function for creating a ArrowStringArray. - Series.str - The string methods are available on Series backed by - a ArrowStringArray. - - Notes - ----- - ArrowStringArray returns a BooleanArray for comparison methods. - - Examples - -------- - >>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") - - ['This is', 'some text', , 'data.'] - Length: 4, dtype: arrow_string - """ - - _dtype = ArrowStringDtype() - - def __init__(self, values): - self._chk_pyarrow_available() - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") - - if not pa.types.is_string(self._data.type): - raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of string type" - ) - - @classmethod - def _chk_pyarrow_available(cls) -> None: - # TODO: maybe update import_optional_dependency to allow a minimum - # version to be specified rather than use the global minimum - if pa is None or LooseVersion(pa.__version__) < "1.0.0": - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." - raise ImportError(msg) - - @classmethod - def _from_sequence(cls, scalars, dtype=None, copy=False): - cls._chk_pyarrow_available() - # convert non-na-likes to str, and nan-likes to ArrowStringDtype.na_value - scalars = lib.ensure_string_array(scalars, copy=False) - return cls(pa.array(scalars, type=pa.string(), from_pandas=True)) - - @classmethod - def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): - return cls._from_sequence(strings, dtype=dtype, copy=copy) - - @property - def dtype(self) -> ArrowStringDtype: - """ - An instance of 'ArrowStringDtype'. - """ - return self._dtype - - def __array__(self, dtype=None) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) - - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self._data - - def to_numpy( - self, dtype=None, copy: bool = False, na_value=lib.no_default - ) -> np.ndarray: - """ - Convert to a NumPy ndarray. - """ - # TODO: copy argument is ignored - - if na_value is lib.no_default: - na_value = self._dtype.na_value - result = self._data.__array__(dtype=dtype) - result[isna(result)] = na_value - return result - - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - - @classmethod - def _from_factorized(cls, values, original): - return cls._from_sequence(values) - - @classmethod - def _concat_same_type(cls, to_concat) -> ArrowStringArray: - """ - Concatenate multiple ArrowStringArray. - - Parameters - ---------- - to_concat : sequence of ArrowStringArray - - Returns - ------- - ArrowStringArray - """ - return cls( - pa.chunked_array( - [array for ea in to_concat for array in ea._data.iterchunks()] - ) - ) - - def __getitem__(self, item: Any) -> Any: - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - return self._as_pandas_scalar(value) - - def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): - scalar = arrow_scalar.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - - def fillna(self, value=None, method=None, limit=None): - """ - Fill NA/NaN values using the specified method. - - Parameters - ---------- - value : scalar, array-like - If a scalar value is passed it is used to fill all missing values. - Alternatively, an array-like 'value' can be given. It's expected - that the array-like have the same length as 'self'. - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap. - limit : int, default None - If method is specified, this is the maximum number of consecutive - NaN values to forward/backward fill. In other words, if there is - a gap with more than this number of consecutive NaNs, it will only - be partially filled. If method is not specified, this is the - maximum number of entries along the entire axis where NaNs will be - filled. - - Returns - ------- - ExtensionArray - With NA/NaN filled. - """ - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f"expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - func = get_fill_func(method) - new_values = func(self.to_numpy(object), limit=limit, mask=mask) - new_values = self._from_sequence(new_values) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - - def _reduce(self, name, skipna=True, **kwargs): - if name in ["min", "max"]: - return getattr(self, name)(skipna=skipna) - - raise TypeError(f"Cannot perform reduction '{name}' with string dtype") - - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def isna(self) -> np.ndarray: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self._data.is_null().to_pandas().values - - def copy(self) -> ArrowStringArray: - """ - Return a shallow copy of the array. - - Returns - ------- - ArrowStringArray - """ - return type(self)(self._data) - - def _cmp_method(self, other, op): - from pandas.arrays import BooleanArray - - pc_func = ARROW_CMP_FUNCS[op.__name__] - if isinstance(other, ArrowStringArray): - result = pc_func(self._data, other._data) - elif isinstance(other, np.ndarray): - result = pc_func(self._data, other) - elif is_scalar(other): - try: - result = pc_func(self._data, pa.scalar(other)) - except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): - mask = isna(self) | isna(other) - valid = ~mask - result = np.zeros(len(self), dtype="bool") - result[valid] = op(np.array(self)[valid], other) - return BooleanArray(result, mask) - else: - return NotImplemented - - # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray - return BooleanArray._from_sequence(result.to_pandas().values) - - def __setitem__(self, key: Union[int, np.ndarray], value: Any) -> None: - """Set one or more values inplace. - - Parameters - ---------- - key : int, ndarray, or slice - When called from, e.g. ``Series.__setitem__``, ``key`` will be - one of - - * scalar int - * ndarray of integers. - * boolean ndarray - * slice object - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Returns - ------- - None - """ - key = check_array_indexer(self, key) - - if is_integer(key): - if not is_scalar(value): - raise ValueError("Must pass scalars with scalar indexer") - elif isna(value): - value = None - elif not isinstance(value, str): - raise ValueError("Scalar must be NA or str") - - # Slice data and insert inbetween - new_data = [ - *self._data[0:key].chunks, - pa.array([value], type=pa.string()), - *self._data[(key + 1) :].chunks, - ] - self._data = pa.chunked_array(new_data) - else: - # Convert to integer indices and iteratively assign. - # TODO: Make a faster variant of this in Arrow upstream. - # This is probably extremely slow. - - # Convert all possible input key types to an array of integers - if is_bool_dtype(key): - # TODO(ARROW-9430): Directly support setitem(booleans) - key_array = np.argwhere(key).flatten() - elif isinstance(key, slice): - key_array = np.array(range(len(self))[key]) - else: - # TODO(ARROW-9431): Directly support setitem(integers) - key_array = np.asanyarray(key) - - if is_scalar(value): - value = np.broadcast_to(value, len(key_array)) - else: - value = np.asarray(value) - - if len(key_array) != len(value): - raise ValueError("Length of indexer and values mismatch") - - for k, v in zip(key_array, value): - self[k] = v - - def take( - self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None - ) -> "ExtensionArray": - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - indices_array = indices - - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import Index, Series - - vc = self._data.value_counts() - - # Index cannot hold ExtensionArrays yet - index = Index(type(self)(vc.field(0)).astype(object)) - # No missings, so we can adhere to the interface and return a numpy array. - counts = np.array(vc.field(1)) - - if dropna and self._data.null_count > 0: - raise NotImplementedError("yo") - - return Series(counts, index=index).astype("Int64") diff --git a/venv/lib/python3.8/site-packages/pandas/core/arrays/timedeltas.py b/venv/lib/python3.8/site-packages/pandas/core/arrays/timedeltas.py index c51882a..92f1f7e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/arrays/timedeltas.py +++ b/venv/lib/python3.8/site-packages/pandas/core/arrays/timedeltas.py @@ -1,33 +1,18 @@ from datetime import timedelta -from typing import List, Optional, Union +from typing import List import numpy as np from pandas._libs import lib, tslibs -from pandas._libs.tslibs import ( - BaseOffset, - NaT, - NaTType, - Period, - Tick, - Timedelta, - Timestamp, - iNaT, - to_offset, -) +from pandas._libs.tslibs import NaT, Period, Tick, Timedelta, Timestamp, iNaT, to_offset from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import get_timedelta_field -from pandas._libs.tslibs.timedeltas import ( - array_to_timedelta64, - ints_to_pytimedelta, - parse_timedelta_unit, -) +from pandas._libs.tslibs.timedeltas import array_to_timedelta64, parse_timedelta_unit from pandas.compat.numpy import function as nv from pandas.core.dtypes.common import ( DT64NS_DTYPE, TD64NS_DTYPE, - is_categorical_dtype, is_dtype_equal, is_float_dtype, is_integer_dtype, @@ -51,8 +36,8 @@ from pandas.core.construction import extract_array from pandas.core.ops.common import unpack_zerodim_and_defer -def _field_accessor(name: str, alias: str, docstring: str): - def f(self) -> np.ndarray: +def _field_accessor(name, alias, docstring=None): + def f(self): values = self.asi8 result = get_timedelta_field(values, alias) if self._hasnans: @@ -67,7 +52,7 @@ def _field_accessor(name: str, alias: str, docstring: str): return property(f) -class TimedeltaArray(dtl.TimelikeOps): +class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. @@ -104,7 +89,6 @@ class TimedeltaArray(dtl.TimelikeOps): _scalar_type = Timedelta _recognized_scalars = (timedelta, np.timedelta64, Tick) _is_recognized_dtype = is_timedelta64_dtype - _infer_matches = ("timedelta", "timedelta64") __array_priority__ = 1000 # define my properties & methods for delegation @@ -121,14 +105,15 @@ class TimedeltaArray(dtl.TimelikeOps): "ceil", ] - # Note: ndim must be defined to ensure NaT.__richcmp__(TimedeltaArray) + # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) # operates pointwise. - def _box_func(self, x) -> Union[Timedelta, NaTType]: - return Timedelta(x, unit="ns") + @property + def _box_func(self): + return lambda x: Timedelta(x, unit="ns") @property - def dtype(self) -> np.dtype: + def dtype(self): """ The dtype for the TimedeltaArray. @@ -203,9 +188,7 @@ class TimedeltaArray(dtl.TimelikeOps): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new( - cls, values, freq: Optional[BaseOffset] = None, dtype=TD64NS_DTYPE - ) -> "TimedeltaArray": + def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): assert dtype == TD64NS_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) if values.dtype != TD64NS_DTYPE: @@ -220,25 +203,8 @@ class TimedeltaArray(dtl.TimelikeOps): @classmethod def _from_sequence( - cls, data, *, dtype=TD64NS_DTYPE, copy: bool = False - ) -> "TimedeltaArray": - if dtype: - _validate_td64_dtype(dtype) - - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) - freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) - - return cls._simple_new(data, freq=freq) - - @classmethod - def _from_sequence_not_strict( - cls, - data, - dtype=TD64NS_DTYPE, - copy: bool = False, - freq=lib.no_default, - unit=None, - ) -> "TimedeltaArray": + cls, data, dtype=TD64NS_DTYPE, copy=False, freq=lib.no_default, unit=None + ): if dtype: _validate_td64_dtype(dtype) @@ -301,11 +267,11 @@ class TimedeltaArray(dtl.TimelikeOps): # ---------------------------------------------------------------- # DatetimeLike Interface - def _unbox_scalar(self, value, setitem: bool = False) -> np.timedelta64: + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") - self._check_compatible_with(value, setitem=setitem) - return np.timedelta64(value.value, "ns") + self._check_compatible_with(value) + return value.value def _scalar_from_string(self, value): return Timedelta(value) @@ -314,10 +280,13 @@ class TimedeltaArray(dtl.TimelikeOps): # we don't have anything to validate. pass + def _maybe_clear_freq(self): + self._freq = None + # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def astype(self, dtype, copy: bool = True): + def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] # --> timedelta64 @@ -331,9 +300,10 @@ class TimedeltaArray(dtl.TimelikeOps): if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) - return self._maybe_mask_results( + values = self._maybe_mask_results( result, fill_value=None, convert="float64" ) + return values result = self._data.astype(dtype, copy=copy) return result.astype("i8") elif is_timedelta64_ns_dtype(dtype): @@ -342,28 +312,11 @@ class TimedeltaArray(dtl.TimelikeOps): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) - def __iter__(self): - if self.ndim > 1: - for i in range(len(self)): - yield self[i] - else: - # convert in chunks of 10k for efficiency - data = self.asi8 - length = len(self) - chunksize = 10000 - chunks = int(length / chunksize) + 1 - for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, length) - converted = ints_to_pytimedelta(data[start_i:end_i], box=True) - yield from converted - # ---------------------------------------------------------------- # Reductions def sum( self, - *, axis=None, dtype=None, out=None, @@ -373,17 +326,20 @@ class TimedeltaArray(dtl.TimelikeOps): min_count: int = 0, ): nv.validate_sum( - (), {"dtype": dtype, "out": out, "keepdims": keepdims, "initial": initial} + (), dict(dtype=dtype, out=out, keepdims=keepdims, initial=initial) ) + if not len(self): + return NaT + if not skipna and self._hasnans: + return NaT result = nanops.nansum( - self._ndarray, axis=axis, skipna=skipna, min_count=min_count + self._data, axis=axis, skipna=skipna, min_count=min_count ) - return self._wrap_reduction_result(axis, result) + return Timedelta(result) def std( self, - *, axis=None, dtype=None, out=None, @@ -392,26 +348,41 @@ class TimedeltaArray(dtl.TimelikeOps): skipna: bool = True, ): nv.validate_stat_ddof_func( - (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std" + (), dict(dtype=dtype, out=out, keepdims=keepdims), fname="std" ) + if not len(self): + return NaT + if not skipna and self._hasnans: + return NaT - result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof) - if axis is None or self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) + result = nanops.nanstd(self._data, axis=axis, skipna=skipna, ddof=ddof) + return Timedelta(result) + + def median( + self, + axis=None, + out=None, + overwrite_input: bool = False, + keepdims: bool = False, + skipna: bool = True, + ): + nv.validate_median( + (), dict(out=out, overwrite_input=overwrite_input, keepdims=keepdims) + ) + return nanops.nanmedian(self._data, axis=axis, skipna=skipna) # ---------------------------------------------------------------- # Rendering Methods def _formatter(self, boxed=False): - from pandas.io.formats.format import get_format_timedelta64 + from pandas.io.formats.format import _get_format_timedelta64 - return get_format_timedelta64(self, box=True) + return _get_format_timedelta64(self, box=True) def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import get_format_timedelta64 + from pandas.io.formats.format import _get_format_timedelta64 - formatter = get_format_timedelta64(self._data, na_rep) + formatter = _get_format_timedelta64(self._data, na_rep) return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) # ---------------------------------------------------------------- @@ -478,7 +449,7 @@ class TimedeltaArray(dtl.TimelikeOps): ) from err @unpack_zerodim_and_defer("__mul__") - def __mul__(self, other) -> "TimedeltaArray": + def __mul__(self, other): if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._data * other @@ -513,7 +484,7 @@ class TimedeltaArray(dtl.TimelikeOps): def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT @@ -570,7 +541,7 @@ class TimedeltaArray(dtl.TimelikeOps): @unpack_zerodim_and_defer("__rtruediv__") def __rtruediv__(self, other): # X / timedelta is defined only for timedelta-like X - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT @@ -613,7 +584,7 @@ class TimedeltaArray(dtl.TimelikeOps): def __floordiv__(self, other): if is_scalar(other): - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT @@ -628,7 +599,7 @@ class TimedeltaArray(dtl.TimelikeOps): # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other - np.putmask(result, self._isnan, iNaT) + result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division @@ -653,8 +624,8 @@ class TimedeltaArray(dtl.TimelikeOps): result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.float64) - np.putmask(result, mask, np.nan) + result = result.astype(np.int64) + result[mask] = np.nan return result elif is_object_dtype(other.dtype): @@ -677,7 +648,7 @@ class TimedeltaArray(dtl.TimelikeOps): def __rfloordiv__(self, other): if is_scalar(other): - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT @@ -702,13 +673,14 @@ class TimedeltaArray(dtl.TimelikeOps): elif is_timedelta64_dtype(other.dtype): other = type(self)(other) + # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = other.asi8 // self.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.float64) - np.putmask(result, mask, np.nan) + result = result.astype(np.int64) + result[mask] = np.nan return result elif is_object_dtype(other.dtype): @@ -723,21 +695,21 @@ class TimedeltaArray(dtl.TimelikeOps): @unpack_zerodim_and_defer("__mod__") def __mod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) return self - (self // other) * other @unpack_zerodim_and_defer("__rmod__") def __rmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) return other - (other // self) * self @unpack_zerodim_and_defer("__divmod__") def __divmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) res1 = self // other @@ -747,29 +719,29 @@ class TimedeltaArray(dtl.TimelikeOps): @unpack_zerodim_and_defer("__rdivmod__") def __rdivmod__(self, other): # Note: This is a naive implementation, can likely be optimized - if isinstance(other, self._recognized_scalars): + if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) res1 = other // self res2 = other - res1 * self return res1, res2 - def __neg__(self) -> "TimedeltaArray": + def __neg__(self): if self.freq is not None: return type(self)(-self._data, freq=-self.freq) return type(self)(-self._data) - def __pos__(self) -> "TimedeltaArray": + def __pos__(self): return type(self)(self._data, freq=self.freq) - def __abs__(self) -> "TimedeltaArray": + def __abs__(self): # Note: freq is not preserved return type(self)(np.abs(self._data)) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods - def total_seconds(self) -> np.ndarray: + def total_seconds(self): """ Return total duration of each element expressed in seconds. @@ -947,9 +919,6 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): data = data._data elif isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=tslibs.iNaT) - elif is_categorical_dtype(data.dtype): - data = data.categories.take(data.codes, fill_value=NaT)._values - copy = False # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): diff --git a/venv/lib/python3.8/site-packages/pandas/core/base.py b/venv/lib/python3.8/site-packages/pandas/core/base.py index f333ee0..1926803 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/base.py +++ b/venv/lib/python3.8/site-packages/pandas/core/base.py @@ -4,57 +4,42 @@ Base and utility classes for pandas objects. import builtins import textwrap -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - FrozenSet, - Optional, - TypeVar, - Union, - cast, -) +from typing import Any, Dict, FrozenSet, List, Optional, Union import numpy as np import pandas._libs.lib as lib -from pandas._typing import DtypeObj, IndexLabel from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc +from pandas.core.dtypes.cast import is_nested_object from pandas.core.dtypes.common import ( is_categorical_dtype, is_dict_like, is_extension_array_dtype, + is_list_like, is_object_dtype, is_scalar, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna, remove_na_arraylike +from pandas.core.dtypes.missing import isna -from pandas.core import algorithms +from pandas.core import algorithms, common as com from pandas.core.accessor import DirNamesMixin from pandas.core.algorithms import duplicated, unique1d, value_counts -from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.construction import create_series_with_explicit_dtype import pandas.core.nanops as nanops -if TYPE_CHECKING: - from pandas import Categorical - -_shared_docs: Dict[str, str] = {} -_indexops_doc_kwargs = { - "klass": "IndexOpsMixin", - "inplace": "", - "unique": "IndexOpsMixin", - "duplicated": "IndexOpsMixin", -} - -_T = TypeVar("_T", bound="IndexOpsMixin") +_shared_docs: Dict[str, str] = dict() +_indexops_doc_kwargs = dict( + klass="IndexOpsMixin", + inplace="", + unique="IndexOpsMixin", + duplicated="IndexOpsMixin", +) class PandasObject(DirNamesMixin): @@ -95,9 +80,7 @@ class PandasObject(DirNamesMixin): either a value or Series of values """ if hasattr(self, "memory_usage"): - # pandas\core\base.py:84: error: "PandasObject" has no attribute - # "memory_usage" [attr-defined] - mem = self.memory_usage(deep=True) # type: ignore[attr-defined] + mem = self.memory_usage(deep=True) return int(mem if is_scalar(mem) else mem.sum()) # no memory_usage attribute, so fall back to object's 'sizeof' @@ -152,7 +135,7 @@ class SelectionMixin: object sub-classes need to define: obj, exclusions """ - _selection: Optional[IndexLabel] = None + _selection = None _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) @@ -206,18 +189,10 @@ class SelectionMixin: @cache_readonly def _selected_obj(self): - # pandas\core\base.py:195: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - if self._selection is None or isinstance( - self.obj, ABCSeries # type: ignore[attr-defined] - ): - # pandas\core\base.py:194: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - return self.obj # type: ignore[attr-defined] + if self._selection is None or isinstance(self.obj, ABCSeries): + return self.obj else: - # pandas\core\base.py:204: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - return self.obj[self._selection] # type: ignore[attr-defined] + return self.obj[self._selection] @cache_readonly def ndim(self) -> int: @@ -225,58 +200,31 @@ class SelectionMixin: @cache_readonly def _obj_with_exclusions(self): - # pandas\core\base.py:209: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - if self._selection is not None and isinstance( - self.obj, ABCDataFrame # type: ignore[attr-defined] - ): - # pandas\core\base.py:217: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - return self.obj.reindex( # type: ignore[attr-defined] - columns=self._selection_list - ) + if self._selection is not None and isinstance(self.obj, ABCDataFrame): + return self.obj.reindex(columns=self._selection_list) - # pandas\core\base.py:207: error: "SelectionMixin" has no attribute - # "exclusions" [attr-defined] - if len(self.exclusions) > 0: # type: ignore[attr-defined] - # pandas\core\base.py:208: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - - # pandas\core\base.py:208: error: "SelectionMixin" has no attribute - # "exclusions" [attr-defined] - return self.obj.drop(self.exclusions, axis=1) # type: ignore[attr-defined] + if len(self.exclusions) > 0: + return self.obj.drop(self.exclusions, axis=1) else: - # pandas\core\base.py:210: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - return self.obj # type: ignore[attr-defined] + return self.obj def __getitem__(self, key): if self._selection is not None: raise IndexError(f"Column(s) {self._selection} already selected") if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): - # pandas\core\base.py:217: error: "SelectionMixin" has no attribute - # "obj" [attr-defined] - if len( - self.obj.columns.intersection(key) # type: ignore[attr-defined] - ) != len(key): - # pandas\core\base.py:218: error: "SelectionMixin" has no - # attribute "obj" [attr-defined] - bad_keys = list( - set(key).difference(self.obj.columns) # type: ignore[attr-defined] - ) + if len(self.obj.columns.intersection(key)) != len(key): + bad_keys = list(set(key).difference(self.obj.columns)) raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") return self._gotitem(list(key), ndim=2) elif not getattr(self, "as_index", False): - # error: "SelectionMixin" has no attribute "obj" [attr-defined] - if key not in self.obj.columns: # type: ignore[attr-defined] + if key not in self.obj.columns: raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=2) else: - # error: "SelectionMixin" has no attribute "obj" [attr-defined] - if key not in self.obj: # type: ignore[attr-defined] + if key not in self.obj: raise KeyError(f"Column not found: {key}") return self._gotitem(key, ndim=1) @@ -288,7 +236,7 @@ class SelectionMixin: Parameters ---------- key : str / list of selections - ndim : {1, 2} + ndim : 1,2 requested ndim of result subset : object, default None subset to act on @@ -330,7 +278,289 @@ class SelectionMixin: f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) - def _get_cython_func(self, arg: Callable) -> Optional[str]: + def _aggregate(self, arg, *args, **kwargs): + """ + provide an implementation for the aggregators + + Parameters + ---------- + arg : string, dict, function + *args : args to pass on to the function + **kwargs : kwargs to pass on to the function + + Returns + ------- + tuple of result, how + + Notes + ----- + how can be a string describe the required post-processing, or + None if not required + """ + is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + + _axis = kwargs.pop("_axis", None) + if _axis is None: + _axis = getattr(self, "axis", 0) + + if isinstance(arg, str): + return self._try_aggregate_string_function(arg, *args, **kwargs), None + + if isinstance(arg, dict): + # aggregate based on the passed dict + if _axis != 0: # pragma: no cover + raise ValueError("Can only pass dict with axis=0") + + obj = self._selected_obj + + # if we have a dict of any non-scalars + # eg. {'A' : ['mean']}, normalize all to + # be list-likes + if any(is_aggregator(x) for x in arg.values()): + new_arg = {} + for k, v in arg.items(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + + # the keys must be in the columns + # for ndim=2, or renamers for ndim=1 + + # ok for now, but deprecated + # {'A': { 'ra': 'mean' }} + # {'A': { 'ra': ['mean'] }} + # {'ra': ['mean']} + + # not ok + # {'ra' : { 'A' : 'mean' }} + if isinstance(v, dict): + raise SpecificationError("nested renamer is not supported") + elif isinstance(obj, ABCSeries): + raise SpecificationError("nested renamer is not supported") + elif isinstance(obj, ABCDataFrame) and k not in obj.columns: + raise KeyError(f"Column '{k}' does not exist!") + + arg = new_arg + + else: + # deprecation of renaming keys + # GH 15931 + keys = list(arg.keys()) + if isinstance(obj, ABCDataFrame) and len( + obj.columns.intersection(keys) + ) != len(keys): + cols = sorted(set(keys) - set(obj.columns.intersection(keys))) + raise SpecificationError(f"Column(s) {cols} do not exist") + + from pandas.core.reshape.concat import concat + + def _agg_1dim(name, how, subset=None): + """ + aggregate a 1-dim with how + """ + colg = self._gotitem(name, ndim=1, subset=subset) + if colg.ndim != 1: + raise SpecificationError( + "nested dictionary is ambiguous in aggregation" + ) + return colg.aggregate(how) + + def _agg_2dim(how): + """ + aggregate a 2-dim with how + """ + colg = self._gotitem(self._selection, ndim=2, subset=obj) + return colg.aggregate(how) + + def _agg(arg, func): + """ + run the aggregations over the arg with func + return a dict + """ + result = {} + for fname, agg_how in arg.items(): + result[fname] = func(fname, agg_how) + return result + + # set the final keys + keys = list(arg.keys()) + result = {} + + if self._selection is not None: + + sl = set(self._selection_list) + + # we are a Series like object, + # but may have multiple aggregations + if len(sl) == 1: + + result = _agg( + arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) + ) + + # we are selecting the same set as we are aggregating + elif not len(sl - set(keys)): + + result = _agg(arg, _agg_1dim) + + # we are a DataFrame, with possibly multiple aggregations + else: + + result = _agg(arg, _agg_2dim) + + # no selection + else: + + try: + result = _agg(arg, _agg_1dim) + except SpecificationError: + + # we are aggregating expecting all 1d-returns + # but we have 2d + result = _agg(arg, _agg_2dim) + + # combine results + + def is_any_series() -> bool: + # return a boolean if we have *any* nested series + return any(isinstance(r, ABCSeries) for r in result.values()) + + def is_any_frame() -> bool: + # return a boolean if we have *any* nested series + return any(isinstance(r, ABCDataFrame) for r in result.values()) + + if isinstance(result, list): + return concat(result, keys=keys, axis=1, sort=True), True + + elif is_any_frame(): + # we have a dict of DataFrames + # return a MI DataFrame + + keys_to_use = [k for k in keys if not result[k].empty] + # Have to check, if at least one DataFrame is not empty. + keys_to_use = keys_to_use if keys_to_use != [] else keys + return ( + concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1), + True, + ) + + elif isinstance(self, ABCSeries) and is_any_series(): + + # we have a dict of Series + # return a MI Series + try: + result = concat(result) + except TypeError as err: + # we want to give a nice error here if + # we have non-same sized objects, so + # we don't automatically broadcast + + raise ValueError( + "cannot perform both aggregation " + "and transformation operations " + "simultaneously" + ) from err + + return result, True + + # fall thru + from pandas import DataFrame, Series + + try: + result = DataFrame(result) + except ValueError: + + # we have a dict of scalars + result = Series(result, name=getattr(self, "name", None)) + + return result, True + elif is_list_like(arg): + # we require a list, but not an 'str' + return self._aggregate_multiple_funcs(arg, _axis=_axis), None + else: + result = None + + f = self._get_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None + + # caller can react + return result, True + + def _aggregate_multiple_funcs(self, arg, _axis): + from pandas.core.reshape.concat import concat + + if _axis != 0: + raise NotImplementedError("axis other than 0 is not supported") + + if self._selected_obj.ndim == 1: + obj = self._selected_obj + else: + obj = self._obj_with_exclusions + + results = [] + keys = [] + + # degenerate case + if obj.ndim == 1: + for a in arg: + colg = self._gotitem(obj.name, ndim=1, subset=obj) + try: + new_res = colg.aggregate(a) + + except TypeError: + pass + else: + results.append(new_res) + + # make sure we find a good name + name = com.get_callable_name(a) or a + keys.append(name) + + # multiples + else: + for index, col in enumerate(obj): + colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) + try: + new_res = colg.aggregate(arg) + except (TypeError, DataError): + pass + except ValueError as err: + # cannot aggregate + if "Must produce aggregated value" in str(err): + # raised directly in _aggregate_named + pass + elif "no results" in str(err): + # raised directly in _aggregate_multiple_funcs + pass + else: + raise + else: + results.append(new_res) + keys.append(col) + + # if we are empty + if not len(results): + raise ValueError("no results") + + try: + return concat(results, keys=keys, axis=1, sort=False) + except TypeError as err: + + # we are concatting non-NDFrame objects, + # e.g. a list of scalars + + from pandas import Series + + result = Series(results, index=keys, name=self.name) + if is_nested_object(result): + raise ValueError( + "cannot combine transform and aggregation operations" + ) from err + return result + + def _get_cython_func(self, arg: str) -> Optional[str]: """ if we define an internal function for this argument, return it """ @@ -344,28 +574,38 @@ class SelectionMixin: return self._builtin_table.get(arg, arg) -class IndexOpsMixin(OpsMixin): +class ShallowMixin: + _attributes: List[str] = [] + + def _shallow_copy(self, obj, **kwargs): + """ + return a new object with the replacement attributes + """ + if isinstance(obj, self._constructor): + obj = obj.obj + for attr in self._attributes: + if attr not in kwargs: + kwargs[attr] = getattr(self, attr) + return self._constructor(obj, **kwargs) + + +class IndexOpsMixin: """ Common ops mixin to support a unified interface / docs for Series / Index """ # ndarray compatibility __array_priority__ = 1000 - _hidden_attrs: FrozenSet[str] = frozenset( + _deprecations: FrozenSet[str] = frozenset( ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) - @property - def dtype(self) -> DtypeObj: - # must be defined here as a property for mypy - raise AbstractMethodError(self) - @property def _values(self) -> Union[ExtensionArray, np.ndarray]: # must be defined here as a property for mypy raise AbstractMethodError(self) - def transpose(self: _T, *args, **kwargs) -> _T: + def transpose(self, *args, **kwargs): """ Return the transpose, which is by definition self. @@ -403,7 +643,7 @@ class IndexOpsMixin(OpsMixin): def item(self): """ - Return the first element of the underlying data as a Python scalar. + Return the first element of the underlying data as a python scalar. Returns ------- @@ -596,11 +836,7 @@ class IndexOpsMixin(OpsMixin): dtype='datetime64[ns]') """ if is_extension_array_dtype(self.dtype): - # pandas\core\base.py:837: error: Too many arguments for "to_numpy" - # of "ExtensionArray" [call-arg] - return self.array.to_numpy( # type: ignore[call-arg] - dtype, copy=copy, na_value=na_value, **kwargs - ) + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) elif kwargs: bad_keys = list(kwargs.keys())[0] raise TypeError( @@ -616,10 +852,10 @@ class IndexOpsMixin(OpsMixin): return result @property - def empty(self) -> bool: + def empty(self): return not self.size - def max(self, axis=None, skipna: bool = True, *args, **kwargs): + def max(self, axis=None, skipna=True, *args, **kwargs): """ Return the maximum value of the Index. @@ -664,7 +900,7 @@ class IndexOpsMixin(OpsMixin): return nanops.nanmax(self._values, skipna=skipna) @doc(op="max", oppose="min", value="largest") - def argmax(self, axis=None, skipna: bool = True, *args, **kwargs) -> int: + def argmax(self, axis=None, skipna=True, *args, **kwargs): """ Return int position of the {value} value in the Series. @@ -719,7 +955,7 @@ class IndexOpsMixin(OpsMixin): nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmax(self._values, skipna=skipna) - def min(self, axis=None, skipna: bool = True, *args, **kwargs): + def min(self, axis=None, skipna=True, *args, **kwargs): """ Return the minimum value of the Index. @@ -764,7 +1000,7 @@ class IndexOpsMixin(OpsMixin): return nanops.nanmin(self._values, skipna=skipna) @doc(argmax, op="min", oppose="max", value="smallest") - def argmin(self, axis=None, skipna=True, *args, **kwargs) -> int: + def argmin(self, axis=None, skipna=True, *args, **kwargs): nv.validate_minmax_axis(axis) nv.validate_argmax_with_skipna(skipna, args, kwargs) return nanops.nanargmin(self._values, skipna=skipna) @@ -819,14 +1055,10 @@ class IndexOpsMixin(OpsMixin): """ return bool(isna(self).any()) - def isna(self): - return isna(self._values) - def _reduce( self, op, name: str, - *, axis=0, skipna=True, numeric_only=None, @@ -892,15 +1124,7 @@ class IndexOpsMixin(OpsMixin): if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values - - # pandas\core\base.py:893: error: Incompatible types in - # assignment (expression has type "Categorical", variable has - # type "IndexOpsMixin") [assignment] - self = cast("Categorical", self) # type: ignore[assignment] - # pandas\core\base.py:894: error: Item "ExtensionArray" of - # "Union[ExtensionArray, Any]" has no attribute "map" - # [union-attr] - return self._values.map(mapper) # type: ignore[union-attr] + return self._values.map(mapper) values = self._values @@ -917,13 +1141,12 @@ class IndexOpsMixin(OpsMixin): raise NotImplementedError map_f = lambda values, f: values.map(f) else: - # pandas\core\base.py:1142: error: "IndexOpsMixin" has no attribute - # "astype" [attr-defined] - values = self.astype(object)._values # type: ignore[attr-defined] + values = self.astype(object)._values if na_action == "ignore": - map_f = lambda values, f: lib.map_infer_mask( - values, f, isna(values).view(np.uint8) - ) + + def map_f(values, f): + return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) + elif na_action is None: map_f = lib.map_infer else: @@ -939,12 +1162,7 @@ class IndexOpsMixin(OpsMixin): return new_values def value_counts( - self, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - bins=None, - dropna: bool = True, + self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): """ Return a Series containing counts of unique values. @@ -983,8 +1201,8 @@ class IndexOpsMixin(OpsMixin): >>> index = pd.Index([3, 1, 2, 3, 4, np.nan]) >>> index.value_counts() 3.0 2 - 2.0 1 4.0 1 + 2.0 1 1.0 1 dtype: int64 @@ -994,8 +1212,8 @@ class IndexOpsMixin(OpsMixin): >>> s = pd.Series([3, 1, 2, 3, 4, np.nan]) >>> s.value_counts(normalize=True) 3.0 0.4 - 2.0 0.2 4.0 0.2 + 2.0 0.2 1.0 0.2 dtype: float64 @@ -1007,8 +1225,8 @@ class IndexOpsMixin(OpsMixin): number of half-open bins. >>> s.value_counts(bins=3) - (0.996, 2.0] 2 (2.0, 3.0] 2 + (0.996, 2.0] 2 (3.0, 4.0] 1 dtype: int64 @@ -1018,9 +1236,9 @@ class IndexOpsMixin(OpsMixin): >>> s.value_counts(dropna=False) 3.0 2 - 2.0 1 NaN 1 4.0 1 + 2.0 1 1.0 1 dtype: int64 """ @@ -1082,8 +1300,11 @@ class IndexOpsMixin(OpsMixin): >>> s.nunique() 4 """ - obj = remove_na_arraylike(self) if dropna else self - return len(obj.unique()) + uniqs = self.unique() + n = len(uniqs) + if dropna and isna(uniqs).any(): + n -= 1 + return n @property def is_unique(self) -> bool: @@ -1138,7 +1359,7 @@ class IndexOpsMixin(OpsMixin): Parameters ---------- - deep : bool, default False + deep : bool Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption. @@ -1157,9 +1378,7 @@ class IndexOpsMixin(OpsMixin): are not components of the array if deep=False or if used on PyPy """ if hasattr(self.array, "memory_usage"): - # pandas\core\base.py:1379: error: "ExtensionArray" has no - # attribute "memory_usage" [attr-defined] - return self.array.memory_usage(deep=deep) # type: ignore[attr-defined] + return self.array.memory_usage(deep=deep) v = self.array.nbytes if deep and is_object_dtype(self) and not PYPY: @@ -1250,16 +1469,6 @@ class IndexOpsMixin(OpsMixin): >>> ser.searchsorted([1, 3], side='right') array([1, 3]) - >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'])) - >>> ser - 0 2000-03-11 - 1 2000-03-12 - 2 2000-03-13 - dtype: datetime64[ns] - - >>> ser.searchsorted('3/14/2000') - 3 - >>> ser = pd.Categorical( ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True ... ) @@ -1292,11 +1501,20 @@ class IndexOpsMixin(OpsMixin): return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) def drop_duplicates(self, keep="first"): + if isinstance(self, ABCIndexClass): + if self.is_unique: + return self._shallow_copy() + duplicated = self.duplicated(keep=keep) - # pandas\core\base.py:1507: error: Value of type "IndexOpsMixin" is not - # indexable [index] - result = self[np.logical_not(duplicated)] # type: ignore[index] + result = self[np.logical_not(duplicated)] return result def duplicated(self, keep="first"): - return duplicated(self._values, keep=keep) + if isinstance(self, ABCIndexClass): + if self.is_unique: + return np.zeros(len(self), dtype=bool) + return duplicated(self, keep=keep) + else: + return self._constructor( + duplicated(self, keep=keep), index=self.index + ).__finalize__(self, method="duplicated") diff --git a/venv/lib/python3.8/site-packages/pandas/core/common.py b/venv/lib/python3.8/site-packages/pandas/core/common.py index cdcbc43..e7260a9 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/common.py +++ b/venv/lib/python3.8/site-packages/pandas/core/common.py @@ -6,16 +6,17 @@ Note: pandas.core.common is *not* part of the public API. from collections import abc, defaultdict import contextlib +from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Collection, Iterable, Iterator, List, Union, cast +from typing import Any, Collection, Iterable, Iterator, List, Union import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, tslibs from pandas._typing import AnyArrayLike, Scalar, T -from pandas.compat.numpy import np_version_under1p18 +from pandas.compat.numpy import _np_version_under1p18 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -24,8 +25,13 @@ from pandas.core.dtypes.common import ( is_extension_array_dtype, is_integer, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import iterable_not_string +from pandas.core.dtypes.generic import ( + ABCExtensionArray, + ABCIndex, + ABCIndexClass, + ABCSeries, +) +from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -37,13 +43,13 @@ class SettingWithCopyWarning(Warning): pass -def flatten(line): +def flatten(l): """ Flatten an arbitrarily nested sequence. Parameters ---------- - line : sequence + l : sequence The non string sequence to flatten Notes @@ -54,11 +60,12 @@ def flatten(line): ------- flattened : generator """ - for element in line: - if iterable_not_string(element): - yield from flatten(element) + for el in l: + if _iterable_not_string(el): + for s in flatten(el): + yield s else: - yield element + yield el def consensus_name_attr(objs): @@ -72,6 +79,21 @@ def consensus_name_attr(objs): return name +def maybe_box_datetimelike(value, dtype=None): + # turn a datetime like into a Timestamp/timedelta as needed + if dtype == object: + # If we dont have datetime64/timedelta64 dtype, we dont want to + # box datetimelike scalars + return value + + if isinstance(value, (np.datetime64, datetime)): + value = tslibs.Timestamp(value) + elif isinstance(value, (np.timedelta64, timedelta)): + value = tslibs.Timedelta(value) + + return value + + def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. @@ -100,7 +122,7 @@ def is_bool_indexer(key: Any) -> bool: check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ - if isinstance(key, (ABCSeries, np.ndarray, ABCIndexClass)) or ( + if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: @@ -108,9 +130,7 @@ def is_bool_indexer(key: Any) -> bool: if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" - if lib.infer_dtype(key) == "boolean" and isna(key).any(): - # Don't raise on e.g. ["A", "B", np.nan], see - # test_loc_getitem_list_of_labels_categoricalindex_with_na + if isna(key).any(): raise ValueError(na_msg) return False return True @@ -257,11 +277,6 @@ def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T """ if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): return list(obj) - # error: Incompatible return value type (got - # "Union[pandas.core.common., - # pandas.core.common.1, T]", expected - # "Union[Collection[T], T]") [return-value] - obj = cast(Collection, obj) return obj @@ -277,23 +292,20 @@ def is_null_slice(obj) -> bool: ) -def is_true_slices(line): +def is_true_slices(l): """ - Find non-trivial slices in "line": return a list of booleans with same length. + Find non-trivial slices in "l": return a list of booleans with same length. """ - return [isinstance(k, slice) and not is_null_slice(k) for k in line] + return [isinstance(k, slice) and not is_null_slice(k) for k in l] # TODO: used only once in indexing; belongs elsewhere? -def is_full_slice(obj, line) -> bool: +def is_full_slice(obj, l) -> bool: """ We have a full length slice. """ return ( - isinstance(obj, slice) - and obj.start == 0 - and obj.stop == line - and obj.step is None + isinstance(obj, slice) and obj.start == 0 and obj.stop == l and obj.step is None ) @@ -331,6 +343,23 @@ def apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable +def dict_compat(d): + """ + Helper function to convert datetimelike-keyed dicts + to Timestamp-keyed dict. + + Parameters + ---------- + d: dict like object + + Returns + ------- + dict + + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def standardize_mapping(into): """ Helper function to standardize a supplied mapping. @@ -391,7 +420,7 @@ def random_state(state=None): if ( is_integer(state) or is_array_like(state) - or (not np_version_under1p18 and isinstance(state, np.random.BitGenerator)) + or (not _np_version_under1p18 and isinstance(state, np.random.BitGenerator)) ): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): @@ -400,8 +429,10 @@ def random_state(state=None): return np.random else: raise ValueError( - "random_state must be an integer, array-like, a BitGenerator, " - "a numpy RandomState, or None" + ( + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" + ) ) @@ -466,11 +497,8 @@ def convert_to_list_like( Convert list-like or scalar input to list-like. List, numpy and pandas array-like inputs are returned unmodified whereas others are converted to list. """ - if isinstance( - values, (list, np.ndarray, ABCIndexClass, ABCSeries, ABCExtensionArray) - ): - # np.ndarray resolving as Any gives a false positive - return values # type: ignore[return-value] + if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)): + return values elif isinstance(values, abc.Iterable) and not isinstance(values, str): return list(values) diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/align.py b/venv/lib/python3.8/site-packages/pandas/core/computation/align.py index 5ad3e78..82867cf 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/align.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/align.py @@ -1,10 +1,9 @@ """ Core eval alignment algorithms. """ -from __future__ import annotations from functools import partial, wraps -from typing import TYPE_CHECKING, Dict, Optional, Sequence, Tuple, Type, Union +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np @@ -18,16 +17,13 @@ from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation.common import result_type_many -if TYPE_CHECKING: - from pandas.core.indexes.api import Index - def _align_core_single_unary_op( term, -) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, Index]]]: +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: typ: Union[partial, Type[FrameOrSeries]] - axes: Optional[Dict[str, Index]] = None + axes: Optional[Dict[str, int]] = None if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) @@ -40,9 +36,10 @@ def _align_core_single_unary_op( def _zip_axes_from_type( - typ: Type[FrameOrSeries], new_axes: Sequence[Index] -) -> Dict[str, Index]: - return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)} + return axes def _any_pandas_objects(terms) -> bool: @@ -189,11 +186,8 @@ def reconstruct_object(typ, obj, axes, dtype): # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) - if ( - len(obj.shape) == 1 - and len(obj) == 1 - and not isinstance(ret_value, np.ndarray) - ): - ret_value = np.array([ret_value]).astype(res_t) + if len(obj.shape) == 1 and len(obj) == 1: + if not isinstance(ret_value, np.ndarray): + ret_value = np.array([ret_value]).astype(res_t) return ret_value diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/check.py b/venv/lib/python3.8/site-packages/pandas/core/computation/check.py index 6c7261b..4d20590 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/check.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/check.py @@ -1,10 +1,10 @@ from pandas.compat._optional import import_optional_dependency ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") -NUMEXPR_INSTALLED = ne is not None -if NUMEXPR_INSTALLED: - NUMEXPR_VERSION = ne.__version__ +_NUMEXPR_INSTALLED = ne is not None +if _NUMEXPR_INSTALLED: + _NUMEXPR_VERSION = ne.__version__ else: - NUMEXPR_VERSION = None + _NUMEXPR_VERSION = None -__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"] +__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/common.py b/venv/lib/python3.8/site-packages/pandas/core/computation/common.py index 8a9583c..327ec21 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/common.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/common.py @@ -5,7 +5,7 @@ import numpy as np from pandas._config import get_option -def ensure_decoded(s): +def _ensure_decoded(s): """ If we have bytes, decode them to unicode. """ diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/engines.py b/venv/lib/python3.8/site-packages/pandas/core/computation/engines.py index 77a3783..9c5388f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/engines.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/engines.py @@ -6,11 +6,11 @@ import abc from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object -from pandas.core.computation.ops import MATHOPS, REDUCTIONS +from pandas.core.computation.ops import _mathops, _reductions import pandas.io.formats.printing as printing -_ne_builtins = frozenset(MATHOPS + REDUCTIONS) +_ne_builtins = frozenset(_mathops + _reductions) class NumExprClobberingError(NameError): @@ -130,7 +130,7 @@ class PythonEngine(AbstractEngine): pass -ENGINES: Dict[str, Type[AbstractEngine]] = { +_engines: Dict[str, Type[AbstractEngine]] = { "numexpr": NumExprEngine, "python": PythonEngine, } diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/eval.py b/venv/lib/python3.8/site-packages/pandas/core/computation/eval.py index 12f1634..b74f99f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/eval.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/eval.py @@ -9,8 +9,8 @@ import warnings from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg -from pandas.core.computation.engines import ENGINES -from pandas.core.computation.expr import PARSERS, Expr +from pandas.core.computation.engines import _engines +from pandas.core.computation.expr import Expr, _parsers from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope @@ -38,13 +38,13 @@ def _check_engine(engine: Optional[str]) -> str: str Engine name. """ - from pandas.core.computation.check import NUMEXPR_INSTALLED + from pandas.core.computation.check import _NUMEXPR_INSTALLED if engine is None: - engine = "numexpr" if NUMEXPR_INSTALLED else "python" + engine = "numexpr" if _NUMEXPR_INSTALLED else "python" - if engine not in ENGINES: - valid_engines = list(ENGINES.keys()) + if engine not in _engines: + valid_engines = list(_engines.keys()) raise KeyError( f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) @@ -52,11 +52,12 @@ def _check_engine(engine: Optional[str]) -> str: # TODO: validate this in a more general way (thinking of future engines # that won't necessarily be import-able) # Could potentially be done on engine instantiation - if engine == "numexpr" and not NUMEXPR_INSTALLED: - raise ImportError( - "'numexpr' is not installed or an unsupported version. Cannot use " - "engine='numexpr' for query/eval if 'numexpr' is not installed" - ) + if engine == "numexpr": + if not _NUMEXPR_INSTALLED: + raise ImportError( + "'numexpr' is not installed or an unsupported version. Cannot use " + "engine='numexpr' for query/eval if 'numexpr' is not installed" + ) return engine @@ -74,9 +75,9 @@ def _check_parser(parser: str): KeyError * If an invalid parser is passed """ - if parser not in PARSERS: + if parser not in _parsers: raise KeyError( - f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" + f"Invalid parser '{parser}' passed, valid parsers are {_parsers.keys()}" ) @@ -211,8 +212,7 @@ def eval( truediv : bool, optional Whether to use true division, like in Python >= 3. - - .. deprecated:: 1.0.0 + deprecated:: 1.0.0 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. @@ -241,8 +241,7 @@ def eval( Returns ------- - ndarray, numeric scalar, DataFrame, Series, or None - The completion value of evaluating the given code or None if ``inplace=True``. + ndarray, numeric scalar, DataFrame, Series Raises ------ @@ -342,7 +341,7 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) # construct the engine and evaluate the parsed expression - eng = ENGINES[engine] + eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/expr.py b/venv/lib/python3.8/site-packages/pandas/core/computation/expr.py index 88a25ad..fcccc24 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/expr.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/expr.py @@ -10,17 +10,9 @@ from typing import Callable, Optional, Set, Tuple, Type, TypeVar import numpy as np -from pandas.compat import PY39 - import pandas.core.common as com from pandas.core.computation.ops import ( - ARITH_OPS_SYMS, - BOOL_OPS_SYMS, - CMP_OPS_SYMS, - LOCAL_TAG, - MATHOPS, - REDUCTIONS, - UNARY_OPS_SYMS, + _LOCAL_TAG, BinOp, Constant, Div, @@ -29,6 +21,12 @@ from pandas.core.computation.ops import ( Term, UnaryOp, UndefinedVariableError, + _arith_ops_syms, + _bool_ops_syms, + _cmp_ops_syms, + _mathops, + _reductions, + _unary_ops_syms, is_term, ) from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string @@ -103,7 +101,7 @@ def _replace_locals(tok: Tuple[int, str]) -> Tuple[int, str]: """ toknum, tokval = tok if toknum == tokenize.OP and tokval == "@": - return tokenize.OP, LOCAL_TAG + return tokenize.OP, _LOCAL_TAG return toknum, tokval @@ -153,7 +151,7 @@ def _preparse( the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), "f must be callable" - return tokenize.untokenize(f(x) for x in tokenize_string(source)) + return tokenize.untokenize((f(x) for x in tokenize_string(source))) def _is_type(t): @@ -169,9 +167,10 @@ _is_str = _is_type(str) # partition all AST nodes _all_nodes = frozenset( - node - for node in (getattr(ast, name) for name in dir(ast)) - if isinstance(node, type) and issubclass(node, ast.AST) + filter( + lambda x: isinstance(x, type) and issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)), + ) ) @@ -188,6 +187,7 @@ _mod_nodes = _filter_nodes(ast.mod) _stmt_nodes = _filter_nodes(ast.stmt) _expr_nodes = _filter_nodes(ast.expr) _expr_context_nodes = _filter_nodes(ast.expr_context) +_slice_nodes = _filter_nodes(ast.slice) _boolop_nodes = _filter_nodes(ast.boolop) _operator_nodes = _filter_nodes(ast.operator) _unary_op_nodes = _filter_nodes(ast.unaryop) @@ -198,9 +198,6 @@ _arguments_nodes = _filter_nodes(ast.arguments) _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) -if not PY39: - _slice_nodes = _filter_nodes(ast.slice) - # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) @@ -342,7 +339,7 @@ class BaseExprVisitor(ast.NodeVisitor): const_type: Type[Term] = Constant term_type = Term - binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS + binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms binary_op_nodes = ( "Gt", "Lt", @@ -366,9 +363,9 @@ class BaseExprVisitor(ast.NodeVisitor): ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) - unary_ops = UNARY_OPS_SYMS + unary_ops = _unary_ops_syms unary_op_nodes = "UAdd", "USub", "Invert", "Not" - unary_op_nodes_map = {k: v for k, v in zip(unary_ops, unary_op_nodes)} + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) rewrite_map = { ast.Eq: ast.In, @@ -496,14 +493,15 @@ class BaseExprVisitor(ast.NodeVisitor): f"'{lhs.type}' and '{rhs.type}'" ) - if self.engine != "pytables" and ( - res.op in CMP_OPS_SYMS - and getattr(lhs, "is_datetime", False) - or getattr(rhs, "is_datetime", False) - ): - # all date ops must be done in python bc numexpr doesn't work - # well with NaT - return self._maybe_eval(res, self.binary_ops) + if self.engine != "pytables": + if ( + res.op in _cmp_ops_syms + and getattr(lhs, "is_datetime", False) + or getattr(rhs, "is_datetime", False) + ): + # all date ops must be done in python bc numexpr doesn't work + # well with NaT + return self._maybe_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python @@ -659,11 +657,7 @@ class BaseExprVisitor(ast.NodeVisitor): raise if res is None: - # pandas\core\computation\expr.py:663: error: "expr" has no - # attribute "id" [attr-defined] - raise ValueError( - f"Invalid function call {node.func.id}" # type: ignore[attr-defined] - ) + raise ValueError(f"Invalid function call {node.func.id}") if hasattr(res, "value"): res = res.value @@ -684,12 +678,7 @@ class BaseExprVisitor(ast.NodeVisitor): for key in node.keywords: if not isinstance(key, ast.keyword): - # pandas\core\computation\expr.py:684: error: "expr" has no - # attribute "id" [attr-defined] - raise ValueError( - "keyword error in function call " # type: ignore[attr-defined] - f"'{node.func.id}'" - ) + raise ValueError(f"keyword error in function call '{node.func.id}'") if key.arg: kwargs[key.arg] = self.visit(key.value).value @@ -738,7 +727,7 @@ class BaseExprVisitor(ast.NodeVisitor): _python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) -_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) +_numexpr_supported_calls = frozenset(_reductions + _mathops) @disallow( @@ -794,7 +783,7 @@ class Expr: self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self._visitor = PARSERS[parser](self.env, self.engine, self.parser) + self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property @@ -826,4 +815,4 @@ class Expr: return frozenset(term.name for term in com.flatten(self.terms)) -PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} +_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/expressions.py b/venv/lib/python3.8/site-packages/pandas/core/computation/expressions.py index e5ede3c..da290db 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/expressions.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/expressions.py @@ -6,7 +6,6 @@ Offer fast expression evaluation through numexpr """ import operator -from typing import List, Set import warnings import numpy as np @@ -15,15 +14,15 @@ from pandas._config import get_option from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.computation.check import NUMEXPR_INSTALLED +from pandas.core.computation.check import _NUMEXPR_INSTALLED from pandas.core.ops import roperator -if NUMEXPR_INSTALLED: +if _NUMEXPR_INSTALLED: import numexpr as ne _TEST_MODE = None -_TEST_RESULT: List[bool] = [] -USE_NUMEXPR = NUMEXPR_INSTALLED +_TEST_RESULT = None +_USE_NUMEXPR = _NUMEXPR_INSTALLED _evaluate = None _where = None @@ -39,21 +38,21 @@ _MIN_ELEMENTS = 10000 def set_use_numexpr(v=True): # set/unset to use numexpr - global USE_NUMEXPR - if NUMEXPR_INSTALLED: - USE_NUMEXPR = v + global _USE_NUMEXPR + if _NUMEXPR_INSTALLED: + _USE_NUMEXPR = v # choose what we are going to do global _evaluate, _where - _evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard - _where = _where_numexpr if USE_NUMEXPR else _where_standard + _evaluate = _evaluate_numexpr if _USE_NUMEXPR else _evaluate_standard + _where = _where_numexpr if _USE_NUMEXPR else _where_standard def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - if NUMEXPR_INSTALLED and USE_NUMEXPR: + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: if n is None: n = ne.detect_number_of_cores() ne.set_num_threads(n) @@ -76,7 +75,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatibility - dtypes: Set[str] = set() + dtypes = set() for o in [a, b]: # Series implements dtypes, check for dimension count as well if hasattr(o, "dtypes") and o.ndim > 1: @@ -231,8 +230,7 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - # error: "None" not callable - return _evaluate(op, op_str, a, b) # type: ignore[misc] + return _evaluate(op, op_str, a, b) # type: ignore return _evaluate_standard(op, op_str, a, b) @@ -248,32 +246,28 @@ def where(cond, a, b, use_numexpr=True): use_numexpr : bool, default True Whether to try to use numexpr. """ - assert _where is not None return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) -def set_test_mode(v: bool = True) -> None: +def set_test_mode(v=True): """ - Keeps track of whether numexpr was used. - - Stores an additional ``True`` for every successful use of evaluate with - numexpr since the last ``get_test_result``. + Keeps track of whether numexpr was used. Stores an additional ``True`` + for every successful use of evaluate with numexpr since the last + ``get_test_result`` """ global _TEST_MODE, _TEST_RESULT _TEST_MODE = v _TEST_RESULT = [] -def _store_test_result(used_numexpr: bool) -> None: +def _store_test_result(used_numexpr): global _TEST_RESULT if used_numexpr: _TEST_RESULT.append(used_numexpr) -def get_test_result() -> List[bool]: - """ - Get test result and reset test_results. - """ +def get_test_result(): + """get test result and reset test_results""" global _TEST_RESULT res = _TEST_RESULT _TEST_RESULT = [] diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/ops.py b/venv/lib/python3.8/site-packages/pandas/core/computation/ops.py index 74bee80..e55df1e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/ops.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/ops.py @@ -15,12 +15,12 @@ from pandas._libs.tslibs import Timestamp from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import ensure_decoded, result_type_many -from pandas.core.computation.scope import DEFAULT_GLOBALS +from pandas.core.computation.common import _ensure_decoded, result_type_many +from pandas.core.computation.scope import _DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -REDUCTIONS = ("sum", "prod") +_reductions = ("sum", "prod") _unary_math_ops = ( "sin", @@ -46,10 +46,10 @@ _unary_math_ops = ( ) _binary_math_ops = ("arctan2",) -MATHOPS = _unary_math_ops + _binary_math_ops +_mathops = _unary_math_ops + _binary_math_ops -LOCAL_TAG = "__pd_eval_local_" +_LOCAL_TAG = "__pd_eval_local_" class UndefinedVariableError(NameError): @@ -69,9 +69,7 @@ class UndefinedVariableError(NameError): class Term: def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, str) else cls - # pandas\core\computation\ops.py:72: error: Argument 2 for "super" not - # an instance of argument 1 [misc] - supr_new = super(Term, klass).__new__ # type: ignore[misc] + supr_new = super(Term, klass).__new__ return supr_new(klass) is_local: bool @@ -82,13 +80,13 @@ class Term: self.env = env self.side = side tname = str(name) - self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS + self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self) -> str: - return self.name.replace(LOCAL_TAG, "") + return self.name.replace(_LOCAL_TAG, "") def __repr__(self) -> str: return pprint_thing(self.name) @@ -222,7 +220,7 @@ class Op: @property def return_type(self): # clobber types to bool if the op is a boolean operator - if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): + if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self))) @@ -282,7 +280,7 @@ def _not_in(x, y): return x not in y -CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +_cmp_ops_syms = (">", "<", ">=", "<=", "==", "!=", "in", "not in") _cmp_ops_funcs = ( operator.gt, operator.lt, @@ -293,13 +291,13 @@ _cmp_ops_funcs = ( _in, _not_in, ) -_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) -BOOL_OPS_SYMS = ("&", "|", "and", "or") +_bool_ops_syms = ("&", "|", "and", "or") _bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) -_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) -ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") +_arith_ops_syms = ("+", "-", "*", "/", "**", "//", "%") _arith_ops_funcs = ( operator.add, operator.sub, @@ -309,12 +307,12 @@ _arith_ops_funcs = ( operator.floordiv, operator.mod, ) -_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) -SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") +_special_case_arith_ops_syms = ("**", "//", "%") _special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) _special_case_arith_ops_dict = dict( - zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) + zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) ) _binary_ops_dict = {} @@ -468,7 +466,7 @@ class BinOp(Op): v = rhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(ensure_decoded(v)) + v = Timestamp(_ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.rhs.update(v) @@ -477,7 +475,7 @@ class BinOp(Op): v = lhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(ensure_decoded(v)) + v = Timestamp(_ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.lhs.update(v) @@ -532,9 +530,9 @@ class Div(BinOp): _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) -UNARY_OPS_SYMS = ("+", "-", "~", "not") +_unary_ops_syms = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) -_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) class UnaryOp(Op): @@ -563,7 +561,7 @@ class UnaryOp(Op): except KeyError as err: raise ValueError( f"Invalid unary operator {repr(op)}, " - f"valid operators are {UNARY_OPS_SYMS}" + f"valid operators are {_unary_ops_syms}" ) from err def __call__(self, env): @@ -591,8 +589,7 @@ class MathCall(Op): self.func = func def __call__(self, env): - # pandas\core\computation\ops.py:592: error: "Op" not callable [operator] - operands = [op(env) for op in self.operands] # type: ignore[operator] + operands = [op(env) for op in self.operands] with np.errstate(all="ignore"): return self.func.func(*operands) @@ -603,11 +600,11 @@ class MathCall(Op): class FuncNode: def __init__(self, name: str): - from pandas.core.computation.check import NUMEXPR_INSTALLED, NUMEXPR_VERSION + from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION - if name not in MATHOPS or ( - NUMEXPR_INSTALLED - and NUMEXPR_VERSION < LooseVersion("2.6.9") + if name not in _mathops or ( + _NUMEXPR_INSTALLED + and _NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") ): raise ValueError(f'"{name}" is not a supported function') diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/parsing.py b/venv/lib/python3.8/site-packages/pandas/core/computation/parsing.py index a1bebc9..c7c7103 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/parsing.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/parsing.py @@ -8,8 +8,6 @@ import token import tokenize from typing import Iterator, Tuple -from pandas._typing import Label - # A token value Python's tokenizer probably will never use. BACKTICK_QUOTED_STRING = 100 @@ -39,9 +37,7 @@ def create_valid_python_identifier(name: str) -> str: special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" # The ignore here is because of a bug in mypy that is resolved in 0.740 - for char, tokval in ( - tokenize.EXACT_TOKEN_TYPES.items() # type: ignore[attr-defined] - ) + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore } special_characters_replacements.update( { @@ -93,7 +89,7 @@ def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: return toknum, tokval -def clean_column_name(name: "Label") -> "Label": +def clean_column_name(name: str) -> str: """ Function to emulate the cleaning of a backtick quoted name. @@ -104,12 +100,12 @@ def clean_column_name(name: "Label") -> "Label": Parameters ---------- - name : hashable + name : str Name to be cleaned. Returns ------- - name : hashable + name : str Returns the name after tokenizing and cleaning. Notes diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/pytables.py b/venv/lib/python3.8/site-packages/pandas/core/computation/pytables.py index b819886..a3389a8 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/pytables.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/pytables.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_list_like import pandas as pd import pandas.core.common as com from pandas.core.computation import expr, ops, scope as _scope -from pandas.core.computation.common import ensure_decoded +from pandas.core.computation.common import _ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term from pandas.core.construction import extract_array @@ -35,17 +35,14 @@ class PyTablesScope(_scope.Scope): queryables: Optional[Dict[str, Any]] = None, ): super().__init__(level + 1, global_dict=global_dict, local_dict=local_dict) - self.queryables = queryables or {} + self.queryables = queryables or dict() class Term(ops.Term): env: PyTablesScope def __new__(cls, name, env, side=None, encoding=None): - if isinstance(name, str): - klass = cls - else: - klass = Constant + klass = Constant if not isinstance(name, str) else cls return object.__new__(klass) def __init__(self, name, env: PyTablesScope, side=None, encoding=None): @@ -66,7 +63,7 @@ class Term(ops.Term): return self.name # read-only property overwriting read/write property - @property # type: ignore[misc] + @property # type: ignore def value(self): return self._value @@ -86,7 +83,6 @@ class BinOp(ops.BinOp): op: str queryables: Dict[str, Any] - condition: Optional[str] def __init__(self, op: str, lhs, rhs, queryables: Dict[str, Any], encoding): super().__init__(op, lhs, rhs) @@ -188,15 +184,17 @@ class BinOp(ops.BinOp): def stringify(value): if self.encoding is not None: - return pprint_thing_encoded(value, encoding=self.encoding) - return pprint_thing(value) + encoder = partial(pprint_thing_encoded, encoding=self.encoding) + else: + encoder = pprint_thing + return encoder(value) - kind = ensure_decoded(self.kind) - meta = ensure_decoded(self.meta) + kind = _ensure_decoded(self.kind) + meta = _ensure_decoded(self.meta) if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) - v = ensure_decoded(v) + v = _ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert("UTC") @@ -259,11 +257,9 @@ class FilterBinOp(BinOp): def invert(self): """ invert the filter """ if self.filter is not None: - self.filter = ( - self.filter[0], - self.generate_filter_op(invert=True), - self.filter[2], - ) + f = list(self.filter) + f[1] = self.generate_filter_op(invert=True) + self.filter = tuple(f) return self def format(self): @@ -378,14 +374,14 @@ class UnaryOp(ops.UnaryOp): operand = self.operand operand = operand.prune(klass) - if operand is not None and ( - issubclass(klass, ConditionBinOp) - and operand.condition is not None - or not issubclass(klass, ConditionBinOp) - and issubclass(klass, FilterBinOp) - and operand.filter is not None - ): - return operand.invert() + if operand is not None: + if issubclass(klass, ConditionBinOp): + if operand.condition is not None: + return operand.invert() + elif issubclass(klass, FilterBinOp): + if operand.filter is not None: + return operand.invert() + return None @@ -562,7 +558,7 @@ class PyTablesExpr(expr.Expr): else: w = _validate_where(w) where[idx] = w - _where = " & ".join(f"({w})" for w in com.flatten(where)) + _where = " & ".join((f"({w})" for w in com.flatten(where))) else: _where = where diff --git a/venv/lib/python3.8/site-packages/pandas/core/computation/scope.py b/venv/lib/python3.8/site-packages/pandas/core/computation/scope.py index d2708da..83bf92a 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/computation/scope.py +++ b/venv/lib/python3.8/site-packages/pandas/core/computation/scope.py @@ -53,7 +53,7 @@ def _raw_hex_id(obj) -> str: return "".join(_replacer(x) for x in packed) -DEFAULT_GLOBALS = { +_DEFAULT_GLOBALS = { "Timestamp": Timestamp, "datetime": datetime.datetime, "True": True, @@ -114,7 +114,7 @@ class Scope: # shallow copy because we don't want to keep filling this up with what # was there before if there are multiple calls to Scope/_ensure_scope - self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) self.target = target if isinstance(local_dict, Scope): @@ -129,36 +129,23 @@ class Scope: # shallow copy here because we don't want to replace what's in # scope when we align terms (alignment accesses the underlying # numpy array of pandas objects) - - # pandas\core\computation\scope.py:132: error: Incompatible types - # in assignment (expression has type "ChainMap[str, Any]", variable - # has type "DeepChainMap[str, Any]") [assignment] - self.scope = self.scope.new_child( # type: ignore[assignment] - (global_dict or frame.f_globals).copy() - ) + self.scope = self.scope.new_child((global_dict or frame.f_globals).copy()) if not isinstance(local_dict, Scope): - # pandas\core\computation\scope.py:134: error: Incompatible - # types in assignment (expression has type "ChainMap[str, - # Any]", variable has type "DeepChainMap[str, Any]") - # [assignment] - self.scope = self.scope.new_child( # type: ignore[assignment] - (local_dict or frame.f_locals).copy() - ) + self.scope = self.scope.new_child((local_dict or frame.f_locals).copy()) finally: del frame # assumes that resolvers are going from outermost scope to inner if isinstance(local_dict, Scope): - # pandas\core\computation\scope.py:140: error: Cannot determine - # type of 'resolvers' [has-type] - resolvers += tuple(local_dict.resolvers.maps) # type: ignore[has-type] + resolvers += tuple(local_dict.resolvers.maps) self.resolvers = DeepChainMap(*resolvers) self.temps = {} def __repr__(self) -> str: scope_keys = _get_pretty_string(list(self.scope.keys())) res_keys = _get_pretty_string(list(self.resolvers.keys())) - return f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + unicode_str = f"{type(self).__name__}(scope={scope_keys}, resolvers={res_keys})" + return unicode_str @property def has_resolvers(self) -> bool: @@ -238,9 +225,7 @@ class Scope: for mapping in maps: if old_key in mapping: - # pandas\core\computation\scope.py:228: error: Unsupported - # target for indexed assignment ("Mapping[Any, Any]") [index] - mapping[new_key] = new_value # type: ignore[index] + mapping[new_key] = new_value return def _get_vars(self, stack, scopes: List[str]): @@ -259,11 +244,7 @@ class Scope: for scope, (frame, _, _, _, _, _) in variables: try: d = getattr(frame, "f_" + scope) - # pandas\core\computation\scope.py:247: error: Incompatible - # types in assignment (expression has type "ChainMap[str, - # Any]", variable has type "DeepChainMap[str, Any]") - # [assignment] - self.scope = self.scope.new_child(d) # type: ignore[assignment] + self.scope = self.scope.new_child(d) finally: # won't remove it, but DECREF it # in Py3 this probably isn't necessary since frame won't be @@ -330,16 +311,5 @@ class Scope: vars : DeepChainMap All variables in this scope. """ - # pandas\core\computation\scope.py:314: error: Unsupported operand - # types for + ("List[Dict[Any, Any]]" and "List[Mapping[Any, Any]]") - # [operator] - - # pandas\core\computation\scope.py:314: error: Unsupported operand - # types for + ("List[Dict[Any, Any]]" and "List[Mapping[str, Any]]") - # [operator] - maps = ( - [self.temps] - + self.resolvers.maps # type: ignore[operator] - + self.scope.maps # type: ignore[operator] - ) + maps = [self.temps] + self.resolvers.maps + self.scope.maps return DeepChainMap(*maps) diff --git a/venv/lib/python3.8/site-packages/pandas/core/config_init.py b/venv/lib/python3.8/site-packages/pandas/core/config_init.py index 7d9664b..2b24311 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/config_init.py +++ b/venv/lib/python3.8/site-packages/pandas/core/config_init.py @@ -85,9 +85,8 @@ with cf.config_prefix("compute"): pc_precision_doc = """ : int - Floating point output precision in terms of number of places after the - decimal, for regular formatting as well as scientific notation. Similar - to ``precision`` in :meth:`numpy.set_printoptions`. + Floating point output precision (number of significant digits). This is + only a suggestion """ pc_colspace_doc = """ @@ -250,7 +249,7 @@ pc_chop_threshold_doc = """ pc_max_seq_items = """ : int or None - When pretty-printing a long sequence, no more then `max_seq_items` + when pretty-printing a long sequence, no more then `max_seq_items` will be printed. If items are omitted, they will be denoted by the addition of "..." to the resulting string. @@ -315,9 +314,9 @@ pc_latex_multirow = """ def table_schema_cb(key): - from pandas.io.formats.printing import enable_data_resource_formatter + from pandas.io.formats.printing import _enable_data_resource_formatter - enable_data_resource_formatter(cf.get_option(key)) + _enable_data_resource_formatter(cf.get_option(key)) def is_terminal() -> bool: @@ -328,7 +327,7 @@ def is_terminal() -> bool: """ try: # error: Name 'get_ipython' is not defined - ip = get_ipython() # type: ignore[name-defined] + ip = get_ipython() # type: ignore except NameError: # assume standard Python interpreter in a terminal return True else: @@ -581,13 +580,6 @@ with cf.config_prefix("io.excel.xls"): writer_engine_doc.format(ext="xls", others=", ".join(_xls_options)), validator=str, ) -cf.deprecate_option( - "io.excel.xls.writer", - msg="As the xlwt package is no longer maintained, the xlwt engine will be " - "removed in a future version of pandas. This is the only engine in pandas that " - "supports writing in the xls format. Install openpyxl and write to an " - "xlsx file instead.", -) with cf.config_prefix("io.excel.xlsm"): cf.register_option( diff --git a/venv/lib/python3.8/site-packages/pandas/core/construction.py b/venv/lib/python3.8/site-packages/pandas/core/construction.py index 96cf1be..d1c174d 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/construction.py +++ b/venv/lib/python3.8/site-packages/pandas/core/construction.py @@ -4,7 +4,6 @@ and Index.__new__. These should not depend on core.internals. """ -from __future__ import annotations from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast @@ -37,7 +36,6 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, is_sparse, - is_string_dtype, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -51,14 +49,16 @@ from pandas.core.dtypes.missing import isna import pandas.core.common as com if TYPE_CHECKING: - from pandas import ExtensionArray, Index, Series + from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, -) -> ExtensionArray: +) -> "ExtensionArray": """ Create an array. @@ -102,7 +102,6 @@ def array( :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` - :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -115,11 +114,6 @@ def array( string dtype for string data, and nullable-boolean dtype for boolean data. - .. versionchanged:: 1.2.0 - - Pandas now also infers nullable-floating dtype for float-like - input data - copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -211,11 +205,6 @@ def array( [1, 2, ] Length: 3, dtype: Int64 - >>> pd.array([1.1, 2.2]) - - [1.1, 2.2] - Length: 2, dtype: Float64 - >>> pd.array(["a", None, "c"]) ['a', , 'c'] @@ -242,10 +231,10 @@ def array( If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. - >>> pd.array([1 + 1j, 3 + 2j]) + >>> pd.array([1.1, 2.2]) - [(1+1j), (3+2j)] - Length: 2, dtype: complex128 + [1.1, 2.2] + Length: 2, dtype: float64 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return @@ -269,7 +258,6 @@ def array( from pandas.core.arrays import ( BooleanArray, DatetimeArray, - FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -332,9 +320,6 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) - elif inferred_dtype in ("floating", "mixed-integer-float"): - return FloatingArray._from_sequence(data, copy=copy) - elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) @@ -351,7 +336,7 @@ def array( return result -def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayLike]: +def extract_array(obj, extract_numpy: bool = False): """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -402,27 +387,9 @@ def extract_array(obj: object, extract_numpy: bool = False) -> Union[Any, ArrayL return obj -def ensure_wrapped_if_datetimelike(arr): - """ - Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. - """ - if isinstance(arr, np.ndarray): - if arr.dtype.kind == "M": - from pandas.core.arrays import DatetimeArray - - return DatetimeArray._from_sequence(arr) - - elif arr.dtype.kind == "m": - from pandas.core.arrays import TimedeltaArray - - return TimedeltaArray._from_sequence(arr) - - return arr - - def sanitize_array( data, - index: Optional[Index], + index: Optional["Index"], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, @@ -504,7 +471,7 @@ def sanitize_array( # figure out the dtype from the value (upcast if necessary) if dtype is None: - dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) + dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) @@ -526,7 +493,7 @@ def sanitize_array( elif subarr.ndim > 1: if isinstance(data, np.ndarray): - raise ValueError("Data must be 1-dimensional") + raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) @@ -542,8 +509,7 @@ def sanitize_array( data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) - if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: + if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) @@ -551,7 +517,9 @@ def sanitize_array( return subarr -def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): +def _try_cast( + arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, +): """ Convert input to numpy ndarray and optionally cast to a given dtype. @@ -631,13 +599,13 @@ def is_empty_data(data: Any) -> bool: def create_series_with_explicit_dtype( data: Any = None, - index: Optional[Union[ArrayLike, Index]] = None, + index: Optional[Union[ArrayLike, "Index"]] = None, dtype: Optional[Dtype] = None, name: Optional[str] = None, copy: bool = False, fastpath: bool = False, dtype_if_empty: Dtype = object, -) -> Series: +) -> "Series": """ Helper to pass an explicit dtype when instantiating an empty Series. diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/base.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/base.py index c2be81c..07c7387 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/base.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/base.py @@ -12,18 +12,19 @@ from pandas.errors import AbstractMethodError from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries if TYPE_CHECKING: - from pandas.core.arrays import ExtensionArray + from pandas.core.arrays import ExtensionArray # noqa: F401 class ExtensionDtype: """ A custom data type, to be paired with an ExtensionArray. + .. versionadded:: 0.23.0 + See Also -------- - extensions.register_extension_dtype: Register an ExtensionType - with pandas as class decorator. - extensions.ExtensionArray: Abstract base class for custom 1-D array types. + extensions.register_extension_dtype + extensions.ExtensionArray Notes ----- @@ -99,8 +100,9 @@ class ExtensionDtype: By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type and all of the attributes - in ``self._metadata`` are equal between `self` and `other`. + * it's an instance of this type and all of the + the attributes in ``self._metadata`` are equal between + `self` and `other`. Parameters ---------- diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py index c77991c..bdf294a 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/cast.py @@ -2,21 +2,8 @@ Routines for casting. """ -from contextlib import suppress from datetime import date, datetime, timedelta -from typing import ( - TYPE_CHECKING, - Any, - Dict, - List, - Optional, - Sequence, - Set, - Sized, - Tuple, - Type, - Union, -) +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type import numpy as np @@ -27,19 +14,17 @@ from pandas._libs.tslibs import ( Period, Timedelta, Timestamp, - conversion, iNaT, ints_to_pydatetime, - ints_to_pytimedelta, ) from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj, Scalar +from pandas._typing import ArrayLike, Dtype, DtypeObj from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( + _POSSIBLY_CAST_DTYPES, DT64NS_DTYPE, INT64_DTYPE, - POSSIBLY_CAST_DTYPES, TD64NS_DTYPE, ensure_int8, ensure_int16, @@ -88,17 +73,11 @@ from pandas.core.dtypes.generic import ( ABCSeries, ) from pandas.core.dtypes.inference import is_list_like -from pandas.core.dtypes.missing import ( - is_valid_nat_for_dtype, - isna, - na_value_for_dtype, - notna, -) +from pandas.core.dtypes.missing import isna, notna if TYPE_CHECKING: from pandas import Series - from pandas.core.arrays import ExtensionArray - from pandas.core.indexes.base import Index + from pandas.core.arrays import ExtensionArray # noqa: F401 _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -134,31 +113,7 @@ def is_nested_object(obj) -> bool: return False -def maybe_box_datetimelike(value: Scalar, dtype: Optional[Dtype] = None) -> Scalar: - """ - Cast scalar to Timestamp or Timedelta if scalar is datetime-like - and dtype is not object. - - Parameters - ---------- - value : scalar - dtype : Dtype, optional - - Returns - ------- - scalar - """ - if dtype == object: - pass - elif isinstance(value, (np.datetime64, datetime)): - value = tslibs.Timestamp(value) - elif isinstance(value, (np.timedelta64, timedelta)): - value = tslibs.Timedelta(value) - - return value - - -def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): +def maybe_downcast_to_dtype(result, dtype): """ try to cast to the specified dtype (e.g. convert back to bool/int or could be an astype of float64->float32 @@ -194,20 +149,12 @@ def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): dtype = np.dtype(dtype) - elif dtype.type is Period: - from pandas.core.arrays import PeriodArray - - with suppress(TypeError): - # e.g. TypeError: int() argument must be a string, a - # bytes-like object or a number, not 'Period - return PeriodArray(result, freq=dtype.freq) - converted = maybe_downcast_numeric(result, dtype, do_round) if converted is not result: return converted # a datetimelike - # GH12821, iNaT is cast to float + # GH12821, iNaT is casted to float if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]: if hasattr(dtype, "tz"): # not a numpy dtype @@ -220,10 +167,21 @@ def maybe_downcast_to_dtype(result, dtype: Union[str, np.dtype]): else: result = result.astype(dtype) + elif dtype.type is Period: + # TODO(DatetimeArray): merge with previous elif + from pandas.core.arrays import PeriodArray + + try: + return PeriodArray(result, freq=dtype.freq) + except TypeError: + # e.g. TypeError: int() argument must be a string, a + # bytes-like object or a number, not 'Period + pass + return result -def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): +def maybe_downcast_numeric(result, dtype, do_round: bool = False): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. @@ -296,9 +254,7 @@ def maybe_downcast_numeric(result, dtype: DtypeObj, do_round: bool = False): return result -def maybe_cast_result( - result: ArrayLike, obj: "Series", numeric_only: bool = False, how: str = "" -) -> ArrayLike: +def maybe_cast_result(result, obj: "Series", numeric_only: bool = False, how: str = ""): """ Try casting result to a different type if appropriate @@ -318,23 +274,25 @@ def maybe_cast_result( result : array-like result maybe casted to the dtype. """ - dtype = obj.dtype + if obj.ndim > 1: + dtype = obj._values.dtype + else: + dtype = obj.dtype dtype = maybe_cast_result_dtype(dtype, how) - assert not is_scalar(result) + if not is_scalar(result): + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) - if ( - is_extension_array_dtype(dtype) - and not is_categorical_dtype(dtype) - and dtype.kind != "M" - ): - # We have to special case categorical so as not to upcast - # things like counts back to categorical - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) - - elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: - result = maybe_downcast_to_dtype(result, dtype) + elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: + result = maybe_downcast_to_dtype(result, dtype) return result @@ -357,24 +315,16 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: The desired dtype of the result. """ from pandas.core.arrays.boolean import BooleanDtype - from pandas.core.arrays.floating import Float64Dtype - from pandas.core.arrays.integer import Int64Dtype, _IntegerDtype + from pandas.core.arrays.integer import Int64Dtype - if how in ["add", "cumsum", "sum", "prod"]: - if dtype == np.dtype(bool): - return np.dtype(np.int64) - elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): - return Int64Dtype() - elif how in ["mean", "median", "var"] and isinstance( - dtype, (BooleanDtype, _IntegerDtype) - ): - return Float64Dtype() + if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)): + return np.dtype(np.int64) + elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): + return Int64Dtype() return dtype -def maybe_cast_to_extension_array( - cls: Type["ExtensionArray"], obj: ArrayLike, dtype: Optional[ExtensionDtype] = None -) -> ArrayLike: +def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None): """ Call to `_from_sequence` that returns the object unchanged on Exception. @@ -390,17 +340,13 @@ def maybe_cast_to_extension_array( ExtensionArray or obj """ from pandas.core.arrays.string_ import StringArray - from pandas.core.arrays.string_arrow import ArrowStringArray assert isinstance(cls, type), f"must pass a type: {cls}" assertion_msg = f"must pass a subclass of ExtensionArray: {cls}" assert issubclass(cls, ABCExtensionArray), assertion_msg - # Everything can be converted to StringArrays, but we may not want to convert - if ( - issubclass(cls, (StringArray, ArrowStringArray)) - and lib.infer_dtype(obj) != "string" - ): + # Everything can be be converted to StringArrays, but we may not want to convert + if issubclass(cls, StringArray) and lib.infer_dtype(obj) != "string": return obj try: @@ -411,9 +357,7 @@ def maybe_cast_to_extension_array( return result -def maybe_upcast_putmask( - result: np.ndarray, mask: np.ndarray, other: Scalar -) -> Tuple[np.ndarray, bool]: +def maybe_upcast_putmask(result: np.ndarray, mask: np.ndarray, other): """ A safe version of putmask that potentially upcasts the result. @@ -457,9 +401,12 @@ def maybe_upcast_putmask( # NaN -> NaT # integer or integer array -> date-like array if result.dtype.kind in ["m", "M"]: - if isna(other): - other = result.dtype.type("nat") - elif is_integer(other): + if is_scalar(other): + if isna(other): + other = result.dtype.type("nat") + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): other = np.array(other, dtype=result.dtype) def changeit(): @@ -492,53 +439,6 @@ def maybe_upcast_putmask( return result, False -def maybe_casted_values( - index: "Index", codes: Optional[np.ndarray] = None -) -> ArrayLike: - """ - Convert an index, given directly or as a pair (level, code), to a 1D array. - - Parameters - ---------- - index : Index - codes : np.ndarray[intp] or None, default None - - Returns - ------- - ExtensionArray or ndarray - If codes is `None`, the values of `index`. - If codes is passed, an array obtained by taking from `index` the indices - contained in `codes`. - """ - - values = index._values - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) - - # if we have the codes, extract the values with a mask - if codes is not None: - mask: np.ndarray = codes == -1 - - if mask.size > 0 and mask.all(): - # we can have situations where the whole mask is -1, - # meaning there is nothing found in codes, so make all nan's - - dtype = index.dtype - fill_value = na_value_for_dtype(dtype) - values = construct_1d_arraylike_from_scalar(fill_value, len(mask), dtype) - - else: - values = values.take(codes) - - if mask.any(): - if isinstance(values, np.ndarray): - values, _ = maybe_upcast_putmask(values, mask, np.nan) - else: - values[mask] = np.nan - - return values - - def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. @@ -587,7 +487,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fill_value = Timestamp(fill_value).to_datetime64() + fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): @@ -600,7 +500,7 @@ def maybe_promote(dtype, fill_value=np.nan): dtype = np.dtype(np.object_) else: try: - fv = Timedelta(fill_value) + fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: @@ -695,7 +595,7 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value -def _ensure_dtype_type(value, dtype: DtypeObj): +def _ensure_dtype_type(value, dtype): """ Ensure that the given value is an instance of the given dtype. @@ -751,7 +651,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, If False, scalar belongs to pandas extension types is inferred as object """ - dtype: DtypeObj = np.dtype(object) + dtype = np.dtype(object) # a 1-element ndarray if isinstance(val, np.ndarray): @@ -773,8 +673,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, dtype = np.dtype(object) elif isinstance(val, (np.datetime64, datetime)): - val = Timestamp(val) - if val is NaT or val.tz is None: + val = tslibs.Timestamp(val) + if val is tslibs.NaT or val.tz is None: dtype = np.dtype("M8[ns]") else: if pandas_dtype: @@ -785,7 +685,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, val = val.value elif isinstance(val, (np.timedelta64, timedelta)): - val = Timedelta(val).value + val = tslibs.Timedelta(val).value dtype = np.dtype("m8[ns]") elif is_bool(val): @@ -814,6 +714,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, elif pandas_dtype: if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) + val = val.ordinal elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] dtype = IntervalDtype(subtype=subtype) @@ -821,25 +722,8 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, return dtype, val -def dict_compat(d: Dict[Scalar, Scalar]) -> Dict[Scalar, Scalar]: - """ - Convert datetimelike-keyed dicts to a Timestamp-keyed dict. - - Parameters - ---------- - d: dict-like object - - Returns - ------- - dict - - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - -def infer_dtype_from_array( - arr, pandas_dtype: bool = False -) -> Tuple[DtypeObj, ArrayLike]: +# TODO: try to make the Any in the return annotation more specific +def infer_dtype_from_array(arr, pandas_dtype: bool = False) -> Tuple[DtypeObj, Any]: """ Infer the dtype from an array. @@ -927,12 +811,7 @@ def maybe_infer_dtype_type(element): return tipo -def maybe_upcast( - values: ArrayLike, - fill_value: Scalar = np.nan, - dtype: Dtype = None, - copy: bool = False, -) -> Tuple[ArrayLike, Scalar]: +def maybe_upcast(values, fill_value=np.nan, dtype=None, copy: bool = False): """ Provide explicit type promotion and coercion. @@ -944,13 +823,6 @@ def maybe_upcast( dtype : if None, then use the dtype of the values, else coerce to this type copy : bool, default True If True always make a copy even if no upcast is required. - - Returns - ------- - values: ndarray or ExtensionArray - the original array, possibly upcast - fill_value: - the fill value, possibly upcast """ if not is_scalar(fill_value) and not is_object_dtype(values.dtype): # We allow arbitrary fill values for object dtype @@ -971,7 +843,7 @@ def maybe_upcast( return values, fill_value -def invalidate_string_dtypes(dtype_set: Set[DtypeObj]): +def invalidate_string_dtypes(dtype_set): """ Change string like dtypes to object for ``DataFrame.select_dtypes()``. @@ -993,9 +865,37 @@ def coerce_indexer_dtype(indexer, categories): return ensure_int64(indexer) -def astype_nansafe( - arr, dtype: DtypeObj, copy: bool = True, skipna: bool = False -) -> ArrayLike: +def coerce_to_dtypes(result, dtypes): + """ + given a dtypes and a result set, coerce the result elements to the + dtypes + """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + def conv(r, dtype): + if np.any(isna(r)): + pass + elif dtype == DT64NS_DTYPE: + r = tslibs.Timestamp(r) + elif dtype == TD64NS_DTYPE: + r = tslibs.Timedelta(r) + elif dtype == np.bool_: + # messy. non 0/1 integers do not get converted. + if is_integer(r) and r not in [0, 1]: + return int(r) + r = bool(r) + elif dtype.kind == "f": + r = float(r) + elif dtype.kind == "i": + r = int(r) + + return r + + return [conv(r, dtype) for r, dtype in zip(result, dtypes)] + + +def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. @@ -1042,7 +942,7 @@ def astype_nansafe( elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): - return ints_to_pytimedelta(arr.view(np.int64)) + return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1099,37 +999,99 @@ def astype_nansafe( return arr.view(dtype) +def maybe_convert_objects(values: np.ndarray, convert_numeric: bool = True): + """ + If we have an object dtype array, try to coerce dates and/or numbers. + + Parameters + ---------- + values : ndarray + convert_numeric : bool, default True + + Returns + ------- + ndarray or DatetimeIndex + """ + validate_bool_kwarg(convert_numeric, "convert_numeric") + + orig_values = values + + # convert dates + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_datetime=True) + + # convert timedeltas + if is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, convert_timedelta=True) + + # convert to numeric + if is_object_dtype(values.dtype): + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric( + values, set(), coerce_numeric=True + ) + except (ValueError, TypeError): + pass + else: + # if we are all nans then leave me alone + if not isna(new_values).all(): + values = new_values + + else: + # soft-conversion + values = lib.maybe_convert_objects(values) + + if values is orig_values: + values = values.copy() + + return values + + def soft_convert_objects( values: np.ndarray, datetime: bool = True, numeric: bool = True, timedelta: bool = True, + coerce: bool = False, copy: bool = True, ): - """ - Try to coerce datetime, timedelta, and numeric object-dtype columns - to inferred dtype. - - Parameters - ---------- - values : np.ndarray[object] - datetime : bool, default True - numeric: bool, default True - timedelta : bool, default True - copy : bool, default True - - Returns - ------- - np.ndarray - """ + """ if we have an object dtype, try to coerce dates and/or numbers """ validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") validate_bool_kwarg(copy, "copy") conversion_count = sum((datetime, numeric, timedelta)) if conversion_count == 0: raise ValueError("At least one of datetime, numeric or timedelta must be True.") + elif conversion_count > 1 and coerce: + raise ValueError( + "Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True." + ) + + if not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + + return to_datetime(values, errors="coerce").to_numpy() + elif timedelta: + from pandas import to_timedelta + + return to_timedelta(values, errors="coerce").to_numpy() + elif numeric: + from pandas import to_numeric + + return to_numeric(values, errors="coerce") # Soft conversions if datetime: @@ -1158,11 +1120,10 @@ def soft_convert_objects( def convert_dtypes( - input_array: AnyArrayLike, + input_array, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, - convert_floating: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, @@ -1170,17 +1131,13 @@ def convert_dtypes( Parameters ---------- - input_array : ExtensionArray, Index, Series or np.ndarray + input_array : ExtensionArray or PandasArray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. - convert_floating : bool, defaults True - Whether, if possible, conversion can be done to floating extension types. - If `convert_integer` is also True, preference will be give to integer - dtypes if the floats can be faithfully casted to integers. Returns ------- @@ -1188,9 +1145,7 @@ def convert_dtypes( new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) - if ( - convert_string or convert_integer or convert_boolean or convert_floating - ) and not is_extension: + if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: @@ -1204,11 +1159,9 @@ def convert_dtypes( target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): - from pandas.core.arrays.integer import INT_STR_TO_DTYPE + from pandas.core.arrays.integer import _dtypes - inferred_dtype = INT_STR_TO_DTYPE.get( - input_array.dtype.name, target_int_dtype - ) + inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype ): @@ -1218,29 +1171,6 @@ def convert_dtypes( if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype - if convert_floating: - if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( - input_array.dtype - ): - from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - - inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( - input_array.dtype.name, "Float64" - ) - # if we could also convert to integer, check if all floats - # are actually integers - if convert_integer: - arr = input_array[notna(input_array)] - if (arr.astype(int) == arr).all(): - inferred_dtype = "Int64" - else: - inferred_dtype = inferred_float_dtype - else: - inferred_dtype = inferred_float_dtype - else: - if is_float_dtype(inferred_dtype): - inferred_dtype = input_array.dtype - if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" @@ -1254,11 +1184,9 @@ def convert_dtypes( return inferred_dtype -def maybe_castable(arr: np.ndarray) -> bool: +def maybe_castable(arr) -> bool: # return False to force a non-fastpath - assert isinstance(arr, np.ndarray) # GH 37024 - # check datetime64[ns]/timedelta64[ns] are valid # otherwise try to coerce kind = arr.dtype.kind @@ -1267,12 +1195,10 @@ def maybe_castable(arr: np.ndarray) -> bool: elif kind == "m": return is_timedelta64_ns_dtype(arr.dtype) - return arr.dtype.name not in POSSIBLY_CAST_DTYPES + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES -def maybe_infer_to_datetimelike( - value: Union[ArrayLike, Scalar], convert_dates: bool = False -): +def maybe_infer_to_datetimelike(value, convert_dates: bool = False): """ we might have a array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a @@ -1294,6 +1220,9 @@ def maybe_infer_to_datetimelike( value, (ABCDatetimeIndex, ABCPeriodIndex, ABCDatetimeArray, ABCPeriodArray) ): return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values v = value @@ -1306,7 +1235,7 @@ def maybe_infer_to_datetimelike( return value shape = v.shape - if v.ndim != 1: + if not v.ndim == 1: v = v.ravel() if not len(v): @@ -1322,6 +1251,8 @@ def maybe_infer_to_datetimelike( # we might have a sequence of the same-datetimes with tz's # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 + from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex try: @@ -1376,7 +1307,7 @@ def maybe_infer_to_datetimelike( return value -def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): +def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1385,6 +1316,9 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: + if isinstance(dtype, str): + dtype = np.dtype(dtype) + is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) @@ -1397,21 +1331,18 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64: - # unpack e.g. SparseDtype - dtype = getattr(dtype, "subtype", dtype) - if not is_dtype_equal(dtype, DT64NS_DTYPE): + if is_datetime64 and not is_dtype_equal( + getattr(dtype, "subtype", dtype), DT64NS_DTYPE + ): - # pandas supports dtype whose granularity is less than [ns] - # e.g., [ps], [fs], [as] - if dtype <= np.dtype("M8[ns]"): - if dtype.name == "datetime64": - raise ValueError(msg) - dtype = DT64NS_DTYPE - else: - raise TypeError( - f"cannot convert datetimelike to dtype [{dtype}]" - ) + # pandas supports dtype whose granularity is less than [ns] + # e.g., [ps], [fs], [as] + if dtype <= np.dtype("M8[ns]"): + if dtype.name == "datetime64": + raise ValueError(msg) + dtype = DT64NS_DTYPE + else: + raise TypeError(f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: # our NaT doesn't support tz's @@ -1445,7 +1376,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: - value = to_datetime(value, errors="raise") + value = to_datetime(value, errors=errors) # GH 25843: Remove tz information since the dtype # didn't specify one if value.tz is not None: @@ -1457,7 +1388,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value.dtype) - value = to_datetime(value, errors="raise").array + value = to_datetime(value, errors=errors).array if is_dt_string: # Strings here are naive, so directly localize value = value.tz_localize(dtype.tz) @@ -1466,7 +1397,7 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): # so localize and convert value = value.tz_localize("UTC").tz_convert(dtype.tz) elif is_timedelta64: - value = to_timedelta(value, errors="raise")._values + value = to_timedelta(value, errors=errors)._values except OutOfBoundsDatetime: raise except (AttributeError, ValueError, TypeError): @@ -1495,10 +1426,10 @@ def maybe_cast_to_datetime(value, dtype: Optional[DtypeObj]): dtype = value.dtype if dtype.kind == "M" and dtype != DT64NS_DTYPE: - value = conversion.ensure_datetime64ns(value) + value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == "m" and dtype != TD64NS_DTYPE: - value = conversion.ensure_timedelta64ns(value) + value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this @@ -1564,13 +1495,40 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): - return np.dtype("object") + return object return np.find_common_type(types, []) +def cast_scalar_to_array(shape, value, dtype: Optional[DtypeObj] = None) -> np.ndarray: + """ + Create np.ndarray of specified shape and dtype, filled with values. + + Parameters + ---------- + shape : tuple + value : scalar value + dtype : np.dtype, optional + dtype to coerce + + Returns + ------- + ndarray of shape, filled with value, of specified / inferred dtype + + """ + if dtype is None: + dtype, fill_value = infer_dtype_from_scalar(value) + else: + fill_value = value + + values = np.empty(shape, dtype=dtype) + values.fill(fill_value) + + return values + + def construct_1d_arraylike_from_scalar( - value: Scalar, length: int, dtype: DtypeObj + value, length: int, dtype: DtypeObj ) -> ArrayLike: """ create a np.ndarray / pandas type of specified shape and dtype @@ -1599,14 +1557,9 @@ def construct_1d_arraylike_from_scalar( elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value - dtype = np.dtype("object") + dtype = object if not isna(value): value = ensure_str(value) - elif dtype.kind in ["M", "m"] and is_valid_nat_for_dtype(value, dtype): - # GH36541: can't fill array directly with pd.NaT - # > np.empty(10, dtype="datetime64[64]").fill(pd.NaT) - # ValueError: cannot convert float NaN to integer - value = dtype.type("NaT", "ns") subarr = np.empty(length, dtype=dtype) subarr.fill(value) @@ -1614,7 +1567,7 @@ def construct_1d_arraylike_from_scalar( return subarr -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1640,7 +1593,7 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: def construct_1d_ndarray_preserving_na( - values: Sequence, dtype: Optional[DtypeObj] = None, copy: bool = False + values, dtype: Optional[DtypeObj] = None, copy: bool = False ) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1674,7 +1627,7 @@ def construct_1d_ndarray_preserving_na( return subarr -def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): +def maybe_cast_to_integer_array(arr, dtype, copy: bool = False): """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -1716,8 +1669,6 @@ def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): ... ValueError: Trying to coerce float values to integers """ - assert is_integer_dtype(dtype) - try: if not hasattr(arr, "astype"): casted = np.array(arr, dtype=dtype, copy=copy) @@ -1742,11 +1693,11 @@ def maybe_cast_to_integer_array(arr, dtype: Dtype, copy: bool = False): if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): raise OverflowError("Trying to coerce negative values to unsigned integers") - if is_float_dtype(arr) or is_object_dtype(arr): + if is_integer_dtype(dtype) and (is_float_dtype(arr) or is_object_dtype(arr)): raise ValueError("Trying to coerce float values to integers") -def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: +def convert_scalar_for_putitemlike(scalar, dtype: np.dtype): """ Convert datetimelike scalar if we are setting into a datetime64 or timedelta64 ndarray. @@ -1777,7 +1728,7 @@ def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: return scalar -def validate_numeric_casting(dtype: np.dtype, value: Scalar) -> None: +def validate_numeric_casting(dtype: np.dtype, value): """ Check that we can losslessly insert the given value into an array with the given dtype. diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/common.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/common.py index d8b0ad7..a2ca4d8 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/common.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ import numpy as np from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, DtypeObj, Optional +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( @@ -43,7 +43,7 @@ from pandas.core.dtypes.inference import ( # noqa:F401 is_sequence, ) -POSSIBLY_CAST_DTYPES = { +_POSSIBLY_CAST_DTYPES = { np.dtype(t).name for t in [ "O", @@ -83,12 +83,7 @@ def ensure_float(arr): float_arr : The original array cast to the float dtype if possible. Otherwise, the original array is returned. """ - if is_extension_array_dtype(arr.dtype): - if is_float_dtype(arr.dtype): - arr = arr.to_numpy(dtype=arr.dtype.numpy_dtype, na_value=np.nan) - else: - arr = arr.to_numpy(dtype="float64", na_value=np.nan) - elif issubclass(arr.dtype.type, (np.integer, np.bool_)): + if issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr @@ -113,7 +108,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: return value -def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray: +def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: """ Ensure that an dtype array of some integer dtype has an int64 dtype if possible. @@ -141,13 +136,11 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray: """ # TODO: GH27506 potential bug with ExtensionArrays try: - # error: Unexpected keyword argument "casting" for "astype" - return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg] + return arr.astype("int64", copy=copy, casting="safe") # type: ignore except TypeError: pass try: - # error: Unexpected keyword argument "casting" for "astype" - return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg] + return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: if is_extension_array_dtype(arr.dtype): return arr.to_numpy(dtype="float64", na_value=np.nan) @@ -640,8 +633,8 @@ def is_dtype_equal(source, target) -> bool: False """ try: - source = get_dtype(source) - target = get_dtype(target) + source = _get_dtype(source) + target = _get_dtype(target) return source == target except (TypeError, AttributeError): @@ -989,10 +982,10 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - tipo = get_dtype(arr_or_dtype) + tipo = _get_dtype(arr_or_dtype) except TypeError: if is_datetime64tz_dtype(arr_or_dtype): - tipo = get_dtype(arr_or_dtype.dtype) + tipo = _get_dtype(arr_or_dtype.dtype) else: return False return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE @@ -1220,10 +1213,6 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False - if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): - # fastpath - dtype = arr_or_dtype - return dtype.kind in ["m", "M"] or dtype.type is Period return ( is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) @@ -1381,7 +1370,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - dtype = get_dtype(arr_or_dtype) + dtype = _get_dtype(arr_or_dtype) except TypeError: return False @@ -1397,7 +1386,8 @@ def is_bool_dtype(arr_or_dtype) -> bool: # guess this return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - return getattr(dtype, "_is_boolean", False) + dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) + return dtype._is_boolean return issubclass(dtype.type, np.bool_) @@ -1566,13 +1556,13 @@ def _is_dtype(arr_or_dtype, condition) -> bool: if arr_or_dtype is None: return False try: - dtype = get_dtype(arr_or_dtype) + dtype = _get_dtype(arr_or_dtype) except (TypeError, ValueError, UnicodeEncodeError): return False return condition(dtype) -def get_dtype(arr_or_dtype) -> DtypeObj: +def _get_dtype(arr_or_dtype) -> DtypeObj: """ Get the dtype instance associated with an array or dtype object. @@ -1703,7 +1693,7 @@ def infer_dtype_from_object(dtype): try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): - # Handles cases like get_dtype(int) i.e., + # Handles cases like _get_dtype(int) i.e., # Python objects that are valid dtypes # (unlike user-defined types, in general) # @@ -1727,7 +1717,7 @@ def _validate_date_like_dtype(dtype) -> None: ------ TypeError : The dtype could not be casted to a date-like dtype. ValueError : The dtype is an illegal date-like dtype (e.g. the - frequency provided is too specific) + the frequency provided is too specific) """ try: typ = np.datetime_data(dtype)[0] @@ -1740,32 +1730,6 @@ def _validate_date_like_dtype(dtype) -> None: ) -def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: - """ - Return None if all args are hashable, else raise a TypeError. - - Parameters - ---------- - *args - Arguments to validate. - error_name : str, optional - The name to use if error - - Raises - ------ - TypeError : If an argument is not hashable - - Returns - ------- - None - """ - if not all(is_hashable(arg) for arg in args): - if error_name: - raise TypeError(f"{error_name} must be a hashable type") - else: - raise TypeError("All elements must be hashable") - - def pandas_dtype(dtype) -> DtypeObj: """ Convert input into a pandas only dtype object or a numpy dtype object. diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/concat.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/concat.py index a9355e3..dd00575 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/concat.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/concat.py @@ -1,7 +1,7 @@ """ Utility functions related to concat. """ -from typing import Set, cast +from typing import cast import numpy as np @@ -9,50 +9,59 @@ from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, + is_object_dtype, is_sparse, + is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseArray -from pandas.core.construction import array, ensure_wrapped_if_datetimelike +from pandas.core.construction import array -def _get_dtype_kinds(arrays) -> Set[str]: +def get_dtype_kinds(l): """ Parameters ---------- - arrays : list of arrays + l : list of arrays Returns ------- - set[str] - A set of kinds that exist in this list of arrays. + a set of kinds that exist in this list of arrays """ - typs: Set[str] = set() - for arr in arrays: - # Note: we use dtype.kind checks because they are much more performant - # than is_foo_dtype + typs = set() + for arr in l: dtype = arr.dtype - if not isinstance(dtype, np.dtype): - # ExtensionDtype so we get - # e.g. "categorical", "datetime64[ns, US/Central]", "Sparse[itn64, 0]" - typ = str(dtype) + if is_categorical_dtype(dtype): + typ = "category" + elif is_sparse(dtype): + typ = "sparse" elif isinstance(arr, ABCRangeIndex): typ = "range" - elif dtype.kind == "M": + elif is_datetime64tz_dtype(dtype): + # if to_concat contains different tz, + # the result must be object dtype + typ = str(dtype) + elif is_datetime64_dtype(dtype): typ = "datetime" - elif dtype.kind == "m": + elif is_timedelta64_dtype(dtype): typ = "timedelta" - elif dtype.kind in ["O", "b"]: - typ = str(dtype) # i.e. "object", "bool" + elif is_object_dtype(dtype): + typ = "object" + elif is_bool_dtype(dtype): + typ = "bool" + elif is_extension_array_dtype(dtype): + typ = str(dtype) else: typ = dtype.kind - typs.add(typ) return typs @@ -131,7 +140,7 @@ def concat_compat(to_concat, axis: int = 0): if non_empties and axis == 0: to_concat = non_empties - typs = _get_dtype_kinds(to_concat) + typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) @@ -152,13 +161,13 @@ def concat_compat(to_concat, axis: int = 0): return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: - return _concat_datetime(to_concat, axis=axis) + return concat_datetime(to_concat, axis=axis, typs=typs) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) - typs = _get_dtype_kinds(to_concat) + typs = get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): @@ -296,13 +305,19 @@ def union_categoricals( raise TypeError("dtype of categories must be the same") ordered = False - if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): + if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered - all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] - new_codes = np.concatenate(all_codes) + if all(first.categories.equals(other.categories) for other in to_union[1:]): + new_codes = np.concatenate([c.codes for c in to_union]) + else: + codes = [first.codes] + [ + recode_for_categories(other.codes, other.categories, first.categories) + for other in to_union[1:] + ] + new_codes = np.concatenate(codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") @@ -346,7 +361,7 @@ def _concatenate_2d(to_concat, axis: int): return np.concatenate(to_concat, axis=axis) -def _concat_datetime(to_concat, axis=0): +def concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype @@ -355,19 +370,21 @@ def _concat_datetime(to_concat, axis=0): ---------- to_concat : array of arrays axis : axis to provide concatenation + typs : set of to_concat dtypes Returns ------- a single array, preserving the combined dtypes """ - to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] + if typs is None: + typs = get_dtype_kinds(to_concat) + to_concat = [_wrap_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: - # ensure_wrapped_if_datetimelike ensures that astype(object) wraps - # in Timestamp/Timedelta + # wrap_datetimelike ensures that astype(object) wraps in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: @@ -381,3 +398,17 @@ def _concat_datetime(to_concat, axis=0): assert result.shape[0] == 1 result = result[0] return result + + +def _wrap_datetimelike(arr): + """ + Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray. + + DTA/TDA handle .astype(object) correctly. + """ + from pandas.core.construction import array as pd_array, extract_array + + arr = extract_array(arr, extract_numpy=True) + if isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"]: + arr = pd_array(arr) + return arr diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py index 3c5421a..404f0b4 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/dtypes.py @@ -29,10 +29,14 @@ from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass from pandas.core.dtypes.inference import is_bool, is_list_like if TYPE_CHECKING: - import pyarrow + import pyarrow # noqa: F401 - from pandas import Categorical - from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray + from pandas import Categorical # noqa: F401 + from pandas.core.arrays import ( # noqa: F401 + DatetimeArray, + IntervalArray, + PeriodArray, + ) str_type = str @@ -47,13 +51,13 @@ class PandasExtensionDtype(ExtensionDtype): type: Any kind: Any # The Any type annotations above are here only because mypy seems to have a - # problem dealing with multiple inheritance from PandasExtensionDtype + # problem dealing with with multiple inheritance from PandasExtensionDtype # and ExtensionDtype's @properties in the subclasses below. The kind and # type variables in those subclasses are explicitly typed below. subdtype = None str: str_type num = 100 - shape: Tuple[int, ...] = () + shape: Tuple[int, ...] = tuple() itemsize = 8 base = None isbuiltin = 0 @@ -371,47 +375,27 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): # but same order is not necessary. There is no distinction between # ordered=False and ordered=None: CDT(., False) and CDT(., None) # will be equal if they have the same categories. - left = self.categories - right = other.categories - - # GH#36280 the ordering of checks here is for performance - if not left.dtype == right.dtype: - return False - - if len(left) != len(right): - return False - - if self.categories.equals(other.categories): + if ( + self.categories.dtype == other.categories.dtype + and self.categories.equals(other.categories) + ): # Check and see if they happen to be identical categories return True - - if left.dtype != object: - # Faster than calculating hash - indexer = left.get_indexer(right) - # Because left and right have the same length and are unique, - # `indexer` not having any -1s implies that there is a - # bijection between `left` and `right`. - return (indexer != -1).all() - - # With object-dtype we need a comparison that identifies - # e.g. int(2) as distinct from float(2) return hash(self) == hash(other) def __repr__(self) -> str_type: if self.categories is None: - data = "None" + data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - if data is None: - # self.categories is RangeIndex - data = str(self.categories._range) - data = data.rstrip(", ") - return f"CategoricalDtype(categories={data}, ordered={self.ordered})" + return f"CategoricalDtype(categories={data}ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype + from pandas.core.util.hashing import ( - combine_hash_arrays, + _combine_hash_arrays, hash_array, hash_tuples, ) @@ -432,9 +416,9 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): hashed = hash((tuple(categories), ordered)) return hashed - if DatetimeTZDtype.is_dtype(categories.dtype): + if is_datetime64tz_dtype(categories.dtype): # Avoid future warning. - categories = categories.astype("datetime64[ns]") + categories = categories.astype(DT64NS_DTYPE) cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: @@ -443,7 +427,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ) else: cat_array = [cat_array] - hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod @@ -455,7 +439,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ------- type """ - from pandas import Categorical + from pandas import Categorical # noqa: F811 return Categorical @@ -651,8 +635,7 @@ class DatetimeTZDtype(PandasExtensionDtype): def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - # error: "str" has no attribute "tz" - unit, tz = unit.unit, unit.tz # type: ignore[attr-defined] + unit, tz = unit.unit, unit.tz # type: ignore if unit != "ns": if isinstance(unit, str) and tz is None: @@ -704,7 +687,7 @@ class DatetimeTZDtype(PandasExtensionDtype): ------- type """ - from pandas.core.arrays import DatetimeArray + from pandas.core.arrays import DatetimeArray # noqa: F811 return DatetimeArray @@ -960,7 +943,7 @@ class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype): """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ - import pyarrow + import pyarrow # noqa: F811 from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1016,7 +999,11 @@ class IntervalDtype(PandasExtensionDtype): _cache: Dict[str_type, PandasExtensionDtype] = {} def __new__(cls, subtype=None): - from pandas.core.dtypes.common import is_string_dtype, pandas_dtype + from pandas.core.dtypes.common import ( + is_categorical_dtype, + is_string_dtype, + pandas_dtype, + ) if isinstance(subtype, IntervalDtype): return subtype @@ -1039,7 +1026,7 @@ class IntervalDtype(PandasExtensionDtype): except TypeError as err: raise TypeError("could not construct IntervalDtype") from err - if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): + if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ( "category, object, and string subtypes are not supported " @@ -1154,7 +1141,7 @@ class IntervalDtype(PandasExtensionDtype): """ Construct IntervalArray from pyarrow Array/ChunkedArray. """ - import pyarrow + import pyarrow # noqa: F811 from pandas.core.arrays import IntervalArray diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/generic.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/generic.py index dfbbaa9..36eff21 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/generic.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/generic.py @@ -1,24 +1,4 @@ """ define generic base classes for pandas objects """ -from __future__ import annotations - -from typing import TYPE_CHECKING, Type, cast - -if TYPE_CHECKING: - from pandas import ( - CategoricalIndex, - DataFrame, - DatetimeIndex, - Float64Index, - Int64Index, - IntervalIndex, - MultiIndex, - PeriodIndex, - RangeIndex, - Series, - TimedeltaIndex, - UInt64Index, - ) - from pandas.core.generic import NDFrame # define abstract base classes to enable isinstance type checking on our @@ -27,54 +7,33 @@ def create_pandas_abc_type(name, attr, comp): # https://github.com/python/mypy/issues/1006 # error: 'classmethod' used with a non-method - @classmethod # type: ignore[misc] + @classmethod # type: ignore def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp - dct = {"__instancecheck__": _check, "__subclasscheck__": _check} + dct = dict(__instancecheck__=_check, __subclasscheck__=_check) meta = type("ABCBase", (type,), dct) - return meta(name, (), dct) + return meta(name, tuple(), dct) -ABCInt64Index = cast( - "Type[Int64Index]", - create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)), +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) +ABCUInt64Index = create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)) +ABCRangeIndex = create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) +ABCDatetimeIndex = create_pandas_abc_type( + "ABCDatetimeIndex", "_typ", ("datetimeindex",) ) -ABCUInt64Index = cast( - "Type[UInt64Index]", - create_pandas_abc_type("ABCUInt64Index", "_typ", ("uint64index",)), +ABCTimedeltaIndex = create_pandas_abc_type( + "ABCTimedeltaIndex", "_typ", ("timedeltaindex",) ) -ABCRangeIndex = cast( - "Type[RangeIndex]", - create_pandas_abc_type("ABCRangeIndex", "_typ", ("rangeindex",)), +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) +ABCCategoricalIndex = create_pandas_abc_type( + "ABCCategoricalIndex", "_typ", ("categoricalindex",) ) -ABCFloat64Index = cast( - "Type[Float64Index]", - create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)), -) -ABCMultiIndex = cast( - "Type[MultiIndex]", - create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)), -) -ABCDatetimeIndex = cast( - "Type[DatetimeIndex]", - create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)), -) -ABCTimedeltaIndex = cast( - "Type[TimedeltaIndex]", - create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)), -) -ABCPeriodIndex = cast( - "Type[PeriodIndex]", - create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)), -) -ABCCategoricalIndex = cast( - "Type[CategoricalIndex]", - create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)), -) -ABCIntervalIndex = cast( - "Type[IntervalIndex]", - create_pandas_abc_type("ABCIntervalIndex", "_typ", ("intervalindex",)), +ABCIntervalIndex = create_pandas_abc_type( + "ABCIntervalIndex", "_typ", ("intervalindex",) ) ABCIndexClass = create_pandas_abc_type( "ABCIndexClass", @@ -94,17 +53,8 @@ ABCIndexClass = create_pandas_abc_type( }, ) -ABCNDFrame = cast( - "Type[NDFrame]", - create_pandas_abc_type("ABCNDFrame", "_typ", ("series", "dataframe")), -) -ABCSeries = cast( - "Type[Series]", - create_pandas_abc_type("ABCSeries", "_typ", ("series",)), -) -ABCDataFrame = cast( - "Type[DataFrame]", create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) -) +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", ("categorical")) ABCDatetimeArray = create_pandas_abc_type("ABCDatetimeArray", "_typ", ("datetimearray")) diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/inference.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/inference.py index 329c444..d1607b5 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/inference.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/inference.py @@ -68,7 +68,7 @@ def is_number(obj) -> bool: return isinstance(obj, (Number, np.number)) -def iterable_not_string(obj) -> bool: +def _iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -83,11 +83,11 @@ def iterable_not_string(obj) -> bool: Examples -------- - >>> iterable_not_string([1, 2, 3]) + >>> _iterable_not_string([1, 2, 3]) True - >>> iterable_not_string("foo") + >>> _iterable_not_string("foo") False - >>> iterable_not_string(1) + >>> _iterable_not_string(1) False """ return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/venv/lib/python3.8/site-packages/pandas/core/dtypes/missing.py b/venv/lib/python3.8/site-packages/pandas/core/dtypes/missing.py index 0b4aab0..8551ce9 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/dtypes/missing.py +++ b/venv/lib/python3.8/site-packages/pandas/core/dtypes/missing.py @@ -9,8 +9,8 @@ from pandas._config import get_option from pandas._libs import lib import pandas._libs.missing as libmissing -from pandas._libs.tslibs import NaT, Period, iNaT -from pandas._typing import ArrayLike, DtypeObj +from pandas._libs.tslibs import NaT, iNaT +from pandas._typing import DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -43,9 +43,6 @@ from pandas.core.dtypes.inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar -nan_checker = np.isnan -INF_AS_NA = False - def isna(obj): """ @@ -191,12 +188,6 @@ def _use_inf_as_na(key): """ inf_as_na = get_option(key) globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) - if inf_as_na: - globals()["nan_checker"] = lambda x: ~np.isfinite(x) - globals()["INF_AS_NA"] = True - else: - globals()["nan_checker"] = np.isnan - globals()["INF_AS_NA"] = False def _isna_ndarraylike(obj, inf_as_na: bool = False): @@ -347,7 +338,7 @@ def notna(obj): notnull = notna -def isna_compat(arr, fill_value=np.nan) -> bool: +def _isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -493,19 +484,7 @@ def _array_equivalent_object(left, right, strict_nan): return True -def array_equals(left: ArrayLike, right: ArrayLike) -> bool: - """ - ExtensionArray-compatible implementation of array_equivalent. - """ - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ABCExtensionArray): - return left.equals(right) - else: - return array_equivalent(left, right, dtype_equal=True) - - -def infer_fill_value(val): +def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -525,11 +504,11 @@ def infer_fill_value(val): return np.nan -def maybe_fill(arr, fill_value=np.nan): +def _maybe_fill(arr, fill_value=np.nan): """ if we have a compatible fill_value and arr dtype, then fill """ - if isna_compat(arr, fill_value): + if _isna_compat(arr, fill_value): arr.fill(fill_value) return arr @@ -608,37 +587,6 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: return not isinstance(obj, np.timedelta64) if dtype.kind == "m": return not isinstance(obj, np.datetime64) - if dtype.kind in ["i", "u", "f", "c"]: - # Numeric - return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64)) - - -def isna_all(arr: ArrayLike) -> bool: - """ - Optimized equivalent to isna(arr).all() - """ - total_len = len(arr) - - # Usually it's enough to check but a small fraction of values to see if - # a block is NOT null, chunks should help in such cases. - # parameters 1000 and 40 were chosen arbitrarily - chunk_len = max(total_len // 40, 1000) - - dtype = arr.dtype - if dtype.kind == "f": - checker = nan_checker - - elif dtype.kind in ["m", "M"] or dtype.type is Period: - checker = lambda x: np.asarray(x.view("i8")) == iNaT - - else: - checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA) - - for i in range(0, total_len, chunk_len): - if not checker(arr[i : i + chunk_len]).all(): - return False - - return True diff --git a/venv/lib/python3.8/site-packages/pandas/core/flags.py b/venv/lib/python3.8/site-packages/pandas/core/flags.py deleted file mode 100644 index 6a09bfa..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/flags.py +++ /dev/null @@ -1,113 +0,0 @@ -import weakref - - -class Flags: - """ - Flags that apply to pandas objects. - - .. versionadded:: 1.2.0 - - Parameters - ---------- - obj : Series or DataFrame - The object these flags are associated with. - allows_duplicate_labels : bool, default True - Whether to allow duplicate labels in this object. By default, - duplicate labels are permitted. Setting this to ``False`` will - cause an :class:`errors.DuplicateLabelError` to be raised when - `index` (or columns for DataFrame) is not unique, or any - subsequent operation on introduces duplicates. - See :ref:`duplicates.disallow` for more. - - .. warning:: - - This is an experimental feature. Currently, many methods fail to - propagate the ``allows_duplicate_labels`` value. In future versions - it is expected that every method taking or returning one or more - DataFrame or Series objects will propagate ``allows_duplicate_labels``. - - Notes - ----- - Attributes can be set in two ways - - >>> df = pd.DataFrame() - >>> df.flags - - >>> df.flags.allows_duplicate_labels = False - >>> df.flags - - - >>> df.flags['allows_duplicate_labels'] = True - >>> df.flags - - """ - - _keys = {"allows_duplicate_labels"} - - def __init__(self, obj, *, allows_duplicate_labels): - self._allows_duplicate_labels = allows_duplicate_labels - self._obj = weakref.ref(obj) - - @property - def allows_duplicate_labels(self) -> bool: - """ - Whether this object allows duplicate labels. - - Setting ``allows_duplicate_labels=False`` ensures that the - index (and columns of a DataFrame) are unique. Most methods - that accept and return a Series or DataFrame will propagate - the value of ``allows_duplicate_labels``. - - See :ref:`duplicates` for more. - - See Also - -------- - DataFrame.attrs : Set global metadata on this object. - DataFrame.set_flags : Set global flags on this object. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) - >>> df.allows_duplicate_labels - True - >>> df.allows_duplicate_labels = False - Traceback (most recent call last): - ... - pandas.errors.DuplicateLabelError: Index has duplicates. - positions - label - a [0, 1] - """ - return self._allows_duplicate_labels - - @allows_duplicate_labels.setter - def allows_duplicate_labels(self, value: bool): - value = bool(value) - obj = self._obj() - if obj is None: - raise ValueError("This flag's object has been deleted.") - - if not value: - for ax in obj.axes: - ax._maybe_check_unique() - - self._allows_duplicate_labels = value - - def __getitem__(self, key): - if key not in self._keys: - raise KeyError(key) - - return getattr(self, key) - - def __setitem__(self, key, value): - if key not in self._keys: - raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") - setattr(self, key, value) - - def __repr__(self): - return f"" - - def __eq__(self, other): - if isinstance(other, type(self)): - return self.allows_duplicate_labels == other.allows_duplicate_labels - return False diff --git a/venv/lib/python3.8/site-packages/pandas/core/frame.py b/venv/lib/python3.8/site-packages/pandas/core/frame.py index 396108b..0b2c99e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/frame.py +++ b/venv/lib/python3.8/site-packages/pandas/core/frame.py @@ -8,14 +8,12 @@ Similar to its R counterpart, data.frame, except providing automatic data alignment and a host of useful data manipulation methods having to do with the labeling information """ -from __future__ import annotations import collections from collections import abc import datetime from io import StringIO import itertools -import mmap from textwrap import dedent from typing import ( IO, @@ -28,6 +26,7 @@ from typing import ( Iterable, Iterator, List, + Mapping, Optional, Sequence, Set, @@ -35,7 +34,6 @@ from typing import ( Type, Union, cast, - overload, ) import warnings @@ -47,11 +45,9 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib, properties from pandas._libs.lib import no_default from pandas._typing import ( - AggFuncType, ArrayLike, Axes, Axis, - CompressionOptions, Dtype, FilePathOrBuffer, FrameOrSeriesUnion, @@ -59,9 +55,9 @@ from pandas._typing import ( Label, Level, Renamer, - StorageOptions, ValueKeyFunc, ) +from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -78,17 +74,18 @@ from pandas.util._validators import ( ) from pandas.core.dtypes.cast import ( + cast_scalar_to_array, + coerce_to_dtypes, construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, - maybe_box_datetimelike, maybe_cast_to_datetime, - maybe_casted_values, maybe_convert_platform, maybe_downcast_to_dtype, maybe_infer_to_datetimelike, maybe_upcast, + maybe_upcast_putmask, validate_numeric_casting, ) from pandas.core.dtypes.common import ( @@ -101,7 +98,6 @@ from pandas.core.dtypes.common import ( is_dict_like, is_dtype_equal, is_extension_array_dtype, - is_float, is_float_dtype, is_hashable, is_integer, @@ -112,32 +108,24 @@ from pandas.core.dtypes.common import ( is_object_dtype, is_scalar, is_sequence, + needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna -from pandas.core import algorithms, common as com, generic, nanops, ops +from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import ( - aggregate, - reconstruct_func, - relabel_result, - transform, -) -from pandas.core.arraylike import OpsMixin +from pandas.core.aggregation import reconstruct_func, relabel_result from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase -from pandas.core.indexes.api import ( - DatetimeIndex, - Index, - PeriodIndex, - ensure_index, - ensure_index_from_sequences, -) +from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences +from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import MultiIndex, maybe_droplevels +from pandas.core.indexes.period import PeriodIndex from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.core.internals import BlockManager from pandas.core.internals.construction import ( @@ -153,16 +141,14 @@ from pandas.core.internals.construction import ( ) from pandas.core.reshape.melt import melt from pandas.core.series import Series -from pandas.core.sorting import get_group_index, lexsort_indexer, nargsort +from pandas.core.sorting import ensure_key_mapped -from pandas.io.common import get_handle +from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import BaseInfo, DataFrameInfo +from pandas.io.formats.info import DataFrameInfo import pandas.plotting if TYPE_CHECKING: - from typing import Literal - from pandas.core.groupby.generic import DataFrameGroupBy from pandas.io.formats.style import Styler @@ -170,27 +156,32 @@ if TYPE_CHECKING: # --------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = { - "axes": "index, columns", - "klass": "DataFrame", - "axes_single_arg": "{0 or 'index', 1 or 'columns'}", - "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 +_shared_doc_kwargs = dict( + axes="index, columns", + klass="DataFrame", + axes_single_arg="{0 or 'index', 1 or 'columns'}", + axis="""axis : {0 or 'index', 1 or 'columns'}, default 0 If 0 or 'index': apply function to each column. If 1 or 'columns': apply function to each row.""", - "optional_by": """ + optional_by=""" by : str or list of str Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels.""", - "optional_labels": """labels : array-like, optional + levels and/or index labels. + + .. versionchanged:: 0.23.0 + + Allow specifying index or column level names.""", + versionadded_to_excel="", + optional_labels="""labels : array-like, optional New labels / index to conform the axis specified by 'axis' to.""", - "optional_axis": """axis : int or str, optional + optional_axis="""axis : int or str, optional Axis to target. Can be either the axis name ('index', 'columns') or number (0, 1).""", -} +) _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -203,14 +194,12 @@ Merge DataFrame or named Series objects with a database-style join. The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. -When performing a cross merge, no column specifications to merge on are -allowed. Parameters ----------%s right : DataFrame or named Series Object to merge with. -how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' +how : {'left', 'right', 'outer', 'inner'}, default 'inner' Type of merge to be performed. * left: use only keys from left frame, similar to a SQL left outer join; @@ -221,11 +210,6 @@ how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' join; sort keys lexicographically. * inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys. - * cross: creates the cartesian product from both frames, preserves the order - of the left keys. - - .. versionadded:: 1.2.0 - on : label or list Column or index level names to join on. These must be found in both DataFrames. If `on` is None and not merging on indexes then this defaults @@ -346,44 +330,6 @@ Traceback (most recent call last): ... ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') - ->>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) ->>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) ->>> df1 - a b -0 foo 1 -1 bar 2 ->>> df2 - a c -0 foo 3 -1 baz 4 - ->>> df1.merge(df2, how='inner', on='a') - a b c -0 foo 1 3 - ->>> df1.merge(df2, how='left', on='a') - a b c -0 foo 1 3.0 -1 bar 2 NaN - ->>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) ->>> df2 = pd.DataFrame({'right': [7, 8]}) ->>> df1 - left -0 foo -1 bar ->>> df2 - right -0 7 -1 8 - ->>> df1.merge(df2, how='cross') - left right -0 foo 7 -1 foo 8 -2 bar 7 -3 bar 8 """ @@ -391,7 +337,7 @@ ValueError: columns overlap but no suffix specified: # DataFrame class -class DataFrame(NDFrame, OpsMixin): +class DataFrame(NDFrame): """ Two-dimensional, size-mutable, potentially heterogeneous tabular data. @@ -403,11 +349,15 @@ class DataFrame(NDFrame, OpsMixin): Parameters ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame - Dict can contain Series, arrays, constants, dataclass or list-like objects. If - data is a dict, column order follows insertion-order. + Dict can contain Series, arrays, constants, or list-like objects. + + .. versionchanged:: 0.23.0 + If data is a dict, column order follows insertion-order for + Python 3.6 and later. .. versionchanged:: 0.25.0 - If data is a list of dicts, column order follows insertion-order. + If data is a list of dicts, column order follows insertion-order + for Python 3.6 and later. index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if @@ -463,28 +413,17 @@ class DataFrame(NDFrame, OpsMixin): 0 1 2 3 1 4 5 6 2 7 8 9 - - Constructing DataFrame from dataclass: - - >>> from dataclasses import make_dataclass - >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) - >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) - x y - 0 0 0 - 1 0 3 - 2 2 3 """ _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set _typ = "dataframe" - _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) @property - def _constructor(self) -> Type[DataFrame]: + def _constructor(self) -> Type["DataFrame"]: return DataFrame _constructor_sliced: Type[Series] = Series - _hidden_attrs: FrozenSet[str] = NDFrame._hidden_attrs | frozenset([]) + _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) _accessors: Set[str] = {"sparse"} @property @@ -522,7 +461,7 @@ class DataFrame(NDFrame, OpsMixin): return mgr = self._init_mgr( - data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy ) elif isinstance(data, dict): @@ -614,8 +553,9 @@ class DataFrame(NDFrame, OpsMixin): if arr.ndim != 0: raise ValueError("DataFrame constructor not properly called!") - shape = (len(index), len(columns)) - values = np.full(shape, arr) + values = cast_scalar_to_array( + (len(index), len(columns)), data, dtype=dtype + ) mgr = init_ndarray( values, index, columns, dtype=values.dtype, copy=False @@ -649,7 +589,7 @@ class DataFrame(NDFrame, OpsMixin): See Also -------- - ndarray.shape : Tuple of array dimensions. + ndarray.shape Examples -------- @@ -698,6 +638,7 @@ class DataFrame(NDFrame, OpsMixin): if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: + # Note: consolidates inplace return not self._is_mixed_type @property @@ -705,10 +646,10 @@ class DataFrame(NDFrame, OpsMixin): """ Can we transpose this DataFrame without creating any new array objects. """ - if self._mgr.any_extension_types: + if self._data.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False - return len(self._mgr.blocks) == 1 + return len(self._data.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods @@ -769,7 +710,7 @@ class DataFrame(NDFrame, OpsMixin): d.to_string(buf=buf) value = buf.getvalue() - repr_width = max(len(line) for line in value.split("\n")) + repr_width = max(len(l) for l in value.split("\n")) return repr_width < width @@ -851,8 +792,10 @@ class DataFrame(NDFrame, OpsMixin): max_cols=max_cols, show_dimensions=show_dimensions, decimal=".", + table_id=None, + render_links=False, ) - return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + return formatter.to_html(notebook=True) else: return None @@ -935,17 +878,14 @@ class DataFrame(NDFrame, OpsMixin): max_cols=max_cols, show_dimensions=show_dimensions, decimal=decimal, - ) - return fmt.DataFrameRenderer(formatter).to_string( - buf=buf, - encoding=encoding, line_width=line_width, ) + return formatter.to_string(buf=buf, encoding=encoding) # ---------------------------------------------------------------------- @property - def style(self) -> Styler: + def style(self) -> "Styler": """ Returns a Styler object. @@ -1034,6 +974,9 @@ class DataFrame(NDFrame, OpsMixin): data : Series The data of the row as a Series. + it : generator + A generator that iterates over the rows of the frame. + See Also -------- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. @@ -1071,7 +1014,7 @@ class DataFrame(NDFrame, OpsMixin): s = klass(v, index=columns, name=k) yield k, s - def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"): + def itertuples(self, index=True, name="Pandas"): """ Iterate over DataFrame rows as namedtuples. @@ -1144,12 +1087,10 @@ class DataFrame(NDFrame, OpsMixin): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - if name is not None: - # https://github.com/python/mypy/issues/9046 - # error: namedtuple() expects a string literal as the first argument - itertuple = collections.namedtuple( # type: ignore[misc] - name, fields, rename=True - ) + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: + itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples @@ -1281,20 +1222,13 @@ class DataFrame(NDFrame, OpsMixin): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - try: - return self.T.dot(np.transpose(other)).T - except ValueError as err: - if "shape mismatch" not in str(err): - raise - # GH#21581 give exception message for original shapes - msg = f"shapes {np.shape(other)} and {self.shape} not aligned" - raise ValueError(msg) from err + return self.T.dot(np.transpose(other)).T # ---------------------------------------------------------------------- # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame: + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ Construct DataFrame from dict of array-like or dicts. @@ -1315,6 +1249,8 @@ class DataFrame(NDFrame, OpsMixin): Column labels to use when ``orient='index'``. Raises a ValueError if used with ``orient='columns'``. + .. versionadded:: 0.23.0 + Returns ------- DataFrame @@ -1581,7 +1517,7 @@ class DataFrame(NDFrame, OpsMixin): ( "data", [ - list(map(maybe_box_datetimelike, t)) + list(map(com.maybe_box_datetimelike, t)) for t in self.itertuples(index=False, name=None) ], ), @@ -1589,7 +1525,7 @@ class DataFrame(NDFrame, OpsMixin): ) elif orient == "series": - return into_c((k, maybe_box_datetimelike(v)) for k, v in self.items()) + return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items()) elif orient == "records": columns = self.columns.tolist() @@ -1598,7 +1534,7 @@ class DataFrame(NDFrame, OpsMixin): for row in self.itertuples(index=False, name=None) ) return [ - into_c((k, maybe_box_datetimelike(v)) for k, v in row.items()) + into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items()) for row in rows ] @@ -1730,7 +1666,7 @@ class DataFrame(NDFrame, OpsMixin): columns=None, coerce_float=False, nrows=None, - ) -> DataFrame: + ) -> "DataFrame": """ Convert structured or record ndarray to DataFrame. @@ -1838,13 +1774,13 @@ class DataFrame(NDFrame, OpsMixin): arrays = [data[k] for k in columns] else: arrays = [] - arr_columns_list = [] + arr_columns = [] for k, v in data.items(): if k in columns: - arr_columns_list.append(k) + arr_columns.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) @@ -2071,7 +2007,7 @@ class DataFrame(NDFrame, OpsMixin): index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, - ) -> DataFrame: + ) -> "DataFrame": """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2109,7 +2045,6 @@ class DataFrame(NDFrame, OpsMixin): ) return cls(mgr) - @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_stata( self, @@ -2122,8 +2057,7 @@ class DataFrame(NDFrame, OpsMixin): variable_labels: Optional[Dict[Label, str]] = None, version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, - compression: CompressionOptions = "infer", - storage_options: StorageOptions = None, + compression: Union[str, Mapping[str, str], None] = "infer", ) -> None: """ Export DataFrame object to Stata dta format. @@ -2162,7 +2096,7 @@ class DataFrame(NDFrame, OpsMixin): variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {{114, 117, 118, 119, None}}, default 114 + version : {114, 117, 118, 119, None}, default 114 Version to use in the output dta file. Set to None to let pandas decide between 118 or 119 formats depending on the number of columns in the frame. Version 114 can be read by Stata 10 and @@ -2174,11 +2108,7 @@ class DataFrame(NDFrame, OpsMixin): support Unicode characters, and version 119 supports more than 32,767 variables. - Version 119 should usually only be used when the number of - variables exceeds the capacity of dta format 118. Exporting - smaller datasets in format 119 may have unintended consequences, - and, as of November 2020, Stata SE cannot read version 119 files. - + .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Added support for formats 118 and 119. @@ -2188,23 +2118,22 @@ class DataFrame(NDFrame, OpsMixin): format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. + + .. versionadded:: 0.23.0 + compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies - compression mode. Compression mode must be one of {{'infer', 'gzip', - 'bz2', 'zip', 'xz', None}}. If compression mode is 'infer' and + compression mode. Compression mode must be one of {'infer', 'gzip', + 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `fname` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no - compression). If dict and compression mode is one of {{'zip', - 'gzip', 'bz2'}}, or inferred as one of the above, other entries + compression). If dict and compression mode is one of {'zip', + 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionadded:: 1.1.0 - {storage_options} - - .. versionadded:: 1.2.0 - Raises ------ NotImplementedError @@ -2224,9 +2153,9 @@ class DataFrame(NDFrame, OpsMixin): Examples -------- - >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', + >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon', ... 'parrot'], - ... 'speed': [350, 18, 361, 15]}}) + ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ if version not in (114, 117, 118, 119, None): @@ -2237,14 +2166,10 @@ class DataFrame(NDFrame, OpsMixin): from pandas.io.stata import StataWriter as statawriter elif version == 117: # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import ( # type: ignore[no-redef] - StataWriter117 as statawriter, - ) + from pandas.io.stata import StataWriter117 as statawriter # type: ignore else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import ( # type: ignore[no-redef] - StataWriterUTF8 as statawriter, - ) + from pandas.io.stata import StataWriterUTF8 as statawriter # type: ignore kwargs: Dict[str, Any] = {} if version is None or version >= 117: @@ -2255,7 +2180,7 @@ class DataFrame(NDFrame, OpsMixin): kwargs["version"] = version # mypy: Too many arguments for "StataWriter" - writer = statawriter( # type: ignore[call-arg] + writer = statawriter( # type: ignore path, self, convert_dates=convert_dates, @@ -2265,20 +2190,19 @@ class DataFrame(NDFrame, OpsMixin): write_index=write_index, variable_labels=variable_labels, compression=compression, - storage_options=storage_options, **kwargs, ) writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: + def to_feather(self, path, **kwargs) -> None: """ Write a DataFrame to the binary Feather format. Parameters ---------- - path : str or file-like object - If a string, it will be used as Root Directory path. + path : str + String file path. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. Starting with pyarrow 0.17, this includes the `compression`, @@ -2293,7 +2217,6 @@ class DataFrame(NDFrame, OpsMixin): @doc( Series.to_markdown, klass=_shared_doc_kwargs["klass"], - storage_options=_shared_docs["storage_options"], examples="""Examples -------- >>> df = pd.DataFrame( @@ -2319,10 +2242,9 @@ class DataFrame(NDFrame, OpsMixin): ) def to_markdown( self, - buf: Optional[Union[IO[str], str]] = None, - mode: str = "wt", + buf: Optional[IO[str]] = None, + mode: Optional[str] = None, index: bool = True, - storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: if "showindex" in kwargs: @@ -2340,24 +2262,21 @@ class DataFrame(NDFrame, OpsMixin): result = tabulate.tabulate(self, **kwargs) if buf is None: return result - - with get_handle(buf, mode, storage_options=storage_options) as handles: - assert not isinstance(handles.handle, (str, mmap.mmap)) - handles.handle.writelines(result) + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + assert buf is not None # Help mypy. + buf.writelines(result) return None - @doc(storage_options=generic._shared_docs["storage_options"]) @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path: Optional[FilePathOrBuffer] = None, + path: FilePathOrBuffer[AnyStr], engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, - storage_options: StorageOptions = None, **kwargs, - ) -> Optional[bytes]: + ) -> None: """ Write a DataFrame to the binary parquet format. @@ -2368,24 +2287,23 @@ class DataFrame(NDFrame, OpsMixin): Parameters ---------- - path : str or file-like object, default None + path : str or file-like object If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, - we refer to objects with a write() method, such as a file handle + we refer to objects with a write() method, such as a file handler (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. If path is None, - a bytes object is returned. + fastparquet does not accept file-like objects. - .. versionchanged:: 1.2.0 + .. versionchanged:: 1.0.0 Previously this was "fname" - engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option ``io.parquet.engine`` is used. The default ``io.parquet.engine`` behavior is to try 'pyarrow', falling back to 'fastparquet' if 'pyarrow' is unavailable. - compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' + compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy' Name of the compression to use. Use ``None`` for no compression. index : bool, default None If ``True``, include the dataframe's index(es) in the file output. @@ -2405,18 +2323,10 @@ class DataFrame(NDFrame, OpsMixin): .. versionadded:: 0.24.0 - {storage_options} - - .. versionadded:: 1.2.0 - **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. - Returns - ------- - bytes if no path argument is provided else None - See Also -------- read_parquet : Read a parquet file. @@ -2432,7 +2342,7 @@ class DataFrame(NDFrame, OpsMixin): Examples -------- - >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) >>> df.to_parquet('df.parquet.gzip', ... compression='gzip') # doctest: +SKIP >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP @@ -2452,14 +2362,13 @@ class DataFrame(NDFrame, OpsMixin): """ from pandas.io.parquet import to_parquet - return to_parquet( + to_parquet( self, path, engine, compression=compression, index=index, partition_cols=partition_cols, - storage_options=storage_options, **kwargs, ) @@ -2520,6 +2429,9 @@ class DataFrame(NDFrame, OpsMixin): table_id : str, optional A css id is included in the opening `` tag if specified. + + .. versionadded:: 0.23.0 + render_links : bool, default False Convert URLs to HTML links. @@ -2537,57 +2449,45 @@ class DataFrame(NDFrame, OpsMixin): columns=columns, col_space=col_space, na_rep=na_rep, - header=header, - index=index, formatters=formatters, float_format=float_format, - bold_rows=bold_rows, sparsify=sparsify, justify=justify, index_names=index_names, + header=header, + index=index, + bold_rows=bold_rows, escape=escape, - decimal=decimal, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, + decimal=decimal, + table_id=table_id, + render_links=render_links, ) # TODO: a generic formatter wld b in DataFrameFormatter - return fmt.DataFrameRenderer(formatter).to_html( + return formatter.to_html( buf=buf, classes=classes, notebook=notebook, border=border, encoding=encoding, - table_id=table_id, - render_links=render_links, ) # ---------------------------------------------------------------------- @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=dedent( - """\ - max_cols : int, optional + max_cols_sub=( + """max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used.""" + ``pandas.options.display.max_info_columns`` is used. + """ ), - show_counts_sub=dedent( - """\ - show_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the DataFrame is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - null_counts : bool, optional - .. deprecated:: 1.2.0 - Use show_counts instead.""" - ), - examples_sub=dedent( - """\ + examples_sub=( + """ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2668,45 +2568,27 @@ class DataFrame(NDFrame, OpsMixin): 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 165.9 MB""" + memory usage: 188.8 MB""" ), - see_also_sub=dedent( - """\ + see_also_sub=( + """ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), - version_added_sub="", ) - @doc(BaseInfo.render) + @doc(DataFrameInfo.info) def info( self, verbose: Optional[bool] = None, buf: Optional[IO[str]] = None, max_cols: Optional[int] = None, memory_usage: Optional[Union[bool, str]] = None, - show_counts: Optional[bool] = None, null_counts: Optional[bool] = None, ) -> None: - if null_counts is not None: - if show_counts is not None: - raise ValueError("null_counts used with show_counts. Use show_counts.") - warnings.warn( - "null_counts is deprecated. Use show_counts instead", - FutureWarning, - stacklevel=2, - ) - show_counts = null_counts - info = DataFrameInfo( - data=self, - memory_usage=memory_usage, - ) - info.render( - buf=buf, - max_cols=max_cols, - verbose=verbose, - show_counts=show_counts, - ) + return DataFrameInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).info() def memory_usage(self, index=True, deep=False) -> Series: """ @@ -2747,16 +2629,16 @@ class DataFrame(NDFrame, OpsMixin): Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) + >>> data = dict([(t, np.ones(shape=5000).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) >>> df.head() int64 float64 complex128 object bool - 0 1 1.0 1.0+0.0j 1 True - 1 1 1.0 1.0+0.0j 1 True - 2 1 1.0 1.0+0.0j 1 True - 3 1 1.0 1.0+0.0j 1 True - 4 1 1.0 1.0+0.0j 1 True + 0 1 1.0 1.000000+0.000000j 1 True + 1 1 1.0 1.000000+0.000000j 1 True + 2 1 1.0 1.000000+0.000000j 1 True + 3 1 1.0 1.000000+0.000000j 1 True + 4 1 1.0 1.000000+0.000000j 1 True >>> df.memory_usage() Index 128 @@ -2782,7 +2664,7 @@ class DataFrame(NDFrame, OpsMixin): int64 40000 float64 40000 complex128 80000 - object 180000 + object 160000 bool 5000 dtype: int64 @@ -2790,7 +2672,7 @@ class DataFrame(NDFrame, OpsMixin): many repeated values. >>> df['object'].astype('category').memory_usage(deep=True) - 5244 + 5216 """ result = self._constructor_sliced( [c.memory_usage(index=False, deep=deep) for col, c in self.items()], @@ -2802,7 +2684,7 @@ class DataFrame(NDFrame, OpsMixin): ).append(result) return result - def transpose(self, *args, copy: bool = False) -> DataFrame: + def transpose(self, *args, copy: bool = False) -> "DataFrame": """ Transpose index and columns. @@ -2881,7 +2763,7 @@ class DataFrame(NDFrame, OpsMixin): >>> df2_transposed 0 1 name Alice Bob - score 9.5 8.0 + score 9.5 8 employed False True kids 0 0 @@ -2899,7 +2781,7 @@ class DataFrame(NDFrame, OpsMixin): 1 object dtype: object """ - nv.validate_transpose(args, {}) + nv.validate_transpose(args, dict()) # construct the args dtypes = list(self.dtypes) @@ -2925,7 +2807,7 @@ class DataFrame(NDFrame, OpsMixin): return result.__finalize__(self, method="transpose") @property - def T(self) -> DataFrame: + def T(self) -> "DataFrame": return self.transpose() # ---------------------------------------------------------------------- @@ -2974,7 +2856,7 @@ class DataFrame(NDFrame, OpsMixin): Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ - return self._mgr.iget_values(i) + return self._data.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ @@ -2991,7 +2873,7 @@ class DataFrame(NDFrame, OpsMixin): if is_hashable(key): # shortcut if the key is in columns if self.columns.is_unique and key in self.columns: - if isinstance(self.columns, MultiIndex): + if self.columns.nlevels > 1: return self._getitem_multilevel(key) return self._get_item_cache(key) @@ -3041,8 +2923,7 @@ class DataFrame(NDFrame, OpsMixin): # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): - # GH#26490 using data[key] can cause RecursionError - data = data._get_item_cache(key) + data = data[key] return data @@ -3167,7 +3048,7 @@ class DataFrame(NDFrame, OpsMixin): # operates on labels and we need to operate positional for # backwards-compat, xref GH#31469 self._check_setitem_copy() - self.iloc[key] = value + self.iloc._setitem_with_indexer(key, value) def _setitem_array(self, key, value): # also raises Exception if object array with NA values @@ -3179,7 +3060,7 @@ class DataFrame(NDFrame, OpsMixin): key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] self._check_setitem_copy() - self.iloc[indexer] = value + self.iloc._setitem_with_indexer(indexer, value) else: if isinstance(value, DataFrame): if len(value.columns) != len(key): @@ -3187,12 +3068,12 @@ class DataFrame(NDFrame, OpsMixin): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) + self.loc._ensure_listlike_indexer(key, axis=1) indexer = self.loc._get_listlike_indexer( key, axis=1, raise_missing=False )[1] self._check_setitem_copy() - self.iloc[:, indexer] = value + self.iloc._setitem_with_indexer((slice(None), indexer), value) def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. @@ -3326,12 +3207,11 @@ class DataFrame(NDFrame, OpsMixin): in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuations (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2) would - be referenced as `Area (cm^2)`). Column names which are Python keywords - (like "list", "for", "import", etc) cannot be used. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. @@ -3351,9 +3231,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - DataFrame resulting from the provided query expression or - None if ``inplace=True``. + DataFrame + DataFrame resulting from the provided query expression. See Also -------- @@ -3500,8 +3379,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - ndarray, scalar, pandas object, or None - The result of the evaluation or None if ``inplace=True``. + ndarray, scalar, or pandas object + The result of the evaluation. See Also -------- @@ -3595,7 +3474,7 @@ class DataFrame(NDFrame, OpsMixin): return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> DataFrame: + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3708,13 +3587,7 @@ class DataFrame(NDFrame, OpsMixin): extracted_dtypes = [ unique_dtype for unique_dtype in unique_dtypes - # error: Argument 1 to "tuple" has incompatible type - # "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], - # Type[float], Type[int], Type[complex], Type[bool]]]"; - # expected "Iterable[Union[type, Tuple[Any, ...]]]" - if issubclass( - unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type] - ) + if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore ] return extracted_dtypes @@ -3750,16 +3623,11 @@ class DataFrame(NDFrame, OpsMixin): value : int, Series, or array-like allow_duplicates : bool, optional """ - if allow_duplicates and not self.flags.allows_duplicate_labels: - raise ValueError( - "Cannot specify 'allow_duplicates=True' when " - "'self.flags.allows_duplicate_labels' is False." - ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs) -> DataFrame: + def assign(self, **kwargs) -> "DataFrame": r""" Assign new columns to a DataFrame. @@ -3788,6 +3656,10 @@ class DataFrame(NDFrame, OpsMixin): Later items in '\*\*kwargs' may refer to newly created or modified columns in 'df'; items are computed and assigned into 'df' in order. + .. versionchanged:: 0.23.0 + + Keyword argument order is maintained. + Examples -------- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, @@ -3912,11 +3784,15 @@ class DataFrame(NDFrame, OpsMixin): else: # cast ignores pandas dtypes. so save the dtype first - infer_dtype, fill_value = infer_dtype_from_scalar(value, pandas_dtype=True) + infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - value = construct_1d_arraylike_from_scalar( - fill_value, len(self), infer_dtype - ) + # upcast + if is_extension_array_dtype(infer_dtype): + value = construct_1d_arraylike_from_scalar( + value, len(self.index), infer_dtype + ) + else: + value = cast_scalar_to_array(len(self.index), value) value = maybe_cast_to_datetime(value, infer_dtype) @@ -3945,15 +3821,10 @@ class DataFrame(NDFrame, OpsMixin): def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. + Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. - .. deprecated:: 1.2.0 - DataFrame.lookup is deprecated, - use DataFrame.melt and DataFrame.loc instead. - For an example see :meth:`~pandas.DataFrame.lookup` - in the user guide. - Parameters ---------- row_labels : sequence @@ -3966,14 +3837,6 @@ class DataFrame(NDFrame, OpsMixin): numpy.ndarray The found values. """ - msg = ( - "The 'lookup' method is deprecated and will be" - "removed in a future version." - "You can use DataFrame.melt and DataFrame.loc" - "as a substitute." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") @@ -4062,7 +3925,7 @@ class DataFrame(NDFrame, OpsMixin): allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value) -> DataFrame: + def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": """ We are guaranteed non-Nones in the axes. """ @@ -4095,7 +3958,7 @@ class DataFrame(NDFrame, OpsMixin): limit=None, fill_axis=0, broadcast_axis=None, - ) -> DataFrame: + ) -> "DataFrame": return super().align( other, join=join, @@ -4164,7 +4027,7 @@ class DataFrame(NDFrame, OpsMixin): ("tolerance", None), ], ) - def reindex(self, *args, **kwargs) -> DataFrame: + def reindex(self, *args, **kwargs) -> "DataFrame": axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names @@ -4214,9 +4077,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - DataFrame without the removed index or column labels or - None if ``inplace=True``. + DataFrame + DataFrame without the removed index or column labels. Raises ------ @@ -4327,7 +4189,7 @@ class DataFrame(NDFrame, OpsMixin): inplace: bool = False, level: Optional[Level] = None, errors: str = "ignore", - ) -> Optional[DataFrame]: + ) -> Optional["DataFrame"]: """ Alter axes labels. @@ -4340,7 +4202,7 @@ class DataFrame(NDFrame, OpsMixin): Parameters ---------- mapper : dict-like or function - Dict-like or function transformations to apply to + Dict-like or functions transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. @@ -4370,8 +4232,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - DataFrame with the renamed axis labels or None if ``inplace=True``. + DataFrame + DataFrame with the renamed axis labels. Raises ------ @@ -4421,7 +4283,7 @@ class DataFrame(NDFrame, OpsMixin): Traceback (most recent call last): KeyError: ['C'] not found in axis - Using axis-style parameters: + Using axis-style parameters >>> df.rename(str.lower, axis='columns') a b @@ -4455,7 +4317,7 @@ class DataFrame(NDFrame, OpsMixin): inplace=False, limit=None, downcast=None, - ) -> Optional[DataFrame]: + ) -> Optional["DataFrame"]: return super().fillna( value=value, method=method, @@ -4563,37 +4425,7 @@ class DataFrame(NDFrame, OpsMixin): return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift( - self, periods=1, freq=None, axis=0, fill_value=lib.no_default - ) -> DataFrame: - axis = self._get_axis_number(axis) - - ncols = len(self.columns) - if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: - # We will infer fill_value to match the closest column - - # Use a column that we know is valid for our column's dtype GH#38434 - label = self.columns[0] - - if periods > 0: - result = self.iloc[:, :-periods] - for col in range(min(ncols, abs(periods))): - # TODO(EA2D): doing this in a loop unnecessary with 2D EAs - # Define filler inside loop so we get a copy - filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, label, filler, allow_duplicates=True) - else: - result = self.iloc[:, -periods:] - for col in range(min(ncols, abs(periods))): - # Define filler inside loop so we get a copy - filler = self.iloc[:, -1].shift(len(self)) - result.insert( - len(result.columns), label, filler, allow_duplicates=True - ) - - result.columns = self.columns.copy() - return result - + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4621,7 +4453,7 @@ class DataFrame(NDFrame, OpsMixin): append : bool, default False Whether to append columns to existing index. inplace : bool, default False - If True, modifies the DataFrame in place (do not create a new object). + Modify the DataFrame in place (do not create a new object). verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this @@ -4629,8 +4461,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - Changed row labels or None if ``inplace=True``. + DataFrame + Changed row labels. See Also -------- @@ -4691,7 +4523,6 @@ class DataFrame(NDFrame, OpsMixin): 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -4729,7 +4560,7 @@ class DataFrame(NDFrame, OpsMixin): frame = self.copy() arrays = [] - names: List[Label] = [] + names = [] if append: names = list(self.index.names) if isinstance(self.index, MultiIndex): @@ -4787,30 +4618,6 @@ class DataFrame(NDFrame, OpsMixin): if not inplace: return frame - @overload - # https://github.com/python/mypy/issues/6580 - # Overloaded function signatures 1 and 2 overlap with incompatible return types - def reset_index( # type: ignore[misc] - self, - level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., - drop: bool = ..., - inplace: Literal[False] = ..., - col_level: Hashable = ..., - col_fill: Label = ..., - ) -> DataFrame: - ... - - @overload - def reset_index( - self, - level: Optional[Union[Hashable, Sequence[Hashable]]] = ..., - drop: bool = ..., - inplace: Literal[True] = ..., - col_level: Hashable = ..., - col_fill: Label = ..., - ) -> None: - ... - def reset_index( self, level: Optional[Union[Hashable, Sequence[Hashable]]] = None, @@ -4818,7 +4625,7 @@ class DataFrame(NDFrame, OpsMixin): inplace: bool = False, col_level: Hashable = 0, col_fill: Label = "", - ) -> Optional[DataFrame]: + ) -> Optional["DataFrame"]: """ Reset the index, or a level of it. @@ -4961,12 +4768,51 @@ class DataFrame(NDFrame, OpsMixin): monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: new_obj = self.copy() + def _maybe_casted_values(index, labels=None): + values = index._values + if not isinstance(index, (PeriodIndex, DatetimeIndex)): + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + # if we have the labels, extract the values with a mask + if labels is not None: + mask = labels == -1 + + # we can have situations where the whole mask is -1, + # meaning there is nothing found in labels, so make all nan's + if mask.size > 0 and mask.all(): + dtype = index.dtype + fill_value = na_value_for_dtype(dtype) + values = construct_1d_arraylike_from_scalar( + fill_value, len(mask), dtype + ) + else: + values = values.take(labels) + + # TODO(https://github.com/pandas-dev/pandas/issues/24206) + # Push this into maybe_upcast_putmask? + # We can't pass EAs there right now. Looks a bit + # complicated. + # So we unbox the ndarray_values, op, re-box. + values_type = type(values) + values_dtype = values.dtype + + if issubclass(values_type, DatetimeLikeArray): + values = values._data # TODO: can we de-kludge yet? + + if mask.any(): + values, _ = maybe_upcast_putmask(values, mask, np.nan) + + if issubclass(values_type, DatetimeLikeArray): + values = values_type(values, dtype=values_dtype) + + return values + new_index = ibase.default_index(len(new_obj)) if level is not None: if not isinstance(level, (tuple, list)): @@ -5009,7 +4855,7 @@ class DataFrame(NDFrame, OpsMixin): name_lst += [col_fill] * missing name = tuple(name_lst) # to ndarray and maybe infer different dtype - level_values = maybe_casted_values(lev, lab) + level_values = _maybe_casted_values(lev, lab) new_obj.insert(0, name, level_values) new_obj.index = new_index @@ -5022,20 +4868,20 @@ class DataFrame(NDFrame, OpsMixin): # Reindex-based selection methods @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> DataFrame: - result = self._constructor(self._mgr.isna(func=isna)) + def isna(self) -> "DataFrame": + result = self._constructor(self._data.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> DataFrame: + def isnull(self) -> "DataFrame": return self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> DataFrame: + def notna(self) -> "DataFrame": return ~self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> DataFrame: + def notnull(self) -> "DataFrame": return ~self.isna() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -5076,8 +4922,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - DataFrame with NA entries dropped from it or None if ``inplace=True``. + DataFrame + DataFrame with NA entries dropped from it. See Also -------- @@ -5130,10 +4976,9 @@ class DataFrame(NDFrame, OpsMixin): Define in which columns to look for missing values. - >>> df.dropna(subset=['name', 'toy']) + >>> df.dropna(subset=['name', 'born']) name toy born 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT Keep the DataFrame with valid entries in the same variable. @@ -5186,7 +5031,7 @@ class DataFrame(NDFrame, OpsMixin): keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, - ) -> Optional[DataFrame]: + ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -5212,7 +5057,7 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None + DataFrame DataFrame with duplicates removed or None if ``inplace=True``. See Also @@ -5252,7 +5097,7 @@ class DataFrame(NDFrame, OpsMixin): 0 Yum Yum cup 4.0 2 Indomie cup 3.5 - To remove duplicates and keep last occurrences, use ``keep``. + To remove duplicates and keep last occurences, use ``keep``. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating @@ -5264,7 +5109,6 @@ class DataFrame(NDFrame, OpsMixin): return self.copy() inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") duplicated = self.duplicated(subset, keep=keep) result = self[-duplicated] @@ -5281,7 +5125,7 @@ class DataFrame(NDFrame, OpsMixin): self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", - ) -> Series: + ) -> "Series": """ Return boolean Series denoting duplicate rows. @@ -5370,14 +5214,16 @@ class DataFrame(NDFrame, OpsMixin): 4 True dtype: bool """ - from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + + from pandas.core.sorting import get_group_index if self.empty: return self._constructor_sliced(dtype=bool) def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), SIZE_HINT_LIMIT) + vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) ) return labels.astype("i8", copy=False), len(shape) @@ -5405,16 +5251,14 @@ class DataFrame(NDFrame, OpsMixin): labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) - result = self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) - return result.__finalize__(self, method="duplicated") + return self._constructor_sliced(duplicated_int64(ids, keep), index=self.index) # ---------------------------------------------------------------------- # Sorting # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - # error: Signature of "sort_values" incompatible with supertype "NDFrame" - def sort_values( # type: ignore[override] + def sort_values( # type: ignore[override] # NOQA # issue 27237 self, by, axis=0, @@ -5435,6 +5279,7 @@ class DataFrame(NDFrame, OpsMixin): f"Length of ascending ({len(ascending)}) != length of by ({len(by)})" ) if len(by) > 1: + from pandas.core.sorting import lexsort_indexer keys = [self._get_label_or_level_values(x, axis=axis) for x in by] @@ -5447,6 +5292,7 @@ class DataFrame(NDFrame, OpsMixin): ) indexer = ensure_platform_int(indexer) else: + from pandas.core.sorting import nargsort by = by[0] k = self._get_label_or_level_values(by, axis=axis) @@ -5533,8 +5379,8 @@ class DataFrame(NDFrame, OpsMixin): Returns ------- - DataFrame or None - The original DataFrame sorted by the labels or None if ``inplace=True``. + DataFrame + The original DataFrame sorted by the labels. See Also -------- @@ -5576,17 +5422,62 @@ class DataFrame(NDFrame, OpsMixin): C 3 d 4 """ - return super().sort_index( - axis, - level, - ascending, - inplace, - kind, - na_position, - sort_remaining, - ignore_index, - key, - ) + # TODO: this can be combined with Series.sort_index impl as + # almost identical + + inplace = validate_bool_kwarg(inplace, "inplace") + + axis = self._get_axis_number(axis) + labels = self._get_axis(axis) + labels = ensure_key_mapped(labels, key, levels=level) + + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() + if level is not None: + new_axis, indexer = labels.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + + elif isinstance(labels, MultiIndex): + from pandas.core.sorting import lexsort_indexer + + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) + else: + from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if (ascending and labels.is_monotonic_increasing) or ( + not ascending and labels.is_monotonic_decreasing + ): + if inplace: + return + else: + return self.copy() + + indexer = nargsort( + labels, kind=kind, ascending=ascending, na_position=na_position + ) + + baxis = self._get_block_manager_axis(axis) + new_data = self._mgr.take(indexer, axis=baxis, verify=False) + + # reconstruct axis if needed + new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + + result = self._constructor(new_data) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_index") def value_counts( self, @@ -5641,8 +5532,8 @@ class DataFrame(NDFrame, OpsMixin): >>> df.value_counts() num_legs num_wings 4 0 2 - 2 2 1 6 0 1 + 2 2 1 dtype: int64 >>> df.value_counts(sort=False) @@ -5662,8 +5553,8 @@ class DataFrame(NDFrame, OpsMixin): >>> df.value_counts(normalize=True) num_legs num_wings 4 0 0.50 - 2 2 0.25 6 0 0.25 + 2 2 0.25 dtype: float64 """ if subset is None: @@ -5684,7 +5575,7 @@ class DataFrame(NDFrame, OpsMixin): return counts - def nlargest(self, n, columns, keep="first") -> DataFrame: + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. @@ -5793,7 +5684,7 @@ class DataFrame(NDFrame, OpsMixin): """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first") -> DataFrame: + def nsmallest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5863,7 +5754,7 @@ class DataFrame(NDFrame, OpsMixin): population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI - Iceland 337000 17036 IS + Iceland 337000 17036 IS When using ``keep='last'``, ties are resolved in reverse order: @@ -5895,7 +5786,7 @@ class DataFrame(NDFrame, OpsMixin): self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame: + def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5926,7 +5817,7 @@ class DataFrame(NDFrame, OpsMixin): result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0) -> DataFrame: + def reorder_levels(self, order, axis=0) -> "DataFrame": """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5957,92 +5848,9 @@ class DataFrame(NDFrame, OpsMixin): return result # ---------------------------------------------------------------------- - # Arithmetic Methods + # Arithmetic / combination related - def _cmp_method(self, other, op): - axis = 1 # only relevant for Series other case - - self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) - - # See GH#4537 for discussion of scalar op behavior - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) - - def _arith_method(self, other, op): - if ops.should_reindex_frame_op(self, other, op, 1, 1, None, None): - return ops.frame_arith_method_with_reindex(self, other, op) - - axis = 1 # only relevant for Series other case - - self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) - - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data) - - _logical_method = _arith_method - - def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): - """ - Evaluate the frame operation func(left, right) by evaluating - column-by-column, dispatching to the Series implementation. - - Parameters - ---------- - right : scalar, Series, or DataFrame - func : arithmetic or comparison operator - axis : {None, 0, 1} - - Returns - ------- - DataFrame - """ - # Get the appropriate array-op to apply to each column/block's values. - array_op = ops.get_array_op(func) - - right = lib.item_from_zerodim(right) - if not is_list_like(right): - # i.e. scalar, faster than checking np.ndim(right) == 0 - bm = self._mgr.apply(array_op, right=right) - return type(self)(bm) - - elif isinstance(right, DataFrame): - assert self.index.equals(right.index) - assert self.columns.equals(right.columns) - # TODO: The previous assertion `assert right._indexed_same(self)` - # fails in cases with empty columns reached via - # _frame_arith_method_with_reindex - - bm = self._mgr.operate_blockwise(right._mgr, array_op) - return type(self)(bm) - - elif isinstance(right, Series) and axis == 1: - # axis=1 means we want to operate row-by-row - assert right.index.equals(self.columns) - - right = right._values - # maybe_align_as_frame ensures we do not have an ndarray here - assert not isinstance(right, np.ndarray) - - arrays = [ - array_op(_left, _right) - for _left, _right in zip(self._iter_column_arrays(), right) - ] - - elif isinstance(right, Series): - assert right.index.equals(self.index) # Handle other cases later - right = right._values - - arrays = [array_op(left, right) for left in self._iter_column_arrays()] - - else: - # Remaining cases have less-obvious dispatch rules - raise NotImplementedError(right) - - return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False - ) - - def _combine_frame(self, other: DataFrame, func, fill_value=None): + def _combine_frame(self, other: "DataFrame", func, fill_value=None): # at this point we have `self._indexed_same(other)` if fill_value is None: @@ -6059,10 +5867,10 @@ class DataFrame(NDFrame, OpsMixin): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - new_data = self._dispatch_frame_op(other, _arith_op) + new_data = ops.dispatch_to_series(self, other, _arith_op) return new_data - def _construct_result(self, result) -> DataFrame: + def _construct_result(self, result) -> "DataFrame": """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -6081,23 +5889,7 @@ class DataFrame(NDFrame, OpsMixin): out.index = self.index return out - def __divmod__(self, other) -> Tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = self // other - mod = self - div * other - return div, mod - - def __rdivmod__(self, other) -> Tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = other // self - mod = other - div * self - return div, mod - - # ---------------------------------------------------------------------- - # Combination-Related - - @doc( - _shared_docs["compare"], + @Appender( """ Returns ------- @@ -6107,31 +5899,22 @@ DataFrame The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. -Raises ------- -ValueError - When the two DataFrames don't have identical labels or shape. - See Also -------- Series.compare : Compare with another Series and show differences. -DataFrame.equals : Test whether two objects contain the same elements. Notes ----- Matching NaNs will not appear as a difference. -Can only compare identically-labeled -(i.e. same shape, identical row and column labels) DataFrames - Examples -------- >>> df = pd.DataFrame( -... {{ +... { ... "col1": ["a", "a", "b", "b", "a"], ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] -... }}, +... }, ... columns=["col1", "col2", "col3"], ... ) >>> df @@ -6199,16 +5982,16 @@ Keep all original rows and columns and also all original values 2 b b 3.0 3.0 3.0 4.0 3 b b NaN NaN 4.0 4.0 4 a a 5.0 5.0 5.0 5.0 -""", - klass=_shared_doc_kwargs["klass"], +""" ) + @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, - other: DataFrame, + other: "DataFrame", align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> DataFrame: + ) -> "DataFrame": return super().compare( other=other, align_axis=align_axis, @@ -6217,8 +6000,8 @@ Keep all original rows and columns and also all original values ) def combine( - self, other: DataFrame, func, fill_value=None, overwrite=True - ) -> DataFrame: + self, other: "DataFrame", func, fill_value=None, overwrite=True + ) -> "DataFrame": """ Perform column-wise combine with another DataFrame. @@ -6378,14 +6161,14 @@ Keep all original rows and columns and also all original values otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) - arr = maybe_downcast_to_dtype(arr, new_dtype) + arr = maybe_downcast_to_dtype(arr, this_dtype) result[col] = arr # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: DataFrame) -> DataFrame: + def combine_first(self, other: "DataFrame") -> "DataFrame": """ Update null elements with value in the same location in `other`. @@ -6429,11 +6212,29 @@ Keep all original rows and columns and also all original values """ import pandas.core.computation.expressions as expressions - def combiner(x, y): - mask = extract_array(isna(x)) + def extract_values(arr): + # Does two things: + # 1. maybe gets the values from the Series / Index + # 2. convert datelike to i8 + # TODO: extract_array? + if isinstance(arr, (Index, Series)): + arr = arr._values - x_values = extract_array(x, extract_numpy=True) - y_values = extract_array(y, extract_numpy=True) + if needs_i8_conversion(arr.dtype): + if is_extension_array_dtype(arr.dtype): + arr = arr.asi8 + else: + arr = arr.view("i8") + return arr + + def combiner(x, y): + mask = isna(x) + # TODO: extract_array? + if isinstance(mask, (Index, Series)): + mask = mask._values + + x_values = extract_values(x) + y_values = extract_values(y) # If the column y in other DataFrame is not in first DataFrame, # just return y_values. @@ -6496,7 +6297,7 @@ Keep all original rows and columns and also all original values See Also -------- dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-column(s) operations. + DataFrame.merge : For column(s)-on-columns(s) operations. Examples -------- @@ -6524,7 +6325,7 @@ Keep all original rows and columns and also all original values 1 b e 2 c f - For Series, its name attribute must be set. + For Series, it's name attribute must be set. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], ... 'B': ['x', 'y', 'z']}) @@ -6692,7 +6493,7 @@ NaN 12.3 33.0 squeeze: bool = no_default, observed: bool = False, dropna: bool = True, - ) -> DataFrameGroupBy: + ) -> "DataFrameGroupBy": from pandas.core.groupby.generic import DataFrameGroupBy if squeeze is not no_default: @@ -6755,6 +6556,9 @@ NaN 12.3 33.0 specified, all remaining columns will be used and the result will have hierarchically indexed columns. + .. versionchanged:: 0.23.0 + Also accept list of column names. + Returns ------- DataFrame @@ -6772,8 +6576,6 @@ NaN 12.3 33.0 duplicate values for one index/column pair. DataFrame.unstack : Pivot based on the index values instead of a column. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. Notes ----- @@ -6870,7 +6672,7 @@ NaN 12.3 33.0 @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None) -> DataFrame: + def pivot(self, index=None, columns=None, values=None) -> "DataFrame": from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -6928,10 +6730,6 @@ NaN 12.3 33.0 -------- DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. - DataFrame.melt: Unpivot a DataFrame from wide to long format, - optionally leaving identifiers set. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. Examples -------- @@ -7022,7 +6820,7 @@ NaN 12.3 33.0 dropna=True, margins_name="All", observed=False, - ) -> DataFrame: + ) -> "DataFrame": from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -7202,15 +7000,13 @@ NaN 12.3 33.0 from pandas.core.reshape.reshape import stack, stack_multiple if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna) + return stack_multiple(self, level, dropna=dropna) else: - result = stack(self, level, dropna=dropna) - - return result.__finalize__(self, method="stack") + return stack(self, level, dropna=dropna) def explode( self, column: Union[str, Tuple], ignore_index: bool = False - ) -> DataFrame: + ) -> "DataFrame": """ Transform each element of a list-like to a row, replicating index values. @@ -7245,11 +7041,10 @@ NaN 12.3 33.0 Notes ----- - This routine will explode list-likes including lists, tuples, sets, + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged, and empty list-likes will - result in a np.nan for that row. In addition, the ordering of rows in the - output will be non-deterministic when exploding sets. + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. Examples -------- @@ -7277,6 +7072,8 @@ NaN 12.3 33.0 raise ValueError("columns must be unique") df = self.reset_index(drop=True) + # TODO: use overload to refine return type of reset_index + assert df is not None # needed for mypy result = df[column].explode() result = df.drop([column], axis=1).join(result) if ignore_index: @@ -7346,11 +7143,16 @@ NaN 12.3 33.0 """ from pandas.core.reshape.reshape import unstack - result = unstack(self, level, fill_value) + return unstack(self, level, fill_value) - return result.__finalize__(self, method="unstack") - - @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) + @Appender( + _shared_docs["melt"] + % dict( + caller="df.melt(", + versionadded="\n .. versionadded:: 0.20.0\n", + other="melt", + ) + ) def melt( self, id_vars=None, @@ -7359,7 +7161,7 @@ NaN 12.3 33.0 value_name="value", col_level=None, ignore_index=True, - ) -> DataFrame: + ) -> "DataFrame": return melt( self, @@ -7408,13 +7210,13 @@ NaN 12.3 33.0 Difference with previous column >>> df.diff(axis=1) - a b c - 0 NaN 0 0 - 1 NaN -1 3 - 2 NaN -1 7 - 3 NaN -1 13 - 4 NaN 0 20 - 5 NaN 2 28 + a b c + 0 NaN 0.0 0.0 + 1 NaN -1.0 3.0 + 2 NaN -1.0 7.0 + 3 NaN -1.0 13.0 + 4 NaN 0.0 20.0 + 5 NaN 2.0 28.0 Difference with 3rd previous row @@ -7447,26 +7249,23 @@ NaN 12.3 33.0 1 255.0""" ), ) - def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: - if not isinstance(periods, int): - if not (is_float(periods) and periods.is_integer()): - raise ValueError("periods must be an integer") - periods = int(periods) + def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": bm_axis = self._get_block_manager_axis(axis) + self._consolidate_inplace() if bm_axis == 0 and periods != 0: - return self - self.shift(periods, axis=axis) + return self.T.diff(periods, axis=0).T new_data = self._mgr.diff(n=periods, axis=bm_axis) - return self._constructor(new_data).__finalize__(self, "diff") + return self._constructor(new_data) # ---------------------------------------------------------------------- # Function application def _gotitem( self, - key: Union[Label, List[Label]], + key: Union[str, List[str]], ndim: int, subset: Optional[FrameOrSeriesUnion] = None, ) -> FrameOrSeriesUnion: @@ -7534,18 +7333,9 @@ NaN 12.3 33.0 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) A B - sum 12.0 NaN - min 1.0 2.0 max NaN 8.0 - - Aggregate different functions over the columns and rename the index of the resulting - DataFrame. - - >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) - A B C - x 7.0 NaN NaN - y NaN 2.0 NaN - z NaN NaN 6.0 + min 1.0 2.0 + sum 12.0 NaN Aggregate over the columns. @@ -7564,6 +7354,7 @@ NaN 12.3 33.0 axis=_shared_doc_kwargs["axis"], see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -7585,12 +7376,6 @@ NaN 12.3 33.0 if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged - - # For the return values of reconstruct_func, if relabeling is - # False, columns and order will be None. - assert columns is not None - assert order is not None - result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) @@ -7600,24 +7385,23 @@ NaN 12.3 33.0 if axis == 1: # NDFrame.aggregate returns a tuple, and we need to transpose # only result - result, how = aggregate(self.T, arg, *args, **kwargs) + result, how = self.T._aggregate(arg, *args, **kwargs) result = result.T if result is not None else result return result, how - return aggregate(self, arg, *args, **kwargs) + return super()._aggregate(arg, *args, **kwargs) agg = aggregate @doc( - _shared_docs["transform"], + NDFrame.transform, klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform( - self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> DataFrame: - result = transform(self, func, axis, *args, **kwargs) - assert isinstance(result, DataFrame) - return result + def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": + axis = self._get_axis_number(axis) + if axis == 1: + return self.T.transform(func, *args, **kwargs).T + return super().transform(func, *args, **kwargs) def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ @@ -7663,6 +7447,9 @@ NaN 12.3 33.0 applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. + + .. versionadded:: 0.23.0 + args : tuple Positional arguments to pass to `func` in addition to the array/series. @@ -7764,7 +7551,7 @@ NaN 12.3 33.0 ) return op.get_result() - def applymap(self, func, na_action: Optional[str] = None) -> DataFrame: + def applymap(self, func) -> "DataFrame": """ Apply a function to a Dataframe elementwise. @@ -7775,10 +7562,6 @@ NaN 12.3 33.0 ---------- func : callable Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If ‘ignore’, propagate NaN values, without passing them to func. - - .. versionadded:: 1.2 Returns ------- @@ -7802,15 +7585,6 @@ NaN 12.3 33.0 0 3 4 1 5 5 - Like Series.map, NA values can be ignored: - - >>> df_copy = df.copy() - >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') - 0 1 - 0 4 - 1 5 5 - Note that a vectorized version of `func` often exists, which will be much faster. You could square each number elementwise. @@ -7826,26 +7600,20 @@ NaN 12.3 33.0 0 1.000000 4.494400 1 11.262736 20.857489 """ - if na_action not in {"ignore", None}: - raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" - ) - ignore_na = na_action == "ignore" - # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): if x.empty: - return lib.map_infer(x, func, ignore_na=ignore_na) - return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) + return lib.map_infer(x, func) + return lib.map_infer(x.astype(object)._values, func) - return self.apply(infer).__finalize__(self, "applymap") + return self.apply(infer) # ---------------------------------------------------------------------- # Merging / joining methods def append( self, other, ignore_index=False, verify_integrity=False, sort=False - ) -> DataFrame: + ) -> "DataFrame": """ Append rows of `other` to the end of caller, returning a new object. @@ -7862,6 +7630,7 @@ NaN 12.3 33.0 sort : bool, default False Sort columns if the columns of `self` and `other` are not aligned. + .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. @@ -7975,18 +7744,16 @@ NaN 12.3 33.0 to_concat = [self, *other] else: to_concat = [self, other] - return ( - concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) - ).__finalize__(self, method="append") + return concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False - ) -> DataFrame: + ) -> "DataFrame": """ Join columns of another DataFrame. @@ -8032,7 +7799,7 @@ NaN 12.3 33.0 See Also -------- - DataFrame.merge : For column(s)-on-column(s) operations. + DataFrame.merge : For column(s)-on-columns(s) operations. Notes ----- @@ -8120,15 +7887,6 @@ NaN 12.3 33.0 other = DataFrame({other.name: other}) if isinstance(other, DataFrame): - if how == "cross": - return merge( - self, - other, - how=how, - on=on, - suffixes=(lsuffix, rsuffix), - sort=sort, - ) return merge( self, other, @@ -8186,7 +7944,7 @@ NaN 12.3 33.0 copy=True, indicator=False, validate=None, - ) -> DataFrame: + ) -> "DataFrame": from pandas.core.reshape.merge import merge return merge( @@ -8205,7 +7963,7 @@ NaN 12.3 33.0 validate=validate, ) - def round(self, decimals=0, *args, **kwargs) -> DataFrame: + def round(self, decimals=0, *args, **kwargs) -> "DataFrame": """ Round a DataFrame to a variable number of decimal places. @@ -8319,7 +8077,7 @@ NaN 12.3 33.0 # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1) -> DataFrame: + def corr(self, method="pearson", min_periods=1) -> "DataFrame": """ Compute pairwise correlation of columns, excluding NA/null values. @@ -8410,7 +8168,7 @@ NaN 12.3 33.0 def cov( self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 - ) -> DataFrame: + ) -> "DataFrame": """ Compute pairwise covariance of columns, excluding NA/null values. @@ -8650,7 +8408,6 @@ NaN 12.3 33.0 See Also -------- Series.count: Number of non-NA elements in a Series. - DataFrame.value_counts: Count unique combinations of columns. DataFrame.shape: Number of DataFrame rows and columns (including NA elements). DataFrame.isna: Boolean same-sized DataFrame showing places of NA @@ -8775,7 +8532,6 @@ NaN 12.3 33.0 self, op, name: str, - *, axis=0, skipna=True, numeric_only=None, @@ -8784,12 +8540,12 @@ NaN 12.3 33.0 ): assert filter_type is None or filter_type == "bool", filter_type - out_dtype = "bool" if filter_type == "bool" else None - - own_dtypes = [arr.dtype for arr in self._iter_column_arrays()] dtype_is_dt = np.array( - [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], + [ + is_datetime64_any_dtype(values.dtype) + for values in self._iter_column_arrays() + ], dtype=bool, ) if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): @@ -8798,90 +8554,144 @@ NaN 12.3 33.0 "will include datetime64 and datetime64tz columns in a " "future version.", FutureWarning, - stacklevel=5, + stacklevel=3, ) cols = self.columns[~dtype_is_dt] self = self[cols] - # TODO: Make other agg func handle axis=None properly GH#21597 - axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) - assert axis in [0, 1] + if axis is None and filter_type == "bool": + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor - def func(values: np.ndarray): - # We only use this in the case that operates on self.values - return op(values, axis=axis, skipna=skipna, **kwds) - - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) + def func(values): + if is_extension_array_dtype(values.dtype): + return extract_array(values)._reduce(name, skipna=skipna, **kwds) else: - return op(values, axis=1, skipna=skipna, **kwds) + return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data() -> DataFrame: + def _get_data(axis_matters): if filter_type is None: data = self._get_numeric_data() - else: - # GH#25101, GH#24434 - assert filter_type == "bool" - data = self._get_bool_data() + elif filter_type == "bool": + if axis_matters: + # GH#25101, GH#24434 + data = self._get_bool_data() if axis == 0 else self + else: + data = self._get_bool_data() + else: # pragma: no cover + msg = ( + f"Generating numeric_only data with filter_type {filter_type} " + "not supported." + ) + raise NotImplementedError(msg) return data - if numeric_only is not None or axis == 0: - # For numeric_only non-None and axis non-None, we know - # which blocks to use and no try/except is needed. - # For numeric_only=None only the case with axis==0 and no object - # dtypes are unambiguous can be handled with BlockManager.reduce - # Case with EAs see GH#35881 + if numeric_only is not None and axis in [0, 1]: df = self if numeric_only is True: - df = _get_data() + df = _get_data(axis_matters=True) if axis == 1: df = df.T axis = 0 - ignore_failures = numeric_only is None + out_dtype = "bool" if filter_type == "bool" else None + + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) - out = df._constructor(res).iloc[0] - if out_dtype is not None: - out = out.astype(out_dtype) - if axis == 0 and len(self) == 0 and name in ["sum", "prod"]: - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) + # simple case where we can use BlockManager._reduce + res = df._mgr.reduce(blk_func) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + if axis == 0 and is_object_dtype(out.dtype): + out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - assert numeric_only is None + if not self._is_homogeneous_type or self._mgr.any_extension_types: + # try to avoid self.values call - data = self - values = data.values + if filter_type is None and axis == 0 and len(self) > 0: + # operate column-wise - try: - result = func(values) + # numeric_only must be None here, as other cases caught above + # require len(self) > 0 bc frame_apply messes up empty prod/sum - except TypeError: - # e.g. in nanops trying to convert strs to float + # this can end up with a non-reduction + # but not always. if the types are mixed + # with datelike then need to make sure a series - data = _get_data() - labels = data._get_agg_axis(axis) + # we only end up here if we have not specified + # numeric_only and yet we have tried a + # column-by-column reduction, where we have mixed type. + # So let's just do what we can + from pandas.core.apply import frame_apply + opa = frame_apply( + self, func=func, result_type="expand", ignore_failures=True + ) + result = opa.get_result() + if result.ndim == self.ndim: + result = result.iloc[0].rename(None) + return result + + if numeric_only is None: + data = self values = data.values - with np.errstate(all="ignore"): + + try: result = func(values) - if filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) - elif filter_type is None and is_object_dtype(result.dtype): + except TypeError: + # e.g. in nanops trying to convert strs to float + + # TODO: why doesnt axis matter here? + data = _get_data(axis_matters=False) + labels = data._get_agg_axis(axis) + + values = data.values + with np.errstate(all="ignore"): + result = func(values) + + else: + if numeric_only: + data = _get_data(axis_matters=True) + labels = data._get_agg_axis(axis) + + values = data.values + else: + data = self + values = data.values + result = func(values) + + if filter_type == "bool" and is_object_dtype(values) and axis is None: + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: can we de-duplicate parts of this with the next blocK? + result = np.bool_(result) + elif hasattr(result, "dtype") and is_object_dtype(result.dtype): try: - result = result.astype(np.float64) + if filter_type is None: + result = result.astype(np.float64) + elif filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) except (ValueError, TypeError): # try to coerce to the original dtypes item by item if we can - pass + if axis == 0: + result = coerce_to_dtypes(result, data.dtypes) - result = self._constructor_sliced(result, index=labels) + if constructor is not None: + result = self._constructor_sliced(result, index=labels) return result def nunique(self, axis=0, dropna=True) -> Series: @@ -8986,11 +8796,7 @@ NaN 12.3 33.0 dtype: object """ axis = self._get_axis_number(axis) - - res = self._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame @@ -9063,11 +8869,7 @@ NaN 12.3 33.0 dtype: object """ axis = self._get_axis_number(axis) - - res = self._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) # indices will always be np.ndarray since axis is not None and # values is a 2d array for DataFrame @@ -9089,7 +8891,7 @@ NaN 12.3 33.0 else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame: + def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": """ Get the mode(s) of each element along the selected axis. @@ -9137,8 +8939,8 @@ NaN 12.3 33.0 ostrich bird 2 NaN By default, missing values are not considered, and the mode of wings - are both 0 and 2. Because the resulting DataFrame has two rows, - the second row of ``species`` and ``legs`` contains ``NaN``. + are both 0 and 2. The second row of species and legs contains ``NaN``, + because they have only one mode, but the DataFrame has two rows. >>> df.mode() species legs wings @@ -9274,7 +9076,7 @@ NaN 12.3 33.0 def to_timestamp( self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True - ) -> DataFrame: + ) -> "DataFrame": """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -9298,15 +9100,12 @@ NaN 12.3 33.0 axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) - if not isinstance(old_ax, PeriodIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") - new_ax = old_ax.to_timestamp(freq=freq, how=how) setattr(new_obj, axis_name, new_ax) return new_obj - def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: + def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame": """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -9330,15 +9129,12 @@ NaN 12.3 33.0 axis_name = self._get_axis_name(axis) old_ax = getattr(self, axis_name) - if not isinstance(old_ax, DatetimeIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") - new_ax = old_ax.to_period(freq=freq) setattr(new_obj, axis_name, new_ax) return new_obj - def isin(self, values) -> DataFrame: + def isin(self, values) -> "DataFrame": """ Whether each element in the DataFrame is contained in values. @@ -9445,10 +9241,10 @@ NaN 12.3 33.0 _info_axis_number = 1 _info_axis_name = "columns" - index: Index = properties.AxisProperty( + index: "Index" = properties.AxisProperty( axis=1, doc="The index (row labels) of the DataFrame." ) - columns: Index = properties.AxisProperty( + columns: "Index" = properties.AxisProperty( axis=0, doc="The column labels of the DataFrame." ) @@ -9473,12 +9269,14 @@ NaN 12.3 33.0 DataFrame._add_numeric_operations() +DataFrame._add_series_or_dataframe_operations() ops.add_flex_arithmetic_methods(DataFrame) +ops.add_special_arithmetic_methods(DataFrame) -def _from_nested_dict(data) -> collections.defaultdict: - new_data: collections.defaultdict = collections.defaultdict(dict) +def _from_nested_dict(data): + new_data = collections.defaultdict(dict) for index, s in data.items(): for col, v in s.items(): new_data[col][index] = v diff --git a/venv/lib/python3.8/site-packages/pandas/core/generic.py b/venv/lib/python3.8/site-packages/pandas/core/generic.py index e43edf1..1c6248a 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/generic.py +++ b/venv/lib/python3.8/site-packages/pandas/core/generic.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import collections from datetime import timedelta import functools @@ -8,6 +6,7 @@ import json import operator import pickle import re +from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -23,7 +22,6 @@ from typing import ( Tuple, Type, Union, - cast, ) import warnings import weakref @@ -33,28 +31,29 @@ import numpy as np from pandas._config import config from pandas._libs import lib -from pandas._libs.tslibs import Period, Tick, Timestamp, to_offset +from pandas._libs.tslibs import Tick, Timestamp, to_offset from pandas._typing import ( Axis, - CompressionOptions, FilePathOrBuffer, FrameOrSeries, - IndexKeyFunc, - IndexLabel, JSONSerializable, Label, Level, Renamer, - StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, - final, ) +from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError -from pandas.util._decorators import doc, rewrite_axis_style_signature +from pandas.util._decorators import ( + Appender, + Substitution, + doc, + rewrite_axis_style_signature, +) from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -70,7 +69,6 @@ from pandas.core.dtypes.common import ( is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, - is_dtype_equal, is_extension_array_dtype, is_float, is_list_like, @@ -87,56 +85,69 @@ from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas.core import arraylike, indexing, missing, nanops +from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.flags import Flags -from pandas.core.indexes import base as ibase -from pandas.core.indexes.api import ( - DatetimeIndex, - Index, - MultiIndex, - PeriodIndex, - RangeIndex, - ensure_index, -) +from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.period import Period, PeriodIndex +import pandas.core.indexing as indexing from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index -from pandas.core.ops import align_method_FRAME +from pandas.core.ops import _align_method_FRAME from pandas.core.shared_docs import _shared_docs -from pandas.core.sorting import get_indexer_indexer -from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt -from pandas.io.formats.format import ( - DataFrameFormatter, - DataFrameRenderer, - format_percentiles, -) +from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas._libs.tslibs import BaseOffset - - from pandas.core.frame import DataFrame from pandas.core.resample import Resampler - from pandas.core.series import Series - from pandas.core.window.indexers import BaseIndexer + from pandas.core.series import Series # noqa: F401 # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs = {**_shared_docs} -_shared_doc_kwargs = { - "axes": "keywords for axes", - "klass": "Series/DataFrame", - "axes_single_arg": "int or labels for object", - "args_transpose": "axes to permute (int or label for object)", - "optional_by": """ +_shared_doc_kwargs = dict( + axes="keywords for axes", + klass="Series/DataFrame", + axes_single_arg="int or labels for object", + args_transpose="axes to permute (int or label for object)", + optional_by=""" by : str or list of str Name or list of names to sort by""", -} +) + + +def _single_replace(self, to_replace, method, inplace, limit): + """ + Replaces values in a Series using the fill method specified when no + replacement value is given in the replace method + """ + if self.ndim != 1: + raise TypeError( + f"cannot replace {to_replace} with method {method} on a " + f"{type(self).__name__}" + ) + + orig_dtype = self.dtype + result = self if inplace else self.copy() + fill_f = missing.get_fill_func(method) + + mask = missing.mask_missing(result.values, to_replace) + values = fill_f(result.values, limit=limit, mask=mask) + + if values.dtype == orig_dtype and inplace: + return + + result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self) + + if inplace: + self._update_inplace(result) + return + + return result bool_t = bool # Need alias because NDFrame has def bool: @@ -168,11 +179,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "_metadata", "__array_struct__", "__array_interface__", - "_flags", ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _hidden_attrs: FrozenSet[str] = frozenset(["get_values", "tshift"]) + _deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"]) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -198,7 +208,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) - object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -219,20 +228,15 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return mgr # ---------------------------------------------------------------------- - # attrs and flags @property def attrs(self) -> Dict[Optional[Hashable], Any]: """ - Dictionary of global attributes of this dataset. + Dictionary of global attributes on this object. .. warning:: attrs is experimental and may change without warning. - - See Also - -------- - DataFrame.flags : Global flags applying to this object. """ if self._attrs is None: self._attrs = {} @@ -242,99 +246,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) - @final - @property - def flags(self) -> Flags: - """ - Get the properties associated with this pandas object. - - The available flags are - - * :attr:`Flags.allows_duplicate_labels` - - See Also - -------- - Flags : Flags that apply to pandas objects. - DataFrame.attrs : Global metadata applying to this dataset. - - Notes - ----- - "Flags" differ from "metadata". Flags reflect properties of the - pandas object (the Series or DataFrame). Metadata refer to properties - of the dataset, and should be stored in :attr:`DataFrame.attrs`. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2]}) - >>> df.flags - - - Flags can be get or set using ``.`` - - >>> df.flags.allows_duplicate_labels - True - >>> df.flags.allows_duplicate_labels = False - - Or by slicing with a key - - >>> df.flags["allows_duplicate_labels"] - False - >>> df.flags["allows_duplicate_labels"] = True - """ - return self._flags - - @final - def set_flags( - self: FrameOrSeries, - *, - copy: bool = False, - allows_duplicate_labels: Optional[bool] = None, - ) -> FrameOrSeries: - """ - Return a new object with updated flags. - - Parameters - ---------- - allows_duplicate_labels : bool, optional - Whether the returned object allows duplicate labels. - - Returns - ------- - Series or DataFrame - The same type as the caller. - - See Also - -------- - DataFrame.attrs : Global metadata applying to this dataset. - DataFrame.flags : Global flags applying to this object. - - Notes - ----- - This method returns a new object that's a view on the same data - as the input. Mutating the input or the output values will be reflected - in the other. - - This method is intended to be used in method chains. - - "Flags" differ from "metadata". Flags reflect properties of the - pandas object (the Series or DataFrame). Metadata refer to properties - of the dataset, and should be stored in :attr:`DataFrame.attrs`. - - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2]}) - >>> df.flags.allows_duplicate_labels - True - >>> df2 = df.set_flags(allows_duplicate_labels=False) - >>> df2.flags.allows_duplicate_labels - False - """ - df = self.copy(deep=copy) - if allows_duplicate_labels is not None: - df.flags["allows_duplicate_labels"] = allows_duplicate_labels - return df - - @final @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ @@ -380,7 +291,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Internals - @final @property def _data(self): # GH#33054 retained because some downstream packages uses this, @@ -402,23 +312,25 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): @property def _AXIS_NUMBERS(self) -> Dict[str, int]: """.. deprecated:: 1.1.0""" - warnings.warn("_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3) + warnings.warn( + "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3, + ) return {"index": 0} @property def _AXIS_NAMES(self) -> Dict[int, str]: """.. deprecated:: 1.1.0""" - warnings.warn("_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3) + warnings.warn( + "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3, + ) return {0: "index"} - @final def _construct_axes_dict(self, axes=None, **kwargs): """Return an axes dictionary for myself.""" d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)} d.update(kwargs) return d - @final @classmethod def _construct_axes_from_arguments( cls, args, kwargs, require_all: bool = False, sentinel=None @@ -450,7 +362,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): axes = {a: kwargs.pop(a, sentinel) for a in cls._AXIS_ORDERS} return axes, kwargs - @final @classmethod def _get_axis_number(cls, axis: Axis) -> int: try: @@ -458,19 +369,16 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): except KeyError: raise ValueError(f"No axis named {axis} for object type {cls.__name__}") - @final @classmethod def _get_axis_name(cls, axis: Axis) -> str: axis_number = cls._get_axis_number(axis) return cls._AXIS_ORDERS[axis_number] - @final def _get_axis(self, axis: Axis) -> Index: axis_number = self._get_axis_number(axis) assert axis_number in {0, 1} return self.index if axis_number == 0 else self.columns - @final @classmethod def _get_block_manager_axis(cls, axis: Axis) -> int: """Map the axis to the block_manager axis.""" @@ -480,11 +388,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return m - axis return axis - @final - def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]]: + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) - d = {} + d = dict() prefix = axis[0] for i, name in enumerate(axis_index.names): @@ -511,18 +418,16 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): d[axis] = dindex return d - @final - def _get_index_resolvers(self) -> Dict[Label, Union[Series, MultiIndex]]: + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: from pandas.core.computation.parsing import clean_column_name - d: Dict[str, Union[Series, MultiIndex]] = {} + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} - @final - def _get_cleaned_column_resolvers(self) -> Dict[Label, Series]: + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: """ Return the special character free column resolvers of a dataframe. @@ -610,13 +515,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ return np.prod(self.shape) - @final @property def _selected_obj(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ return self - @final @property def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries: """ internal compat with SelectionMixin """ @@ -643,18 +546,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Returns ------- renamed : %(klass)s or None - An object of type %(klass)s or None if ``inplace=True``. + An object of type %(klass)s if inplace=False, None otherwise. See Also -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ - self._check_inplace_and_allows_duplicate_labels(inplace) - return self._set_axis_nocheck(labels, axis, inplace) - - @final - def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): - # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy. if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -667,7 +564,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._mgr.set_axis(axis, labels) self._clear_item_cache() - @final def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: """ Interchange axes and swap values axes appropriately. @@ -693,11 +589,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ignore needed because of NDFrame constructor is different than # DataFrame/Series constructors. - return self._constructor( - new_values, *new_axes # type: ignore[arg-type] - ).__finalize__(self, method="swapaxes") + return self._constructor(new_values, *new_axes).__finalize__( # type: ignore + self, method="swapaxes" + ) - @final def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return DataFrame with requested index / column level(s) removed. @@ -762,7 +657,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item: Label) -> Union[Series, Any]: + def pop(self, item: Label) -> Union["Series", Any]: result = self[item] del self[item] if self.ndim == 2: @@ -770,7 +665,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return result - @final def squeeze(self, axis=None): """ Squeeze 1 dimensional axis objects into scalars. @@ -1027,7 +921,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: index = mapper - self._check_inplace_and_allows_duplicate_labels(inplace) result = self if inplace else self.copy(deep=copy) for axis_no, replacements in enumerate((index, columns)): @@ -1052,7 +945,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level) - result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) + result.set_axis(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: @@ -1094,7 +987,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Returns ------- Series, DataFrame, or None - The same type as the caller or None if ``inplace=True``. + The same type as the caller or None if `inplace` is True. See Also -------- @@ -1114,7 +1007,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): In this case, the parameter ``copy`` is ignored. The second calling convention will modify the names of the - corresponding index if mapper is a list or a scalar. + the corresponding index if mapper is a list or a scalar. However, if mapper is dict-like or a function, it will use the deprecated behavior of modifying the axis *labels*. @@ -1235,7 +1128,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if not inplace: return result - @final def _set_axis_name(self, name, axis=0, inplace=False): """ Set the name(s) of the axis. @@ -1298,24 +1190,20 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Comparison Methods - @final def _indexed_same(self, other) -> bool: return all( self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) - @final - def equals(self, other: object) -> bool: + def equals(self, other): """ Test whether two objects contain the same elements. This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. - - The row/column index do not need to have the same type, as long - as the values are considered equal. Corresponding columns must be of - the same dtype. + the same location are considered equal. The column headers do not + need to have the same type, but the elements within the columns must + be the same dtype. Parameters ---------- @@ -1344,6 +1232,13 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. + Notes + ----- + This function requires that the elements have the same dtype as their + respective elements in the other Series or DataFrame. However, the + column labels do not need to have the same type, as long as they are + still considered equal. + Examples -------- >>> df = pd.DataFrame({1: [10], 2: [20]}) @@ -1385,13 +1280,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False - other = cast(NDFrame, other) return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- # Unary Methods - @final def __neg__(self): values = self._values if is_bool_dtype(values): @@ -1406,7 +1299,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}") return self.__array_wrap__(arr) - @final def __pos__(self): values = self._values if is_bool_dtype(values): @@ -1424,7 +1316,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) return self.__array_wrap__(arr) - @final def __invert__(self): if not self.size: # inv fails with 0 len @@ -1434,7 +1325,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result = self._constructor(new_data).__finalize__(self, method="__invert__") return result - @final def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " @@ -1443,7 +1333,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): __bool__ = __nonzero__ - @final def bool(self): """ Return the bool of a single element Series or DataFrame. @@ -1488,11 +1377,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self.__nonzero__() - @final def __abs__(self: FrameOrSeries) -> FrameOrSeries: return self.abs() - @final def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: return self.round(decimals) @@ -1504,7 +1391,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # operations should utilize/extend these methods when possible so that we # have consistent precedence and validation logic throughout the library. - @final def _is_level_reference(self, key, axis=0): """ Test whether a key is a level reference for a given axis. @@ -1535,7 +1421,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): and not self._is_label_reference(key, axis=axis) ) - @final def _is_label_reference(self, key, axis=0) -> bool_t: """ Test whether a key is a label reference for a given axis. @@ -1565,7 +1450,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): and any(key in self.axes[ax] for ax in other_axes) ) - @final def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t: """ Test whether a key is a label or level reference for a given axis. @@ -1590,7 +1474,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): key, axis=axis ) - @final def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None: """ Check whether `key` is ambiguous. @@ -1634,7 +1517,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) raise ValueError(msg) - @final def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: """ Return a 1-D array of values associated with `key`, a label or level @@ -1694,12 +1576,14 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): label_axis_name = "column" if axis == 0 else "index" raise ValueError( - f"The {label_axis_name} label '{key}' is not unique.{multi_message}" + ( + f"The {label_axis_name} label '{key}' " + f"is not unique.{multi_message}" + ) ) return values - @final def _drop_labels_or_levels(self, keys, axis: int = 0): """ Drop labels and/or levels for the given `axis`. @@ -1736,8 +1620,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if invalid_keys: raise ValueError( - "The following keys are not valid labels or " - f"levels for axis {axis}: {invalid_keys}" + ( + "The following keys are not valid labels or " + f"levels for axis {axis}: {invalid_keys}" + ) ) # Compute levels and labels to drop @@ -1778,7 +1664,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Iteration - def __hash__(self) -> int: + def __hash__(self): raise TypeError( f"{repr(type(self).__name__)} objects are mutable, " f"thus they cannot be hashed" @@ -1830,7 +1716,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """Returns length of info axis""" return len(self._info_axis) - @final def __contains__(self, key) -> bool_t: """True if the key is in the info axis""" return key in self._info_axis @@ -1895,28 +1780,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) - def __array_wrap__( - self, - result: np.ndarray, - context: Optional[Tuple[Callable, Tuple[Any, ...], int]] = None, - ): - """ - Gets called after a ufunc and other functions. - - Parameters - ---------- - result: np.ndarray - The result of the ufunc or other function called on the NumPy array - returned by __array__ - context: tuple of (func, tuple, int) - This parameter is returned by ufuncs as a 3-element tuple: (name of the - ufunc, arguments of the ufunc, domain of the ufunc), but is not set by - other numpy functions.q - - Notes - ----- - Series implements __array_ufunc_ so this not called for ufunc on Series. - """ + def __array_wrap__(self, result, context=None): result = lib.item_from_zerodim(result) if is_scalar(result): # e.g. we get here with np.ptp(series) @@ -1927,11 +1791,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self, method="__array_wrap__" ) - def __array_ufunc__( - self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any - ): - return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs) - # ideally we would define this to avoid the getattr checks, but # is slower # @property @@ -1943,20 +1802,18 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Picklability - @final def __getstate__(self) -> Dict[str, Any]: meta = {k: getattr(self, k, None) for k in self._metadata} - return { - "_mgr": self._mgr, - "_typ": self._typ, - "_metadata": self._metadata, - "attrs": self.attrs, - "_flags": {k: self.flags[k] for k in self.flags._keys}, + return dict( + _mgr=self._mgr, + _typ=self._typ, + _metadata=self._metadata, + attrs=self.attrs, **meta, - } + ) - @final def __setstate__(self, state): + if isinstance(state, BlockManager): self._mgr = state elif isinstance(state, dict): @@ -1967,8 +1824,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) - flags = state.get("_flags", {"allows_duplicate_labels": True}) - object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names # to avoid definitional recursion @@ -1976,7 +1831,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # defined meta = set(self._internal_names + self._metadata) for k in list(meta): - if k in state and k != "_flags": + if k in state: v = state[k] object.__setattr__(self, k, v) @@ -2000,7 +1855,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): prepr = f"[{','.join(map(pprint_thing, self))}]" return f"{type(self).__name__}({prepr})" - @final def _repr_latex_(self): """ Returns a LaTeX representation for a particular object. @@ -2011,7 +1865,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return None - @final def _repr_data_resource_(self): """ Not a real Jupyter special repr method, but we use the same @@ -2019,23 +1872,21 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ if config.get_option("display.html.table_schema"): data = self.head(config.get_option("display.max_rows")) - - as_json = data.to_json(orient="table") - as_json = cast(str, as_json) - payload = json.loads(as_json, object_pairs_hook=collections.OrderedDict) + payload = json.loads( + data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict + ) return payload # ---------------------------------------------------------------------- # I/O Methods - @final - @doc(klass="object", storage_options=_shared_docs["storage_options"]) + @doc(klass="object") def to_excel( self, excel_writer, - sheet_name: str = "Sheet1", - na_rep: str = "", - float_format: Optional[str] = None, + sheet_name="Sheet1", + na_rep="", + float_format=None, columns=None, header=True, index=True, @@ -2048,7 +1899,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): inf_rep="inf", verbose=True, freeze_panes=None, - storage_options: StorageOptions = None, ) -> None: """ Write {klass} to an Excel sheet. @@ -2065,7 +1915,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Parameters ---------- - excel_writer : path-like, file-like, or ExcelWriter object + excel_writer : str or ExcelWriter object File path or existing ExcelWriter. sheet_name : str, default 'Sheet1' Name of sheet which will contain DataFrame. @@ -2093,13 +1943,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. - - .. deprecated:: 1.2.0 - - As the `xlwt `__ package is no longer - maintained, the ``xlwt`` engine will be removed in a future version - of pandas. - merge_cells : bool, default True Write MultiIndex and Hierarchical Rows as merged cells. encoding : str, optional @@ -2113,9 +1956,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): freeze_panes : tuple of int (length 2), optional Specifies the one-based bottommost row and rightmost column that is to be frozen. - {storage_options} - - .. versionadded:: 1.2.0 See Also -------- @@ -2190,11 +2030,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): startcol=startcol, freeze_panes=freeze_panes, engine=engine, - storage_options=storage_options, ) - @final - @doc(storage_options=_shared_docs["storage_options"]) def to_json( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -2205,10 +2042,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, - compression: CompressionOptions = "infer", + compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, - storage_options: StorageOptions = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2227,27 +2063,29 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): * Series: - default is 'index' - - allowed values are: {{'split', 'records', 'index', 'table'}}. + - allowed values are: {'split','records','index','table'}. * DataFrame: - default is 'columns' - - allowed values are: {{'split', 'records', 'index', 'columns', - 'values', 'table'}}. + - allowed values are: {'split', 'records', 'index', 'columns', + 'values', 'table'}. * The format of the JSON string: - - 'split' : dict like {{'index' -> [index], 'columns' -> [columns], - 'data' -> [values]}} - - 'records' : list like [{{column -> value}}, ... , {{column -> value}}] - - 'index' : dict like {{index -> {{column -> value}}}} - - 'columns' : dict like {{column -> {{index -> value}}}} + - 'split' : dict like {'index' -> [index], 'columns' -> [columns], + 'data' -> [values]} + - 'records' : list like [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + - 'columns' : dict like {column -> {index -> value}} - 'values' : just the values array - - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}} + - 'table' : dict like {'schema': {schema}, 'data': {data}} Describing the data, where data component is like ``orient='records'``. - date_format : {{None, 'epoch', 'iso'}} + .. versionchanged:: 0.20.0 + + date_format : {None, 'epoch', 'iso'} Type of date conversion. 'epoch' = epoch milliseconds, 'iso' = ISO8601. The default depends on the `orient`. For ``orient='table'``, the default is 'iso'. For all other orients, @@ -2270,7 +2108,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): throw ValueError if incorrect 'orient' since others are not list like. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}} + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} A string representing the compression to use in the output file, only used when the first argument is a filename. By default, the @@ -2282,15 +2120,14 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when orient is 'split' or 'table'. + + .. versionadded:: 0.23.0 + indent : int, optional Length of whitespace used to indent each record. .. versionadded:: 1.0.0 - {storage_options} - - .. versionadded:: 1.2.0 - Returns ------- None or str @@ -2308,10 +2145,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): and the default ``indent=None`` are equivalent in pandas, though this may change in a future release. - ``orient='table'`` contains a 'pandas_version' field under 'schema'. - This stores the version of `pandas` used in the latest revision of the - schema. - Examples -------- >>> import json @@ -2324,7 +2157,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> result = df.to_json(orient="split") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ + { "columns": [ "col 1", "col 2" @@ -2343,7 +2176,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "d" ] ] - }} + } Encoding/decoding a Dataframe using ``'records'`` formatted JSON. Note that index labels are not preserved with this encoding. @@ -2352,14 +2185,14 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP [ - {{ + { "col 1": "a", "col 2": "b" - }}, - {{ + }, + { "col 1": "c", "col 2": "d" - }} + } ] Encoding/decoding a Dataframe using ``'index'`` formatted JSON: @@ -2367,32 +2200,32 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> result = df.to_json(orient="index") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "row 1": {{ + { + "row 1": { "col 1": "a", "col 2": "b" - }}, - "row 2": {{ + }, + "row 2": { "col 1": "c", "col 2": "d" - }} - }} + } + } Encoding/decoding a Dataframe using ``'columns'`` formatted JSON: >>> result = df.to_json(orient="columns") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "col 1": {{ + { + "col 1": { "row 1": "a", "row 2": "c" - }}, - "col 2": {{ + }, + "col 2": { "row 1": "b", "row 2": "d" - }} - }} + } + } Encoding/decoding a Dataframe using ``'values'`` formatted JSON: @@ -2415,40 +2248,40 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> result = df.to_json(orient="table") >>> parsed = json.loads(result) >>> json.dumps(parsed, indent=4) # doctest: +SKIP - {{ - "schema": {{ + { + "schema": { "fields": [ - {{ + { "name": "index", "type": "string" - }}, - {{ + }, + { "name": "col 1", "type": "string" - }}, - {{ + }, + { "name": "col 2", "type": "string" - }} + } ], "primaryKey": [ "index" ], "pandas_version": "0.20.0" - }}, + }, "data": [ - {{ + { "index": "row 1", "col 1": "a", "col 2": "b" - }}, - {{ + }, + { "index": "row 2", "col 1": "c", "col 2": "d" - }} + } ] - }} + } """ from pandas.io import json @@ -2473,10 +2306,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): compression=compression, index=index, indent=indent, - storage_options=storage_options, ) - @final def to_hdf( self, path_or_buf, @@ -2618,7 +2449,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): encoding=encoding, ) - @final def to_sql( self, name: str, @@ -2729,8 +2559,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] - An `sqlalchemy.engine.Connection` can also be passed to `con`: - + An `sqlalchemy.engine.Connection` can also be passed to to `con`: >>> with engine.begin() as connection: ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) ... df1.to_sql('users', con=connection, if_exists='append') @@ -2786,14 +2615,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): method=method, ) - @final - @doc(storage_options=_shared_docs["storage_options"]) def to_pickle( self, path, - compression: CompressionOptions = "infer", + compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, - storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. @@ -2802,29 +2628,18 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ---------- path : str File path where the pickled object will be stored. - compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, \ + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \ default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. - Compression mode may be any of the following possible - values: {{‘infer’, ‘gzip’, ‘bz2’, ‘zip’, ‘xz’, None}}. If compression - mode is ‘infer’ and path_or_buf is path-like, then detect - compression mode from the following extensions: - ‘.gz’, ‘.bz2’, ‘.zip’ or ‘.xz’. (otherwise no compression). - If dict given and mode is ‘zip’ or inferred as ‘zip’, other entries - passed as additional compression options. protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible - values are 0, 1, 2, 3, 4, 5. A negative value for the protocol + values are 0, 1, 2, 3, 4. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html. - {storage_options} - - .. versionadded:: 1.2.0 - See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -2834,7 +2649,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Examples -------- - >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) + >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 @@ -2858,15 +2673,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ from pandas.io.pickle import to_pickle - to_pickle( - self, - path, - compression=compression, - protocol=protocol, - storage_options=storage_options, - ) + to_pickle(self, path, compression=compression, protocol=protocol) - @final def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs ) -> None: @@ -2928,7 +2736,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) - @final def to_xarray(self): """ Return an xarray object from the pandas object. @@ -3012,8 +2819,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return xarray.Dataset.from_dataframe(self) - @final - @doc(returns=fmt.return_docstring) + @Substitution(returns=fmt.return_docstring) def to_latex( self, buf=None, @@ -3037,21 +2843,20 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): multirow=None, caption=None, label=None, - position=None, ): r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. - Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted + Requires ``\usepackage{booktabs}``. The output can be copy/pasted into a main LaTeX document or read from an external file - with ``\input{{table.tex}}``. + with ``\input{table.tex}``. + + .. versionchanged:: 0.20.2 + Added to Series. .. versionchanged:: 1.0.0 Added caption and label arguments. - .. versionchanged:: 1.2.0 - Added position argument, changed meaning of caption argument. - Parameters ---------- buf : str, Path or StringIO-like, optional, default None @@ -3067,13 +2872,13 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Write row names (index). na_rep : str, default 'NaN' Missing data representation. - formatters : list of functions or dict of {{str: function}}, optional + formatters : list of functions or dict of {str: function}, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function or str, optional, default None Formatter for floating point numbers. For example - ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will + ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print @@ -3091,7 +2896,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): longtable : bool, optional By default, the value will be read from the pandas config module. Use a longtable environment instead of tabular. Requires - adding a \usepackage{{longtable}} to your LaTeX preamble. + adding a \usepackage{longtable} to your LaTeX preamble. escape : bool, optional By default, the value will be read from the pandas config module. When set to False prevents from escaping latex special @@ -3109,31 +2914,21 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): The default will be read from the config module. multirow : bool, default False Use \multirow to enhance MultiIndex rows. Requires adding a - \usepackage{{multirow}} to your LaTeX preamble. Will print + \usepackage{multirow} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. - caption : str or tuple, optional - Tuple (full_caption, short_caption), - which results in ``\caption[short_caption]{{full_caption}}``; - if a single string is passed, no short caption will be set. + caption : str, optional + The LaTeX caption to be placed inside ``\caption{}`` in the output. .. versionadded:: 1.0.0 - .. versionchanged:: 1.2.0 - Optionally allow caption to be a tuple ``(full_caption, short_caption)``. - label : str, optional - The LaTeX label to be placed inside ``\label{{}}`` in the output. - This is used with ``\ref{{}}`` in the main ``.tex`` file. + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 - position : str, optional - The LaTeX positional argument for tables, to be placed after - ``\begin{{}}`` in the output. - - .. versionadded:: 1.2.0 - {returns} + %(returns)s See Also -------- DataFrame.to_string : Render a DataFrame to a console-friendly @@ -3142,18 +2937,18 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Examples -------- - >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], - ... mask=['red', 'purple'], - ... weapon=['sai', 'bo staff'])) + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE - \begin{{tabular}}{{lll}} + \begin{tabular}{lll} \toprule name & mask & weapon \\ \midrule Raphael & red & sai \\ Donatello & purple & bo staff \\ \bottomrule - \end{{tabular}} + \end{tabular} """ # Get defaults from the pandas config if self.ndim == 1: @@ -3169,7 +2964,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if multirow is None: multirow = config.get_option("display.latex.multirow") - self = cast("DataFrame", self) formatter = DataFrameFormatter( self, columns=columns, @@ -3185,7 +2979,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): escape=escape, decimal=decimal, ) - return DataFrameRenderer(formatter).to_latex( + return formatter.to_latex( buf=buf, column_format=column_format, longtable=longtable, @@ -3195,11 +2989,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): multirow=multirow, caption=caption, label=label, - position=position, ) - @final - @doc(storage_options=_shared_docs["storage_options"]) def to_csv( self, path_or_buf: Optional[FilePathOrBuffer] = None, @@ -3209,10 +3000,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[IndexLabel] = None, + index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, - compression: CompressionOptions = "infer", + compression: Optional[Union[str, Mapping[str, str]]] = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, @@ -3220,9 +3011,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): date_format: Optional[str] = None, doublequote: bool_t = True, escapechar: Optional[str] = None, - decimal: str = ".", + decimal: Optional[str] = ".", errors: str = "strict", - storage_options: StorageOptions = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3234,18 +3024,13 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. If a non-binary file object is passed, it should be opened - with `newline=''`, disabling universal newlines. If a binary - file object is passed, `mode` might need to contain a `'b'`. + a string. If a file object is passed it should be opened with + `newline=''`, disabling universal newlines. .. versionchanged:: 0.24.0 Was previously named "path" for Series. - .. versionchanged:: 1.2.0 - - Support for binary file objects was introduced. - sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3274,16 +3059,15 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Python write mode, default 'w'. encoding : str, optional A string representing the encoding to use in the output file, - defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` - is a non-binary file object. + defaults to 'utf-8'. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following - possible values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `path_or_buf` is path-like, then detect compression mode from the following extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given - and mode is one of {{'zip', 'gzip', 'bz2'}}, or inferred as + and mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. @@ -3299,16 +3083,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): supported for compression modes 'gzip' and 'bz2' as well as 'zip'. - .. versionchanged:: 1.2.0 - - Compression is supported for binary file objects. - - .. versionchanged:: 1.2.0 - - Previous versions forwarded dict entries for 'gzip' to - `gzip.open` instead of `gzip.GzipFile` which prevented - setting `mtime`. - quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC @@ -3340,10 +3114,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): .. versionadded:: 1.1.0 - {storage_options} - - .. versionadded:: 1.2.0 - Returns ------- None or str @@ -3357,9 +3127,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Examples -------- - >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'], + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}}) + ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' @@ -3372,16 +3142,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() - formatter = DataFrameFormatter( - frame=df, - header=header, - index=index, - na_rep=na_rep, - float_format=float_format, - decimal=decimal, - ) + from pandas.io.formats.csvs import CSVFormatter - return DataFrameRenderer(formatter).to_csv( + formatter = CSVFormatter( + df, path_or_buf, line_terminator=line_terminator, sep=sep, @@ -3389,7 +3153,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): errors=errors, compression=compression, quoting=quoting, - columns=columns, + na_rep=na_rep, + float_format=float_format, + cols=columns, + header=header, + index=index, index_label=index_label, mode=mode, chunksize=chunksize, @@ -3397,13 +3165,18 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): date_format=date_format, doublequote=doublequote, escapechar=escapechar, - storage_options=storage_options, + decimal=decimal, ) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + + return None # ---------------------------------------------------------------------- # Lookup Caching - @final def _set_as_cached(self, item, cacher) -> None: """ Set the _cacher attribute on the calling object with a weakref to @@ -3411,7 +3184,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ self._cacher = (item, weakref.ref(cacher)) - @final def _reset_cacher(self) -> None: """ Reset the cacher. @@ -3419,7 +3191,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if hasattr(self, "_cacher"): del self._cacher - @final def _maybe_cache_changed(self, item, value) -> None: """ The object has called back to us saying maybe it has changed. @@ -3427,13 +3198,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): loc = self._info_axis.get_loc(item) self._mgr.iset(loc, value) - @final @property def _is_cached(self) -> bool_t: """Return boolean indicating if self is cached or not.""" return getattr(self, "_cacher", None) is not None - @final def _get_cacher(self): """return my cacher or None""" cacher = getattr(self, "_cacher", None) @@ -3441,7 +3210,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): cacher = cacher[1]() return cacher - @final def _maybe_update_cacher( self, clear: bool_t = False, verify_is_copy: bool_t = True ) -> None: @@ -3460,7 +3228,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if cacher is not None: ref = cacher[1]() - # we are trying to reference a dead referent, hence + # we are trying to reference a dead referant, hence # a copy if ref is None: del self._cacher @@ -3474,12 +3242,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ref._item_cache.pop(cacher[0], None) if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t="referent") + self._check_setitem_copy(stacklevel=5, t="referant") if clear: self._clear_item_cache() - @final def _clear_item_cache(self) -> None: self._item_cache.clear() @@ -3576,7 +3343,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): stacklevel=2, ) - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) self._consolidate_inplace() @@ -3585,7 +3352,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) return self._constructor(new_data).__finalize__(self, method="take") - @final def _take_with_is_copy(self: FrameOrSeries, indices, axis=0) -> FrameOrSeries: """ Internal version of the `take` method that sets the `_is_copy` @@ -3600,7 +3366,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result._set_is_copy(self) return result - @final def xs(self, key, axis=0, level=None, drop_level: bool_t = True): """ Return cross-section from the Series/DataFrame. @@ -3717,23 +3482,15 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return result if axis == 1: - if drop_level: - return self[key] - index = self.columns - else: - index = self.index + return self[key] self._consolidate_inplace() + index = self.index if isinstance(index, MultiIndex): - try: - loc, new_index = index._get_loc_level( - key, level=0, drop_level=drop_level - ) - except TypeError as e: - raise TypeError(f"Expected label or tuple of labels, got {key}") from e + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) else: - loc = index.get_loc(key) + loc = self.index.get_loc(key) if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: @@ -3743,9 +3500,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return self._take_with_is_copy(loc, axis=axis) if not is_scalar(loc): - new_index = index[loc] + new_index = self.index[loc] - if is_scalar(loc) and axis == 0: + if is_scalar(loc): # In this case loc should be an integer if self.ndim == 1: # if we encounter an array-like and we only have 1 dim @@ -3761,10 +3518,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): name=self.index[loc], dtype=new_values.dtype, ) - elif is_scalar(loc): - result = self.iloc[:, slice(loc, loc + 1)] - elif axis == 1: - result = self.iloc[:, loc] + else: result = self.iloc[loc] result.index = new_index @@ -3777,7 +3531,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def __getitem__(self, item): raise AbstractMethodError(self) - @final def _get_item_cache(self, item): """Return the cached item, item represents a label indexer.""" cache = self._item_cache @@ -3788,7 +3541,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): loc = self.columns.get_loc(item) values = self._mgr.iget(loc) - res = self._box_col_values(values, loc).__finalize__(self) + res = self._box_col_values(values, loc) cache[item] = res res._set_as_cached(item, self) @@ -3828,7 +3581,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): NDFrame._iset_item(self, loc, value) - @final def _set_is_copy(self, ref, copy: bool_t = True) -> None: if not copy: self._is_copy = None @@ -3836,7 +3588,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): assert ref is not None self._is_copy = weakref.ref(ref) - @final def _check_is_chained_assignment_possible(self) -> bool_t: """ Check if we are a view, have a cacher, and are of mixed type. @@ -3851,13 +3602,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t="referent", force=True) + self._check_setitem_copy(stacklevel=4, t="referant", force=True) return True elif self._is_copy: - self._check_setitem_copy(stacklevel=4, t="referent") + self._check_setitem_copy(stacklevel=4, t="referant") return False - @final def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): """ @@ -3900,7 +3650,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # the copy weakref if self._is_copy is not None and not isinstance(self._is_copy, str): r = self._is_copy() - if not gc.get_referents(r) or (r is not None and r.shape == self.shape): + if not gc.get_referents(r) or r.shape == self.shape: self._is_copy = None return @@ -3908,7 +3658,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if isinstance(self._is_copy, str): t = self._is_copy - elif t == "referent": + elif t == "referant": t = ( "\n" "A value is trying to be set on a copy of a slice from a " @@ -3972,15 +3722,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Unsorted - @final - def _check_inplace_and_allows_duplicate_labels(self, inplace): - if inplace and not self.flags.allows_duplicate_labels: - raise ValueError( - "Cannot specify 'inplace=True' when " - "'self.flags.allows_duplicate_labels' is False." - ) - - @final def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). @@ -4000,13 +3741,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): except (KeyError, ValueError, IndexError): return default - @final @property def _is_view(self) -> bool_t: """Return boolean indicating if self is view of another array """ return self._mgr.is_view - @final def reindex_like( self: FrameOrSeries, other, @@ -4045,7 +3784,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Maximum number of consecutive labels to fill for inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations must + matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance @@ -4154,7 +3893,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return obj - @final def _drop_axis( self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" ) -> FrameOrSeries: @@ -4210,7 +3948,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return result - @final def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: """ Replace self internals with result. @@ -4228,7 +3965,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._mgr = result._mgr self._maybe_update_cacher(verify_is_copy=verify_is_copy) - @final def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: """ Prefix labels with string `prefix`. @@ -4286,13 +4022,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - # error: Incompatible return value type (got "Optional[FrameOrSeries]", - # expected "FrameOrSeries") - # error: Argument 1 to "rename" of "NDFrame" has incompatible type - # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" - return self.rename(**mapper) # type: ignore[return-value, arg-type] + return self.rename(**mapper) # type: ignore - @final def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ Suffix labels with string `suffix`. @@ -4350,11 +4081,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - # error: Incompatible return value type (got "Optional[FrameOrSeries]", - # expected "FrameOrSeries") - # error: Argument 1 to "rename" of "NDFrame" has incompatible type - # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" - return self.rename(**mapper) # type: ignore[return-value, arg-type] + return self.rename(**mapper) # type: ignore def sort_values( self, @@ -4405,7 +4132,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Returns ------- DataFrame or None - DataFrame with sorted values or None if ``inplace=True``. + DataFrame with sorted values if inplace=False, None otherwise. See Also -------- @@ -4483,79 +4210,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F - - Natural sort with the key argument, - using the `natsort ` package. - - >>> df = pd.DataFrame({ - ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], - ... "value": [10, 20, 30, 40, 50] - ... }) - >>> df - time value - 0 0hr 10 - 1 128hr 20 - 2 72hr 30 - 3 48hr 40 - 4 96hr 50 - >>> from natsort import index_natsorted - >>> df.sort_values( - ... by="time", - ... key=lambda x: np.argsort(index_natsorted(df["time"])) - ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 """ raise AbstractMethodError(self) - def sort_index( - self, - axis=0, - level=None, - ascending: bool_t = True, - inplace: bool_t = False, - kind: str = "quicksort", - na_position: str = "last", - sort_remaining: bool_t = True, - ignore_index: bool_t = False, - key: IndexKeyFunc = None, - ): - - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - target = self._get_axis(axis) - - indexer = get_indexer_indexer( - target, level, ascending, kind, na_position, sort_remaining, key - ) - - if indexer is None: - if inplace: - return - else: - return self.copy() - - baxis = self._get_block_manager_axis(axis) - new_data = self._mgr.take(indexer, axis=baxis, verify=False) - - # reconstruct axis if needed - new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() - - if ignore_index: - axis = 1 if isinstance(self, ABCDataFrame) else 0 - new_data.axes[axis] = ibase.default_index(len(indexer)) - - result = self._constructor(new_data) - - if inplace: - return self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_index") - @doc( klass=_shared_doc_kwargs["klass"], axes=_shared_doc_kwargs["axes"], @@ -4807,7 +4464,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self, method="reindex") - @final def _reindex_axes( self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy ) -> FrameOrSeries: @@ -4833,7 +4489,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return obj - @final def _needs_reindex_multi(self, axes, method, level) -> bool_t: """Check if we do need a multi reindex.""" return ( @@ -4846,7 +4501,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def _reindex_multi(self, axes, copy, fill_value): raise AbstractMethodError(self) - @final def _reindex_with_indexers( self: FrameOrSeries, reindexers, @@ -4971,15 +4625,14 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return self.reindex(**{name: [r for r in items if r in labels]}) elif like: - def f(x) -> bool: - assert like is not None # needed for mypy + def f(x): return like in ensure_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: - def f(x) -> bool: + def f(x): return matcher.search(ensure_str(x)) is not None matcher = re.compile(regex) @@ -4988,7 +4641,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: raise TypeError("Must pass either `items`, `like`, or `regex`") - @final def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the first `n` rows. @@ -5061,7 +4713,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ return self.iloc[:n] - @final def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: """ Return the last `n` rows. @@ -5136,7 +4787,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return self.iloc[0:0] return self.iloc[-n:] - @final def sample( self: FrameOrSeries, n=None, @@ -5345,7 +4995,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis) - @final @doc(klass=_shared_doc_kwargs["klass"]) def pipe(self, func, *args, **kwargs): r""" @@ -5397,13 +5046,54 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ... .pipe(g, arg1=a) ... .pipe((func, 'arg2'), arg1=a, arg3=c) ... ) # doctest: +SKIP - """ + """ return com.pipe(self, func, *args, **kwargs) + _shared_docs["aggregate"] = dedent( + """ + Aggregate using one or more operations over the specified axis. + {versionadded} + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + {axis} + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. + {see_also} + Notes + ----- + `agg` is an alias for `aggregate`. Use the alias. + + A passed user-defined-function will be passed a Series for evaluation. + {examples}""" + ) + # ---------------------------------------------------------------------- # Attribute access - @final def __finalize__( self: FrameOrSeries, other, method: Optional[str] = None, **kwargs ) -> FrameOrSeries: @@ -5418,7 +5108,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): A passed method name providing context on where ``__finalize__`` was called. - .. warning:: + .. warning: The value passed as `method` are not currently considered stable across pandas releases. @@ -5426,19 +5116,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if isinstance(other, NDFrame): for name in other.attrs: self.attrs[name] = other.attrs[name] - - self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. - for name in set(self._metadata) & set(other._metadata): + for name in self._metadata: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) - - if method == "concat": - allows_duplicate_labels = all( - x.flags.allows_duplicate_labels for x in other.objs - ) - self.flags.allows_duplicate_labels = allows_duplicate_labels - return self def __getattr__(self, name: str): @@ -5500,21 +5181,21 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) object.__setattr__(self, name, value) - @final - def _dir_additions(self) -> Set[str]: + def _dir_additions(self): """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, its first level values are used. + If info_axis is a MultiIndex, it's first level values are used. """ - additions = super()._dir_additions() - if self._info_axis._can_hold_strings: - additions.update(self._info_axis._dir_additions_for_owner) - return additions + additions = { + c + for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, str) and c.isidentifier() + } + return super()._dir_additions().union(additions) # ---------------------------------------------------------------------- # Consolidation of internals - @final def _protect_consolidate(self, f): """ Consolidate _mgr -- if the blocks have changed, then clear the @@ -5526,7 +5207,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._clear_item_cache() return result - @final def _consolidate_inplace(self) -> None: """Consolidate data in place and return None""" @@ -5535,34 +5215,33 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._protect_consolidate(f) - @final - def _consolidate(self): + def _consolidate(self, inplace: bool_t = False): """ Compute NDFrame with "consolidated" internals (data of each dtype grouped together in a single ndarray). + Parameters + ---------- + inplace : bool, default False + If False return new object, otherwise modify existing object. + Returns ------- consolidated : same type as caller """ - f = lambda: self._mgr.consolidate() - cons_data = self._protect_consolidate(f) - return self._constructor(cons_data).__finalize__(self) + inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + self._consolidate_inplace() + else: + f = lambda: self._mgr.consolidate() + cons_data = self._protect_consolidate(f) + return self._constructor(cons_data).__finalize__(self) - @final @property def _is_mixed_type(self) -> bool_t: - if self._mgr.is_single_block: - return False + f = lambda: self._mgr.is_mixed_type + return self._protect_consolidate(f) - if self._mgr.any_extension_types: - # Even if they have the same dtype, we cant consolidate them, - # so we pretend this is "mixed'" - return True - - return self.dtypes.nunique() > 1 - - @final def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ if self._is_mixed_type: @@ -5579,11 +5258,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return True - @final def _get_numeric_data(self): return self._constructor(self._mgr.get_numeric_data()).__finalize__(self) - @final def _get_bool_data(self): return self._constructor(self._mgr.get_bool_data()).__finalize__(self) @@ -5703,7 +5380,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): data = self._mgr.get_dtypes() return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) - @final def _to_dict_of_blocks(self, copy: bool_t = True): """ Return a dict of dtype -> Constructor Types that @@ -5869,7 +5545,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") # GH 33113: handle empty frame or series @@ -5881,7 +5557,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result.columns = self.columns return result - @final def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: """ Make a copy of this object's indices and data. @@ -5991,11 +5666,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._clear_item_cache() return self._constructor(data).__finalize__(self, method="copy") - @final def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: return self.copy(deep=deep) - @final def __deepcopy__(self: FrameOrSeries, memo=None) -> FrameOrSeries: """ Parameters @@ -6005,12 +5678,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ return self.copy(deep=True) - @final def _convert( self: FrameOrSeries, datetime: bool_t = False, numeric: bool_t = False, timedelta: bool_t = False, + coerce: bool_t = False, ) -> FrameOrSeries: """ Attempt to infer better dtype for object columns @@ -6024,6 +5697,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): unconvertible values becoming NaN. timedelta : bool, default False If True, convert to timedelta where possible. + coerce : bool, default False + If True, force conversion with unconvertible values converted to + nulls (NaN or NaT). Returns ------- @@ -6032,16 +5708,17 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): validate_bool_kwarg(datetime, "datetime") validate_bool_kwarg(numeric, "numeric") validate_bool_kwarg(timedelta, "timedelta") + validate_bool_kwarg(coerce, "coerce") return self._constructor( self._mgr.convert( datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, copy=True, ) ).__finalize__(self) - @final def infer_objects(self: FrameOrSeries) -> FrameOrSeries: """ Attempt to infer better dtypes for object columns. @@ -6084,17 +5761,17 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # python objects will still be converted to # native numpy numeric types return self._constructor( - self._mgr.convert(datetime=True, numeric=False, timedelta=True, copy=True) + self._mgr.convert( + datetime=True, numeric=False, timedelta=True, coerce=False, copy=True + ) ).__finalize__(self, method="infer_objects") - @final def convert_dtypes( self: FrameOrSeries, infer_objects: bool_t = True, convert_string: bool_t = True, convert_integer: bool_t = True, convert_boolean: bool_t = True, - convert_floating: bool_t = True, ) -> FrameOrSeries: """ Convert columns to best possible dtypes using dtypes supporting ``pd.NA``. @@ -6111,12 +5788,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. - convert_floating : bool, defaults True - Whether, if possible, conversion can be done to floating extension types. - If `convert_integer` is also True, preference will be give to integer - dtypes if the floats can be faithfully casted to integers. - - .. versionadded:: 1.2.0 Returns ------- @@ -6134,25 +5805,19 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ----- By default, ``convert_dtypes`` will attempt to convert a Series (or each Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options - ``convert_string``, ``convert_integer``, ``convert_boolean`` and - ``convert_boolean``, it is possible to turn off individual conversions - to ``StringDtype``, the integer extension types, ``BooleanDtype`` - or floating extension types, respectively. + ``convert_string``, ``convert_integer``, and ``convert_boolean``, it is + possible to turn off individual conversions to ``StringDtype``, the integer + extension types or ``BooleanDtype``, respectively. For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference rules as during normal Series/DataFrame construction. Then, if possible, - convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer - or floating extension type, otherwise leave as ``object``. + convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer extension + type, otherwise leave as ``object``. If the dtype is integer, convert to an appropriate integer extension type. If the dtype is numeric, and consists of all integers, convert to an - appropriate integer extension type. Otherwise, convert to an - appropriate floating extension type. - - .. versionchanged:: 1.2 - Starting with pandas 1.2, this method also converts float columns - to the nullable floating extension type. + appropriate integer extension type. In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. @@ -6192,7 +5857,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> dfn = df.convert_dtypes() >>> dfn a b c d e f - 0 1 x True h 10 + 0 1 x True h 10 NaN 1 2 y False i 100.5 2 3 z 20 200.0 @@ -6202,7 +5867,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): c boolean d string e Int64 - f Float64 + f float64 dtype: object Start with a Series of strings and missing data represented by ``np.nan``. @@ -6224,20 +5889,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ if self.ndim == 1: return self._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + infer_objects, convert_string, convert_integer, convert_boolean ) else: results = [ col._convert_dtypes( - infer_objects, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + infer_objects, convert_string, convert_integer, convert_boolean ) for col_name, col in self.items() ] @@ -6365,7 +6022,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): axis = self._get_axis_number(axis) if value is None: - if not self._mgr.is_single_block and axis == 1: + + if self._is_mixed_type and axis == 1: if inplace: raise NotImplementedError() result = self.T.fillna(method=method, limit=limit).T @@ -6435,7 +6093,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return result.__finalize__(self, method="fillna") - @final def ffill( self: FrameOrSeries, axis=None, @@ -6457,7 +6114,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): pad = ffill - @final def bfill( self: FrameOrSeries, axis=None, @@ -6484,8 +6140,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self, to_replace=None, value=None, - inplace: bool_t = False, - limit: Optional[int] = None, + inplace=False, + limit=None, regex=False, method="pad", ): @@ -6561,7 +6217,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. - limit : int or None, default None + limit : int, default None Maximum size gap to forward or backward fill. regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular @@ -6573,10 +6229,13 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): The method to use when for replacement, when `to_replace` is a scalar, list or tuple and `value` is ``None``. + .. versionchanged:: 0.23.0 + Added to DataFrame. + Returns ------- - {klass} or None - Object after replacement or None if ``inplace=True``. + {klass} + Object after replacement. Raises ------ @@ -6731,6 +6390,20 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): 1 new new 2 bait xyz + Note that when replacing multiple ``bool`` or ``datetime64`` objects, + the data types in the `to_replace` parameter must match the data + type of the value being replaced: + + >>> df = pd.DataFrame({{'A': [True, False, True], + ... 'B': [False, True, False]}}) + >>> df.replace({{'a string': 'new value', True: False}}) # raises + Traceback (most recent call last): + ... + TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' + + This raises a ``TypeError`` because one of the ``dict`` keys is not of + the correct type for replacement. + Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: @@ -6764,7 +6437,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): 3 b 4 b dtype: object - """ + """ if not ( is_scalar(to_replace) or is_re_compilable(to_replace) @@ -6778,7 +6451,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: - raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") + raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") self._consolidate_inplace() @@ -6790,14 +6463,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if isinstance(to_replace, (tuple, list)): if isinstance(self, ABCDataFrame): - from pandas import Series - return self.apply( - Series._replace_single, - args=(to_replace, method, inplace, limit), + _single_replace, args=(to_replace, method, inplace, limit) ) - self = cast("Series", self) - return self._replace_single(to_replace, method, inplace, limit) + return _single_replace(self, to_replace, method, inplace, limit) if not is_dict_like(to_replace): if not is_dict_like(regex): @@ -6810,10 +6479,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): regex = True items = list(to_replace.items()) - if items: - keys, values = zip(*items) - else: - keys, values = ([], []) + keys, values = zip(*items) if items else ([], []) are_mappings = [is_dict_like(v) for v in values] @@ -6844,14 +6510,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # need a non-zero len on all axes if not self.size: - if inplace: - return - return self.copy() + return self if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} # Note: Checking below for `in foo.keys()` instead of - # `in foo` is needed for when we have a Series and not dict + # `in foo`is needed for when we have a Series and not dict mapping = { col: (to_replace[col], value[col]) for col in to_replace.keys() @@ -6874,25 +6538,25 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: raise TypeError("value argument must be scalar, dict, or Series") - elif is_list_like(to_replace): - if not is_list_like(value): - # e.g. to_replace = [NA, ''] and value is 0, - # so we replace NA with 0 and then replace '' with 0 - value = [value] * len(to_replace) - - # e.g. we have to_replace = [NA, ''] and value = [0, 'missing'] - if len(to_replace) != len(value): - raise ValueError( - f"Replacement lists must match in length. " - f"Expecting {len(to_replace)} got {len(value)} " + elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if is_list_like(value): + if len(to_replace) != len(value): + raise ValueError( + f"Replacement lists must match in length. " + f"Expecting {len(to_replace)} got {len(value)} " + ) + self._consolidate_inplace() + new_data = self._mgr.replace_list( + src_list=to_replace, + dest_list=value, + inplace=inplace, + regex=regex, ) - new_data = self._mgr.replace_list( - src_list=to_replace, - dest_list=value, - inplace=inplace, - regex=regex, - ) + else: # [NA, ''] -> 0 + new_data = self._mgr.replace( + to_replace=to_replace, value=value, inplace=inplace, regex=regex + ) elif to_replace is None: if not ( is_re_compilable(regex) @@ -6935,7 +6599,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return result.__finalize__(self, method="replace") - @final def interpolate( self: FrameOrSeries, method: str = "linear", @@ -6948,8 +6611,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): **kwargs, ) -> Optional[FrameOrSeries]: """ - Fill NaN values using an interpolation method. - Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -6977,7 +6638,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): `scipy.interpolate.BPoly.from_derivatives` which replaces 'piecewise_polynomial' interpolation method in scipy 0.18. - axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. limit : int, optional @@ -7012,16 +6672,18 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). + .. versionadded:: 0.23.0 + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - ``**kwargs`` : optional + **kwargs Keyword arguments to pass on to the interpolating function. Returns ------- - Series or DataFrame or None + Series or DataFrame Returns the same object type as the caller, interpolated at - some or all ``NaN`` values or None if ``inplace=True``. + some or all ``NaN`` values. See Also -------- @@ -7189,7 +6851,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if method == "linear": # prior default index = np.arange(len(obj.index)) - index = Index(index) else: index = obj.index methods = {"index", "values", "nearest", "time"} @@ -7235,7 +6896,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Timeseries methods Methods - @final def asof(self, where, subset=None): """ Return the last row(s) without any NaNs before `where`. @@ -7383,13 +7043,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): nulls = self.isna() if is_series else self[subset].isna().any(1) if nulls.all(): if is_series: - self = cast("Series", self) return self._constructor(np.nan, index=where, name=self.name) elif is_list: - self = cast("DataFrame", self) return self._constructor(np.nan, index=where, columns=self.columns) else: - self = cast("DataFrame", self) return self._constructor_sliced( np.nan, index=self.columns, name=where[0] ) @@ -7422,7 +7079,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ------- {klass} Mask of bool values for each element in {klass} that - indicates whether an element is an NA value. + indicates whether an element is not an NA value. See Also -------- @@ -7435,11 +7092,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], + ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}}) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7502,11 +7159,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], - ... born=[pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... name=['Alfred', 'Batman', ''], - ... toy=[None, 'Batmobile', 'Joker'])) + >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], + ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + ... pd.Timestamp('1940-04-25')], + ... 'name': ['Alfred', 'Batman', ''], + ... 'toy': [None, 'Batmobile', 'Joker']}}) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7540,7 +7197,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self, method="notnull") - @final def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): if (lower is not None and np.any(isna(lower))) or ( upper is not None and np.any(isna(upper)) @@ -7566,7 +7222,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: return result - @final def _clip_with_one_bound(self, threshold, method, axis, inplace): if axis is not None: @@ -7587,10 +7242,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if isinstance(self, ABCSeries): threshold = self._constructor(threshold, index=self.index) else: - threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] + threshold = _align_method_FRAME(self, threshold, axis, flex=None)[1] return self.where(subset, threshold, axis=axis, inplace=inplace) - @final def clip( self: FrameOrSeries, lower=None, @@ -7625,9 +7279,9 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Returns ------- - Series or DataFrame or None + Series or DataFrame Same type as calling object with the values outside the - clip boundaries replaced or None if ``inplace=True``. + clip boundaries replaced. See Also -------- @@ -7717,7 +7371,77 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return result - @final + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. + + A groupby operation involves some combination of splitting the + object, applying a function, and combining the results. This can be + used to group large amounts of data and compute operations on these + groups. + + Parameters + ---------- + by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If an ndarray is passed, the + values are used as-is determine the groups. A label or list of + labels may be passed to group by the columns in ``self``. Notice + that a tuple is interpreted as a (single) key. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. + as_index : bool, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. + sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. + group_keys : bool, default True + When calling apply, add group keys to index to identify pieces. + squeeze : bool, default False + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. + + .. deprecated:: 1.1.0 + + observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 + dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups + + .. versionadded:: 1.1.0 + + Returns + ------- + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. + + See Also + -------- + resample : Convenience method for frequency conversion and resampling + of time series. + + Notes + ----- + See the `user guide + `_ for more. + """ + def asfreq( self: FrameOrSeries, freq, @@ -7829,7 +7553,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): fill_value=fill_value, ) - @final def at_time( self: FrameOrSeries, time, asof: bool_t = False, axis=None ) -> FrameOrSeries: @@ -7888,7 +7611,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): indexer = index.indexer_at_time(time, asof=asof) return self._take_with_is_copy(indexer, axis=axis) - @final def between_time( self: FrameOrSeries, start_time, @@ -7969,11 +7691,10 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): raise TypeError("Index must be DatetimeIndex") indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, include_end=include_end + start_time, end_time, include_start=include_start, include_end=include_end, ) return self._take_with_is_copy(indexer, axis=axis) - @final def resample( self, rule, @@ -7988,7 +7709,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - ) -> Resampler: + ) -> "Resampler": """ Resample time-series data. @@ -8224,8 +7945,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): For DataFrame objects, the keyword `on` can be used to specify the column instead of the index for resampling. - >>> d = {'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} + >>> d = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) >>> df = pd.DataFrame(d) >>> df['week_starting'] = pd.date_range('01/01/2018', ... periods=8, @@ -8250,8 +7971,8 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): specify on which level the resampling needs to take place. >>> days = pd.date_range('1/1/2000', periods=4, freq='D') - >>> d2 = {'price': [10, 11, 9, 13, 14, 18, 17, 19], - ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]} + >>> d2 = dict({'price': [10, 11, 9, 13, 14, 18, 17, 19], + ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}) >>> df2 = pd.DataFrame(d2, ... index=pd.MultiIndex.from_product([days, ... ['morning', @@ -8375,7 +8096,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): offset=offset, ) - @final def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ Select initial periods of time series data based on a date offset. @@ -8444,7 +8164,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return self.loc[:end] - @final def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Select final periods of time series data based on a date offset. @@ -8508,7 +8227,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): start = self.index.searchsorted(start_date, side="right") return self.iloc[start:] - @final def rank( self: FrameOrSeries, axis=0, @@ -8632,7 +8350,36 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return ranker(data) - @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) + _shared_docs[ + "compare" + ] = """ + Compare to another %(klass)s and show the differences. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + other : %(klass)s + Object to compare with. + + align_axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine which axis to align the comparison on. + + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are aligned horizontally + with columns drawn alternately from self and other. + + keep_shape : bool, default False + If true, all rows and columns are kept. + Otherwise, only the ones with different values are kept. + + keep_equal : bool, default False + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. + """ + + @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, other, @@ -8829,7 +8576,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}") - @final def _align_frame( self, other, @@ -8887,10 +8633,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: - # GH#33671 ensure we don't change the index on - # our original Series (NB: by default deep=False) - left = left.copy() - right = right.copy() left.index = join_index right.index = join_index @@ -8899,7 +8641,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): right.__finalize__(other), ) - @final def _align_series( self, other, @@ -8979,10 +8720,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: - # GH#33671 ensure we don't change the index on - # our original Series (NB: by default deep=False) - left = left.copy() - right = right.copy() left.index = join_index right.index = join_index @@ -8991,7 +8728,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): right.__finalize__(other), ) - @final def _where( self, cond, @@ -9041,6 +8777,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): cond = -cond if inplace else cond # try to align with other + try_quick = True if isinstance(other, NDFrame): # align with me @@ -9079,11 +8816,12 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # match True cond to other elif len(cond[icond]) == len(other): - # try to not change dtype at first - new_other = np.asarray(self) - new_other = new_other.copy() - new_other[icond] = other - other = new_other + # try to not change dtype at first (if try_quick) + if try_quick: + new_other = np.asarray(self) + new_other = new_other.copy() + new_other[icond] = other + other = new_other else: raise ValueError( @@ -9120,7 +8858,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): self._check_inplace_setting(other) new_data = self._mgr.putmask( - mask=cond, new=other, align=align, axis=block_axis + mask=cond, new=other, align=align, axis=block_axis, ) result = self._constructor(new_data) return self._update_inplace(result) @@ -9137,7 +8875,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result = self._constructor(new_data) return result.__finalize__(self) - @final @doc( klass=_shared_doc_kwargs["klass"], cond="True", @@ -9190,7 +8927,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): Returns ------- - Same type as caller or None if ``inplace=True``. + Same type as caller See Also -------- @@ -9221,6 +8958,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): 3 3.0 4 4.0 dtype: float64 + >>> s.mask(s > 0) 0 0.0 1 NaN @@ -9236,13 +8974,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): 3 3 4 4 dtype: int64 - >>> s.mask(s > 1, 10) - 0 0 - 1 1 - 2 10 - 3 10 - 4 10 - dtype: int64 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) >>> df @@ -9280,7 +9011,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): cond, other, inplace, axis, level, errors=errors, try_cast=try_cast ) - @final @doc( where, klass=_shared_doc_kwargs["klass"], @@ -9342,7 +9072,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): extend the index when shifting and preserve the original data. If `freq` is specified as "infer" then it will be inferred from the freq or inferred_freq attributes of the index. If neither of - those attributes exist, a ValueError is thrown. + those attributes exist, a ValueError is thrown axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional @@ -9391,11 +9121,11 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): >>> df.shift(periods=1, axis="columns") Col1 Col2 Col3 - 2020-01-01 NaN 10 13 - 2020-01-02 NaN 20 23 - 2020-01-03 NaN 15 18 - 2020-01-04 NaN 30 33 - 2020-01-05 NaN 45 48 + 2020-01-01 NaN 10.0 13.0 + 2020-01-02 NaN 20.0 23.0 + 2020-01-03 NaN 15.0 18.0 + 2020-01-04 NaN 30.0 33.0 + 2020-01-05 NaN 45.0 48.0 >>> df.shift(periods=3, fill_value=0) Col1 Col2 Col3 @@ -9463,17 +9193,13 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result = self.set_axis(new_ax, axis) return result.__finalize__(self, method="shift") - @final def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ Equivalent to `shift` without copying data. + The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. - .. deprecated:: 1.2.0 - slice_shift is deprecated, - use DataFrame/Series.shift instead. - Parameters ---------- periods : int @@ -9488,14 +9214,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): While the `slice_shift` is faster than `shift`, you may pay for it later during alignment. """ - - msg = ( - "The 'slice_shift' method is deprecated " - "and will be removed in a future version. " - "You can use DataFrame/Series.shift instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - if periods == 0: return self @@ -9512,7 +9230,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return new_obj.__finalize__(self, method="slice_shift") - @final def tshift( self: FrameOrSeries, periods: int = 1, freq=None, axis: Axis = 0 ) -> FrameOrSeries: @@ -9688,7 +9405,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # if we have a date index, convert to dates, otherwise # treat like a slice - if ax._is_all_dates: + if ax.is_all_dates: from pandas.core.tools.datetimes import to_datetime before = to_datetime(before) @@ -9713,7 +9430,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return result - @final def tz_convert( self: FrameOrSeries, tz, axis=0, level=None, copy: bool_t = True ) -> FrameOrSeries: @@ -9771,7 +9487,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): result = result.set_axis(ax, axis=axis, inplace=False) return result.__finalize__(self, method="tz_convert") - @final def tz_localize( self: FrameOrSeries, tz, @@ -9944,8 +9659,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): # ---------------------------------------------------------------------- # Numeric Methods - - @final def abs(self: FrameOrSeries) -> FrameOrSeries: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -10015,7 +9728,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ return np.abs(self) - @final def describe( self: FrameOrSeries, percentiles=None, @@ -10236,7 +9948,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): categorical count 3 unique 3 - top d + top f freq 1 Excluding numeric columns from a ``DataFrame`` description. @@ -10289,7 +10001,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): formatted_percentiles = format_percentiles(percentiles) - def describe_numeric_1d(series) -> "Series": + def describe_numeric_1d(series): stat_index = ( ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] ) @@ -10300,7 +10012,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) return pd.Series(d, index=stat_index, name=series.name) - def describe_categorical_1d(data) -> "Series": + def describe_categorical_1d(data): names = ["count", "unique"] objcounts = data.value_counts() count_unique = len(objcounts[objcounts != 0]) @@ -10349,7 +10061,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return pd.Series(result, index=names, name=data.name, dtype=dtype) - def describe_timestamp_1d(data) -> "Series": + def describe_timestamp_1d(data): # GH-30164 stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] d = ( @@ -10359,7 +10071,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) return pd.Series(d, index=stat_index, name=data.name) - def describe_1d(data) -> "Series": + def describe_1d(data): if is_bool_dtype(data.dtype): return describe_categorical_1d(data) elif is_numeric_dtype(data): @@ -10372,9 +10084,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return describe_categorical_1d(data) if self.ndim == 1: - # Incompatible return value type - # (got "Series", expected "FrameOrSeries") [return-value] - return describe_1d(self) # type:ignore[return-value] + return describe_1d(self) elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics default_include = [np.number] @@ -10404,7 +10114,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): d.columns = data.columns.copy() return d - @final def pct_change( self: FrameOrSeries, periods=1, @@ -10543,7 +10252,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): rs = rs.reindex_like(data) return rs - @final def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): if axis is None: raise ValueError("Must specify 'axis' when aggregating by level.") @@ -10555,292 +10263,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) return grouped.aggregate(applyf) - @final - def _logical_func( - self, name: str, func, axis=0, bool_only=None, skipna=True, level=None, **kwargs - ): - nv.validate_logical_func((), kwargs, fname=name) - if level is not None: - if bool_only is not None: - raise NotImplementedError( - "Option bool_only is not implemented with option level." - ) - return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - - if self.ndim > 1 and axis is None: - # Reduce along one dimension then the other, to simplify DataFrame._reduce - res = self._logical_func( - name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs - ) - return res._logical_func(name, func, skipna=skipna, **kwargs) - - return self._reduce( - func, - name=name, - axis=axis, - skipna=skipna, - numeric_only=bool_only, - filter_type="bool", - ) - - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - return self._logical_func( - "any", nanops.nanany, axis, bool_only, skipna, level, **kwargs - ) - - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - return self._logical_func( - "all", nanops.nanall, axis, bool_only, skipna, level, **kwargs - ) - - @final - def _accum_func(self, name: str, func, axis=None, skipna=True, *args, **kwargs): - skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) - if axis is None: - axis = self._stat_axis_number - else: - axis = self._get_axis_number(axis) - - if axis == 1: - return self.T._accum_func( - name, func, axis=0, skipna=skipna, *args, **kwargs - ).T - - def block_accum_func(blk_values): - values = blk_values.T if hasattr(blk_values, "T") else blk_values - - result = nanops.na_accum_func(values, func, skipna=skipna) - - result = result.T if hasattr(result, "T") else result - return result - - result = self._mgr.apply(block_accum_func) - - return self._constructor(result).__finalize__(self, method=name) - - def cummax(self, axis=None, skipna=True, *args, **kwargs): - return self._accum_func( - "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs - ) - - def cummin(self, axis=None, skipna=True, *args, **kwargs): - return self._accum_func( - "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs - ) - - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs) - - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs) - - @final - def _stat_function_ddof( - self, - name: str, - func, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - nv.validate_stat_ddof_func((), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, ddof=ddof - ) - return self._reduce( - func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof - ) - - def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._stat_function_ddof( - "sem", nanops.nansem, axis, skipna, level, ddof, numeric_only, **kwargs - ) - - def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._stat_function_ddof( - "var", nanops.nanvar, axis, skipna, level, ddof, numeric_only, **kwargs - ) - - def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs - ): - return self._stat_function_ddof( - "std", nanops.nanstd, axis, skipna, level, ddof, numeric_only, **kwargs - ) - - @final - def _stat_function( - self, - name: str, - func, - axis=None, - skipna=None, - level=None, - numeric_only=None, - **kwargs, - ): - if name == "median": - nv.validate_median((), kwargs) - else: - nv.validate_stat_func((), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce( - func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only - ) - - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "min", nanops.nanmin, axis, skipna, level, numeric_only, **kwargs - ) - - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "max", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs - ) - - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "mean", nanops.nanmean, axis, skipna, level, numeric_only, **kwargs - ) - - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "median", nanops.nanmedian, axis, skipna, level, numeric_only, **kwargs - ) - - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "skew", nanops.nanskew, axis, skipna, level, numeric_only, **kwargs - ) - - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return self._stat_function( - "kurt", nanops.nankurt, axis, skipna, level, numeric_only, **kwargs - ) - - kurtosis = kurt - - @final - def _min_count_stat_function( - self, - name: str, - func, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - if name == "sum": - nv.validate_sum((), kwargs) - elif name == "prod": - nv.validate_prod((), kwargs) - else: - nv.validate_stat_func((), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level( - name, axis=axis, level=level, skipna=skipna, min_count=min_count - ) - return self._reduce( - func, - name=name, - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - ) - - def sum( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - return self._min_count_stat_function( - "sum", nanops.nansum, axis, skipna, level, numeric_only, min_count, **kwargs - ) - - def prod( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - return self._min_count_stat_function( - "prod", - nanops.nanprod, - axis, - skipna, - level, - numeric_only, - min_count, - **kwargs, - ) - - product = prod - - def mad(self, axis=None, skipna=None, level=None): - """ - {desc} - - Parameters - ---------- - axis : {axis_descr} - Axis for the function to be applied on. - skipna : bool, default None - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. - - Returns - ------- - {name1} or {name2} (if level specified)\ - {see_also}\ - {examples} - """ - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number - if level is not None: - return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) - - data = self._get_numeric_data() - if axis == 0: - demeaned = data - data.mean(axis=0) - else: - demeaned = data.sub(data.mean(axis=1), axis=0) - return np.abs(demeaned).mean(axis=axis, skipna=skipna) - @classmethod def _add_numeric_operations(cls): """ @@ -10848,49 +10270,34 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ axis_descr, name1, name2 = _doc_parms(cls) - @doc( - _bool_doc, - desc=_any_desc, + cls.any = _make_logical_function( + cls, + "any", name1=name1, name2=name2, axis_descr=axis_descr, + desc=_any_desc, + func=nanops.nanany, see_also=_any_see_also, examples=_any_examples, empty_value=False, ) - def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - return NDFrame.any(self, axis, bool_only, skipna, level, **kwargs) - - # pandas\core\generic.py:10725: error: Cannot assign to a method - # [assignment] - cls.any = any # type: ignore[assignment] - - @doc( - _bool_doc, - desc=_all_desc, + cls.all = _make_logical_function( + cls, + "all", name1=name1, name2=name2, axis_descr=axis_descr, + desc=_all_desc, + func=nanops.nanall, see_also=_all_see_also, examples=_all_examples, empty_value=True, ) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - return NDFrame.all(self, axis, bool_only, skipna, level, **kwargs) - - # pandas\core\generic.py:10719: error: Cannot assign to a method - # [assignment] - - # pandas\core\generic.py:10719: error: Incompatible types in assignment - # (expression has type "Callable[[Iterable[object]], bool]", variable - # has type "Callable[[NDFrame, Any, Any, Any, Any, KwArg(Any)], Any]") - # [assignment] - cls.all = all # type: ignore[assignment] @doc( - NDFrame.mad, desc="Return the mean absolute deviation of the values " - "over the requested axis.", + "for the requested axis.", name1=name1, name2=name2, axis_descr=axis_descr, @@ -10898,331 +10305,248 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): examples="", ) def mad(self, axis=None, skipna=None, level=None): - return NDFrame.mad(self, axis, skipna, level) + """ + {desc} - # pandas\core\generic.py:10736: error: Cannot assign to a method - # [assignment] - cls.mad = mad # type: ignore[assignment] + Parameters + ---------- + axis : {axis_descr} + Axis for the function to be applied on. + skipna : bool, default None + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a {name1}. - @doc( - _num_ddof_doc, + Returns + ------- + {name1} or {name2} (if level specified)\ + {see_also}\ + {examples} + """ + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level("mad", axis=axis, level=level, skipna=skipna) + + data = self._get_numeric_data() + if axis == 0: + demeaned = data - data.mean(axis=0) + else: + demeaned = data.sub(data.mean(axis=1), axis=0) + return np.abs(demeaned).mean(axis=axis, skipna=skipna) + + cls.mad = mad + + cls.sem = _make_stat_function_ddof( + cls, + "sem", + name1=name1, + name2=name2, + axis_descr=axis_descr, desc="Return unbiased standard error of the mean over requested " "axis.\n\nNormalized by N-1 by default. This can be changed " "using the ddof argument", + func=nanops.nansem, + ) + cls.var = _make_stat_function_ddof( + cls, + "var", name1=name1, name2=name2, axis_descr=axis_descr, - ) - def sem( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - return NDFrame.sem(self, axis, skipna, level, ddof, numeric_only, **kwargs) - - # pandas\core\generic.py:10758: error: Cannot assign to a method - # [assignment] - cls.sem = sem # type: ignore[assignment] - - @doc( - _num_ddof_doc, desc="Return unbiased variance over requested axis.\n\nNormalized by " "N-1 by default. This can be changed using the ddof argument", + func=nanops.nanvar, + ) + cls.std = _make_stat_function_ddof( + cls, + "std", name1=name1, name2=name2, axis_descr=axis_descr, - ) - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - return NDFrame.var(self, axis, skipna, level, ddof, numeric_only, **kwargs) - - # pandas\core\generic.py:10779: error: Cannot assign to a method - # [assignment] - cls.var = var # type: ignore[assignment] - - @doc( - _num_ddof_doc, desc="Return sample standard deviation over requested axis." "\n\nNormalized by N-1 by default. This can be changed using the " "ddof argument", - name1=name1, - name2=name2, - axis_descr=axis_descr, + func=nanops.nanstd, ) - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - return NDFrame.std(self, axis, skipna, level, ddof, numeric_only, **kwargs) - # pandas\core\generic.py:10801: error: Cannot assign to a method - # [assignment] - cls.std = std # type: ignore[assignment] - - @doc( - _cnum_doc, - desc="minimum", + cls.cummin = _make_cum_function( + cls, + "cummin", name1=name1, name2=name2, axis_descr=axis_descr, + desc="minimum", + accum_func=np.minimum.accumulate, accum_func_name="min", examples=_cummin_examples, ) - def cummin(self, axis=None, skipna=True, *args, **kwargs): - return NDFrame.cummin(self, axis, skipna, *args, **kwargs) - - # pandas\core\generic.py:10815: error: Cannot assign to a method - # [assignment] - cls.cummin = cummin # type: ignore[assignment] - - @doc( - _cnum_doc, - desc="maximum", + cls.cumsum = _make_cum_function( + cls, + "cumsum", name1=name1, name2=name2, axis_descr=axis_descr, - accum_func_name="max", - examples=_cummax_examples, - ) - def cummax(self, axis=None, skipna=True, *args, **kwargs): - return NDFrame.cummax(self, axis, skipna, *args, **kwargs) - - # pandas\core\generic.py:10829: error: Cannot assign to a method - # [assignment] - cls.cummax = cummax # type: ignore[assignment] - - @doc( - _cnum_doc, desc="sum", - name1=name1, - name2=name2, - axis_descr=axis_descr, + accum_func=np.cumsum, accum_func_name="sum", examples=_cumsum_examples, ) - def cumsum(self, axis=None, skipna=True, *args, **kwargs): - return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) - - # pandas\core\generic.py:10843: error: Cannot assign to a method - # [assignment] - cls.cumsum = cumsum # type: ignore[assignment] - - @doc( - _cnum_doc, - desc="product", + cls.cumprod = _make_cum_function( + cls, + "cumprod", name1=name1, name2=name2, axis_descr=axis_descr, + desc="product", + accum_func=np.cumprod, accum_func_name="prod", examples=_cumprod_examples, ) - def cumprod(self, axis=None, skipna=True, *args, **kwargs): - return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) - - # pandas\core\generic.py:10857: error: Cannot assign to a method - # [assignment] - cls.cumprod = cumprod # type: ignore[assignment] - - @doc( - _num_doc, - desc="Return the sum of the values over the requested axis.\n\n" - "This is equivalent to the method ``numpy.sum``.", + cls.cummax = _make_cum_function( + cls, + "cummax", name1=name1, name2=name2, axis_descr=axis_descr, - min_count=_min_count_stub, + desc="maximum", + accum_func=np.maximum.accumulate, + accum_func_name="max", + examples=_cummax_examples, + ) + + cls.sum = _make_min_count_stat_function( + cls, + "sum", + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the sum of the values for the requested axis.\n\n" + "This is equivalent to the method ``numpy.sum``.", + func=nanops.nansum, see_also=_stat_func_see_also, examples=_sum_examples, ) - def sum( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - return NDFrame.sum( - self, axis, skipna, level, numeric_only, min_count, **kwargs - ) - - # pandas\core\generic.py:10883: error: Cannot assign to a method - # [assignment] - cls.sum = sum # type: ignore[assignment] - - @doc( - _num_doc, - desc="Return the product of the values over the requested axis.", + cls.mean = _make_stat_function( + cls, + "mean", name1=name1, name2=name2, axis_descr=axis_descr, - min_count=_min_count_stub, - see_also=_stat_func_see_also, - examples=_prod_examples, + desc="Return the mean of the values for the requested axis.", + func=nanops.nanmean, ) - def prod( - self, - axis=None, - skipna=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - return NDFrame.prod( - self, axis, skipna, level, numeric_only, min_count, **kwargs - ) - - # pandas\core\generic.py:10908: error: Cannot assign to a method - # [assignment] - cls.prod = prod # type: ignore[assignment] - cls.product = prod - - @doc( - _num_doc, - desc="Return the mean of the values over the requested axis.", + cls.skew = _make_stat_function( + cls, + "skew", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", - see_also="", - examples="", - ) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return NDFrame.mean(self, axis, skipna, level, numeric_only, **kwargs) - - # pandas\core\generic.py:10924: error: Cannot assign to a method - # [assignment] - cls.mean = mean # type: ignore[assignment] - - @doc( - _num_doc, desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.", + func=nanops.nanskew, + ) + cls.kurt = _make_stat_function( + cls, + "kurt", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", - see_also="", - examples="", - ) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return NDFrame.skew(self, axis, skipna, level, numeric_only, **kwargs) - - # pandas\core\generic.py:10939: error: Cannot assign to a method - # [assignment] - cls.skew = skew # type: ignore[assignment] - - @doc( - _num_doc, desc="Return unbiased kurtosis over requested axis.\n\n" "Kurtosis obtained using Fisher's definition of\n" "kurtosis (kurtosis of normal == 0.0). Normalized " "by N-1.", + func=nanops.nankurt, + ) + cls.kurtosis = cls.kurt + cls.prod = _make_min_count_stat_function( + cls, + "prod", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", - see_also="", - examples="", + desc="Return the product of the values for the requested axis.", + func=nanops.nanprod, + examples=_prod_examples, ) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return NDFrame.kurt(self, axis, skipna, level, numeric_only, **kwargs) - - # pandas\core\generic.py:10957: error: Cannot assign to a method - # [assignment] - cls.kurt = kurt # type: ignore[assignment] - cls.kurtosis = kurt - - @doc( - _num_doc, - desc="Return the median of the values over the requested axis.", + cls.product = cls.prod + cls.median = _make_stat_function( + cls, + "median", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", - see_also="", - examples="", + desc="Return the median of the values for the requested axis.", + func=nanops.nanmedian, ) - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - return NDFrame.median(self, axis, skipna, level, numeric_only, **kwargs) - - # pandas\core\generic.py:10975: error: Cannot assign to a method - # [assignment] - cls.median = median # type: ignore[assignment] - - @doc( - _num_doc, - desc="Return the maximum of the values over the requested axis.\n\n" + cls.max = _make_stat_function( + cls, + "max", + name1=name1, + name2=name2, + axis_descr=axis_descr, + desc="Return the maximum of the values for the requested axis.\n\n" "If you want the *index* of the maximum, use ``idxmax``. This is" "the equivalent of the ``numpy.ndarray`` method ``argmax``.", - name1=name1, - name2=name2, - axis_descr=axis_descr, - min_count="", + func=nanops.nanmax, see_also=_stat_func_see_also, examples=_max_examples, ) - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs) - - # pandas\core\generic.py:10992: error: Cannot assign to a method - # [assignment] - cls.max = max # type: ignore[assignment] - - @doc( - _num_doc, - desc="Return the minimum of the values over the requested axis.\n\n" - "If you want the *index* of the minimum, use ``idxmin``. This is" - "the equivalent of the ``numpy.ndarray`` method ``argmin``.", + cls.min = _make_stat_function( + cls, + "min", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", + desc="Return the minimum of the values for the requested axis.\n\n" + "If you want the *index* of the minimum, use ``idxmin``. This is" + "the equivalent of the ``numpy.ndarray`` method ``argmin``.", + func=nanops.nanmin, see_also=_stat_func_see_also, examples=_min_examples, ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - return NDFrame.min(self, axis, skipna, level, numeric_only, **kwargs) - # pandas\core\generic.py:11009: error: Cannot assign to a method - # [assignment] - cls.min = min # type: ignore[assignment] + @classmethod + def _add_series_or_dataframe_operations(cls): + """ + Add the series or dataframe only operations to the cls; evaluate + the doc strings again. + """ + from pandas.core.window import ( + Expanding, + ExponentialMovingWindow, + Rolling, + Window, + ) - @final - @doc(Rolling) - def rolling( - self, - window: Union[int, timedelta, BaseOffset, BaseIndexer], - min_periods: Optional[int] = None, - center: bool_t = False, - win_type: Optional[str] = None, - on: Optional[str] = None, - axis: Axis = 0, - closed: Optional[str] = None, - ): - axis = self._get_axis_number(axis) + @doc(Rolling) + def rolling( + self, + window, + min_periods=None, + center=False, + win_type=None, + on=None, + axis=0, + closed=None, + ): + axis = self._get_axis_number(axis) - if win_type is not None: - return Window( + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + return Rolling( self, window=window, min_periods=min_periods, @@ -11233,129 +10557,131 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): closed=closed, ) - return Rolling( + cls.rolling = rolling + + @doc(Expanding) + def expanding(self, min_periods=1, center=None, axis=0): + axis = self._get_axis_number(axis) + if center is not None: + warnings.warn( + "The `center` argument on `expanding` " + "will be removed in the future", + FutureWarning, + stacklevel=2, + ) + else: + center = False + + return Expanding(self, min_periods=min_periods, center=center, axis=axis) + + cls.expanding = expanding + + @doc(ExponentialMovingWindow) + def ewm( self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - ) - - @final - @doc(Expanding) - def expanding( - self, min_periods: int = 1, center: Optional[bool_t] = None, axis: Axis = 0 - ) -> Expanding: - axis = self._get_axis_number(axis) - if center is not None: - warnings.warn( - "The `center` argument on `expanding` will be removed in the future", - FutureWarning, - stacklevel=2, - ) - else: - center = False - - return Expanding(self, min_periods=min_periods, center=center, axis=axis) - - @final - @doc(ExponentialMovingWindow) - def ewm( - self, - com: Optional[float] = None, - span: Optional[float] = None, - halflife: Optional[Union[float, TimedeltaConvertibleTypes]] = None, - alpha: Optional[float] = None, - min_periods: int = 0, - adjust: bool_t = True, - ignore_na: bool_t = False, - axis: Axis = 0, - times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, - ) -> ExponentialMovingWindow: - axis = self._get_axis_number(axis) - return ExponentialMovingWindow( - self, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - ) - - # ---------------------------------------------------------------------- - # Arithmetic Methods - - @final - def _inplace_method(self, other, op): - """ - Wrap arithmetic method to operate inplace. - """ - result = op(self, other) - - if ( - self.ndim == 1 - and result._indexed_same(self) - and is_dtype_equal(result.dtype, self.dtype) + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + times=None, ): - # GH#36498 this inplace op can _actually_ be inplace. - self._values[:] = result._values - return self + axis = self._get_axis_number(axis) + return ExponentialMovingWindow( + self, + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + ) - # Delete cacher - self._reset_cacher() + cls.ewm = ewm - # this makes sure that we are aligned like the input - # we are updating inplace so we want to ignore is_copy - self._update_inplace( - result.reindex_like(self, copy=False), verify_is_copy=False - ) - return self + @doc(klass=_shared_doc_kwargs["klass"], axis="") + def transform(self, func, *args, **kwargs): + """ + Call ``func`` on self producing a {klass} with transformed values. - def __iadd__(self, other): - return self._inplace_method(other, type(self).__add__) # type: ignore[operator] + Produced {klass} will have same axis length as self. - def __isub__(self, other): - return self._inplace_method(other, type(self).__sub__) # type: ignore[operator] + Parameters + ---------- + func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. - def __imul__(self, other): - return self._inplace_method(other, type(self).__mul__) # type: ignore[operator] + Accepted combinations are: - def __itruediv__(self, other): - return self._inplace_method( - other, type(self).__truediv__ # type: ignore[operator] - ) + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. + {axis} + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. - def __ifloordiv__(self, other): - return self._inplace_method( - other, type(self).__floordiv__ # type: ignore[operator] - ) + Returns + ------- + {klass} + A {klass} that must have the same length as self. - def __imod__(self, other): - return self._inplace_method(other, type(self).__mod__) # type: ignore[operator] + Raises + ------ + ValueError : If the returned {klass} has a different length than self. - def __ipow__(self, other): - return self._inplace_method(other, type(self).__pow__) # type: ignore[operator] + See Also + -------- + {klass}.agg : Only perform aggregating type operations. + {klass}.apply : Invoke function on a {klass}. - def __iand__(self, other): - return self._inplace_method(other, type(self).__and__) # type: ignore[operator] + Examples + -------- + >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) + >>> df + A B + 0 0 1 + 1 1 2 + 2 2 3 + >>> df.transform(lambda x: x + 1) + A B + 0 1 2 + 1 2 3 + 2 3 4 - def __ior__(self, other): - return self._inplace_method(other, type(self).__or__) # type: ignore[operator] + Even though the resulting {klass} must have the same length as the + input {klass}, it is possible to provide several input functions: - def __ixor__(self, other): - return self._inplace_method(other, type(self).__xor__) # type: ignore[operator] + >>> s = pd.Series(range(3)) + >>> s + 0 0 + 1 1 + 2 2 + dtype: int64 + >>> s.transform([np.sqrt, np.exp]) + sqrt exp + 0 0.000000 1.000000 + 1 1.000000 2.718282 + 2 1.414214 7.389056 + """ + result = self.agg(func, *args, **kwargs) + if is_scalar(result) or len(result) != len(self): + raise ValueError("transforms cannot produce aggregated results") + + return result # ---------------------------------------------------------------------- # Misc methods - @final def _find_valid_index(self, how: str): """ Retrieves the index of the first valid value. @@ -11374,7 +10700,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): return None return self.index[idxpos] - @final @doc(position="first", klass=_shared_doc_kwargs["klass"]) def first_valid_index(self): """ @@ -11391,7 +10716,6 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ return self._find_valid_index("first") - @final @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"]) def last_valid_index(self): return self._find_valid_index("last") @@ -11408,43 +10732,43 @@ def _doc_parms(cls): _num_doc = """ -{desc} +%(desc)s Parameters ---------- -axis : {axis_descr} +axis : %(axis_descr)s Axis for the function to be applied on. skipna : bool, default True Exclude NA/null values when computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. + particular level, collapsing into a %(name1)s. numeric_only : bool, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. -{min_count}\ +%(min_count)s\ **kwargs Additional keyword arguments to be passed to the function. Returns ------- -{name1} or {name2} (if level specified)\ -{see_also}\ -{examples} +%(name1)s or %(name2)s (if level specified)\ +%(see_also)s\ +%(examples)s """ _num_ddof_doc = """ -{desc} +%(desc)s Parameters ---------- -axis : {axis_descr} +axis : %(axis_descr)s skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. + particular level, collapsing into a %(name1)s. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. @@ -11454,19 +10778,14 @@ numeric_only : bool, default None Returns ------- -{name1} or {name2} (if level specified) - -Notes ------ -To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the -default `ddof=1`)\n""" +%(name1)s or %(name2)s (if level specified)\n""" _bool_doc = """ -{desc} +%(desc)s Parameters ---------- -axis : {{0 or 'index', 1 or 'columns', None}}, default 0 +axis : {0 or 'index', 1 or 'columns', None}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the @@ -11480,24 +10799,24 @@ bool_only : bool, default None then use only boolean data. Not implemented for Series. skipna : bool, default True Exclude NA/null values. If the entire row/column is NA and skipna is - True, then the result will be {empty_value}, as for an empty row/column. + True, then the result will be %(empty_value)s, as for an empty row/column. If skipna is False, then NA are treated as True, because these are not equal to zero. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a {name1}. + particular level, collapsing into a %(name1)s. **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. Returns ------- -{name1} or {name2} - If level is specified, then, {name2} is returned; otherwise, {name1} +%(name1)s or %(name2)s + If level is specified, then, %(name2)s is returned; otherwise, %(name1)s is returned. -{see_also} -{examples}""" +%(see_also)s +%(examples)s""" _all_desc = """\ Return whether all elements are True, potentially over an axis. @@ -11560,14 +10879,14 @@ DataFrame.any : Return True if one (or more) elements are True. """ _cnum_doc = """ -Return cumulative {desc} over a DataFrame or Series axis. +Return cumulative %(desc)s over a DataFrame or Series axis. Returns a DataFrame or Series of the same size containing the cumulative -{desc}. +%(desc)s. Parameters ---------- -axis : {{0 or 'index', 1 or 'columns'}}, default 0 +axis : {0 or 'index', 1 or 'columns'}, default 0 The index or the name of the axis. 0 is equivalent to None or 'index'. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result @@ -11578,21 +10897,21 @@ skipna : bool, default True Returns ------- -{name1} or {name2} - Return cumulative {desc} of {name1} or {name2}. +%(name1)s or %(name2)s + Return cumulative %(desc)s of %(name1)s or %(name2)s. See Also -------- -core.window.Expanding.{accum_func_name} : Similar functionality +core.window.Expanding.%(accum_func_name)s : Similar functionality but ignores ``NaN`` values. -{name2}.{accum_func_name} : Return the {desc} over - {name2} axis. -{name2}.cummax : Return cumulative maximum over {name2} axis. -{name2}.cummin : Return cumulative minimum over {name2} axis. -{name2}.cumsum : Return cumulative sum over {name2} axis. -{name2}.cumprod : Return cumulative product over {name2} axis. +%(name2)s.%(accum_func_name)s : Return the %(desc)s over + %(name2)s axis. +%(name2)s.cummax : Return cumulative maximum over %(name2)s axis. +%(name2)s.cummin : Return cumulative minimum over %(name2)s axis. +%(name2)s.cumsum : Return cumulative sum over %(name2)s axis. +%(name2)s.cumprod : Return cumulative product over %(name2)s axis. -{examples}""" +%(examples)s""" _cummin_examples = """\ Examples @@ -11867,7 +11186,7 @@ DataFrame.all : Return whether all elements are True over requested axis. _any_desc = """\ Return whether any element is True, potentially over an axis. -Returns False unless there is at least one element within a series or +Returns False unless there at least one element within a series or along a Dataframe axis that is True or equivalent (e.g. non-zero or non-empty).""" @@ -12053,4 +11372,218 @@ _min_count_stub = """\ min_count : int, default 0 The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + + .. versionadded:: 0.22.0 + + Added with the default being 0. This means the sum of an all-NA + or empty Series is 0, and the product of an all-NA or empty + Series is 1. """ + + +def _make_min_count_stat_function( + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str = "", + examples: str = "", +) -> Callable: + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count=_min_count_stub, + see_also=see_also, + examples=examples, + ) + @Appender(_num_doc) + def stat_func( + self, + axis=None, + skipna=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + if name == "sum": + nv.validate_sum(tuple(), kwargs) + elif name == "prod": + nv.validate_prod(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, min_count=min_count + ) + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=numeric_only, + min_count=min_count, + ) + + return set_function_name(stat_func, name, cls) + + +def _make_stat_function( + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str = "", + examples: str = "", +) -> Callable: + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + min_count="", + see_also=see_also, + examples=examples, + ) + @Appender(_num_doc) + def stat_func( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + if name == "median": + nv.validate_median(tuple(), kwargs) + else: + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only + ) + + return set_function_name(stat_func, name, cls) + + +def _make_stat_function_ddof( + cls, name: str, name1: str, name2: str, axis_descr: str, desc: str, func: Callable +) -> Callable: + @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) + @Appender(_num_ddof_doc) + def stat_func( + self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + ): + nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level( + name, axis=axis, level=level, skipna=skipna, ddof=ddof + ) + return self._reduce( + func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof + ) + + return set_function_name(stat_func, name, cls) + + +def _make_cum_function( + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + accum_func: Callable, + accum_func_name: str, + examples: str, +) -> Callable: + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + accum_func_name=accum_func_name, + examples=examples, + ) + @Appender(_cnum_doc) + def cum_func(self, axis=None, skipna=True, *args, **kwargs): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) + if axis is None: + axis = self._stat_axis_number + else: + axis = self._get_axis_number(axis) + + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def block_accum_func(blk_values): + values = blk_values.T if hasattr(blk_values, "T") else blk_values + + result = nanops.na_accum_func(values, accum_func, skipna=skipna) + + result = result.T if hasattr(result, "T") else result + return result + + result = self._mgr.apply(block_accum_func) + + return self._constructor(result).__finalize__(self, method=name) + + return set_function_name(cum_func, name, cls) + + +def _make_logical_function( + cls, + name: str, + name1: str, + name2: str, + axis_descr: str, + desc: str, + func: Callable, + see_also: str, + examples: str, + empty_value: bool, +) -> Callable: + @Substitution( + desc=desc, + name1=name1, + name2=name2, + axis_descr=axis_descr, + see_also=see_also, + examples=examples, + empty_value=empty_value, + ) + @Appender(_bool_doc) + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + nv.validate_logical_func(tuple(), kwargs, fname=name) + if level is not None: + if bool_only is not None: + raise NotImplementedError( + "Option bool_only is not implemented with option level." + ) + return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + return self._reduce( + func, + name=name, + axis=axis, + skipna=skipna, + numeric_only=bool_only, + filter_type="bool", + ) + + return set_function_name(logical_func, name, cls) diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/base.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/base.py index 99426c5..e71b2f9 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/base.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/base.py @@ -4,41 +4,17 @@ hold the allowlist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ import collections -from typing import List - -from pandas._typing import final from pandas.core.dtypes.common import is_list_like, is_scalar -from pandas.core.base import PandasObject - OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) -class ShallowMixin(PandasObject): - _attributes: List[str] = [] - - @final - def _shallow_copy(self, obj, **kwargs): - """ - return a new object with the replacement attributes - """ - if isinstance(obj, self._constructor): - obj = obj.obj - for attr in self._attributes: - if attr not in kwargs: - kwargs[attr] = getattr(self, attr) - return self._constructor(obj, **kwargs) - - -class GotItemMixin(PandasObject): +class GroupByMixin: """ Provide the groupby facilities to the mixed object. """ - _attributes: List[str] - - @final def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. @@ -46,16 +22,14 @@ class GotItemMixin(PandasObject): Parameters ---------- key : string / list of selections - ndim : {1, 2} + ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: - # pandas\core\groupby\base.py:52: error: "GotItemMixin" has no - # attribute "obj" [attr-defined] - subset = self.obj # type: ignore[attr-defined] + subset = self.obj # we need to make a shallow copy of ourselves # with the same groupby @@ -63,28 +37,15 @@ class GotItemMixin(PandasObject): # Try to select from a DataFrame, falling back to a Series try: - # pandas\core\groupby\base.py:60: error: "GotItemMixin" has no - # attribute "_groupby" [attr-defined] - groupby = self._groupby[key] # type: ignore[attr-defined] + groupby = self._groupby[key] except IndexError: - # pandas\core\groupby\base.py:62: error: "GotItemMixin" has no - # attribute "_groupby" [attr-defined] - groupby = self._groupby # type: ignore[attr-defined] + groupby = self._groupby - # pandas\core\groupby\base.py:64: error: Too many arguments for - # "GotItemMixin" [call-arg] - - # pandas\core\groupby\base.py:64: error: Unexpected keyword argument - # "groupby" for "GotItemMixin" [call-arg] - - # pandas\core\groupby\base.py:64: error: Unexpected keyword argument - # "parent" for "GotItemMixin" [call-arg] - self = type(self)( - subset, groupby=groupby, parent=self, **kwargs # type: ignore[call-arg] - ) + self = type(self)(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() - if subset.ndim == 2 and (is_scalar(key) and key in subset or is_list_like(key)): - self._selection = key + if subset.ndim == 2: + if is_scalar(key) and key in subset or is_list_like(key): + self._selection = key return self @@ -112,8 +73,15 @@ common_apply_allowlist = ( ) series_apply_allowlist = ( - common_apply_allowlist - | {"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"} + ( + common_apply_allowlist + | { + "nlargest", + "nsmallest", + "is_monotonic_increasing", + "is_monotonic_decreasing", + } + ) ) | frozenset(["dtype", "unique"]) dataframe_apply_allowlist = common_apply_allowlist | frozenset(["dtypes", "corrwith"]) @@ -196,7 +164,6 @@ groupby_other_methods = frozenset( "describe", "dtypes", "expanding", - "ewm", "filter", "get_group", "groups", diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/categorical.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/categorical.py index 64037f5..db734bb 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/categorical.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/categorical.py @@ -1,5 +1,3 @@ -from typing import Optional, Tuple - import numpy as np from pandas.core.algorithms import unique1d @@ -8,12 +6,9 @@ from pandas.core.arrays.categorical import ( CategoricalDtype, recode_for_categories, ) -from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby( - c: Categorical, sort: bool, observed: bool -) -> Tuple[Categorical, Optional[Categorical]]: +def recode_for_groupby(c: Categorical, sort: bool, observed: bool): """ Code the categories to ensure we can groupby for categoricals. @@ -48,9 +43,6 @@ def recode_for_groupby( """ # we only care about observed values if observed: - # In cases with c.ordered, this is equivalent to - # return c.remove_unused_categories(), c - unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] @@ -81,9 +73,7 @@ def recode_for_groupby( return c.reorder_categories(cat.categories), None -def recode_from_groupby( - c: Categorical, sort: bool, ci: CategoricalIndex -) -> CategoricalIndex: +def recode_from_groupby(c: Categorical, sort: bool, ci): """ Reverse the codes_to_groupby to account for sort / observed. @@ -101,10 +91,7 @@ def recode_from_groupby( """ # we re-order to the original category orderings if sort: - # error: "CategoricalIndex" has no attribute "set_categories" - return ci.set_categories(c.categories) # type: ignore[attr-defined] + return ci.set_categories(c.categories) # we are not sorting, so add unobserved to the end - new_cats = c.categories[~c.categories.isin(ci.categories)] - # error: "CategoricalIndex" has no attribute "add_categories" - return ci.add_categories(new_cats) # type: ignore[attr-defined] + return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/generic.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/generic.py index 07ffb88..2e35bb9 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/generic.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/generic.py @@ -9,6 +9,7 @@ from collections import abc, namedtuple import copy from functools import partial from textwrap import dedent +import typing from typing import ( TYPE_CHECKING, Any, @@ -20,8 +21,8 @@ from typing import ( Mapping, Optional, Sequence, + Tuple, Type, - TypeVar, Union, cast, ) @@ -29,14 +30,16 @@ import warnings import numpy as np -from pandas._libs import lib, reduction as libreduction -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label +from pandas._libs import lib +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( - find_common_type, + maybe_cast_result, maybe_cast_result_dtype, + maybe_convert_objects, maybe_downcast_numeric, + maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -45,25 +48,23 @@ from pandas.core.dtypes.common import ( is_integer_dtype, is_interval_dtype, is_numeric_dtype, + is_object_dtype, is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna, notna -from pandas.core import algorithms, nanops from pandas.core.aggregation import ( - agg_list_like, - aggregate, maybe_mangle_lambdas, reconstruct_func, validate_func_kwargs, ) -from pandas.core.arrays import Categorical, ExtensionArray +import pandas.core.algorithms as algorithms from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame -from pandas.core.generic import NDFrame +from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame from pandas.core.groupby import base from pandas.core.groupby.groupby import ( GroupBy, @@ -71,13 +72,17 @@ from pandas.core.groupby.groupby import ( _apply_docs, _transform_template, get_groupby, - group_selection_context, ) from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase -from pandas.core.internals import BlockManager +from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series -from pandas.core.util.numba_ import maybe_use_numba +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + generate_numba_func, + maybe_use_numba, + split_for_numba, +) from pandas.plotting import boxplot_frame_groupby @@ -91,7 +96,7 @@ AggScalar = Union[str, Callable[..., Any]] # TODO: validate types on ScalarResult and move to _typing # Blocked from using by https://github.com/python/mypy/issues/1484 # See note at _mangle_lambda_list -ScalarResult = TypeVar("ScalarResult") +ScalarResult = typing.TypeVar("ScalarResult") def generate_property(name: str, klass: Type[FrameOrSeries]): @@ -220,17 +225,11 @@ class SeriesGroupBy(GroupBy[Series]): def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @doc(_agg_template, examples=_agg_examples_doc, klass="Series") + @doc( + _agg_template, examples=_agg_examples_doc, klass="Series", + ) def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result, index = self._aggregate_with_numba( - data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor(result.ravel(), index=index, name=data.name) - relabeling = func is None columns = None if relabeling: @@ -253,13 +252,18 @@ class SeriesGroupBy(GroupBy[Series]): return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general(func, *args, **kwargs) + return self._python_agg_general( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) try: - return self._python_agg_general(func, *args, **kwargs) + return self._python_agg_general( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) except (ValueError, KeyError): + # Do not catch Numba errors here, we want to raise and not fall back. # TODO: KeyError is raised in _python_agg_general, - # see test_groupby.test_basic + # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) @@ -302,7 +306,7 @@ class SeriesGroupBy(GroupBy[Series]): arg = zip(columns, arg) - results: Dict[base.OutputKey, FrameOrSeriesUnion] = {} + results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} for idx, (name, func) in enumerate(arg): obj = self @@ -326,7 +330,7 @@ class SeriesGroupBy(GroupBy[Series]): self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> FrameOrSeriesUnion: + ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -349,7 +353,7 @@ class SeriesGroupBy(GroupBy[Series]): indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result: FrameOrSeriesUnion + result: Union[Series, DataFrame] if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns @@ -367,7 +371,7 @@ class SeriesGroupBy(GroupBy[Series]): self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> FrameOrSeriesUnion: + ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -470,19 +474,12 @@ class SeriesGroupBy(GroupBy[Series]): def _aggregate_named(self, func, *args, **kwargs): result = {} - initialized = False for name, group in self: - # Each step of this loop corresponds to - # libreduction._BaseGrouper._apply_to_group - group.name = name # NB: libreduction does not pin name - + group.name = name output = func(group, *args, **kwargs) - output = libreduction.extract_result(output) - if not initialized: - # We only do this validation on the first iteration - libreduction.check_result_array(output, 0) - initialized = True + if isinstance(output, (Series, Index, np.ndarray)): + raise ValueError("Must produce aggregated value") result[name] = output return result @@ -490,28 +487,22 @@ class SeriesGroupBy(GroupBy[Series]): @Substitution(klass="Series") @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result = self._transform_with_numba( - data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor( - result.ravel(), index=data.index, name=data.name - ) - func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general(func, *args, **kwargs) + return self._transform_general( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels or func in base.transformation_kernels: + elif func in base.cythonized_kernels: # cythonized transform or canned "agg+broadcast" return getattr(self, func)(*args, **kwargs) + elif func in base.transformation_kernels: + return getattr(self, func)(*args, **kwargs) + # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. @@ -520,37 +511,52 @@ class SeriesGroupBy(GroupBy[Series]): result = getattr(self, func)(*args, **kwargs) return self._transform_fast(result) - def _transform_general(self, func, *args, **kwargs): + def _transform_general( + self, func, *args, engine="cython", engine_kwargs=None, **kwargs + ): """ Transform with a non-str `func`. """ + + if maybe_use_numba(engine): + numba_func, cache_key = generate_numba_func( + func, engine_kwargs, kwargs, "groupby_transform" + ) + klass = type(self._selected_obj) results = [] for name, group in self: object.__setattr__(group, "name", name) - res = func(group, *args, **kwargs) + if maybe_use_numba(engine): + values, index = split_for_numba(group) + res = numba_func(values, index, *args) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_func + else: + res = func(group, *args, **kwargs) - if isinstance(res, (DataFrame, Series)): + if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - results.append(klass(res, index=group.index)) + indexer = self._get_index(name) + ser = klass(res, indexer) + results.append(ser) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - concatenated = concat(results) - result = self._set_result_index_ordered(concatenated) + result = concat(results).sort_index() else: result = self.obj._constructor(dtype=np.float64) + # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - if is_numeric_dtype(result.dtype): - common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) - if common_dtype is result.dtype: - result = maybe_downcast_numeric(result, self._selected_obj.dtype) + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype): + result = maybe_downcast_to_dtype(result, dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index @@ -601,8 +607,8 @@ class SeriesGroupBy(GroupBy[Series]): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x) -> bool: - b = wrapper(x) + def true_and_notna(x, *args, **kwargs) -> bool: + b = wrapper(x, *args, **kwargs) return b and notna(b) try: @@ -679,7 +685,7 @@ class SeriesGroupBy(GroupBy[Series]): self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.merge import get_join_indexers + from pandas.core.reshape.merge import _get_join_indexers from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): @@ -708,7 +714,7 @@ class SeriesGroupBy(GroupBy[Series]): # lab is a Categorical with categories an IntervalIndex lab = cut(Series(val), bins, include_lowest=True) lev = lab.cat.categories - lab = lev.take(lab.cat.codes, allow_fill=True, fill_value=lev._na_value) + lab = lev.take(lab.cat.codes) llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] if is_interval_dtype(lab.dtype): @@ -781,7 +787,7 @@ class SeriesGroupBy(GroupBy[Series]): right = [diff.cumsum() - 1, codes[-1]] - _, idx = get_join_indexers(left, right, sort=False, how="left") + _, idx = _get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) if sort: @@ -928,21 +934,21 @@ class DataFrameGroupBy(GroupBy[DataFrame]): See :ref:`groupby.aggregate.named` for more.""" ) - @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") + @doc( + _agg_template, examples=_agg_examples_doc, klass="DataFrame", + ) def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result, index = self._aggregate_with_numba( - data, func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor(result, index=index, columns=data.columns) - relabeling, func, columns, order = reconstruct_func(func, **kwargs) + + if maybe_use_numba(engine): + return self._python_agg_general( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) + func = maybe_mangle_lambdas(func) - result, how = aggregate(self, func, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if how is None: return result @@ -962,7 +968,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): # try to treat as if we are passing a list try: - result = agg_list_like(self, [func], _axis=self.axis) + result = self._aggregate_multiple_funcs([func], _axis=self.axis) # select everything except for the last level, which is the one # containing the name of the function(s), see GH 32040 @@ -1012,83 +1018,37 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_mgr = self._cython_agg_blocks( + agg_blocks, agg_items = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) + return self._wrap_agged_blocks(agg_blocks, items=agg_items) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> BlockManager: + ) -> "Tuple[List[Block], Index]": + # TODO: the actual managing of mgr_locs is a PITA + # here, it should happen via BlockManager.combine data: BlockManager = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) - def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: - # see if we can cast the values to the desired dtype - # this may not be the original dtype - assert not isinstance(result, DataFrame) - - dtype = maybe_cast_result_dtype(values.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if isinstance(values, Categorical) and isinstance(result, np.ndarray): - # If the Categorical op didn't raise, it is dtype-preserving - result = type(values)._from_sequence(result.ravel(), dtype=values.dtype) - # Note this will have result.dtype == dtype from above - - elif isinstance(result, np.ndarray) and result.ndim == 1: - # We went through a SeriesGroupByPath and need to reshape - # GH#32223 includes case with IntegerArray values - result = result.reshape(1, -1) - # test_groupby_duplicate_columns gets here with - # result.dtype == int64, values.dtype=object, how="min" - - return result - - def py_fallback(bvalues: ArrayLike) -> ArrayLike: - # if self.grouper.aggregate fails, we fall back to a pure-python - # solution - - # We get here with a) EADtypes and b) object dtype - obj: FrameOrSeriesUnion - - # call our grouper again with only this block - if isinstance(bvalues, ExtensionArray): - # TODO(EA2D): special case not needed with 2D EAs - obj = Series(bvalues) - else: - obj = DataFrame(bvalues.T) - if obj.shape[1] == 1: - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = obj.iloc[:, 0] - - # Create SeriesGroupBy with observed=True so that it does - # not try to add missing categories if grouping over multiple - # Categoricals. This will done by later self._reindex_output() - # Doing it here creates an error. See GH#34951 - sgb = get_groupby(obj, self.grouper, observed=True) - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - return result - - def blk_func(bvalues: ArrayLike) -> ArrayLike: + agg_blocks: List[Block] = [] + new_items: List[np.ndarray] = [] + deleted_items: List[np.ndarray] = [] + # Some object-dtype blocks might be split into List[Block[T], Block[U]] + split_items: List[np.ndarray] = [] + split_frames: List[DataFrame] = [] + no_result = object() + for block in data.blocks: + # Avoid inheriting result from earlier in the loop + result = no_result + locs = block.mgr_locs.as_array try: - result = self.grouper._cython_operation( - "aggregate", bvalues, how, axis=1, min_count=min_count + result, _ = self.grouper.aggregate( + block.values, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1099,21 +1059,108 @@ class DataFrameGroupBy(GroupBy[DataFrame]): # we cannot perform the operation # in an alternate way, exclude the block assert how == "ohlc" - raise + deleted_items.append(locs) + continue - result = py_fallback(bvalues) + # call our grouper again with only this block + obj = self.obj[data.items[locs]] + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] - return cast_agg_result(result, bvalues, how) + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + s = get_groupby(obj, self.grouper, observed=True) + try: + result = s.aggregate(lambda x: alt(x, axis=self.axis)) + except TypeError: + # we may have an exception in trying to aggregate + # continue and exclude the block + deleted_items.append(locs) + continue + else: + result = cast(DataFrame, result) + # unwrap DataFrame to get array + if len(result._mgr.blocks) != 1: + # We've split an object block! Everything we've assumed + # about a single block input returning a single block output + # is a lie. To keep the code-path for the typical non-split case + # clean, we choose to clean up this mess later on. + split_items.append(locs) + split_frames.append(result) + continue - # TypeError -> we may have an exception in trying to aggregate - # continue and exclude the block - # NotImplementedError -> "ohlc" with wrong dtype - new_mgr = data.apply(blk_func, ignore_failures=True) + assert len(result._mgr.blocks) == 1 + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) - if not len(new_mgr): + assert not isinstance(result, DataFrame) + + if result is not no_result: + # see if we can cast the block to the desired dtype + # this may not be the original dtype + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + + new_items.append(locs) + agg_blocks.append(agg_block) + + if not (agg_blocks or split_frames): raise DataError("No numeric types to aggregate") - return new_mgr + if split_items: + # Clean up the mess left over from split blocks. + for locs, result in zip(split_items, split_frames): + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) + + # reset the locs in the blocks to correspond to our + # current ordering + indexer = np.concatenate(new_items) + agg_items = data.items.take(np.sort(indexer)) + + if deleted_items: + + # we need to adjust the indexer to account for the + # items we have removed + # really should be done in internals :< + + deleted = np.concatenate(deleted_items) + ai = np.arange(len(data)) + mask = np.zeros(len(data)) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in agg_blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc + + return agg_blocks, agg_items def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: @@ -1122,7 +1169,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): axis = self.axis obj = self._obj_with_exclusions - result: Dict[Label, Union[NDFrame, np.ndarray]] = {} + result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) @@ -1145,6 +1192,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): data = obj[item] colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + cast = self._transform_should_cast(func) try: result[item] = colg.aggregate(func, *args, **kwargs) @@ -1157,6 +1205,10 @@ class DataFrameGroupBy(GroupBy[DataFrame]): cannot_agg.append(item) continue + else: + if cast: + result[item] = maybe_cast_result(result[item], data) + result_columns = obj.columns if cannot_agg: result_columns = result_columns.drop(cannot_agg) @@ -1175,133 +1227,160 @@ class DataFrameGroupBy(GroupBy[DataFrame]): return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - - key_index = self.grouper.result_index if self.as_index else None - - if isinstance(first_not_none, (np.ndarray, Index)): - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) - elif not isinstance(first_not_none, Series): - # values are not series or array-like but scalars - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - if self.as_index: - return self.obj._constructor_sliced(values, index=key_index) - else: - result = DataFrame(values, index=key_index, columns=[self._selection]) - self._insert_inaxis_grouper_inplace(result) - return result else: - # values are Series - return self._wrap_applied_output_series( - keys, values, not_indexed_same, first_not_none, key_index - ) + key_index = self.grouper.result_index if self.as_index else None - def _wrap_applied_output_series( - self, - keys, - values: List[Series], - not_indexed_same: bool, - first_not_none, - key_index, - ) -> FrameOrSeriesUnion: - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) - values = [x if (x is not None) else backup for x in values] + if isinstance(first_not_none, Series): - all_indexed_same = all_indexes_same(x.index for x in values) - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - applied_index = self._selected_obj._get_axis(self.axis) - singular_series = len(values) == 1 and applied_index.nlevels == 1 - - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object ) - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat + values = [x if (x is not None) else backup for x in values] - return concat(values) + v = values[0] - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) + if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: + if isinstance(v, Series): + applied_index = self._selected_obj._get_axis(self.axis) + all_indexed_same = all_indexes_same((x.index for x in values)) + singular_series = len(values) == 1 and applied_index.nlevels == 1 - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + # assign the name to this series + if singular_series: + values[0].name = keys[0] - if self.axis == 0: - index = key_index - columns = first_not_none.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = list(names)[0] - else: - index = first_not_none.index - columns = key_index - stacked_values = stacked_values.T + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) - result = self.obj._constructor(stacked_values, index=index, columns=columns) + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = result._convert(datetime=True) - else: - result = result._convert(datetime=True) + return concat(values) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) - return self._reindex_output(result) + # GH6124 if the list of Series have a consistent name, + # then propagate that name to the result. + index = v.index.copy() + if index.name is None: + # Only propagate the series name to the result + # if all series have a consistent name. If the + # series do not have a consistent name, do + # nothing. + names = {v.name for v in values} + if len(names) == 1: + index.name = list(names)[0] - def _transform_general(self, func, *args, **kwargs): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + + result = self.obj._constructor( + stacked_values, index=index, columns=columns + ) + + elif not self.as_index: + # We add grouping column below, so create a frame here + result = DataFrame( + values, index=key_index, columns=[self._selection] + ) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) + + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) + + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + + return self._reindex_output(result) + + # values are not series or array-like but scalars + else: + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + return self.obj._constructor_sliced(values, index=key_index) + + def _transform_general( + self, func, *args, engine="cython", engine_kwargs=None, **kwargs + ): from pandas.core.reshape.concat import concat applied = [] obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) - fast_path, slow_path = self._define_paths(func, *args, **kwargs) + if maybe_use_numba(engine): + numba_func, cache_key = generate_numba_func( + func, engine_kwargs, kwargs, "groupby_transform" + ) + else: + fast_path, slow_path = self._define_paths(func, *args, **kwargs) for name, group in gen: object.__setattr__(group, "name", name) - # Try slow path and fast path. - try: - path, res = self._choose_path(fast_path, slow_path, group) - except TypeError: - return self._transform_item_by_item(obj, fast_path) - except ValueError as err: - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err + if maybe_use_numba(engine): + values, index = split_for_numba(group) + res = numba_func(values, index, *args) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_func + # Return the result as a DataFrame for concatenation later + res = self.obj._constructor( + res, index=group.index, columns=group.columns + ) + else: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError as err: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err if isinstance(res, Series): @@ -1337,26 +1416,23 @@ class DataFrameGroupBy(GroupBy[DataFrame]): @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - if maybe_use_numba(engine): - with group_selection_context(self): - data = self._selected_obj - result = self._transform_with_numba( - data, func, *args, engine_kwargs=engine_kwargs, **kwargs - ) - return self.obj._constructor(result, index=data.index, columns=data.columns) - # optimized transforms func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general(func, *args, **kwargs) + return self._transform_general( + func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) - elif func in base.cythonized_kernels or func in base.transformation_kernels: + elif func in base.cythonized_kernels: # cythonized transformation or canned "reduction+broadcast" return getattr(self, func)(*args, **kwargs) + elif func in base.transformation_kernels: + return getattr(self, func)(*args, **kwargs) + # GH 30918 # Use _transform_fast only when we know func is an aggregation if func in base.reduction_kernels: @@ -1372,7 +1448,9 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ): return self._transform_fast(result) - return self._transform_general(func, *args, **kwargs) + return self._transform_general( + func, engine=engine, engine_kwargs=engine_kwargs, *args, **kwargs + ) def _transform_fast(self, result: DataFrame) -> DataFrame: """ @@ -1380,13 +1458,13 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ obj = self._obj_with_exclusions - # for each col, reshape to size of original frame by take operation + # for each col, reshape to to size of original frame + # by take operation ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) - output = [ - algorithms.take_1d(result.iloc[:, i].values, ids) - for i, _ in enumerate(result.columns) - ] + output = [] + for i, _ in enumerate(result.columns): + output.append(algorithms.take_1d(result.iloc[:, i].values, ids)) return self.obj._constructor._from_arrays( output, columns=result.columns, index=obj.index @@ -1415,7 +1493,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): except AssertionError: raise except Exception: - # GH#29631 For user-defined function, we can't predict what may be + # GH#29631 For user-defined function, we cant predict what may be # raised; see test_transform.test_transform_fastpath_raises return path, res @@ -1445,7 +1523,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): else: inds.append(i) - if not output: + if len(output) == 0: raise TypeError("Transform function invalid for data types") columns = obj.columns @@ -1519,9 +1597,6 @@ class DataFrameGroupBy(GroupBy[DataFrame]): return self._apply_filter(indices, dropna) def __getitem__(self, key): - if self.axis == 1: - # GH 37725 - raise ValueError("Cannot subset columns when using axis=1") # per GH 23566 if isinstance(key, tuple) and len(key) > 1: # if len == 1, then it becomes a SeriesGroupBy and this is actually @@ -1542,7 +1617,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): Parameters ---------- key : string / list of selections - ndim : {1, 2} + ndim : 1,2 requested ndim of result subset : object, default None subset to act on @@ -1553,37 +1628,22 @@ class DataFrameGroupBy(GroupBy[DataFrame]): return DataFrameGroupBy( subset, self.grouper, - axis=self.axis, - level=self.level, + selection=key, grouper=self.grouper, exclusions=self.exclusions, - selection=key, as_index=self.as_index, - sort=self.sort, - group_keys=self.group_keys, - squeeze=self.squeeze, observed=self.observed, - mutated=self.mutated, - dropna=self.dropna, ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, - level=self.level, - grouper=self.grouper, - selection=key, - sort=self.sort, - group_keys=self.group_keys, - squeeze=self.squeeze, - observed=self.observed, - dropna=self.dropna, + subset, selection=key, grouper=self.grouper, observed=self.observed ) raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: + def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1600,14 +1660,20 @@ class DataFrameGroupBy(GroupBy[DataFrame]): else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: + def _insert_inaxis_grouper_inplace(self, result): # zip in reverse so we can always insert at loc 0 + izip = zip( + *map( + reversed, + ( + self.grouper.names, + self.grouper.get_group_levels(), + [grp.in_axis for grp in self.grouper.groupings], + ), + ) + ) columns = result.columns - for name, lev, in_axis in zip( - reversed(self.grouper.names), - reversed(self.grouper.get_group_levels()), - reversed([grp.in_axis for grp in self.grouper.groupings]), - ): + for name, lev, in_axis in izip: # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: @@ -1631,8 +1697,8 @@ class DataFrameGroupBy(GroupBy[DataFrame]): DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - columns = Index([key.label for key in output]) - columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) + name = self._obj_with_exclusions._get_axis(1 - self.axis).name + columns = Index([key.label for key in output], name=name) result = self.obj._constructor(indexed_output) result.columns = columns @@ -1664,21 +1730,15 @@ class DataFrameGroupBy(GroupBy[DataFrame]): DataFrame """ indexed_output = {key.position: val for key, val in output.items()} + columns = Index(key.label for key in output) + result = self.obj._constructor(indexed_output) - - if self.axis == 1: - result = result.T - result.columns = self.obj.columns - else: - columns = Index(key.label for key in output) - columns.name = self.obj.columns.name - result.columns = columns - + result.columns = columns result.index = self.obj.index return result - def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1705,7 +1765,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func) -> DataFrame: + def _apply_to_column_groupbys(self, func): from pandas.core.reshape.concat import concat return concat( @@ -1714,7 +1774,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): axis=1, ) - def count(self) -> DataFrame: + def count(self): """ Compute count of group, excluding missing values. @@ -1727,28 +1787,18 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - def hfunc(bvalues: ArrayLike) -> ArrayLike: - # TODO(2DEA): reshape would not be necessary with 2D EAs - if bvalues.ndim == 1: - # EA - masked = mask & ~isna(bvalues).reshape(1, -1) - else: - masked = mask & ~isna(bvalues) + # TODO(2DEA): reshape would not be necessary with 2D EAs + vals = ((mask & ~isna(blk.values).reshape(blk.shape)) for blk in data.blocks) + locs = (blk.mgr_locs for blk in data.blocks) - counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) - return counted + counted = ( + lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals + ) + blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - new_mgr = data.apply(hfunc) + return self._wrap_agged_blocks(blocks, items=data.items) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _wrap_agged_blocks() returns. GH 35028 - with com.temp_setattr(self, "observed", True): - result = self._wrap_agged_blocks(new_mgr.blocks, items=data.items) - - return self._reindex_output(result, fill_value=0) - - def nunique(self, dropna: bool = True) -> DataFrame: + def nunique(self, dropna: bool = True): """ Return DataFrame with counts of unique elements in each position. @@ -1814,7 +1864,6 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ], axis=1, ) - results = cast(DataFrame, results) if axis_number == 1: results = results.T @@ -1826,46 +1875,40 @@ class DataFrameGroupBy(GroupBy[DataFrame]): self._insert_inaxis_grouper_inplace(results) return results - @Appender(DataFrame.idxmax.__doc__) - def idxmax(self, axis=0, skipna: bool = True): - axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False - - def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 - res = df._reduce( - nanops.nanargmax, - "argmax", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) - - return self._python_apply_general(func, self._obj_with_exclusions) - - @Appender(DataFrame.idxmin.__doc__) - def idxmin(self, axis=0, skipna: bool = True): - axis = DataFrame._get_axis_number(axis) - numeric_only = None if axis == 0 else False - - def func(df): - # NB: here we use numeric_only=None, in DataFrame it is False GH#38217 - res = df._reduce( - nanops.nanargmin, - "argmin", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - ) - indices = res._values - index = df._get_axis(axis) - result = [index[i] if i >= 0 else np.nan for i in indices] - return df._constructor_sliced(result, index=res.index) - - return self._python_apply_general(func, self._obj_with_exclusions) - boxplot = boxplot_frame_groupby + + +def _recast_datetimelike_result(result: DataFrame) -> DataFrame: + """ + If we have date/time like in the original, then coerce dates + as we are stacking can easily have object dtypes here. + + Parameters + ---------- + result : DataFrame + + Returns + ------- + DataFrame + + Notes + ----- + - Assumes Groupby._selected_obj has ndim==2 and at least one + datetimelike column + """ + result = result.copy() + + obj_cols = [ + idx + for idx in range(len(result.columns)) + if is_object_dtype(result.dtypes.iloc[idx]) + ] + + # See GH#26285 + for n in obj_cols: + converted = maybe_convert_objects( + result.iloc[:, n].values, convert_numeric=False + ) + + result.iloc[:, n] = converted + return result diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/groupby.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/groupby.py index 23f0e17..9415ee1 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/groupby.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/groupby.py @@ -11,7 +11,7 @@ from contextlib import contextmanager import datetime from functools import partial, wraps import inspect -from textwrap import dedent +import re import types from typing import ( Callable, @@ -20,12 +20,10 @@ from typing import ( Generic, Hashable, Iterable, - Iterator, List, Mapping, Optional, Sequence, - Set, Tuple, Type, TypeVar, @@ -36,22 +34,14 @@ import numpy as np from pandas._config.config import option_context -from pandas._libs import Timestamp, lib +from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby -from pandas._typing import ( - F, - FrameOrSeries, - FrameOrSeriesUnion, - IndexLabel, - Label, - Scalar, - final, -) +from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc -from pandas.core.dtypes.cast import maybe_downcast_to_dtype +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( ensure_float, is_bool_dtype, @@ -61,7 +51,6 @@ from pandas.core.dtypes.common import ( is_numeric_dtype, is_object_dtype, is_scalar, - is_timedelta64_dtype, ) from pandas.core.dtypes.missing import isna, notna @@ -72,22 +61,21 @@ from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, numba_, ops +from pandas.core.groupby import base, ops from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -from pandas.core.util.numba_ import NUMBA_FUNC_CACHE +from pandas.core.util.numba_ import maybe_use_numba _common_see_also = """ See Also -------- - Series.%(name)s : Apply a function %(name)s to a Series. - DataFrame.%(name)s : Apply a function %(name)s - to each row or column of a DataFrame. + Series.%(name)s + DataFrame.%(name)s """ -_apply_docs = { - "template": """ +_apply_docs = dict( + template=""" Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a {input} as its first @@ -124,7 +112,7 @@ _apply_docs = { Series.apply : Apply a function to a Series. DataFrame.apply : Apply a function to each row or column of a DataFrame. """, - "dataframe_examples": """ + dataframe_examples=""" >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) @@ -164,7 +152,7 @@ _apply_docs = { b 2 dtype: int64 """, - "series_examples": """ + series_examples=""" >>> s = pd.Series([0, 1, 2], index='a a b'.split()) >>> g = s.groupby(s.index) @@ -203,7 +191,7 @@ _apply_docs = { -------- {examples} """, -} +) _groupby_agg_method_template = """ Compute {fname} of group values. @@ -227,6 +215,8 @@ _pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return the function's result. +%(versionadded)s + Use `.pipe` when you want to improve readability by chaining together functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing @@ -296,7 +286,7 @@ f : function .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func. + Positional arguments to pass to func engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. @@ -321,12 +311,9 @@ Returns See Also -------- -%(klass)s.groupby.apply : Apply function func group-wise - and combine the results together. -%(klass)s.groupby.aggregate : Aggregate using one or more - operations over the specified axis. -%(klass)s.transform : Transforms the Series on each group - based on the given function. +%(klass)s.groupby.apply +%(klass)s.groupby.aggregate +%(klass)s.transform Notes ----- @@ -397,8 +384,7 @@ func : function, str, list or dict - dict of axis labels -> functions, function names or list of such. Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. + ``engine='numba'`` specified. If the ``'numba'`` engine is chosen, the function must be a user defined function with ``values`` and ``index`` as the @@ -408,7 +394,7 @@ func : function, str, list or dict .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func. + Positional arguments to pass to func engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. @@ -433,12 +419,9 @@ Returns See Also -------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Aggregate using one or more - operations over the specified axis. -{klass}.aggregate : Transforms the Series on each group - based on the given function. +{klass}.groupby.apply +{klass}.groupby.transform +{klass}.aggregate Notes ----- @@ -449,7 +432,6 @@ user defined function, and no alternative execution attempts will be tried. """ -@final class GroupByPlot(PandasObject): """ Class implementing the .plot attribute for groupby objects. @@ -476,15 +458,13 @@ class GroupByPlot(PandasObject): @contextmanager -def group_selection_context(groupby: "BaseGroupBy") -> Iterator["BaseGroupBy"]: +def _group_selection_context(groupby): """ - Set / reset the group_selection_context. + Set / reset the _group_selection_context. """ groupby._set_group_selection() - try: - yield groupby - finally: - groupby._reset_group_selection() + yield groupby + groupby._reset_group_selection() _KeysArgType = Union[ @@ -496,34 +476,19 @@ _KeysArgType = Union[ ] -class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): - _group_selection: Optional[IndexLabel] = None +class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): + _group_selection = None _apply_allowlist: FrozenSet[str] = frozenset() - _hidden_attrs = PandasObject._hidden_attrs | { - "as_index", - "axis", - "dropna", - "exclusions", - "grouper", - "group_keys", - "keys", - "level", - "mutated", - "obj", - "observed", - "sort", - "squeeze", - } def __init__( self, obj: FrameOrSeries, keys: Optional[_KeysArgType] = None, axis: int = 0, - level: Optional[IndexLabel] = None, - grouper: Optional["ops.BaseGrouper"] = None, - exclusions: Optional[Set[Label]] = None, - selection: Optional[IndexLabel] = None, + level=None, + grouper: "Optional[ops.BaseGrouper]" = None, + exclusions=None, + selection=None, as_index: bool = True, sort: bool = True, group_keys: bool = True, @@ -571,40 +536,35 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): self.obj = obj self.axis = obj._get_axis_number(axis) self.grouper = grouper - self.exclusions = exclusions or set() + self.exclusions = set(exclusions) if exclusions else set() - @final def __len__(self) -> int: return len(self.groups) - @final def __repr__(self) -> str: # TODO: Better repr for GroupBy object return object.__repr__(self) - def _assure_grouper(self) -> None: + def _assure_grouper(self): """ We create the grouper on instantiation sub-classes may have a different policy. """ pass - @final @property - def groups(self) -> Dict[Hashable, np.ndarray]: + def groups(self): """ Dict {group name -> group labels}. """ self._assure_grouper() return self.grouper.groups - @final @property - def ngroups(self) -> int: + def ngroups(self): self._assure_grouper() return self.grouper.ngroups - @final @property def indices(self): """ @@ -613,7 +573,6 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): self._assure_grouper() return self.grouper.indices - @final def _get_indices(self, names): """ Safe get multiple indices, translate keys for @@ -664,14 +623,12 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): return [self.indices.get(name, []) for name in names] - @final def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ return self._get_indices([name])[0] - @final @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy @@ -683,8 +640,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): else: return self.obj[self._selection] - @final - def _reset_group_selection(self) -> None: + def _reset_group_selection(self): """ Clear group based selection. @@ -696,8 +652,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): self._group_selection = None self._reset_cache("_selected_obj") - @final - def _set_group_selection(self) -> None: + def _set_group_selection(self): """ Create group based selection. @@ -722,10 +677,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): self._group_selection = ax.difference(Index(groupers), sort=False).tolist() self._reset_cache("_selected_obj") - @final - def _set_result_index_ordered( - self, result: "OutputFrameOrSeries" - ) -> "OutputFrameOrSeries": + def _set_result_index_ordered(self, result): # set the result index on the passed values object and # return the new object, xref 8046 @@ -739,8 +691,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) return result - @final - def _dir_additions(self) -> Set[str]: + def _dir_additions(self): return self.obj._dir_additions() | self._apply_allowlist def __getattr__(self, attr: str): @@ -755,25 +706,24 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): @Substitution( klass="GroupBy", - examples=dedent( - """\ - >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) - >>> df - A B - 0 a 1 - 1 b 2 - 2 a 3 - 3 b 4 + versionadded=".. versionadded:: 0.21.0", + examples="""\ +>>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) +>>> df + A B +0 a 1 +1 b 2 +2 a 3 +3 b 4 - To get the difference between each groups maximum and minimum value in one - pass, you can do +To get the difference between each groups maximum and minimum value in one +pass, you can do - >>> df.groupby('A').pipe(lambda x: x.max() - x.min()) - B - A - a 2 - b 2""" - ), +>>> df.groupby('A').pipe(lambda x: x.max() - x.min()) + B +A +a 2 +b 2""", ) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): @@ -781,16 +731,16 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): plot = property(GroupByPlot) - @final - def _make_wrapper(self, name: str) -> Callable: + def _make_wrapper(self, name): assert name in self._apply_allowlist - with group_selection_context(self): - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + self._set_group_selection() + + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) @@ -814,12 +764,27 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): if name in base.plotting_methods: return self.apply(curried) - return self._python_apply_general(curried, self._obj_with_exclusions) + try: + return self._python_apply_general(curried, self._obj_with_exclusions) + except TypeError as err: + if not re.search( + "reduction operation '.*' not allowed for this dtype", str(err) + ): + # We don't have a cython implementation + # TODO: is the above comment accurate? + raise + + if self.obj.ndim == 1: + # this can be called recursively, so need to raise ValueError + raise ValueError + + # GH#3688 try to operate item-by-item + result = self._aggregate_item_by_item(name, *args, **kwargs) + return result wrapper.__name__ = name return wrapper - @final def get_group(self, name, obj=None): """ Construct DataFrame from group with provided name. @@ -846,7 +811,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): return obj._take_with_is_copy(inds, axis=self.axis) - def __iter__(self) -> Iterator[Tuple[Label, FrameOrSeries]]: + def __iter__(self): """ Groupby iterator. @@ -901,12 +866,11 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with group_selection_context(self): + with _group_selection_context(self): return self._python_apply_general(f, self._selected_obj) return result - @final def _python_apply_general( self, f: F, data: FrameOrSeriesUnion ) -> FrameOrSeriesUnion: @@ -937,7 +901,6 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) - @final def _cumcount_array(self, ascending: bool = True): """ Parameters @@ -970,12 +933,24 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): rev[sorter] = np.arange(count, dtype=np.intp) return out[rev].astype(np.int64, copy=False) - @final - def _cython_transform( - self, how: str, numeric_only: bool = True, axis: int = 0, **kwargs - ): - output: Dict[base.OutputKey, np.ndarray] = {} + def _transform_should_cast(self, func_nm: str) -> bool: + """ + Parameters + ---------- + func_nm: str + The name of the aggregation function being performed + Returns + ------- + bool + Whether transform should attempt to cast the result of aggregation + """ + filled_series = self.grouper.size().fillna(0) + assert filled_series is not None + return filled_series.gt(0).any() and func_nm not in base.cython_cast_blocklist + + def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): + output: Dict[base.OutputKey, np.ndarray] = {} for idx, obj in enumerate(self._iterate_slices()): name = obj.name is_numeric = is_numeric_dtype(obj.dtype) @@ -983,16 +958,17 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): continue try: - result = self.grouper._cython_operation( - "transform", obj._values, how, axis, **kwargs - ) + result, _ = self.grouper.transform(obj.values, how, **kwargs) except NotImplementedError: continue + if self._transform_should_cast(how): + result = maybe_cast_result(result, obj, how=how) + key = base.OutputKey(label=name, position=idx) output[key] = result - if not output: + if len(output) == 0: raise DataError("No numeric types to aggregate") return self._wrap_transformed_output(output) @@ -1008,7 +984,6 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) - @final def _agg_general( self, numeric_only: bool = True, @@ -1017,32 +992,30 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): alias: str, npfunc: Callable, ): - with group_selection_context(self): - # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes - pass - else: - raise + self._set_group_selection() - # apply a non-cython aggregation - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result.__finalize__(self.obj, method="groupby") + result = None + # try a cython aggregation if we can + try: + result = self._cython_agg_general( + how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, + ) + except DataError: + pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + else: + raise + + # apply a non-cython aggregation + if result is None: + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result.__finalize__(self.obj, method="groupby") def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1059,105 +1032,34 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): if numeric_only and not is_numeric: continue - result = self.grouper._cython_operation( - "aggregate", obj._values, how, axis=0, min_count=min_count + result, agg_names = self.grouper.aggregate( + obj._values, how, min_count=min_count ) - if how == "ohlc": + if agg_names: # e.g. ohlc - agg_names = ["open", "high", "low", "close"] assert len(agg_names) == result.shape[1] for result_column, result_name in zip(result.T, agg_names): key = base.OutputKey(label=result_name, position=idx) - output[key] = result_column + output[key] = maybe_cast_result(result_column, obj, how=how) idx += 1 else: assert result.ndim == 1 key = base.OutputKey(label=name, position=idx) - output[key] = result + output[key] = maybe_cast_result(result, obj, how=how) idx += 1 - if not output: + if len(output) == 0: raise DataError("No numeric types to aggregate") return self._wrap_aggregated_output(output, index=self.grouper.result_index) - @final - def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): - """ - Perform groupby transform routine with the numba engine. - - This routine mimics the data splitting routine of the DataSplitter class - to generate the indices of each group in the sorted data and then passes the - data and indices into a Numba jitted function. - """ - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) - group_keys = self.grouper._get_group_keys() - labels, _, n_groups = self.grouper.group_info - sorted_index = get_group_index_sorter(labels, n_groups) - sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) - sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - starts, ends = lib.generate_slices(sorted_labels, n_groups) - - numba_transform_func = numba_.generate_numba_transform_func( - tuple(args), kwargs, func, engine_kwargs - ) - result = numba_transform_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) - ) - - cache_key = (func, "groupby_transform") - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_transform_func - - # result values needs to be resorted to their original positions since we - # evaluated the data sorted by group - return result.take(np.argsort(sorted_index), axis=0) - - @final - def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): - """ - Perform groupby aggregation routine with the numba engine. - - This routine mimics the data splitting routine of the DataSplitter class - to generate the indices of each group in the sorted data and then passes the - data and indices into a Numba jitted function. - """ - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) - group_keys = self.grouper._get_group_keys() - labels, _, n_groups = self.grouper.group_info - sorted_index = get_group_index_sorter(labels, n_groups) - sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) - sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() - starts, ends = lib.generate_slices(sorted_labels, n_groups) - - numba_agg_func = numba_.generate_numba_agg_func( - tuple(args), kwargs, func, engine_kwargs - ) - result = numba_agg_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) - ) - - cache_key = (func, "groupby_agg") - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_agg_func - - if self.grouper.nkeys > 1: - index = MultiIndex.from_tuples(group_keys, names=self.grouper.names) - else: - index = Index(group_keys, name=self.grouper.names[0]) - return result, index - - @final - def _python_agg_general(self, func, *args, **kwargs): + def _python_agg_general( + self, func, *args, engine="cython", engine_kwargs=None, **kwargs + ): func = self._is_builtin_func(func) - f = lambda x: func(x, *args, **kwargs) + if engine != "numba": + f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict output: Dict[base.OutputKey, np.ndarray] = {} @@ -1168,36 +1070,43 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): # agg_series below assumes ngroups > 0 continue - try: - # if this function is invalid for this dtype, we will ignore it. - result, counts = self.grouper.agg_series(obj, f) - except TypeError: - continue + if maybe_use_numba(engine): + result, counts = self.grouper.agg_series( + obj, + func, + *args, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) + else: + try: + # if this function is invalid for this dtype, we will ignore it. + result, counts = self.grouper.agg_series(obj, f) + except TypeError: + continue assert result is not None key = base.OutputKey(label=name, position=idx) + output[key] = maybe_cast_result(result, obj, numeric_only=True) - if is_numeric_dtype(obj.dtype): - result = maybe_downcast_to_dtype(result, obj.dtype) + if len(output) == 0: + return self._python_apply_general(f, self._selected_obj) - if self.grouper._filter_empty_groups: - mask = counts.ravel() > 0 + if self.grouper._filter_empty_groups: + + mask = counts.ravel() > 0 + for key, result in output.items(): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): values = ensure_float(values) - result = maybe_downcast_to_dtype(values[mask], result.dtype) - - output[key] = result - - if not output: - return self._python_apply_general(f, self._selected_obj) + output[key] = maybe_cast_result(values[mask], result) return self._wrap_aggregated_output(output, index=self.grouper.result_index) - @final def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -1211,7 +1120,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.filter(lambda x: True).axes[self.axis] + ax = self._selected_obj._get_axis(self.axis) # this is a very unfortunate situation # we can't use reindex to restore the original order @@ -1259,7 +1168,6 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): return result - @final def _apply_filter(self, indices, dropna): if len(indices) == 0: indices = np.array([], dtype="int64") @@ -1281,7 +1189,7 @@ class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) -class GroupBy(BaseGroupBy[FrameOrSeries]): +class GroupBy(_GroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. @@ -1349,7 +1257,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): more """ - @final @property def _obj_1d_constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames @@ -1358,7 +1265,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): assert isinstance(self.obj, Series) return self.obj._constructor - @final def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1388,7 +1294,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): skipna=skipna, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def any(self, skipna: bool = True): @@ -1402,13 +1307,10 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): Returns ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if any element - is True within its respective group, False otherwise. + bool """ return self._bool_agg("any", skipna) - @final @Substitution(name="groupby") @Appender(_common_see_also) def all(self, skipna: bool = True): @@ -1422,9 +1324,7 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): Returns ------- - Series or DataFrame - DataFrame or Series of boolean values, where a value is True if all elements - are True within its respective group, False otherwise. + bool """ return self._bool_agg("all", skipna) @@ -1442,7 +1342,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): # defined here for API doc raise NotImplementedError - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def mean(self, numeric_only: bool = True): @@ -1499,7 +1398,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): numeric_only=numeric_only, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def median(self, numeric_only=True): @@ -1525,7 +1423,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): numeric_only=numeric_only, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def std(self, ddof: int = 1): @@ -1555,7 +1452,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): ddof=ddof, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def var(self, ddof: int = 1): @@ -1580,10 +1476,9 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): ) else: func = lambda x: x.var(ddof=ddof) - with group_selection_context(self): + with _group_selection_context(self): return self._python_agg_general(func) - @final @Substitution(name="groupby") @Appender(_common_see_also) def sem(self, ddof: int = 1): @@ -1609,12 +1504,13 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): cols = result.columns.get_indexer_for( result.columns.difference(self.exclusions).unique() ) - result.iloc[:, cols] = result.iloc[:, cols] / np.sqrt( - self.count().iloc[:, cols] + # TODO(GH-22046) - setting with iloc broken if labels are not unique + # .values to remove labels + result.iloc[:, cols] = ( + result.iloc[:, cols].values / np.sqrt(self.count().iloc[:, cols]).values ) return result - @final @Substitution(name="groupby") @Appender(_common_see_also) def size(self) -> FrameOrSeriesUnion: @@ -1640,54 +1536,40 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return self._reindex_output(result, fill_value=0) - @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum + ) - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _agg_general() returns. GH #31422 - with com.temp_setattr(self, "observed", True): - result = self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="add", - npfunc=np.sum, - ) - - return self._reindex_output(result, fill_value=0) - - @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) - @final @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) - @final @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) - @final @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): - """Helper function for first item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): + """Helper function for first item that isn't NA. + """ + x = x.array[notna(x.array)] + if len(x) == 0: return np.nan - return arr[0] + return x[0] if isinstance(obj, DataFrame): return obj.apply(first, axis=axis) @@ -1703,16 +1585,16 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): npfunc=first_compat, ) - @final @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): - """Helper function for last item that isn't NA.""" - arr = x.array[notna(x.array)] - if not len(arr): + """Helper function for last item that isn't NA. + """ + x = x.array[notna(x.array)] + if len(x) == 0: return np.nan - return arr[-1] + return x[-1] if isinstance(obj, DataFrame): return obj.apply(last, axis=axis) @@ -1728,7 +1610,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): npfunc=last_compat, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def ohlc(self) -> DataFrame: @@ -1744,16 +1625,14 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): """ return self._apply_to_column_groupbys(lambda x: x._cython_agg_general("ohlc")) - @final @doc(DataFrame.describe) def describe(self, **kwargs): - with group_selection_context(self): + with _group_selection_context(self): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T return result.unstack() - @final def resample(self, rule, *args, **kwargs): """ Provide resampling when using a TimeGrouper. @@ -1855,7 +1734,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return get_resampler_for_grouping(self, rule, *args, **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def rolling(self, *args, **kwargs): @@ -1866,7 +1744,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return RollingGroupby(self, *args, **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def expanding(self, *args, **kwargs): @@ -1878,18 +1755,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return ExpandingGroupby(self, *args, **kwargs) - @final - @Substitution(name="groupby") - @Appender(_common_see_also) - def ewm(self, *args, **kwargs): - """ - Return an ewm grouper, providing ewm functionality per group. - """ - from pandas.core.window import ExponentialMovingWindowGroupby - - return ExponentialMovingWindowGroupby(self, *args, **kwargs) - - @final def _fill(self, direction, limit=None): """ Shared function for `pad` and `backfill` to call Cython method. @@ -1910,8 +1775,8 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): See Also -------- - pad : Returns Series with minimum number of char in object. - backfill : Backward fill the missing values in the dataset. + pad + backfill """ # Need int value for Cython if limit is None: @@ -1925,10 +1790,8 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): result_is_index=True, direction=direction, limit=limit, - dropna=self.dropna, ) - @final @Substitution(name="groupby") def pad(self, limit=None): """ @@ -1946,16 +1809,15 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): See Also -------- - Series.pad: Returns Series with minimum number of char in object. - DataFrame.pad: Object with missing values filled or None if inplace=True. - Series.fillna: Fill NaN values of a Series. - DataFrame.fillna: Fill NaN values of a DataFrame. + Series.pad + DataFrame.pad + Series.fillna + DataFrame.fillna """ return self._fill("ffill", limit=limit) ffill = pad - @final @Substitution(name="groupby") def backfill(self, limit=None): """ @@ -1973,16 +1835,15 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): See Also -------- - Series.backfill : Backward fill the missing values in the dataset. - DataFrame.backfill: Backward fill the missing values in the dataset. - Series.fillna: Fill NaN values of a Series. - DataFrame.fillna: Fill NaN values of a DataFrame. + Series.backfill + DataFrame.backfill + Series.fillna + DataFrame.fillna """ return self._fill("bfill", limit=limit) bfill = backfill - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFrame: @@ -2071,31 +1932,29 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - with group_selection_context(self): + self._set_group_selection() - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d( - self._cumcount_array(ascending=False) + 1, -nth_array - ) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): @@ -2156,7 +2015,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return result - @final def quantile(self, q=0.5, interpolation: str = "linear"): """ Return group values at the given quantile, a la numpy.percentile. @@ -2209,9 +2067,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" vals = np.asarray(vals).astype(float) - elif is_timedelta64_dtype(vals.dtype): - inference = "timedelta64[ns]" - vals = np.asarray(vals).astype(float) return vals, inference @@ -2254,38 +2109,30 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): ) for qi in q ] - result = concat(results, axis=self.axis, keys=q) + result = concat(results, axis=0, keys=q) # fix levels to place quantiles on the inside # TODO(GH-10710): Ideally, we could write this as # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = list(range(1, result.axes[self.axis].nlevels)) + [0] + order = list(range(1, result.index.nlevels)) + [0] # temporarily saves the index names - index_names = np.array(result.axes[self.axis].names) + index_names = np.array(result.index.names) # set index names to positions to avoid confusion - result.axes[self.axis].names = np.arange(len(index_names)) + result.index.names = np.arange(len(index_names)) # place quantiles on the inside - if isinstance(result, Series): - result = result.reorder_levels(order) - else: - result = result.reorder_levels(order, axis=self.axis) + result = result.reorder_levels(order) # restore the index names in order - result.axes[self.axis].names = index_names[order] + result.index.names = index_names[order] # reorder rows to keep things sorted - indices = ( - np.arange(result.shape[self.axis]) - .reshape([len(q), self.ngroups]) - .T.flatten() - ) - return result.take(indices, axis=self.axis) + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() + return result.take(indices) - @final @Substitution(name="groupby") def ngroup(self, ascending: bool = True): """ @@ -2346,14 +2193,13 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): 5 0 dtype: int64 """ - with group_selection_context(self): + with _group_selection_context(self): index = self._selected_obj.index result = self._obj_1d_constructor(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result - @final @Substitution(name="groupby") def cumcount(self, ascending: bool = True): """ @@ -2408,12 +2254,11 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): 5 0 dtype: int64 """ - with group_selection_context(self): - index = self._selected_obj._get_axis(self.axis) + with _group_selection_context(self): + index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) - @final @Substitution(name="groupby") @Appender(_common_see_also) def rank( @@ -2463,7 +2308,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): axis=axis, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cumprod(self, axis=0, *args, **kwargs): @@ -2480,7 +2324,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return self._cython_transform("cumprod", **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cumsum(self, axis=0, *args, **kwargs): @@ -2497,7 +2340,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return self._cython_transform("cumsum", **kwargs) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cummin(self, axis=0, **kwargs): @@ -2513,7 +2355,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return self._cython_transform("cummin", numeric_only=False) - @final @Substitution(name="groupby") @Appender(_common_see_also) def cummax(self, axis=0, **kwargs): @@ -2529,7 +2370,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return self._cython_transform("cummax", numeric_only=False) - @final def _get_cythonized_result( self, how: str, @@ -2600,8 +2440,9 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): """ if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") - if post_processing and not callable(post_processing): - raise ValueError("'post_processing' must be a callable!") + if post_processing: + if not callable(post_processing): + raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): raise ValueError("'pre_processing' must be a callable!") @@ -2680,7 +2521,7 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): output[key] = result # error_msg is "" on an frame/series with no rows or columns - if not output and error_msg != "": + if len(output) == 0 and error_msg != "": raise TypeError(error_msg) if aggregate: @@ -2688,7 +2529,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): else: return self._wrap_transformed_output(output) - @final @Substitution(name="groupby") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """ @@ -2732,7 +2572,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): periods=periods, ) - @final @Substitution(name="groupby") @Appender(_common_see_also) def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0): @@ -2758,11 +2597,10 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): fill_method = "pad" limit = 0 filled = getattr(self, fill_method)(limit=limit) - fill_grp = filled.groupby(self.grouper.codes, axis=self.axis) - shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis) + fill_grp = filled.groupby(self.grouper.codes) + shifted = fill_grp.shift(periods=periods, freq=freq) return (filled / shifted) - 1 - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def head(self, n=5): @@ -2795,12 +2633,8 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): """ self._reset_group_selection() mask = self._cumcount_array() < n - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] + return self._selected_obj[mask] - @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) def tail(self, n=5): @@ -2833,12 +2667,8 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n - if self.axis == 0: - return self._selected_obj[mask] - else: - return self._selected_obj.iloc[:, mask] + return self._selected_obj[mask] - @final def _reindex_output( self, output: OutputFrameOrSeries, fill_value: Scalar = np.NaN ) -> OutputFrameOrSeries: @@ -2925,7 +2755,6 @@ class GroupBy(BaseGroupBy[FrameOrSeries]): return output.reset_index(drop=True) - @final def sample( self, n: Optional[int] = None, diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/grouper.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/grouper.py index 40ef719..c552b58 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/grouper.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/grouper.py @@ -2,12 +2,12 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ -from typing import Dict, Hashable, List, Optional, Set, Tuple +from typing import Dict, Hashable, List, Optional, Tuple import warnings import numpy as np -from pandas._typing import FrameOrSeries, Label, final +from pandas._typing import FrameOrSeries from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -18,6 +18,7 @@ from pandas.core.dtypes.common import ( is_scalar, is_timedelta64_dtype, ) +from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray @@ -98,13 +99,6 @@ class Grouper: .. versionadded:: 1.1.0 - dropna : bool, default True - If True, and if group keys contain NA values, NA values together with - row/column will be dropped. If False, NA values will also be treated as - the key in groups. - - .. versionadded:: 1.2.0 - Returns ------- A specification for a groupby instruction @@ -287,10 +281,8 @@ class Grouper: self.indexer = None self.binner = None self._grouper = None - self._indexer = None self.dropna = dropna - @final @property def ax(self): return self.grouper @@ -308,10 +300,7 @@ class Grouper: a tuple of binner, grouper, obj (possibly sorted) """ self._set_grouper(obj) - # pandas\core\groupby\grouper.py:310: error: Value of type variable - # "FrameOrSeries" of "get_grouper" cannot be "Optional[Any]" - # [type-var] - self.grouper, _, self.obj = get_grouper( # type: ignore[type-var] + self.grouper, _, self.obj = get_grouper( self.obj, [self.key], axis=self.axis, @@ -322,7 +311,6 @@ class Grouper: ) return self.binner, self.grouper, self.obj - @final def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper @@ -342,24 +330,15 @@ class Grouper: # Keep self.grouper value before overriding if self._grouper is None: self._grouper = self.grouper - self._indexer = self.indexer # the key must be a valid info item if self.key is not None: key = self.key # The 'on' is already defined - if getattr(self.grouper, "name", None) == key and isinstance(obj, Series): - # Sometimes self._grouper will have been resorted while - # obj has not. In this case there is a mismatch when we - # call self._grouper.take(obj.index) so we need to undo the sorting - # before we call _grouper.take. - assert self._grouper is not None - if self._indexer is not None: - reverse_indexer = self._indexer.argsort() - unsorted_ax = self._grouper.take(reverse_indexer) - ax = unsorted_ax.take(obj.index) - else: - ax = self._grouper.take(obj.index) + if getattr(self.grouper, "name", None) == key and isinstance( + obj, ABCSeries + ): + ax = self._grouper.take(obj.index) else: if key not in obj._info_axis: raise KeyError(f"The grouper name {key} is not found") @@ -391,14 +370,10 @@ class Grouper: self.grouper = ax return self.grouper - @final @property def groups(self): - # pandas\core\groupby\grouper.py:382: error: Item "None" of - # "Optional[Any]" has no attribute "groups" [union-attr] - return self.grouper.groups # type: ignore[union-attr] + return self.grouper.groups - @final def __repr__(self) -> str: attrs_list = ( f"{attr_name}={repr(getattr(self, attr_name))}" @@ -410,7 +385,6 @@ class Grouper: return f"{cls_name}({attrs})" -@final class Grouping: """ Holds the grouping information for a single key @@ -419,7 +393,7 @@ class Grouping: ---------- index : Index grouper : - obj : DataFrame or Series + obj Union[DataFrame, Series]: name : Label level : observed : bool, default False @@ -599,9 +573,7 @@ class Grouping: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - group_idx = self.group_index - assert isinstance(group_idx, CategoricalIndex) # set in __init__ - return recode_from_groupby(self.all_grouper, self.sort, group_idx) + return recode_from_groupby(self.all_grouper, self.sort, self.group_index) return self.group_index @property @@ -612,25 +584,23 @@ class Grouping: return self._group_index def _make_codes(self) -> None: - if self._codes is not None and self._group_index is not None: - return - - # we have a list of groupers - if isinstance(self.grouper, ops.BaseGrouper): - codes = self.grouper.codes_info - uniques = self.grouper.result_index - else: - # GH35667, replace dropna=False with na_sentinel=None - if not self.dropna: - na_sentinel = None + if self._codes is None or self._group_index is None: + # we have a list of groupers + if isinstance(self.grouper, ops.BaseGrouper): + codes = self.grouper.codes_info + uniques = self.grouper.result_index else: - na_sentinel = -1 - codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, na_sentinel=na_sentinel - ) - uniques = Index(uniques, name=self.name) - self._codes = codes - self._group_index = uniques + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None + else: + na_sentinel = -1 + codes, uniques = algorithms.factorize( + self.grouper, sort=self.sort, na_sentinel=na_sentinel + ) + uniques = Index(uniques, name=self.name) + self._codes = codes + self._group_index = uniques @cache_readonly def groups(self) -> Dict[Hashable, np.ndarray]: @@ -647,7 +617,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> Tuple["ops.BaseGrouper", Set[Label], FrameOrSeries]: +) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -723,13 +693,13 @@ def get_grouper( if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: - return grouper, set(), obj + return grouper, [], obj else: - return grouper, {key.key}, obj + return grouper, [key.key], obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): - return key, set(), obj + return key, [], obj if not isinstance(key, list): keys = [key] @@ -772,7 +742,7 @@ def get_grouper( levels = [level] * len(keys) groupings: List[Grouping] = [] - exclusions: Set[Label] = set() + exclusions: List[Hashable] = [] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: @@ -793,30 +763,30 @@ def get_grouper( return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError): - # IndexError reached in e.g. test_skip_group_keys when we pass - # lambda here + except (KeyError, IndexError, ValueError): + # TODO: ValueError: Given date string not likely a datetime. + # should be KeyError? return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name - exclusions.add(name) + exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] - exclusions.add(name) + exclusions.append(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions - exclusions.add(gpr.key) + exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None @@ -853,9 +823,7 @@ def get_grouper( groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper( - group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna - ) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/numba_.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/numba_.py deleted file mode 100644 index 76f50f1..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/numba_.py +++ /dev/null @@ -1,178 +0,0 @@ -"""Common utilities for Numba operations with groupby ops""" -import inspect -from typing import Any, Callable, Dict, Optional, Tuple - -import numpy as np - -from pandas._typing import Scalar -from pandas.compat._optional import import_optional_dependency - -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - NumbaUtilError, - get_jit_arguments, - jit_user_function, -) - - -def validate_udf(func: Callable) -> None: - """ - Validate user defined function for ops when using Numba with groupby ops. - - The first signature arguments should include: - - def f(values, index, ...): - ... - - Parameters - ---------- - func : function, default False - user defined function - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - udf_signature = list(inspect.signature(func).parameters.keys()) - expected_args = ["values", "index"] - min_number_args = len(expected_args) - if ( - len(udf_signature) < min_number_args - or udf_signature[:min_number_args] != expected_args - ): - raise NumbaUtilError( - f"The first {min_number_args} arguments to {func.__name__} must be " - f"{expected_args}" - ) - - -def generate_numba_agg_func( - args: Tuple, - kwargs: Dict[str, Any], - func: Callable[..., Scalar], - engine_kwargs: Optional[Dict[str, bool]], -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: - """ - Generate a numba jitted agg function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a groupby agg function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the groupby evaluation loop. - - Parameters - ---------- - args : tuple - *args to be passed into the function - kwargs : dict - **kwargs to be passed into the function - func : function - function to be applied to each window and will be JITed - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) - - validate_udf(func) - cache_key = (func, "groupby_agg") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: - loop_range = numba.prange - else: - loop_range = range - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def group_agg( - values: np.ndarray, - index: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - num_groups: int, - num_columns: int, - ) -> np.ndarray: - result = np.empty((num_groups, num_columns)) - for i in loop_range(num_groups): - group_index = index[begin[i] : end[i]] - for j in loop_range(num_columns): - group = values[begin[i] : end[i], j] - result[i, j] = numba_func(group, group_index, *args) - return result - - return group_agg - - -def generate_numba_transform_func( - args: Tuple, - kwargs: Dict[str, Any], - func: Callable[..., np.ndarray], - engine_kwargs: Optional[Dict[str, bool]], -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: - """ - Generate a numba jitted transform function specified by values from engine_kwargs. - - 1. jit the user's function - 2. Return a groupby transform function with the jitted function inline - - Configurations specified in engine_kwargs apply to both the user's - function _AND_ the groupby evaluation loop. - - Parameters - ---------- - args : tuple - *args to be passed into the function - kwargs : dict - **kwargs to be passed into the function - func : function - function to be applied to each window and will be JITed - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) - - validate_udf(func) - cache_key = (func, "groupby_transform") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: - loop_range = numba.prange - else: - loop_range = range - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def group_transform( - values: np.ndarray, - index: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - num_groups: int, - num_columns: int, - ) -> np.ndarray: - result = np.empty((len(values), num_columns)) - for i in loop_range(num_groups): - group_index = index[begin[i] : end[i]] - for j in loop_range(num_columns): - group = values[begin[i] : end[i], j] - result[begin[i] : end[i], j] = numba_func(group, group_index, *args) - return result - - return group_transform diff --git a/venv/lib/python3.8/site-packages/pandas/core/groupby/ops.py b/venv/lib/python3.8/site-packages/pandas/core/groupby/ops.py index 7724e39..5ea4f0c 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/groupby/ops.py +++ b/venv/lib/python3.8/site-packages/pandas/core/groupby/ops.py @@ -7,34 +7,19 @@ are contained *in* the SeriesGroupBy and DataFrameGroupBy objects. """ import collections -from typing import ( - Dict, - Generic, - Hashable, - Iterator, - List, - Optional, - Sequence, - Tuple, - Type, -) +from typing import List, Optional, Sequence, Tuple, Type import numpy as np from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape, final +from pandas._typing import F, FrameOrSeries, Label from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import ( - maybe_cast_result, - maybe_cast_result_dtype, - maybe_downcast_to_dtype, -) +from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( - ensure_float, ensure_float64, ensure_int64, ensure_int_or_float, @@ -45,7 +30,6 @@ from pandas.core.dtypes.common import ( is_datetime64_any_dtype, is_datetime64tz_dtype, is_extension_array_dtype, - is_float_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype, @@ -53,7 +37,7 @@ from pandas.core.dtypes.common import ( is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.missing import isna, maybe_fill +from pandas.core.dtypes.missing import _maybe_fill, isna import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin @@ -66,11 +50,17 @@ from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_list, + get_flattened_iterator, get_group_index, get_group_index_sorter, get_indexer_dict, ) +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + generate_numba_func, + maybe_use_numba, + split_for_numba, +) class BaseGrouper: @@ -98,12 +88,11 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence["grouper.Grouping"], + groupings: "Sequence[grouper.Grouping]", sort: bool = True, group_keys: bool = True, mutated: bool = False, indexer: Optional[np.ndarray] = None, - dropna: bool = True, ): assert isinstance(axis, Index), axis @@ -114,14 +103,13 @@ class BaseGrouper: self.group_keys = group_keys self.mutated = mutated self.indexer = indexer - self.dropna = dropna @property def groupings(self) -> List["grouper.Grouping"]: return self._groupings @property - def shape(self) -> Shape: + def shape(self) -> Tuple[int, ...]: return tuple(ping.ngroups for ping in self.groupings) def __iter__(self): @@ -131,9 +119,7 @@ class BaseGrouper: def nkeys(self) -> int: return len(self.groupings) - def get_iterator( - self, data: FrameOrSeries, axis: int = 0 - ) -> Iterator[Tuple[Label, FrameOrSeries]]: + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -147,14 +133,13 @@ class BaseGrouper: for key, (i, group) in zip(keys, splitter): yield key, group.__finalize__(data, method="groupby") - @final def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": """ Returns ------- Generator yielding subsetted objects - __finalize__ has not been called for the subsetted objects returned. + __finalize__ has not been called for the the subsetted objects returned. """ comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -168,7 +153,6 @@ class BaseGrouper: """ return self.groupings[0].grouper - @final def _get_group_keys(self): if len(self.groupings) == 1: return self.levels[0] @@ -176,9 +160,8 @@ class BaseGrouper: comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - @final def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) @@ -235,7 +218,7 @@ class BaseGrouper: # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes, axis): + if not _is_indexed_like(res, group_axes): mutated = True result_values.append(res) @@ -244,9 +227,12 @@ class BaseGrouper: @cache_readonly def indices(self): """ dict {group name -> group indices} """ - codes_list = [ping.codes for ping in self.groupings] - keys = [ping.group_index for ping in self.groupings] - return get_indexer_dict(codes_list, keys) + if len(self.groupings) == 1: + return self.groupings[0].indices + else: + codes_list = [ping.codes for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return get_indexer_dict(codes_list, keys) @property def codes(self) -> List[np.ndarray]: @@ -260,7 +246,6 @@ class BaseGrouper: def names(self) -> List[Label]: return [ping.name for ping in self.groupings] - @final def size(self) -> Series: """ Compute group sizes. @@ -274,7 +259,7 @@ class BaseGrouper: return Series(out, index=self.result_index, dtype="int64") @cache_readonly - def groups(self) -> Dict[Hashable, np.ndarray]: + def groups(self): """ dict {group name -> group labels} """ if len(self.groupings) == 1: return self.groupings[0].groups @@ -283,7 +268,6 @@ class BaseGrouper: to_groupby = Index(to_groupby) return self.axis.groupby(to_groupby) - @final @cache_readonly def is_monotonic(self) -> bool: # return if my group orderings are monotonic @@ -297,7 +281,6 @@ class BaseGrouper: comp_ids = ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups - @final @cache_readonly def codes_info(self) -> np.ndarray: # return the codes of items in original grouped axis @@ -307,7 +290,6 @@ class BaseGrouper: codes = codes[sorter] return codes - @final def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]: all_codes = self.codes if len(all_codes) > 1: @@ -317,7 +299,6 @@ class BaseGrouper: ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index)) - @final @cache_readonly def ngroups(self) -> int: return len(self.result_index) @@ -335,11 +316,11 @@ class BaseGrouper: codes = self.reconstructed_codes levels = [ping.result_index for ping in self.groupings] - return MultiIndex( + result = MultiIndex( levels=levels, codes=codes, verify_integrity=False, names=self.names ) + return result - @final def get_group_levels(self) -> List[Index]: if not self.compressed and len(self.groupings) == 1: return [self.groupings[0].result_index] @@ -380,7 +361,8 @@ class BaseGrouper: _cython_arity = {"ohlc": 4} # OHLC - @final + _name_functions = {"ohlc": ["open", "high", "low", "close"]} + def _is_builtin_func(self, arg): """ if we define a builtin function for this argument, return it, @@ -388,7 +370,6 @@ class BaseGrouper: """ return SelectionMixin._builtin_table.get(arg, arg) - @final def _get_cython_function( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -425,7 +406,6 @@ class BaseGrouper: return func - @final def _get_cython_func_and_vals( self, kind: str, how: str, values: np.ndarray, is_numeric: bool ): @@ -460,94 +440,17 @@ class BaseGrouper: raise return func, values - @final - def _disallow_invalid_ops(self, values: ArrayLike, how: str): - """ - Check if we can do this operation with our cython functions. - - Raises - ------ - NotImplementedError - This is either not a valid function for this dtype, or - valid but not implemented in cython. - """ - dtype = values.dtype - - if is_categorical_dtype(dtype) or is_sparse(dtype): - # categoricals are only 1d, so we - # are not setup for dim transforming - raise NotImplementedError(f"{dtype} dtype not supported") - elif is_datetime64_any_dtype(dtype): - # we raise NotImplemented if this is an invalid operation - # entirely, e.g. adding datetimes - if how in ["add", "prod", "cumsum", "cumprod"]: - raise NotImplementedError( - f"datetime64 type does not support {how} operations" - ) - elif is_timedelta64_dtype(dtype): - if how in ["prod", "cumprod"]: - raise NotImplementedError( - f"timedelta64 type does not support {how} operations" - ) - - @final - def _ea_wrap_cython_operation( + def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ - If we have an ExtensionArray, unwrap, call _cython_operation, and - re-wrap if appropriate. + Returns the values of a cython operation as a Tuple of [data, names]. + + Names is only useful when dealing with 2D results, like ohlc + (see self._name_functions). """ - # TODO: general case implementation overrideable by EAs. - orig_values = values - - if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): - # All of the functions implemented here are ordinal, so we can - # operate on the tz-naive equivalents - values = values.view("M8[ns]") - res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs - ) - if how in ["rank"]: - # preserve float64 dtype - return res_values - - res_values = res_values.astype("i8", copy=False) - result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) - return result - - elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): - # IntegerArray or BooleanArray - values = ensure_int_or_float(values) - res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs - ) - dtype = maybe_cast_result_dtype(orig_values.dtype, how) - if is_extension_array_dtype(dtype): - cls = dtype.construct_array_type() - return cls._from_sequence(res_values, dtype=dtype) - return res_values - - elif is_float_dtype(values.dtype): - # FloatingArray - values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) - res_values = self._cython_operation( - kind, values, how, axis, min_count, **kwargs - ) - result = type(orig_values)._from_sequence(res_values) - return result - - raise NotImplementedError(values.dtype) - - @final - def _cython_operation( - self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs - ) -> np.ndarray: - """ - Returns the values of a cython operation. - """ - orig_values = values assert kind in ["transform", "aggregate"] + orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") @@ -558,12 +461,30 @@ class BaseGrouper: # can we do this operation with our cython functions # if not raise NotImplementedError - self._disallow_invalid_ops(values, how) - if is_extension_array_dtype(values.dtype): - return self._ea_wrap_cython_operation( - kind, values, how, axis, min_count, **kwargs - ) + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + + # categoricals are only 1d, so we + # are not setup for dim transforming + if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): + raise NotImplementedError(f"{values.dtype} dtype not supported") + elif is_datetime64_any_dtype(values.dtype): + if how in ["add", "prod", "cumsum", "cumprod"]: + raise NotImplementedError( + f"datetime64 type does not support {how} operations" + ) + elif is_timedelta64_dtype(values.dtype): + if how in ["prod", "cumprod"]: + raise NotImplementedError( + f"timedelta64 type does not support {how} operations" + ) + + if is_datetime64tz_dtype(values.dtype): + # Cast to naive; we'll cast back at the end of the function + # TODO: possible need to reshape? + # TODO(EA2D):kludge can be avoided when 2D EA is allowed. + values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) @@ -581,7 +502,7 @@ class BaseGrouper: else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): - values = ensure_float64(ensure_float(values)) + values = ensure_float64(values) else: values = values.astype(object) @@ -616,11 +537,13 @@ class BaseGrouper: codes, _, _ = self.group_info if kind == "aggregate": - result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) + result = _maybe_fill( + np.empty(out_shape, dtype=out_dtype), fill_value=np.nan + ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": - result = maybe_fill( + result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) @@ -642,30 +565,48 @@ class BaseGrouper: if vdim == 1 and arity == 1: result = result[:, 0] + names: Optional[List[str]] = self._name_functions.get(how, None) + if swapped: result = result.swapaxes(0, axis) - if how not in base.cython_cast_blocklist: - # e.g. if we are int64 and need to restore to datetime64/timedelta64 - # "rank" is the only member of cython_cast_blocklist we get here - dtype = maybe_cast_result_dtype(orig_values.dtype, how) - result = maybe_downcast_to_dtype(result, dtype) + if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( + orig_values.dtype + ): + # We need to use the constructors directly for these dtypes + # since numpy won't recognize them + # https://github.com/pandas-dev/pandas/issues/31471 + result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) - return result + if is_extension_array_dtype(orig_values.dtype): + result = maybe_cast_result(result=result, obj=orig_values, how=how) + + return result, names + + def aggregate( + self, values, how: str, axis: int = 0, min_count: int = -1 + ) -> Tuple[np.ndarray, Optional[List[str]]]: + return self._cython_operation( + "aggregate", values, how, axis, min_count=min_count + ) + + def transform(self, values, how: str, axis: int = 0, **kwargs): + return self._cython_operation("transform", values, how, axis, **kwargs) - @final def _aggregate( - self, result, counts, values, comp_ids, agg_func, min_count: int = -1 + self, result, counts, values, comp_ids, agg_func, min_count: int = -1, ): if agg_func is libgroupby.group_nth: # different signature from the others - agg_func(result, counts, values, comp_ids, min_count, rank=1) + # TODO: should we be using min_count instead of hard-coding it? + agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) else: agg_func(result, counts, values, comp_ids, min_count) return result - @final def _transform( self, result, values, comp_ids, transform_func, is_datetimelike: bool, **kwargs ): @@ -675,10 +616,22 @@ class BaseGrouper: return result - def agg_series(self, obj: Series, func: F): + def agg_series( + self, + obj: Series, + func: F, + *args, + engine: str = "cython", + engine_kwargs=None, + **kwargs, + ): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 + if maybe_use_numba(engine): + return self._aggregate_series_pure_python( + obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ) if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) @@ -697,14 +650,13 @@ class BaseGrouper: try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "Must produce aggregated value" in str(err): + if "Function does not reduce" in str(err): # raised in libreduction pass else: raise return self._aggregate_series_pure_python(obj, func) - @final def _aggregate_series_fast(self, obj: Series, func: F): # At this point we have already checked that # - obj.index is not a MultiIndex @@ -724,33 +676,53 @@ class BaseGrouper: result, counts = grouper.get_result() return result, counts - @final - def _aggregate_series_pure_python(self, obj: Series, func: F): + def _aggregate_series_pure_python( + self, + obj: Series, + func: F, + *args, + engine: str = "cython", + engine_kwargs=None, + **kwargs, + ): + + if maybe_use_numba(engine): + numba_func, cache_key = generate_numba_func( + func, engine_kwargs, kwargs, "groupby_agg" + ) + group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) - result = np.empty(ngroups, dtype="O") - initialized = False + result = None splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: + if maybe_use_numba(engine): + values, index = split_for_numba(group) + res = numba_func(values, index, *args) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_func + else: + res = func(group, *args, **kwargs) - # Each step of this loop corresponds to - # libreduction._BaseGrouper._apply_to_group - res = func(group) - res = libreduction.extract_result(res) - - if not initialized: - # We only do this validation on the first iteration - libreduction.check_result_array(res, 0) - initialized = True + if result is None: + if isinstance(res, (Series, Index, np.ndarray)): + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing important res.index info? + res = res.item() + else: + raise ValueError("Function does not reduce") + result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] result[label] = res + assert result is not None result = lib.maybe_convert_objects(result, try_float=0) - result = maybe_cast_result(result, obj, numeric_only=True) + # TODO: maybe_cast_to_extension_array? return result, counts @@ -910,7 +882,15 @@ class BinGrouper(BaseGrouper): for lvl, name in zip(self.levels, self.names) ] - def agg_series(self, obj: Series, func: F): + def agg_series( + self, + obj: Series, + func: F, + *args, + engine: str = "cython", + engine_kwargs=None, + **kwargs, + ): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result @@ -924,13 +904,13 @@ class BinGrouper(BaseGrouper): return grouper.get_result() -def _is_indexed_like(obj, axes, axis: int) -> bool: +def _is_indexed_like(obj, axes) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.axes[axis].equals(axes[axis]) + return obj.index.equals(axes[0]) elif isinstance(obj, DataFrame): - return obj.axes[axis].equals(axes[axis]) + return obj.index.equals(axes[0]) return False @@ -939,7 +919,7 @@ def _is_indexed_like(obj, axes, axis: int) -> bool: # Splitting / application -class DataSplitter(Generic[FrameOrSeries]): +class DataSplitter: def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) diff --git a/venv/lib/python3.8/site-packages/pandas/core/index.py b/venv/lib/python3.8/site-packages/pandas/core/index.py index 44f434e..a315b96 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/index.py +++ b/venv/lib/python3.8/site-packages/pandas/core/index.py @@ -19,7 +19,7 @@ from pandas.core.indexes.api import ( # noqa:F401 ensure_index_from_sequences, get_objs_combined_axis, ) -from pandas.core.indexes.multi import sparsify_labels # noqa:F401 +from pandas.core.indexes.multi import _sparsify # noqa:F401 # GH#30193 warnings.warn( diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexers.py b/venv/lib/python3.8/site-packages/pandas/core/indexers.py index da4654b..d9aa02d 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexers.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexers.py @@ -79,9 +79,6 @@ def is_scalar_indexer(indexer, ndim: int) -> bool: ------- bool """ - if ndim == 1 and is_integer(indexer): - # GH37748: allow indexer to be an integer for Series - return True if isinstance(indexer, tuple): if len(indexer) == ndim: return all( @@ -108,7 +105,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: return True if arr_value.ndim == 1: if not isinstance(indexer, tuple): - indexer = (indexer,) + indexer = tuple([indexer]) return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) return False @@ -117,7 +114,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: # Indexer Validation -def check_setitem_lengths(indexer, value, values) -> bool: +def check_setitem_lengths(indexer, value, values) -> None: """ Validate that value and indexer are the same length. @@ -136,46 +133,34 @@ def check_setitem_lengths(indexer, value, values) -> bool: Returns ------- - bool - Whether this is an empty listlike setting which is a no-op. + None Raises ------ ValueError When the indexer is an ndarray or list and the lengths don't match. """ - no_op = False - + # boolean with truth values == len of the value is ok too if isinstance(indexer, (np.ndarray, list)): - # We can ignore other listlikes because they are either - # a) not necessarily 1-D indexers, e.g. tuple - # b) boolean indexers e.g. BoolArray - if is_list_like(value): - if len(indexer) != len(value): - # boolean with truth values == len of the value is ok too - if not ( - isinstance(indexer, np.ndarray) - and indexer.dtype == np.bool_ - and len(indexer[indexer]) == len(value) - ): - raise ValueError( - "cannot set using a list-like indexer " - "with a different length than the value" - ) - if not len(indexer): - no_op = True + if is_list_like(value) and len(indexer) != len(value): + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) elif isinstance(indexer, slice): - if is_list_like(value): + # slice + if is_list_like(value) and len(values): if len(value) != length_of_indexer(indexer, values): raise ValueError( "cannot set using a slice indexer with a " "different length than the value" ) - if not len(value): - no_op = True - - return no_op def validate_indices(indices: np.ndarray, n: int) -> None: diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/accessors.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/accessors.py index c97778f..881d5ce 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/accessors.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/accessors.py @@ -24,15 +24,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex if TYPE_CHECKING: - from pandas import Series + from pandas import Series # noqa:F401 class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): - _hidden_attrs = PandasObject._hidden_attrs | { - "orig", - "name", - } - def __init__(self, data: "Series", orig): if not isinstance(data, ABCSeries): raise TypeError( @@ -83,7 +78,7 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): else: index = self._parent.index # return the result as a Series, which is by definition a copy - result = Series(result, index=index, name=self.name).__finalize__(self._parent) + result = Series(result, index=index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -111,9 +106,7 @@ class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): if not is_list_like(result): return result - result = Series(result, index=self._parent.index, name=self.name).__finalize__( - self._parent - ) + result = Series(result, index=self._parent.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -241,10 +234,8 @@ class DatetimeProperties(Properties): See Also -------- - Timestamp.isocalendar : Function return a 3-tuple containing ISO year, - week number, and weekday for the given Timestamp object. - datetime.date.isocalendar : Return a named tuple object with - three components: year, week and weekday. + Timestamp.isocalendar + datetime.date.isocalendar Examples -------- @@ -333,8 +324,7 @@ class TimedeltaProperties(Properties): See Also -------- - datetime.timedelta : A duration expressing the difference - between two date, time, or datetime. + datetime.timedelta Examples -------- @@ -381,11 +371,7 @@ class TimedeltaProperties(Properties): 3 0 0 0 3 0 0 0 4 0 0 0 4 0 0 0 """ - return ( - self._get_values() - .components.set_index(self._parent.index) - .__finalize__(self._parent) - ) + return self._get_values().components.set_index(self._parent.index) @property def freq(self): diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/api.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/api.py index 18981a2..678753f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/api.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/api.py @@ -4,12 +4,12 @@ from typing import List, Set from pandas._libs import NaT, lib from pandas.errors import InvalidIndexError +import pandas.core.common as com from pandas.core.indexes.base import ( Index, _new_Index, ensure_index, ensure_index_from_sequences, - get_unanimous_names, ) from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex @@ -57,7 +57,7 @@ __all__ = [ "ensure_index_from_sequences", "get_objs_combined_axis", "union_indexes", - "get_unanimous_names", + "get_consensus_names", "all_indexes_same", ] @@ -218,12 +218,13 @@ def union_indexes(indexes, sort=True) -> Index: return result elif kind == "array": index = indexes[0] - if not all(index.equals(other) for other in indexes[1:]): - index = _unique_indices(indexes) + for other in indexes[1:]: + if not index.equals(other): + return _unique_indices(indexes) - name = get_unanimous_names(*indexes)[0] + name = get_consensus_names(indexes)[0] if name != index.name: - index = index.rename(name) + index = index._shallow_copy(name=name) return index else: # kind='list' return _unique_indices(indexes) @@ -267,6 +268,30 @@ def _sanitize_and_check(indexes): return indexes, "array" +def get_consensus_names(indexes): + """ + Give a consensus 'names' to indexes. + + If there's exactly one non-empty 'names', return this, + otherwise, return empty. + + Parameters + ---------- + indexes : list of Index objects + + Returns + ------- + list + A list representing the consensus 'names' found. + """ + # find the non-none names, need to tupleify to make + # the set hashable, then reverse on return + consensus_names = {tuple(i.names) for i in indexes if com.any_not_none(*i.names)} + if len(consensus_names) == 1: + return list(list(consensus_names)[0]) + return [None] * indexes[0].nlevels + + def all_indexes_same(indexes): """ Determine if all indexes contain the same elements. diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py index 11d1915..b0f64bd 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from copy import copy as copy_func from datetime import datetime -from itertools import zip_longest import operator +from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -9,14 +9,8 @@ from typing import ( FrozenSet, Hashable, List, - NewType, Optional, - Sequence, - Set, - Tuple, - TypeVar, Union, - cast, ) import warnings @@ -25,15 +19,17 @@ import numpy as np from pandas._libs import algos as libalgos, index as libindex, lib import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array, no_default -from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label, Shape, final +from pandas._typing import DtypeObj, Label +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.errors import DuplicateLabelError, InvalidIndexError -from pandas.util._decorators import Appender, cache_readonly, doc +from pandas.errors import InvalidIndexError +from pandas.util._decorators import Appender, Substitution, cache_readonly, doc +from pandas.core.dtypes import concat as _concat from pandas.core.dtypes.cast import ( - find_common_type, maybe_cast_to_integer_array, validate_numeric_casting, ) @@ -41,6 +37,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_object, ensure_platform_int, + is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_any_dtype, @@ -60,34 +57,35 @@ from pandas.core.dtypes.common import ( is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, - needs_i8_conversion, pandas_dtype, - validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( + ABCCategorical, + ABCDataFrame, ABCDatetimeIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, + ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.core import missing, ops +from pandas.core import ops from pandas.core.accessor import CachedAccessor import pandas.core.algorithms as algos from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimes import tz_to_dtype, validate_tz_from_dtype from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.frozen import FrozenList +import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped, nargsort +from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -99,29 +97,78 @@ from pandas.io.formats.printing import ( ) if TYPE_CHECKING: - from pandas import MultiIndex, RangeIndex, Series + from pandas import Series __all__ = ["Index"] _unsortable_types = frozenset(("mixed", "mixed-integer")) -_index_doc_kwargs = { - "klass": "Index", - "inplace": "", - "target_klass": "Index", - "raises_section": "", - "unique": "Index", - "duplicated": "np.ndarray", -} -_index_shared_docs = {} +_index_doc_kwargs = dict( + klass="Index", + inplace="", + target_klass="Index", + raises_section="", + unique="Index", + duplicated="np.ndarray", +) +_index_shared_docs = dict() str_t = str +def _make_comparison_op(op, cls): + def cmp_method(self, other): + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): + if other.ndim > 0 and len(self) != len(other): + raise ValueError("Lengths must match to compare") + + if is_object_dtype(self.dtype) and isinstance(other, ABCCategorical): + left = type(other)(self._values, dtype=other.dtype) + return op(left, other) + elif is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): + # e.g. PeriodArray + with np.errstate(all="ignore"): + result = op(self._values, other) + + elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): + # don't pass MultiIndex + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) + + elif is_interval_dtype(self.dtype): + with np.errstate(all="ignore"): + result = op(self._values, np.asarray(other)) + + else: + with np.errstate(all="ignore"): + result = ops.comparison_op(self._values, np.asarray(other), op) + + if is_bool_dtype(result): + return result + return ops.invalid_comparison(self, other, op) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + +def _make_arithmetic_op(op, cls): + def index_arithmetic_method(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCTimedeltaIndex)): + return NotImplemented + + from pandas import Series + + result = op(Series(self), other) + if isinstance(result, tuple): + return (Index(result[0]), Index(result[1])) + return Index(result) + + name = f"__{op.__name__}__" + return set_function_name(index_arithmetic_method, name, cls) + + _o_dtype = np.dtype(object) - - -_Identity = NewType("_Identity", object) +_Identity = object def _new_Index(cls, d): @@ -144,12 +191,9 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) -_IndexT = TypeVar("_IndexT", bound="Index") - - class Index(IndexOpsMixin, PandasObject): """ - Immutable sequence used for indexing and alignment. The basic object + Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. Parameters @@ -193,9 +237,9 @@ class Index(IndexOpsMixin, PandasObject): """ # tolist is not actually deprecated, just suppressed in the __dir__ - _hidden_attrs: FrozenSet[str] = ( - PandasObject._hidden_attrs - | IndexOpsMixin._hidden_attrs + _deprecations: FrozenSet[str] = ( + PandasObject._deprecations + | IndexOpsMixin._deprecations | frozenset(["contains", "set_value"]) ) @@ -220,7 +264,7 @@ class Index(IndexOpsMixin, PandasObject): _typ = "index" _data: Union[ExtensionArray, np.ndarray] - _id: Optional[_Identity] = None + _id = None _name: Label = None # MultiIndex.levels previously allowed setting the index name. We # don't allow this anymore, and raise if it happens rather than @@ -230,7 +274,6 @@ class Index(IndexOpsMixin, PandasObject): _attributes = ["name"] _is_numeric_dtype = False _can_hold_na = True - _can_hold_strings = True # would we like our indexing holder to defer to us _defer_to_indexing = False @@ -415,11 +458,6 @@ class Index(IndexOpsMixin, PandasObject): ndarray An ndarray with int64 dtype. """ - warnings.warn( - "Index.asi8 is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) return None @classmethod @@ -441,66 +479,16 @@ class Index(IndexOpsMixin, PandasObject): result._index_data = values result._name = name result._cache = {} - result._reset_identity() - return result + return result._reset_identity() @cache_readonly def _constructor(self): return type(self) - @final - def _maybe_check_unique(self): - """ - Check that an Index has no duplicates. - - This is typically only called via - `NDFrame.flags.allows_duplicate_labels.setter` when it's set to - True (duplicates aren't allowed). - - Raises - ------ - DuplicateLabelError - When the index is not unique. - """ - if not self.is_unique: - msg = """Index has duplicates.""" - duplicates = self._format_duplicate_message() - msg += f"\n{duplicates}" - - raise DuplicateLabelError(msg) - - @final - def _format_duplicate_message(self): - """ - Construct the DataFrame for a DuplicateLabelError. - - This returns a DataFrame indicating the labels and positions - of duplicates in an index. This should only be called when it's - already known that duplicates are present. - - Examples - -------- - >>> idx = pd.Index(['a', 'b', 'a']) - >>> idx._format_duplicate_message() - positions - label - a [0, 2] - """ - from pandas import Series - - duplicates = self[self.duplicated(keep="first")].unique() - assert len(duplicates) - - out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] - if self.nlevels == 1: - out = out.rename_axis("label") - return out.to_frame(name="positions") - # -------------------------------------------------------------------- # Index Internals Methods - @final def _get_attributes_dict(self): """ Return an attributes dict for my class. @@ -521,15 +509,14 @@ class Index(IndexOpsMixin, PandasObject): name : Label, defaults to self.name """ name = self.name if name is no_default else name + cache = self._cache.copy() if values is None else {} + if values is None: + values = self._values - if values is not None: - return self._simple_new(values, name=name) - - result = self._simple_new(self._values, name=name) - result._cache = self._cache + result = self._simple_new(values, name=name) + result._cache = cache return result - @final def is_(self, other) -> bool: """ More flexible, faster check like ``is`` but that works through views. @@ -551,23 +538,16 @@ class Index(IndexOpsMixin, PandasObject): -------- Index.identical : Works like ``Index.is_`` but also checks metadata. """ - if self is other: - return True - elif not hasattr(other, "_id"): - return False - elif self._id is None or other._id is None: - return False - else: - return self._id is other._id + # use something other than None to be clearer + return self._id is getattr(other, "_id", Ellipsis) and self._id is not None - @final - def _reset_identity(self) -> None: + def _reset_identity(self): """ Initializes or resets ``_id`` attribute with new object. """ - self._id = _Identity(object()) + self._id = _Identity() + return self - @final def _cleanup(self): self._engine.clear_mapping() @@ -580,19 +560,6 @@ class Index(IndexOpsMixin, PandasObject): target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) - @cache_readonly - def _dir_additions_for_owner(self) -> Set[str_t]: - """ - Add the string-like labels to the owner dataframe/series dir output. - - If this is a MultiIndex, it's first level values are used. - """ - return { - c - for c in self.unique(level=0)[:100] - if isinstance(c, str) and c.isidentifier() - } - # -------------------------------------------------------------------- # Array-Like Methods @@ -611,7 +578,7 @@ class Index(IndexOpsMixin, PandasObject): def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc and other functions. + Gets called after a ufunc. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: @@ -627,7 +594,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self._data.dtype - @final def ravel(self, order="C"): """ Return an ndarray of the flattened values of the underlying data. @@ -639,14 +605,8 @@ class Index(IndexOpsMixin, PandasObject): See Also -------- - numpy.ndarray.ravel : Return a flattened array. + numpy.ndarray.ravel """ - warnings.warn( - "Index.ravel returning ndarray is deprecated; in a future version " - "this will return a view on self.", - FutureWarning, - stacklevel=2, - ) values = self._get_engine_target() return values.ravel(order=order) @@ -667,7 +627,7 @@ class Index(IndexOpsMixin, PandasObject): Create an Index with values cast to dtypes. The class of a new Index is determined by dtype. When conversion is - impossible, a TypeError exception is raised. + impossible, a ValueError exception is raised. Parameters ---------- @@ -735,45 +695,52 @@ class Index(IndexOpsMixin, PandasObject): See Also -------- - numpy.ndarray.take: Return an array formed from the - elements of a at the given indices. + numpy.ndarray.take """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if kwargs: - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) - allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) - - # Note: we discard fill_value and use self._na_value, only relevant - # in the case where allow_fill is True and fill_value is not None - taken = algos.take( - self._values, indices, allow_fill=allow_fill, fill_value=self._na_value - ) - return self._shallow_copy(taken) - - def _maybe_disallow_fill(self, allow_fill: bool, fill_value, indices) -> bool: - """ - We only use pandas-style take when allow_fill is True _and_ - fill_value is not None. - """ - if allow_fill and fill_value is not None: - # only fill if we are passing a non-None fill_value - if self._can_hold_na: - if (indices < -1).any(): - raise ValueError( - "When allow_fill=True and fill_value is not None, " - "all indices must be >= -1" - ) - else: + if self._can_hold_na: + taken = self._assert_take_fillable( + self._values, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + else: + if allow_fill and fill_value is not None: cls_name = type(self).__name__ raise ValueError( f"Unable to fill values because {cls_name} cannot contain NA" ) + taken = self._values.take(indices) + return self._shallow_copy(taken) + + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan + ): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + raise ValueError( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + taken = algos.take( + values, indices, allow_fill=allow_fill, fill_value=na_value + ) else: - allow_fill = False - return allow_fill + taken = values.take(indices) + return taken _index_shared_docs[ "repeat" @@ -817,19 +784,13 @@ class Index(IndexOpsMixin, PandasObject): @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): repeats = ensure_platform_int(repeats) - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) return self._shallow_copy(self._values.repeat(repeats)) # -------------------------------------------------------------------- # Copying Methods - def copy( - self: _IndexT, - name: Optional[Label] = None, - deep: bool = False, - dtype: Optional[Dtype] = None, - names: Optional[Sequence[Label]] = None, - ) -> _IndexT: + def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -842,9 +803,6 @@ class Index(IndexOpsMixin, PandasObject): deep : bool, default False dtype : numpy dtype or pandas type, optional Set dtype for new object. - - .. deprecated:: 1.2.0 - use ``astype`` method instead. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. @@ -858,27 +816,21 @@ class Index(IndexOpsMixin, PandasObject): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ - name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: - new_index = self._shallow_copy(self._data.copy(), name=name) + new_index = self._shallow_copy(self._data.copy()) else: - new_index = self._shallow_copy(name=name) + new_index = self._shallow_copy() + + names = self._validate_names(name=name, names=names, deep=deep) + new_index = new_index.set_names(names) if dtype: - warnings.warn( - "parameter dtype is deprecated and will be removed in a future " - "version. Use the astype method instead.", - FutureWarning, - stacklevel=2, - ) new_index = new_index.astype(dtype) return new_index - @final def __copy__(self, **kwargs): return self.copy(**kwargs) - @final def __deepcopy__(self, memo=None): """ Parameters @@ -937,8 +889,7 @@ class Index(IndexOpsMixin, PandasObject): if self.inferred_type == "string": is_justify = False elif self.inferred_type == "categorical": - # error: "Index" has no attribute "categories" - if is_object_dtype(self.categories): # type: ignore[attr-defined] + if is_object_dtype(self.categories): # type: ignore is_justify = False return format_object_summary( @@ -993,9 +944,9 @@ class Index(IndexOpsMixin, PandasObject): # could have nans mask = isna(values) if mask.any(): - result_arr = np.array(result) - result_arr[mask] = na_rep - result = result_arr.tolist() + result = np.array(result) + result[mask] = na_rep + result = result.tolist() # type: ignore else: result = trim_front(format_array(values, None, justify="left")) return header + result @@ -1004,8 +955,6 @@ class Index(IndexOpsMixin, PandasObject): """ Format specified values of `self` and return them. - .. deprecated:: 1.2.0 - Parameters ---------- slicer : int, array-like @@ -1027,12 +976,6 @@ class Index(IndexOpsMixin, PandasObject): numpy.ndarray Formatted values. """ - warnings.warn( - "The 'to_native_types' method is deprecated and will be removed in " - "a future version. Use 'astype(str)' instead.", - FutureWarning, - stacklevel=2, - ) values = self if slicer is not None: values = values[slicer] @@ -1247,8 +1190,7 @@ class Index(IndexOpsMixin, PandasObject): maybe_extract_name(value, None, type(self)) self._name = value - @final - def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: + def _validate_names(self, name=None, names=None, deep: bool = False): """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. @@ -1258,25 +1200,15 @@ class Index(IndexOpsMixin, PandasObject): if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: - new_names = deepcopy(self.names) if deep else self.names + return deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") - new_names = names - elif not is_list_like(name): - new_names = [name] + return names else: - new_names = name - - if len(new_names) != len(self.names): - raise ValueError( - f"Length of new names must be {len(self.names)}, got {len(new_names)}" - ) - - # All items in 'new_names' need to be hashable - validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") - - return new_names + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name,)) @@ -1304,13 +1236,13 @@ class Index(IndexOpsMixin, PandasObject): # GH 20527 # All items in 'name' need to be hashable: - validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") - + for name in values: + if not is_hashable(name): + raise TypeError(f"{type(self).__name__}.name must be a hashable type") self._name = values[0] names = property(fset=_set_names, fget=_get_names) - @final def set_names(self, names, level=None, inplace: bool = False): """ Set Index or MultiIndex name. @@ -1330,8 +1262,8 @@ class Index(IndexOpsMixin, PandasObject): Returns ------- - Index or None - The same type as the caller or None if ``inplace=True``. + Index + The same type as the caller or None if inplace is True. See Also -------- @@ -1406,8 +1338,8 @@ class Index(IndexOpsMixin, PandasObject): Returns ------- - Index or None - The same type as the caller or None if ``inplace=True``. + Index + The same type as the caller or None if inplace is True. See Also -------- @@ -1456,7 +1388,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self - @final def _validate_index_level(self, level): """ Validate index level. @@ -1486,7 +1417,7 @@ class Index(IndexOpsMixin, PandasObject): def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - For internal compatibility with the Index API. + For internal compatibility with with the Index API. Sort the Index. This is for compat with MultiIndex @@ -1501,20 +1432,6 @@ class Index(IndexOpsMixin, PandasObject): ------- Index """ - if not isinstance(ascending, (list, bool)): - raise TypeError( - "ascending must be a single bool value or" - "a list of bool values of length 1" - ) - - if isinstance(ascending, list): - if len(ascending) != 1: - raise TypeError("ascending must be a list of bool values of length 1") - ascending = ascending[0] - - if not isinstance(ascending, bool): - raise TypeError("ascending must be a bool value") - return self.sort_values(return_indexer=True, ascending=ascending) def _get_level_values(self, level): @@ -1558,7 +1475,6 @@ class Index(IndexOpsMixin, PandasObject): get_level_values = _get_level_values - @final def droplevel(self, level=0): """ Return index with requested level(s) removed. @@ -1566,6 +1482,8 @@ class Index(IndexOpsMixin, PandasObject): If resulting index has only 1 level left, the result will be of Index type, not MultiIndex. + .. versionadded:: 0.23.1 (support for non-MultiIndex) + Parameters ---------- level : int, str, or list-like, default 0 @@ -1575,55 +1493,20 @@ class Index(IndexOpsMixin, PandasObject): Returns ------- Index or MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) - >>> mi - MultiIndex([(1, 3, 5), - (2, 4, 6)], - names=['x', 'y', 'z']) - - >>> mi.droplevel() - MultiIndex([(3, 5), - (4, 6)], - names=['y', 'z']) - - >>> mi.droplevel(2) - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.droplevel('z') - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.droplevel(['x', 'y']) - Int64Index([5, 6], dtype='int64', name='z') """ if not isinstance(level, (tuple, list)): level = [level] levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] - return self._drop_level_numbers(levnums) - - def _drop_level_numbers(self, levnums: List[int]): - """ - Drop MultiIndex levels by level _number_, not name. - """ - - if len(levnums) == 0: + if len(level) == 0: return self - if len(levnums) >= self.nlevels: + if len(level) >= self.nlevels: raise ValueError( - f"Cannot remove {len(levnums)} levels from an index with " - f"{self.nlevels} levels: at least one level must be left." + f"Cannot remove {len(level)} levels from an index with {self.nlevels} " + "levels: at least one level must be left." ) # The two checks above guarantee that here self is a MultiIndex - self = cast("MultiIndex", self) new_levels = list(self.levels) new_codes = list(self.codes) @@ -1685,7 +1568,6 @@ class Index(IndexOpsMixin, PandasObject): # -------------------------------------------------------------------- # Introspection Methods - @final @property def is_monotonic(self) -> bool: """ @@ -1800,7 +1682,6 @@ class Index(IndexOpsMixin, PandasObject): """ return not self.is_unique - @final def is_boolean(self) -> bool: """ Check if the Index only consists of booleans. @@ -1836,7 +1717,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["boolean"] - @final def is_integer(self) -> bool: """ Check if the Index only consists of integers. @@ -1872,7 +1752,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["integer"] - @final def is_floating(self) -> bool: """ Check if the Index is a floating type. @@ -1916,7 +1795,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] - @final def is_numeric(self) -> bool: """ Check if the Index only consists of numeric data. @@ -1960,7 +1838,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["integer", "floating"] - @final def is_object(self) -> bool: """ Check if the Index is of the object dtype. @@ -2001,7 +1878,6 @@ class Index(IndexOpsMixin, PandasObject): """ return is_object_dtype(self.dtype) - @final def is_categorical(self) -> bool: """ Check if the Index holds categorical data. @@ -2045,7 +1921,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["categorical"] - @final def is_interval(self) -> bool: """ Check if the Index holds Interval objects. @@ -2079,7 +1954,6 @@ class Index(IndexOpsMixin, PandasObject): """ return self.inferred_type in ["interval"] - @final def is_mixed(self) -> bool: """ Check if the Index holds data with mixed data types. @@ -2117,7 +1991,6 @@ class Index(IndexOpsMixin, PandasObject): ) return self.inferred_type in ["mixed"] - @final def holds_integer(self) -> bool: """ Whether the type is an integer type. @@ -2132,30 +2005,17 @@ class Index(IndexOpsMixin, PandasObject): return lib.infer_dtype(self._values, skipna=False) @cache_readonly - def _is_all_dates(self) -> bool: + def is_all_dates(self) -> bool: """ Whether or not the index values only consist of dates. """ return is_datetime_array(ensure_object(self._values)) - @cache_readonly - def is_all_dates(self): - """ - Whether or not the index values only consist of dates. - """ - warnings.warn( - "Index.is_all_dates is deprecated, will be removed in a future version. " - "check index.inferred_type instead", - FutureWarning, - stacklevel=2, - ) - return self._is_all_dates - # -------------------------------------------------------------------- # Pickle Methods def __reduce__(self): - d = {"data": self._data} + d = dict(data=self._data) d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None @@ -2179,7 +2039,6 @@ class Index(IndexOpsMixin, PandasObject): return values @cache_readonly - @final def _nan_idxs(self): if self._can_hold_na: return self._isnan.nonzero()[0] @@ -2196,7 +2055,6 @@ class Index(IndexOpsMixin, PandasObject): else: return False - @final def isna(self): """ Detect missing values. @@ -2254,7 +2112,6 @@ class Index(IndexOpsMixin, PandasObject): isnull = isna - @final def notna(self): """ Detect existing (non-missing) values. @@ -2324,7 +2181,7 @@ class Index(IndexOpsMixin, PandasObject): DataFrame.fillna : Fill NaN values of a DataFrame. Series.fillna : Fill NaN Values of a Series. """ - value = self._require_scalar(value) + self._assert_can_do_op(value) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: @@ -2368,25 +2225,22 @@ class Index(IndexOpsMixin, PandasObject): level : int or str, optional, default None Only return values from specified level (for MultiIndex). + .. versionadded:: 0.23.0 + Returns ------- Index without duplicates See Also -------- - unique : Numpy array of unique values in that column. - Series.unique : Return unique values of Series object. + unique + Series.unique """ if level is not None: self._validate_index_level(level) - - if self.is_unique: - return self._shallow_copy() - result = super().unique() return self._shallow_copy(result) - @final def drop_duplicates(self, keep="first"): """ Return Index with duplicate values removed. @@ -2433,9 +2287,6 @@ class Index(IndexOpsMixin, PandasObject): >>> idx.drop_duplicates(keep=False) Index(['cow', 'beetle', 'hippo'], dtype='object') """ - if self.is_unique: - return self._shallow_copy() - return super().drop_duplicates(keep=keep) def duplicated(self, keep="first"): @@ -2492,9 +2343,6 @@ class Index(IndexOpsMixin, PandasObject): >>> idx.duplicated(keep=False) array([ True, False, True, False, True]) """ - if self.is_unique: - # fastpath available bc we are immutable - return np.zeros(len(self), dtype=bool) return super().duplicated(keep=keep) def _get_unique_index(self, dropna: bool = False): @@ -2521,54 +2369,52 @@ class Index(IndexOpsMixin, PandasObject): else: values = self._values - if dropna and not isinstance(self, ABCMultiIndex): - # isna not defined for MultiIndex - if self.hasnans: - values = values[~isna(values)] + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass return self._shallow_copy(values) # -------------------------------------------------------------------- # Arithmetic & Logical Methods + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + from pandas import Series + + return Index(Series(self) + other) + + def __radd__(self, other): + from pandas import Series + + return Index(other + Series(self)) + def __iadd__(self, other): # alias for __add__ return self + other - @final + def __sub__(self, other): + return Index(np.array(self) - other) + + def __rsub__(self, other): + # wrap Series to ensure we pin name correctly + from pandas import Series + + return Index(other - Series(self)) + def __and__(self, other): - warnings.warn( - "Index.__and__ operating as a set operation is deprecated, " - "in the future this will be a logical operation matching " - "Series.__and__. Use index.intersection(other) instead", - FutureWarning, - stacklevel=2, - ) return self.intersection(other) - @final def __or__(self, other): - warnings.warn( - "Index.__or__ operating as a set operation is deprecated, " - "in the future this will be a logical operation matching " - "Series.__or__. Use index.union(other) instead", - FutureWarning, - stacklevel=2, - ) return self.union(other) - @final def __xor__(self, other): - warnings.warn( - "Index.__xor__ operating as a set operation is deprecated, " - "in the future this will be a logical operation matching " - "Series.__xor__. Use index.symmetric_difference(other) instead", - FutureWarning, - stacklevel=2, - ) return self.symmetric_difference(other) - @final def __nonzero__(self): raise ValueError( f"The truth value of a {type(self).__name__} is ambiguous. " @@ -2580,7 +2426,6 @@ class Index(IndexOpsMixin, PandasObject): # -------------------------------------------------------------------- # Set Operation Methods - @final def _get_reconciled_name_object(self, other): """ If the result of a set operation will be self, @@ -2589,10 +2434,9 @@ class Index(IndexOpsMixin, PandasObject): """ name = get_op_result_name(self, other) if self.name != name: - return self.rename(name) + return self._shallow_copy(name=name) return self - @final def _union_incompatible_dtypes(self, other, sort): """ Casts this and other index to object dtype to allow the formation @@ -2617,7 +2461,7 @@ class Index(IndexOpsMixin, PandasObject): other = Index(other).astype(object, copy=False) return Index.union(this, other, sort=sort).astype(object, copy=False) - def _can_union_without_object_cast(self, other) -> bool: + def _is_compatible_with_other(self, other) -> bool: """ Check whether this and the other dtype are compatible with each other. Meaning a union can be formed between them without needing to be cast @@ -2633,7 +2477,6 @@ class Index(IndexOpsMixin, PandasObject): """ return type(self) is type(other) and is_dtype_equal(self.dtype, other.dtype) - @final def _validate_sort_keyword(self, sort): if sort not in [None, False]: raise ValueError( @@ -2694,14 +2537,11 @@ class Index(IndexOpsMixin, PandasObject): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) - if not self._can_union_without_object_cast(other): + if not self._is_compatible_with_other(other): return self._union_incompatible_dtypes(other, sort=sort) - result = self._union(other, sort=sort) - - return self._wrap_setop_result(other, result) + return self._union(other, sort=sort) def _union(self, other, sort): """ @@ -2723,10 +2563,10 @@ class Index(IndexOpsMixin, PandasObject): Index """ if not len(other) or self.equals(other): - return self + return self._get_reconciled_name_object(other) if not len(self): - return other + return other._get_reconciled_name_object(self) # TODO(EA): setops-refactor, clean all this up lvals = self._values @@ -2768,24 +2608,12 @@ class Index(IndexOpsMixin, PandasObject): stacklevel=3, ) - return result + # for subclasses + return self._wrap_setop_result(other, result) - @final def _wrap_setop_result(self, other, result): - if isinstance(self, (ABCDatetimeIndex, ABCTimedeltaIndex)) and isinstance( - result, np.ndarray - ): - result = type(self._data)._simple_new(result, dtype=self.dtype) - elif is_categorical_dtype(self.dtype) and isinstance(result, np.ndarray): - result = Categorical(result, dtype=self.dtype) - name = get_op_result_name(self, other) - if isinstance(result, Index): - if result.name != name: - return result.rename(name) - return result - else: - return self._shallow_copy(result, name=name) + return self._shallow_copy(result, name=name) # TODO: standardize return type of non-union setops type(self vs other) def intersection(self, other, sort=False): @@ -2824,26 +2652,16 @@ class Index(IndexOpsMixin, PandasObject): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + other = ensure_index(other) - if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) + if self.equals(other) and not self.has_duplicates: return self._get_reconciled_name_object(other) if not is_dtype_equal(self.dtype, other.dtype): - dtype = find_common_type([self.dtype, other.dtype]) - this = self.astype(dtype, copy=False) - other = other.astype(dtype, copy=False) + this = self.astype("O") + other = other.astype("O") return this.intersection(other, sort=sort) - result = self._intersection(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _intersection(self, other, sort=False): - """ - intersection specialized to the case with matching dtypes. - """ # TODO(EA): setops-refactor, clean all this up lvals = self._values rvals = other._values @@ -2854,7 +2672,7 @@ class Index(IndexOpsMixin, PandasObject): except TypeError: pass else: - return algos.unique1d(result) + return self._wrap_setop_result(other, algos.unique1d(result)) try: indexer = Index(rvals).get_indexer(lvals) @@ -2865,15 +2683,18 @@ class Index(IndexOpsMixin, PandasObject): indexer = algos.unique1d(Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] - result = other.take(indexer).unique()._values + taken = other.take(indexer).unique() + res_name = get_op_result_name(self, other) if sort is None: - result = algos.safe_sort(result) + taken = algos.safe_sort(taken.values) + return self._shallow_copy(taken, name=res_name) # Intersection has to be unique - assert Index(result).is_unique + assert algos.unique(taken._values).shape == taken._values.shape - return result + taken.name = res_name + return taken def difference(self, other, sort=None): """ @@ -2915,15 +2736,12 @@ class Index(IndexOpsMixin, PandasObject): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) if self.equals(other): - return self[:0].rename(result_name) + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - result = self._difference(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _difference(self, other, sort): + other, result_name = self._convert_can_do_setop(other) this = self._get_unique_index() @@ -2931,14 +2749,14 @@ class Index(IndexOpsMixin, PandasObject): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this._values.take(label_diff) + the_diff = this.values.take(label_diff) if sort is None: try: the_diff = algos.safe_sort(the_diff) except TypeError: pass - return the_diff + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None, sort=None): """ @@ -3048,7 +2866,7 @@ class Index(IndexOpsMixin, PandasObject): distances are broken by preferring the larger index value. tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of - the index at the matching location must satisfy the equation + the index at the matching location most satisfy the equation ``abs(index[loc] - key) <= tolerance``. Returns @@ -3113,7 +2931,7 @@ class Index(IndexOpsMixin, PandasObject): inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations must + matches. The values of the index at the matching locations most satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance @@ -3199,7 +3017,6 @@ class Index(IndexOpsMixin, PandasObject): raise ValueError("list-like tolerance size must match target index size") return tolerance - @final def _get_fill_indexer( self, target: "Index", method: str_t, limit=None, tolerance=None ) -> np.ndarray: @@ -3215,11 +3032,10 @@ class Index(IndexOpsMixin, PandasObject): indexer = engine_method(target_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) - if tolerance is not None and len(self): + if tolerance is not None: indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer - @final def _get_fill_indexer_searchsorted( self, target: "Index", method: str_t, limit=None ) -> np.ndarray: @@ -3253,28 +3069,18 @@ class Index(IndexOpsMixin, PandasObject): indexer[indexer == len(self)] = -1 return indexer - @final def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or tuples). """ - if not len(self): - return self._get_fill_indexer(target, "pad") - left_indexer = self.get_indexer(target, "pad", limit=limit) right_indexer = self.get_indexer(target, "backfill", limit=limit) target_values = target._values - # error: Unsupported left operand type for - ("ExtensionArray") - left_distances = np.abs( - self._values[left_indexer] - target_values # type: ignore[operator] - ) - # error: Unsupported left operand type for - ("ExtensionArray") - right_distances = np.abs( - self._values[right_indexer] - target_values # type: ignore[operator] - ) + left_distances = np.abs(self._values[left_indexer] - target_values) + right_distances = np.abs(self._values[right_indexer] - target_values) op = operator.lt if self.is_monotonic_increasing else operator.le indexer = np.where( @@ -3286,15 +3092,13 @@ class Index(IndexOpsMixin, PandasObject): indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance) return indexer - @final def _filter_indexer_tolerance( self, target: Union["Index", np.ndarray, ExtensionArray], indexer: np.ndarray, tolerance, ) -> np.ndarray: - # error: Unsupported left operand type for - ("ExtensionArray") - distance = abs(self._values[indexer] - target) # type: ignore[operator] + distance = abs(self._values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -3311,7 +3115,6 @@ class Index(IndexOpsMixin, PandasObject): # GH#10331 return key - @final def _validate_positional_slice(self, key: slice): """ For positional indexing, a slice must have either int or None @@ -3406,7 +3209,7 @@ class Index(IndexOpsMixin, PandasObject): Return tuple-safe keys. """ if isinstance(keyarr, Index): - pass + keyarr = self._convert_index_indexer(keyarr) else: keyarr = self._convert_arr_indexer(keyarr) @@ -3429,6 +3232,21 @@ class Index(IndexOpsMixin, PandasObject): keyarr = com.asarray_tuplesafe(keyarr) return keyarr + def _convert_index_indexer(self, keyarr): + """ + Convert an Index indexer to the appropriate dtype. + + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + + Returns + ------- + converted_keyarr : Index (or sub-class) + """ + return keyarr + def _convert_list_indexer(self, keyarr): """ Convert a list-like indexer to the appropriate dtype. @@ -3445,12 +3263,11 @@ class Index(IndexOpsMixin, PandasObject): """ return None - @final - def _invalid_indexer(self, form: str_t, key) -> TypeError: + def _invalid_indexer(self, form: str_t, key): """ Consistent invalid indexer message. """ - return TypeError( + raise TypeError( f"cannot do {form} indexing on {type(self).__name__} with these " f"indexers [{key}] of type {type(key).__name__}" ) @@ -3458,7 +3275,6 @@ class Index(IndexOpsMixin, PandasObject): # -------------------------------------------------------------------- # Reindex Methods - @final def _can_reindex(self, indexer): """ Check if we are allowing reindexing with this particular indexer. @@ -3472,7 +3288,7 @@ class Index(IndexOpsMixin, PandasObject): ValueError if its a duplicate axis """ # trying to reindex on an axis with duplicates - if not self._index_as_unique and len(indexer): + if not self.is_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -3498,7 +3314,11 @@ class Index(IndexOpsMixin, PandasObject): target = ensure_has_len(target) # target may be an iterator if not isinstance(target, Index) and len(target) == 0: - target = self[:0] + if isinstance(self, ABCRangeIndex): + values = range(0) + else: + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, name=self.name) else: target = ensure_index(target) @@ -3512,7 +3332,8 @@ class Index(IndexOpsMixin, PandasObject): if self.equals(target): indexer = None else: - if self._index_as_unique: + # check is_overlapping for IntervalIndex compat + if self.is_unique and not getattr(self, "is_overlapping", False): indexer = self.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) @@ -3548,10 +3369,6 @@ class Index(IndexOpsMixin, PandasObject): """ target = ensure_index(target) - if len(target) == 0: - # GH#13691 - return self[:0], np.array([], dtype=np.intp), None - indexer, missing = self.get_indexer_non_unique(target) check = indexer != -1 new_labels = self.take(indexer[check]) @@ -3566,7 +3383,7 @@ class Index(IndexOpsMixin, PandasObject): cur_labels = self.take(indexer[check]).values cur_indexer = ensure_int64(length[check]) - new_labels = np.empty((len(indexer),), dtype=object) + new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels new_labels[missing_indexer] = missing_labels @@ -3720,29 +3537,27 @@ class Index(IndexOpsMixin, PandasObject): else: return join_index - @final def _join_multi(self, other, how, return_indexers=True): from pandas.core.indexes.multi import MultiIndex - from pandas.core.reshape.merge import restore_dropped_levels_multijoin + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin # figure out join names - self_names_list = list(com.not_none(*self.names)) - other_names_list = list(com.not_none(*other.names)) - self_names_order = self_names_list.index - other_names_order = other_names_list.index - self_names = set(self_names_list) - other_names = set(other_names_list) + self_names = set(com.not_none(*self.names)) + other_names = set(com.not_none(*other.names)) overlap = self_names & other_names # need at least 1 in common if not overlap: raise ValueError("cannot join with no overlapping index names") - if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + self_is_mi = isinstance(self, ABCMultiIndex) + other_is_mi = isinstance(other, ABCMultiIndex) + + if self_is_mi and other_is_mi: # Drop the non-matching levels from left and right respectively - ldrop_names = sorted(self_names - overlap, key=self_names_order) - rdrop_names = sorted(other_names - overlap, key=other_names_order) + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) # if only the order differs if not len(ldrop_names + rdrop_names): @@ -3763,7 +3578,7 @@ class Index(IndexOpsMixin, PandasObject): # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, codes, names = restore_dropped_levels_multijoin( + levels, codes, names = _restore_dropped_levels_multijoin( self, other, dropped_names, join_idx, lidx, ridx ) @@ -3784,7 +3599,7 @@ class Index(IndexOpsMixin, PandasObject): # Case where only one index is multi # make the indices into mi's that match flip_order = False - if isinstance(self, MultiIndex): + if self_is_mi: self, other = other, self flip_order = True # flip if join method is right or left @@ -3800,9 +3615,8 @@ class Index(IndexOpsMixin, PandasObject): return result[0], result[2], result[1] return result - @final def _join_non_unique(self, other, how="left", return_indexers=False): - from pandas.core.reshape.merge import get_join_indexers + from pandas.core.reshape.merge import _get_join_indexers # We only get here if dtypes match assert self.dtype == other.dtype @@ -3810,7 +3624,7 @@ class Index(IndexOpsMixin, PandasObject): lvalues = self._get_engine_target() rvalues = other._get_engine_target() - left_idx, right_idx = get_join_indexers( + left_idx, right_idx = _get_join_indexers( [lvalues], [rvalues], how=how, sort=True ) @@ -3828,7 +3642,6 @@ class Index(IndexOpsMixin, PandasObject): else: return join_index - @final def _join_level( self, other, level, how="left", return_indexers=False, keep_order=True ): @@ -3876,8 +3689,6 @@ class Index(IndexOpsMixin, PandasObject): left, right = right, left how = {"right": "left", "left": "right"}.get(how, how) - assert isinstance(left, MultiIndex) - level = left._get_level_number(level) old_level = left.levels[level] @@ -3901,9 +3712,9 @@ class Index(IndexOpsMixin, PandasObject): else: left_lev_indexer = ensure_int64(left_lev_indexer) rev_indexer = lib.get_reverse_indexer(left_lev_indexer, len(old_level)) - old_codes = left.codes[level] + new_lev_codes = algos.take_nd( - rev_indexer, old_codes[old_codes != -1], allow_fill=False + rev_indexer, left.codes[level], allow_fill=False ) new_codes = list(left.codes) @@ -3972,7 +3783,6 @@ class Index(IndexOpsMixin, PandasObject): else: return join_index - @final def _join_monotonic(self, other, how="left", return_indexers=False): # We only get here with matching dtypes assert other.dtype == self.dtype @@ -4021,16 +3831,9 @@ class Index(IndexOpsMixin, PandasObject): else: return join_index - def _wrap_joined_index( - self: _IndexT, joined: np.ndarray, other: _IndexT - ) -> _IndexT: - assert other.dtype == self.dtype - - if isinstance(self, ABCMultiIndex): - name = self.names if self.names == other.names else None - else: - name = get_op_result_name(self, other) - return self._constructor(joined, name=name) + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) # -------------------------------------------------------------------- # Uncategorized Methods @@ -4074,7 +3877,7 @@ class Index(IndexOpsMixin, PandasObject): This is an ndarray or ExtensionArray. - ``_values`` are consistent between ``Series`` and ``Index``. + ``_values`` are consistent between``Series`` and ``Index``. It may differ from the public '.values' method. @@ -4089,7 +3892,7 @@ class Index(IndexOpsMixin, PandasObject): See Also -------- - values : Values + values """ return self._data @@ -4142,19 +3945,25 @@ class Index(IndexOpsMixin, PandasObject): if other is None: other = self._na_value + dtype = self.dtype values = self.values - try: - self._validate_fill_value(other) - except (ValueError, TypeError): - return self.astype(object).where(cond, other) + if is_bool(other) or is_bool_dtype(other): + + # bools force casting + values = values.astype(object) + dtype = None values = np.where(cond, values, other) - return Index(values, name=self.name) + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return Index(values, dtype=dtype, name=self.name) # construction helpers - @final @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor @@ -4164,7 +3973,6 @@ class Index(IndexOpsMixin, PandasObject): f"kind, {repr(data)} was passed" ) - @final @classmethod def _string_data_error(cls, data): raise TypeError( @@ -4172,7 +3980,6 @@ class Index(IndexOpsMixin, PandasObject): "to explicitly cast to a numeric type" ) - @final def _coerce_scalar_to_index(self, item): """ We need to coerce a scalar to a compat for our index type. @@ -4190,22 +3997,24 @@ class Index(IndexOpsMixin, PandasObject): return Index([item], dtype=dtype, **self._get_attributes_dict()) - def _validate_fill_value(self, value): + def _to_safe_for_reshape(self): """ - Check if the value can be inserted into our array, and convert - it to an appropriate native type if necessary. + Convert to object if we are a categorical. + """ + return self + + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. """ return value - @final - def _require_scalar(self, value): + def _assert_can_do_op(self, value): """ - Check that this is a scalar value that we can use for setitem-like - operations without changing dtype. + Check value is valid for scalar op. """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") - return value @property def _has_complex_internals(self) -> bool: @@ -4221,7 +4030,7 @@ class Index(IndexOpsMixin, PandasObject): """ return self.is_object() - def is_type_compatible(self, kind: str_t) -> bool: + def is_type_compatible(self, kind) -> bool: """ Whether the index type is compatible with the provided type. """ @@ -4268,11 +4077,9 @@ class Index(IndexOpsMixin, PandasObject): except (OverflowError, TypeError, ValueError): return False - @final def __hash__(self): raise TypeError(f"unhashable type: {repr(type(self).__name__)}") - @final def __setitem__(self, key, value): raise TypeError("Index does not support mutable operations") @@ -4313,7 +4120,6 @@ class Index(IndexOpsMixin, PandasObject): else: return result - @final def _can_hold_identifiers_and_holds_name(self, name) -> bool: """ Faster check for ``name in self`` when we know `name` is a Python @@ -4355,13 +4161,13 @@ class Index(IndexOpsMixin, PandasObject): return self._concat(to_concat, name) - def _concat(self, to_concat: List["Index"], name: Label) -> "Index": + def _concat(self, to_concat, name): """ Concatenate multiple Index objects. """ - to_concat_vals = [x._values for x in to_concat] + to_concat = [x._values if isinstance(x, Index) else x for x in to_concat] - result = concat_compat(to_concat_vals) + result = _concat.concat_compat(to_concat) return Index(result, name=name) def putmask(self, mask, value): @@ -4374,12 +4180,15 @@ class Index(IndexOpsMixin, PandasObject): See Also -------- - numpy.ndarray.putmask : Changes elements of an array - based on conditional and input values. + numpy.ndarray.putmask """ - values = self._values.copy() + values = self.values.copy() try: - converted = self._validate_fill_value(value) + np.putmask(values, mask, self._convert_for_op(value)) + if is_period_dtype(self.dtype): + # .values cast to object, so we need to cast back + values = type(self)(values)._data + return self._shallow_copy(values) except (ValueError, TypeError) as err: if is_object_dtype(self): raise err @@ -4387,10 +4196,7 @@ class Index(IndexOpsMixin, PandasObject): # coerces to object return self.astype(object).putmask(mask, value) - np.putmask(values, mask, converted) - return self._shallow_copy(values) - - def equals(self, other: object) -> bool: + def equals(self, other: Any) -> bool: """ Determine if two Index object are equal. @@ -4469,7 +4275,6 @@ class Index(IndexOpsMixin, PandasObject): return array_equivalent(self._values, other._values) - @final def identical(self, other) -> bool: """ Similar to equals, but checks that object attributes and types are also equal. @@ -4483,13 +4288,14 @@ class Index(IndexOpsMixin, PandasObject): return ( self.equals(other) and all( - getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables + ( + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables + ) ) and type(self) == type(other) ) - @final def asof(self, label): """ Return the label from the index, or, if not present, the previous one. @@ -4555,7 +4361,7 @@ class Index(IndexOpsMixin, PandasObject): loc = loc.indices(len(self))[-1] return self[loc] - def asof_locs(self, where: "Index", mask) -> np.ndarray: + def asof_locs(self, where, mask): """ Return the locations (indices) of labels in the index. @@ -4583,24 +4389,18 @@ class Index(IndexOpsMixin, PandasObject): which correspond to the return values of the `asof` function for every element in `where`. """ - locs = self._values[mask].searchsorted(where._values, side="right") + locs = self.values[mask].searchsorted(where.values, side="right") locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) - # TODO: overload return type of ExtensionArray.__getitem__ - first_value = cast(Any, self._values[mask.argmax()]) - result[(locs == 0) & (where._values < first_value)] = -1 + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 return result - @final def sort_values( - self, - return_indexer: bool = False, - ascending: bool = True, - na_position: str_t = "last", - key: Optional[Callable] = None, + self, return_indexer=False, ascending=True, key: Optional[Callable] = None ): """ Return a sorted copy of the index. @@ -4614,12 +4414,6 @@ class Index(IndexOpsMixin, PandasObject): Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at - the end. - - .. versionadded:: 1.2.0 - key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4660,16 +4454,9 @@ class Index(IndexOpsMixin, PandasObject): """ idx = ensure_key_mapped(self, key) - # GH 35584. Sort missing values according to na_position kwarg - # ignore na_position for MultiIndex - if not isinstance(self, ABCMultiIndex): - _as = nargsort( - items=idx, ascending=ascending, na_position=na_position, key=key - ) - else: - _as = idx.argsort() - if not ascending: - _as = _as[::-1] + _as = idx.argsort() + if not ascending: + _as = _as[::-1] sorted_index = self.take(_as) @@ -4678,7 +4465,6 @@ class Index(IndexOpsMixin, PandasObject): else: return sorted_index - @final def sort(self, *args, **kwargs): """ Use sort_values instead. @@ -4741,10 +4527,7 @@ class Index(IndexOpsMixin, PandasObject): '2012-03-01'], dtype='datetime64[ns]', freq='MS') """ - raise NotImplementedError( - f"This method is only implemented for DatetimeIndex, PeriodIndex and " - f"TimedeltaIndex; Got type {type(self).__name__}" - ) + raise NotImplementedError(f"Not supported for type {type(self).__name__}") def argsort(self, *args, **kwargs) -> np.ndarray: """ @@ -4781,15 +4564,13 @@ class Index(IndexOpsMixin, PandasObject): >>> idx[order] Index(['a', 'b', 'c', 'd'], dtype='object') """ - if needs_i8_conversion(self.dtype): - # TODO: these do not match the underlying EA argsort methods GH#37863 - return self.asi8.argsort(*args, **kwargs) + result = self.asi8 - # This works for either ndarray or EA, is overriden - # by RangeIndex, MultIIndex - return self._data.argsort(*args, **kwargs) + if result is None: + result = np.array(self) + + return result.argsort(*args, **kwargs) - @final def get_value(self, series: "Series", key): """ Fast lookup of value from 1-dimensional ndarray. @@ -4856,7 +4637,6 @@ class Index(IndexOpsMixin, PandasObject): return series.iloc[loc] - @final def set_value(self, arr, key, value): """ Fast lookup of value from 1-dimensional ndarray. @@ -4904,36 +4684,13 @@ class Index(IndexOpsMixin, PandasObject): @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ensure_index(target) - - if target.is_boolean() and self.is_numeric(): - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - return self._get_indexer_non_comparable(target, method=None, unique=False) - pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if not self._should_compare(target): - return self._get_indexer_non_comparable(target, method=None, unique=False) - - if not is_dtype_equal(self.dtype, target.dtype): - # TODO: if object, could use infer_dtype to pre-empt costly - # conversion if still non-comparable? - dtype = find_common_type([self.dtype, target.dtype]) - if ( - dtype.kind in ["i", "u"] - and is_categorical_dtype(target.dtype) - and target.hasnans - ): - # FIXME: find_common_type incorrect with Categorical GH#38240 - # FIXME: some cases where float64 cast can be lossy? - dtype = np.dtype(np.float64) - - this = self.astype(dtype, copy=False) - that = target.astype(dtype, copy=False) - return this.get_indexer_non_unique(that) + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) @@ -4943,7 +4700,6 @@ class Index(IndexOpsMixin, PandasObject): indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), missing - @final def get_indexer_for(self, target, **kwargs): """ Guaranteed return of an indexer even when non-unique. @@ -4956,59 +4712,11 @@ class Index(IndexOpsMixin, PandasObject): numpy.ndarray List of indices. """ - if self._index_as_unique: + if self.is_unique: return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer - def _get_indexer_non_comparable(self, target: "Index", method, unique: bool = True): - """ - Called from get_indexer or get_indexer_non_unique when the target - is of a non-comparable dtype. - - For get_indexer lookups with method=None, get_indexer is an _equality_ - check, so non-comparable dtypes mean we will always have no matches. - - For get_indexer lookups with a method, get_indexer is an _inequality_ - check, so non-comparable dtypes mean we will always raise TypeError. - - Parameters - ---------- - target : Index - method : str or None - unique : bool, default True - * True if called from get_indexer. - * False if called from get_indexer_non_unique. - - Raises - ------ - TypeError - If doing an inequality check, i.e. method is not None. - """ - if method is not None: - other = unpack_nested_dtype(target) - raise TypeError(f"Cannot compare dtypes {self.dtype} and {other.dtype}") - - no_matches = -1 * np.ones(target.shape, dtype=np.intp) - if unique: - # This is for get_indexer - return no_matches - else: - # This is for get_indexer_non_unique - missing = np.arange(len(target), dtype=np.intp) - return no_matches, missing - - @property - def _index_as_unique(self): - """ - Whether we should treat this as unique for the sake of - get_indexer vs get_indexer_non_unique. - - For IntervalIndex compat. - """ - return self.is_unique - - @final def _maybe_promote(self, other: "Index"): """ When dealing with an object-dtype Index and a non-object Index, see @@ -5033,21 +4741,12 @@ class Index(IndexOpsMixin, PandasObject): return self, other - def _should_compare(self, other: "Index") -> bool: - """ - Check if `self == other` can ever have non-False entries. - """ - other = unpack_nested_dtype(other) - dtype = other.dtype - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ return True - @final def groupby(self, values) -> PrettyDict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -5117,7 +4816,6 @@ class Index(IndexOpsMixin, PandasObject): return Index(new_values, **attributes) # TODO: De-duplicate with map, xref GH#32349 - @final def _transform_index(self, func, level=None) -> "Index": """ Apply function to all values found in index. @@ -5221,20 +4919,14 @@ class Index(IndexOpsMixin, PandasObject): """ if level is not None: self._validate_index_level(level) - return algos.isin(self._values, values) + return algos.isin(self, values) - def _get_string_slice(self, key: str_t): + def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError - def slice_indexer( - self, - start: Optional[Label] = None, - end: Optional[Label] = None, - step: Optional[int] = None, - kind: Optional[str_t] = None, - ) -> slice: + def slice_indexer(self, start=None, end=None, step=None, kind=None): """ Compute the slice indexer for input labels and step. @@ -5293,7 +4985,6 @@ class Index(IndexOpsMixin, PandasObject): return com.cast_scalar_indexer(key) return key - @final def _validate_indexer(self, form: str_t, key, kind: str_t): """ If we are positional indexer, validate that we have appropriate @@ -5306,7 +4997,7 @@ class Index(IndexOpsMixin, PandasObject): elif is_integer(key): pass else: - raise self._invalid_indexer(form, key) + self._invalid_indexer(form, key) def _maybe_cast_slice_bound(self, label, side: str_t, kind): """ @@ -5333,9 +5024,14 @@ class Index(IndexOpsMixin, PandasObject): # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes - # reject them, if index does not contain label - if (is_float(label) or is_integer(label)) and label not in self.values: - raise self._invalid_indexer("slice", label) + # reject them + if is_float(label): + self._invalid_indexer("slice", label) + + # we are trying to find integer bounds on a non-integer based index + # this is rejected (generally .loc gets you here) + elif is_integer(label): + self._invalid_indexer("slice", label) return label @@ -5584,7 +5280,7 @@ class Index(IndexOpsMixin, PandasObject): """ arr_dtype = "object" if self.dtype == "object" else None labels = com.index_labels_to_array(labels, dtype=arr_dtype) - indexer = self.get_indexer_for(labels) + indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): if errors != "ignore": @@ -5595,135 +5291,120 @@ class Index(IndexOpsMixin, PandasObject): # -------------------------------------------------------------------- # Generated Arithmetic, Comparison, and Unary Methods - def _cmp_method(self, other, op): + @classmethod + def _add_comparison_methods(cls): """ - Wrapper used to dispatch comparison operations. + Add in comparison methods. """ - if self.is_(other): - # fastpath - if op in {operator.eq, operator.le, operator.ge}: - arr = np.ones(len(self), dtype=bool) - if self._can_hold_na and not isinstance(self, ABCMultiIndex): - # TODO: should set MultiIndex._can_hold_na = False? - arr[self.isna()] = False - return arr - elif op in {operator.ne, operator.lt, operator.gt}: - return np.zeros(len(self), dtype=bool) + cls.__eq__ = _make_comparison_op(operator.eq, cls) + cls.__ne__ = _make_comparison_op(operator.ne, cls) + cls.__lt__ = _make_comparison_op(operator.lt, cls) + cls.__gt__ = _make_comparison_op(operator.gt, cls) + cls.__le__ = _make_comparison_op(operator.le, cls) + cls.__ge__ = _make_comparison_op(operator.ge, cls) - if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): - if len(self) != len(other): - raise ValueError("Lengths must match to compare") - - if not isinstance(other, ABCMultiIndex): - other = extract_array(other, extract_numpy=True) - else: - other = np.asarray(other) - - if is_object_dtype(self.dtype) and isinstance(other, ExtensionArray): - # e.g. PeriodArray, Categorical - with np.errstate(all="ignore"): - result = op(self._values, other) - - elif is_object_dtype(self.dtype) and not isinstance(self, ABCMultiIndex): - # don't pass MultiIndex - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) - - elif is_interval_dtype(self.dtype): - with np.errstate(all="ignore"): - result = op(self._values, np.asarray(other)) - - else: - with np.errstate(all="ignore"): - result = ops.comparison_op(self._values, other, op) - - return result - - def _arith_method(self, other, op): + @classmethod + def _add_numeric_methods_add_sub_disabled(cls): """ - Wrapper used to dispatch arithmetic operations. + Add in the numeric add/sub methods to disable. + """ + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") + + @classmethod + def _add_numeric_methods_disabled(cls): + """ + Add in numeric methods to disable other than add/sub. + """ + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") + + @classmethod + def _add_numeric_methods_binary(cls): + """ + Add in numeric methods. + """ + cls.__add__ = _make_arithmetic_op(operator.add, cls) + cls.__radd__ = _make_arithmetic_op(ops.radd, cls) + cls.__sub__ = _make_arithmetic_op(operator.sub, cls) + cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) + cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) + cls.__pow__ = _make_arithmetic_op(operator.pow, cls) + + cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) + cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) + + # TODO: rmod? rdivmod? + cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) + cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) + cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__mul__ = _make_arithmetic_op(operator.mul, cls) + cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) + + @classmethod + def _add_numeric_methods_unary(cls): + """ + Add in numeric unary methods. """ - from pandas import Series + def _make_evaluate_unary(op, opstr: str_t): + def _evaluate_numeric_unary(self): - result = op(Series(self), other) - if isinstance(result, tuple): - return (Index(result[0]), Index(result[1])) - return Index(result) + attrs = self._get_attributes_dict() + return Index(op(self.values), **attrs) - def _unary_method(self, op): - result = op(self._values) - return Index(result, name=self.name) + _evaluate_numeric_unary.__name__ = opstr + return _evaluate_numeric_unary - def __abs__(self): - return self._unary_method(operator.abs) + cls.__neg__ = _make_evaluate_unary(operator.neg, "__neg__") + cls.__pos__ = _make_evaluate_unary(operator.pos, "__pos__") + cls.__abs__ = _make_evaluate_unary(np.abs, "__abs__") + cls.__inv__ = _make_evaluate_unary(lambda x: -x, "__inv__") - def __neg__(self): - return self._unary_method(operator.neg) + @classmethod + def _add_numeric_methods(cls): + cls._add_numeric_methods_unary() + cls._add_numeric_methods_binary() - def __pos__(self): - return self._unary_method(operator.pos) - - def __inv__(self): - # TODO: why not operator.inv? - # TODO: __inv__ vs __invert__? - return self._unary_method(lambda x: -x) - - def any(self, *args, **kwargs): + @classmethod + def _add_logical_methods(cls): """ - Return whether any element is Truthy. + Add in logical methods. + """ + _doc = """ + %(desc)s Parameters ---------- *args - These parameters will be passed to numpy.any. + These parameters will be passed to numpy.%(outname)s. **kwargs - These parameters will be passed to numpy.any. + These parameters will be passed to numpy.%(outname)s. Returns ------- - any : bool or array_like (if axis is specified) - A single element array_like may be converted to bool. + %(outname)s : bool or array_like (if axis is specified) + A single element array_like may be converted to bool.""" - See Also - -------- - Index.all : Return whether all elements are True. - Series.all : Return whether all elements are True. - - Notes - ----- - Not a Number (NaN), positive infinity and negative infinity - evaluate to True because these are not equal to zero. - - Examples - -------- - >>> index = pd.Index([0, 1, 2]) - >>> index.any() - True - - >>> index = pd.Index([0, 0, 0]) - >>> index.any() - False - """ - # FIXME: docstr inaccurate, args/kwargs not passed - self._maybe_disable_logical_methods("any") - return np.any(self.values) - - def all(self): - """ - Return whether all elements are Truthy. - - Parameters - ---------- - *args - These parameters will be passed to numpy.all. - **kwargs - These parameters will be passed to numpy.all. - - Returns - ------- - all : bool or array_like (if axis is specified) - A single element array_like may be converted to bool. + _index_shared_docs["index_all"] = dedent( + """ See Also -------- @@ -5762,28 +5443,68 @@ class Index(IndexOpsMixin, PandasObject): >>> pd.Index([0, 0, 0]).any() False """ - # FIXME: docstr inaccurate, args/kwargs not passed + ) - self._maybe_disable_logical_methods("all") - return np.all(self.values) + _index_shared_docs["index_any"] = dedent( + """ - @final - def _maybe_disable_logical_methods(self, opname: str_t): + See Also + -------- + Index.all : Return whether all elements are True. + Series.all : Return whether all elements are True. + + Notes + ----- + Not a Number (NaN), positive infinity and negative infinity + evaluate to True because these are not equal to zero. + + Examples + -------- + >>> index = pd.Index([0, 1, 2]) + >>> index.any() + True + + >>> index = pd.Index([0, 0, 0]) + >>> index.any() + False """ - raise if this Index subclass does not support any or all. + ) + + def _make_logical_function(name: str_t, desc: str_t, f): + @Substitution(outname=name, desc=desc) + @Appender(_index_shared_docs["index_" + name]) + @Appender(_doc) + def logical_func(self, *args, **kwargs): + result = f(self.values) + if ( + isinstance(result, (np.ndarray, ABCSeries, Index)) + and result.ndim == 0 + ): + # return NumPy type + return result.dtype.type(result.item()) + else: # pragma: no cover + return result + + logical_func.__name__ = name + return logical_func + + cls.all = _make_logical_function( + "all", "Return whether all elements are True.", np.all + ) + cls.any = _make_logical_function( + "any", "Return whether any element is True.", np.any + ) + + @classmethod + def _add_logical_methods_disabled(cls): """ - if ( - isinstance(self, ABCMultiIndex) - or needs_i8_conversion(self.dtype) - or is_interval_dtype(self.dtype) - or is_categorical_dtype(self.dtype) - or is_float_dtype(self.dtype) - ): - # This call will raise - make_invalid_op(opname)(self) + Add in logical methods to disable. + """ + cls.all = make_invalid_op("all") + cls.any = make_invalid_op("any") @property - def shape(self) -> Shape: + def shape(self): """ Return a tuple of the shape of the underlying data. """ @@ -5793,6 +5514,11 @@ class Index(IndexOpsMixin, PandasObject): return self._values.shape +Index._add_numeric_methods_disabled() +Index._add_logical_methods() +Index._add_comparison_methods() + + def ensure_index_from_sequences(sequences, names=None): """ Construct an index from sequences of data. @@ -5833,9 +5559,7 @@ def ensure_index_from_sequences(sequences, names=None): return MultiIndex.from_arrays(sequences, names=names) -def ensure_index( - index_like: Union[AnyArrayLike, Sequence], copy: bool = False -) -> Index: +def ensure_index(index_like, copy: bool = False): """ Ensure that we have an index from some index-like object. @@ -5871,18 +5595,7 @@ def ensure_index( index_like = index_like.copy() return index_like if hasattr(index_like, "name"): - # https://github.com/python/mypy/issues/1424 - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # Sequence[Any]]" has no attribute "name" [union-attr] - # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" - # has no attribute "name" [union-attr] - # error: "Sequence[Any]" has no attribute "name" [attr-defined] - # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no - # attribute "name" [union-attr] - # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no - # attribute "name" [union-attr] - name = index_like.name # type: ignore[union-attr, attr-defined] - return Index(index_like, name=name, copy=copy) + return Index(index_like, name=index_like.name, copy=copy) if is_iterator(index_like): index_like = list(index_like) @@ -5900,13 +5613,6 @@ def ensure_index( return MultiIndex.from_arrays(converted) else: - if isinstance(converted, np.ndarray) and converted.dtype == np.int64: - # Check for overflows if we should actually be uint64 - # xref GH#35481 - alt = np.asarray(index_like) - if alt.dtype == np.uint64: - converted = alt - index_like = converted else: # clean_index_list does the equivalent of copying @@ -5944,7 +5650,7 @@ def _validate_join_method(method: str): raise ValueError(f"do not recognize join method {method}") -def default_index(n: int) -> "RangeIndex": +def default_index(n): from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) @@ -6169,43 +5875,3 @@ def _maybe_asobject(dtype, klass, data, copy: bool, name: Label, **kwargs): return index.astype(object) return klass(data, dtype=dtype, copy=copy, name=name, **kwargs) - - -def get_unanimous_names(*indexes: Index) -> Tuple[Label, ...]: - """ - Return common name if all indices agree, otherwise None (level-by-level). - - Parameters - ---------- - indexes : list of Index objects - - Returns - ------- - list - A list representing the unanimous 'names' found. - """ - name_tups = [tuple(i.names) for i in indexes] - name_sets = [{*ns} for ns in zip_longest(*name_tups)] - names = tuple(ns.pop() if len(ns) == 1 else None for ns in name_sets) - return names - - -def unpack_nested_dtype(other: Index) -> Index: - """ - When checking if our dtype is comparable with another, we need - to unpack CategoricalDtype to look at its categories.dtype. - - Parameters - ---------- - other : Index - - Returns - ------- - Index - """ - dtype = other.dtype - if is_categorical_dtype(dtype): - # If there is ever a SparseIndex, this could get dispatched - # here too. - return dtype.categories - return other diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/category.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/category.py index 377fff5..8af6ee5 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/category.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/category.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import Any, List import warnings import numpy as np @@ -6,28 +6,35 @@ import numpy as np from pandas._config import get_option from pandas._libs import index as libindex +from pandas._libs.hashtable import duplicated_int64 from pandas._libs.lib import no_default -from pandas._typing import ArrayLike, Label +from pandas._typing import Label from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_platform_int, is_categorical_dtype, + is_interval_dtype, + is_list_like, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor -from pandas.core.arrays.categorical import Categorical, contains +from pandas.core.algorithms import take_1d +from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories +import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name -from pandas.core.indexes.extension import NDArrayBackedExtensionIndex, inherit_names +from pandas.core.indexes.extension import ExtensionIndex, inherit_names import pandas.core.missing as missing +from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update({"target_klass": "CategoricalIndex"}) +_index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) @inherit_names( @@ -61,7 +68,7 @@ _index_doc_kwargs.update({"target_klass": "CategoricalIndex"}) typ="method", overwrite=True, ) -class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): +class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -156,14 +163,9 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): _typ = "categoricalindex" - @property - def _can_hold_strings(self): - return self.categories._can_hold_strings - codes: np.ndarray categories: Index _data: Categorical - _values: Categorical @property def _engine_type(self): @@ -210,6 +212,29 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): return cls._simple_new(data, name=name) + def _create_from_codes(self, codes, dtype=None, name=None): + """ + *this is an internal non-public method* + + create the correct categorical from codes + + Parameters + ---------- + codes : new codes + dtype: CategoricalDtype, defaults to existing + name : optional name attribute, defaults to existing + + Returns + ------- + CategoricalIndex + """ + if dtype is None: + dtype = self.dtype + if name is None: + name = self.name + cat = Categorical.from_codes(codes, dtype=dtype) + return CategoricalIndex(cat, name=name) + @classmethod def _simple_new(cls, values: Categorical, name: Label = None): assert isinstance(values, Categorical), type(values) @@ -220,74 +245,52 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): result._cache = {} result._reset_identity() + result._no_setting_name = False return result # -------------------------------------------------------------------- - # error: Argument 1 of "_shallow_copy" is incompatible with supertype - # "ExtensionIndex"; supertype defines the argument type as - # "Optional[ExtensionArray]" [override] @doc(Index._shallow_copy) - def _shallow_copy( # type:ignore[override] - self, - values: Optional[Categorical] = None, - name: Label = no_default, - ): + def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is not None: - # In tests we only get here with Categorical objects that - # have matching .ordered, and values.categories a subset of - # our own. However we do _not_ have a dtype match in general. values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name) - def _is_dtype_compat(self, other) -> Categorical: + def _is_dtype_compat(self, other) -> bool: """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) - Parameters - ---------- - other : Index - - Returns - ------- - Categorical - Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): - other = extract_array(other) - if not other._categories_match_up_to_permutation(self): + if isinstance(other, CategoricalIndex): + other = other._values + if not other.is_dtype_equal(self): raise TypeError( "categories must match existing categories when appending" ) else: values = other - + if not is_list_like(values): + values = [values] cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) if not other.isin(values).all(): raise TypeError( "cannot append a non-category item to a CategoricalIndex" ) - other = other._values - - if not ((other == values) | (isna(other) & isna(values))).all(): - # GH#37667 see test_equals_non_category - raise TypeError( - "categories must match existing categories when appending" - ) return other - def equals(self, other: object) -> bool: + def equals(self, other) -> bool: """ Determine if two CategoricalIndex objects contain the same elements. @@ -305,10 +308,13 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): try: other = self._is_dtype_compat(other) + if isinstance(other, type(self)): + other = other._data + return self._data.equals(other) except (TypeError, ValueError): - return False + pass - return self._data.equals(other) + return False # -------------------------------------------------------------------- # Rendering Methods @@ -331,9 +337,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): "categories", ibase.default_pprint(self.categories, max_seq_items=max_categories), ), - # pandas\core\indexes\category.py:315: error: "CategoricalIndex" - # has no attribute "ordered" [attr-defined] - ("ordered", self.ordered), # type: ignore[attr-defined] + ("ordered", self.ordered), ] if self.name is not None: attrs.append(("name", ibase.default_pprint(self.name))) @@ -363,6 +367,11 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): """ return the underlying data, which is a Categorical """ return self._data + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + @doc(Index.__contains__) def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. @@ -373,14 +382,30 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): @doc(Index.astype) def astype(self, dtype, copy=True): - res_data = self._data.astype(dtype, copy=copy) - return Index(res_data, name=self.name) + if dtype is not None: + dtype = pandas_dtype(dtype) + + if is_interval_dtype(dtype): + from pandas import IntervalIndex + + return IntervalIndex(np.array(self)) + elif is_categorical_dtype(dtype): + # GH 18630 + dtype = self.dtype.update_dtype(dtype) + if dtype == self.dtype: + return self.copy() if copy else self + + return Index.astype(self, dtype=dtype, copy=copy) + + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + return self._data.codes == -1 @doc(Index.fillna) def fillna(self, value, downcast=None): - value = self._require_scalar(value) - cat = self._data.fillna(value) - return type(self)._simple_new(cat, name=self.name) + self._assert_can_do_op(value) + return CategoricalIndex(self._data.fillna(value), name=self.name) @cache_readonly def _engine(self): @@ -399,6 +424,32 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): # of result, not self. return type(self)._simple_new(result, name=self.name) + @doc(Index.duplicated) + def duplicated(self, keep="first"): + codes = self.codes.astype("i8") + return duplicated_int64(codes, keep) + + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.astype("object") + + def _maybe_cast_indexer(self, key): + code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) + return code + + @doc(Index.where) + def where(self, cond, other=None): + # TODO: Investigate an alternative implementation with + # 1. copy the underlying Categorical + # 2. setitem with `cond` and `other` + # 3. Rebuild CategoricalIndex. + if other is None: + other = self._na_value + values = np.where(cond, self._values, other) + cat = Categorical(values, dtype=self.dtype) + return type(self)._simple_new(cat, name=self.name) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -450,8 +501,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] - cat = self._data._from_backing_data(codes) - new_target = type(self)._simple_new(cat, name=self.name) + new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't @@ -460,8 +510,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): - new_target = Categorical(new_target, dtype=target.dtype) - new_target = type(self)._simple_new(new_target, name=self.name) + new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -484,52 +533,59 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories - new_target = Categorical(new_target, dtype=self.dtype) - new_target = type(self)._simple_new(new_target, name=self.name) + new_target = self._shallow_copy(new_target) return new_target, indexer, new_indexer - # -------------------------------------------------------------------- - # Indexing Methods - - def _maybe_cast_indexer(self, key) -> int: - return self._data._unbox_scalar(key) - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase.ensure_index(target) - self._check_indexing_method(method) - if self.is_unique and self.equals(target): return np.arange(len(self), dtype="intp") - return self._get_indexer_non_unique(target._values)[0] + if method == "pad" or method == "backfill": + raise NotImplementedError( + "method='pad' and method='backfill' not " + "implemented yet for CategoricalIndex" + ) + elif method == "nearest": + raise NotImplementedError( + "method='nearest' not implemented yet for CategoricalIndex" + ) + + if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): + if self._values.equals(target._values): + # we have the same codes + codes = target.codes + else: + codes = recode_for_categories( + target.codes, target.categories, self._values.categories + ) + else: + if isinstance(target, CategoricalIndex): + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) + + indexer, _ = self._engine.get_indexer_non_unique(codes) + return ensure_platform_int(indexer) @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - return self._get_indexer_non_unique(target._values) - def _get_indexer_non_unique(self, values: ArrayLike): - """ - get_indexer_non_unique but after unrapping the target Index object. - """ - # Note: we use engine.get_indexer_non_unique for get_indexer in addition - # to get_indexer_non_unique because, even if `target` is unique, any - # non-category entries in it will be encoded as -1 so `codes` may - # not be unique. - - if isinstance(values, Categorical): - # Indexing on codes is more efficient if categories are the same, - # so we can apply some optimizations based on the degree of - # dtype-matching. - cat = self._data._encode_with_my_categories(values) - codes = cat._codes - else: - codes = self.categories.get_indexer(values) + if isinstance(target, CategoricalIndex): + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + target = target._values + codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing @@ -539,23 +595,29 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): # the categories if self.categories._defer_to_indexing: - # See tests.indexing.interval.test_interval:test_loc_getitem_frame indexer = self.categories._convert_list_indexer(keyarr) return Index(self.codes).get_indexer_for(indexer) - return self.get_indexer_for(keyarr) + indexer = self.categories.get_indexer(np.asarray(keyarr)) + if (indexer == -1).any(): + raise KeyError( + "a list-indexer must only include values that are in the categories" + ) - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side: str, kind): - if kind == "loc": - return label + return self.get_indexer(keyarr) - return super()._maybe_cast_slice_bound(label, side, kind) + @doc(Index._convert_arr_indexer) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) - # -------------------------------------------------------------------- + if self.categories._defer_to_indexing: + return keyarr - def _is_comparable_dtype(self, dtype): - return self.categories._is_comparable_dtype(dtype) + return self._shallow_copy(keyarr) + + @doc(Index._convert_index_indexer) + def _convert_index_indexer(self, keyarr): + return self._shallow_copy(keyarr) def take_nd(self, *args, **kwargs): """Alias for `take`""" @@ -566,6 +628,13 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): ) return self.take(*args, **kwargs) + @doc(Index._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side, kind): + if kind == "loc": + return label + + return super()._maybe_cast_slice_bound(label, side, kind) + def map(self, mapper): """ Map values using input correspondence (a dict, Series, or function). @@ -636,19 +705,53 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): mapped = self._values.map(mapper) return Index(mapped, name=self.name) - def _concat(self, to_concat: List["Index"], name: Label) -> Index: - # if calling index is category, don't check dtype of others - try: - codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - except TypeError: - # not all to_concat elements are among our categories (or NA) - from pandas.core.dtypes.concat import concat_compat + def delete(self, loc): + """ + Make new Index with passed location(-s) deleted - res = concat_compat(to_concat) - return Index(res, name=name) - else: - cat = self._data._from_backing_data(codes) - return type(self)._simple_new(cat, name=name) + Returns + ------- + new_index : Index + """ + return self._create_from_codes(np.delete(self.codes, loc)) + + def insert(self, loc: int, item): + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + + Raises + ------ + ValueError if the item is not in the categories + + """ + code = self.categories.get_indexer([item]) + if (code == -1) and not (is_scalar(item) and isna(item)): + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) + + codes = self.codes + codes = np.concatenate((codes[:loc], code, codes[loc:])) + return self._create_from_codes(codes) + + def _concat(self, to_concat, name): + # if calling index is category, don't check dtype of others + codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) + result = self._create_from_codes(codes, name=name) + # if name is None, _create_from_codes sets self.name + result.name = name + return result def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ @@ -659,3 +762,14 @@ class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): if is_scalar(res): return res return CategoricalIndex(res, name=self.name) + + def _wrap_joined_index( + self, joined: np.ndarray, other: "CategoricalIndex" + ) -> "CategoricalIndex": + name = get_op_result_name(self, other) + return self._create_from_codes(joined, name=name) + + +CategoricalIndex._add_numeric_methods_add_sub_disabled() +CategoricalIndex._add_numeric_methods_disabled() +CategoricalIndex._add_logical_methods_disabled() diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimelike.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimelike.py index f0d4d36..b30ef37 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimelike.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimelike.py @@ -2,19 +2,20 @@ Base and utility classes for tseries type pandas objects. """ from datetime import datetime -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, TypeVar, Union, cast +from typing import Any, List, Optional, TypeVar, Union, cast import numpy as np from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib -from pandas._libs.tslibs import BaseOffset, Resolution, Tick +from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones +from pandas._libs.tslibs.parsing import DateParseError from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( is_bool_dtype, - is_categorical_dtype, is_dtype_equal, is_integer, is_list_like, @@ -22,24 +23,26 @@ from pandas.core.dtypes.common import ( is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.base import IndexOpsMixin import pandas.core.common as com +from pandas.core.construction import array as pd_array, extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.extension import ( - NDArrayBackedExtensionIndex, + ExtensionIndex, inherit_names, make_wrapped_arith_op, ) from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name +from pandas.core.sorting import ensure_key_mapped from pandas.core.tools.timedeltas import to_timedelta -if TYPE_CHECKING: - from pandas import CategoricalIndex - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _T = TypeVar("_T", bound="DatetimeIndexOpsMixin") @@ -50,25 +53,18 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): Create the join wrapper methods. """ - # error: 'staticmethod' used with a non-method - @staticmethod # type: ignore[misc] + @staticmethod # type: ignore def wrapper(left, right): - # Note: these only get called with left.dtype == right.dtype - if isinstance( - left, (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin) - ): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): left = left.view("i8") - if isinstance( - right, - (np.ndarray, DatetimeIndexOpsMixin, ABCSeries, DatetimeLikeArrayMixin), - ): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): right = right.view("i8") results = joinf(left, right) if with_indexers: # dtype should be timedelta64[ns] for TimedeltaIndex # and datetime64[ns] for DatetimeIndex - dtype = cast(np.dtype, left.dtype).base + dtype = left.dtype.base join_index, left_indexer, right_indexer = results join_index = join_index.view(dtype) @@ -79,65 +75,43 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): @inherit_names( - ["inferred_freq", "_resolution_obj", "resolution"], + ["inferred_freq", "_isnan", "_resolution_obj", "resolution"], DatetimeLikeArrayMixin, cache=True, ) -@inherit_names(["mean", "asi8", "freq", "freqstr"], DatetimeLikeArrayMixin) -class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): +@inherit_names( + ["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin, +) +class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. """ - _can_hold_strings = False _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] - _data_cls: Union[Type[DatetimeArray], Type[TimedeltaArray], Type[PeriodArray]] freq: Optional[BaseOffset] freqstr: Optional[str] _resolution_obj: Resolution _bool_ops: List[str] = [] _field_ops: List[str] = [] - # error: "Callable[[Any], Any]" has no attribute "fget" - hasnans = cache_readonly( - DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined] - ) + hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - @classmethod - def _simple_new( - cls, - values: Union[DatetimeArray, TimedeltaArray, PeriodArray], - name: Label = None, - ): - assert isinstance(values, cls._data_cls), type(values) - - result = object.__new__(cls) - result._data = values - result._name = name - result._cache = {} - - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._data - - result._reset_identity() - return result - @property - def _is_all_dates(self) -> bool: + def is_all_dates(self) -> bool: return True # ------------------------------------------------------------------------ # Abstract data attributes @property - def values(self) -> np.ndarray: + def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc and other functions. + Gets called after a ufunc. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result): @@ -151,35 +125,24 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): # ------------------------------------------------------------------------ - def equals(self, other: object) -> bool: + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ if self.is_(other): return True - if not isinstance(other, Index): - return False - elif other.dtype.kind in ["f", "i", "u", "c"]: + if not isinstance(other, ABCIndexClass): return False elif not isinstance(other, type(self)): - should_try = False - inferrable = self._data._infer_matches - if other.dtype == object: - should_try = other.inferred_type in inferrable - elif is_categorical_dtype(other.dtype): - other = cast("CategoricalIndex", other) - should_try = other.categories.inferred_type in inferrable - - if should_try: - try: - other = type(self)(other) - except (ValueError, TypeError, OverflowError): - # e.g. - # ValueError -> cannot parse str entry, or OutOfBoundsDatetime - # TypeError -> trying to convert IntervalIndex to DatetimeIndex - # OverflowError -> Index([very_large_timedeltas]) - return False + try: + other = type(self)(other) + except (ValueError, TypeError, OverflowError): + # e.g. + # ValueError -> cannot parse str entry, or OutOfBoundsDatetime + # TypeError -> trying to convert IntervalIndex to DatetimeIndex + # OverflowError -> Index([very_large_timedeltas]) + return False if not is_dtype_equal(self.dtype, other.dtype): # have different timezone @@ -198,20 +161,46 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) ) + def sort_values(self, return_indexer=False, ascending=True, key=None): + """ + Return sorted copy of Index. + """ + idx = ensure_key_mapped(self, key) + + _as = idx.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) indices = np.asarray(indices, dtype=np.intp) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) + if isinstance(maybe_slice, slice): + return self[maybe_slice] - result = NDArrayBackedExtensionIndex.take( + return ExtensionIndex.take( self, indices, axis, allow_fill, fill_value, **kwargs ) - if isinstance(maybe_slice, slice): - freq = self._data._get_getitem_freq(maybe_slice) - result._data._freq = freq - return result + + @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, str): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + if isinstance(value, Index): + value = value._data + + return self._data.searchsorted(value, side=side, sorter=sorter) _can_hold_na = True @@ -248,23 +237,23 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): return self._na_value i8 = self.asi8 - - if len(i8) and self.is_monotonic_increasing: + try: # quick check - if i8[0] != iNaT: - return self._data._box_func(i8[0]) + if len(i8) and self.is_monotonic: + if i8[0] != iNaT: + return self._box_func(i8[0]) - if self.hasnans: - if not skipna: - return self._na_value - i8 = i8[~self._isnan] - - if not len(i8): + if self.hasnans: + if skipna: + min_stamp = self[~self._isnan].asi8.min() + else: + return self._na_value + else: + min_stamp = i8.min() + return self._box_func(min_stamp) + except ValueError: return self._na_value - min_stamp = i8.min() - return self._data._box_func(min_stamp) - def argmin(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the minimum values along an axis. @@ -305,23 +294,23 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): return self._na_value i8 = self.asi8 - - if len(i8) and self.is_monotonic: + try: # quick check - if i8[-1] != iNaT: - return self._data._box_func(i8[-1]) + if len(i8) and self.is_monotonic: + if i8[-1] != iNaT: + return self._box_func(i8[-1]) - if self.hasnans: - if not skipna: - return self._na_value - i8 = i8[~self._isnan] - - if not len(i8): + if self.hasnans: + if skipna: + max_stamp = self[~self._isnan].asi8.max() + else: + return self._na_value + else: + max_stamp = i8.max() + return self._box_func(max_stamp) + except ValueError: return self._na_value - max_stamp = i8.max() - return self._data._box_func(max_stamp) - def argmax(self, axis=None, skipna=True, *args, **kwargs): """ Returns the indices of the maximum values along an axis. @@ -380,7 +369,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): @property def _formatter_func(self): - return self._data._formatter() + raise AbstractMethodError(self) def _format_attrs(self): """ @@ -395,6 +384,123 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): attrs.append(("freq", freq)) return attrs + # -------------------------------------------------------------------- + # Indexing Methods + + def _validate_partial_date_slice(self, reso: Resolution): + raise NotImplementedError + + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): + raise NotImplementedError + + def _partial_date_slice( + self, + reso: Resolution, + parsed: datetime, + use_lhs: bool = True, + use_rhs: bool = True, + ): + """ + Parameters + ---------- + reso : Resolution + parsed : datetime + use_lhs : bool, default True + use_rhs : bool, default True + + Returns + ------- + slice or ndarray[intp] + """ + self._validate_partial_date_slice(reso) + + t1, t2 = self._parsed_string_to_bounds(reso, parsed) + i8vals = self.asi8 + unbox = self._data._unbox_scalar + + if self.is_monotonic: + + if len(self) and ( + (use_lhs and t1 < self[0] and t2 < self[0]) + or ((use_rhs and t1 > self[-1] and t2 > self[-1])) + ): + # we are out of range + raise KeyError + + # TODO: does this depend on being monotonic _increasing_? + + # a monotonic (sorted) series can be sliced + # Use asi8.searchsorted to avoid re-validating Periods/Timestamps + left = i8vals.searchsorted(unbox(t1), side="left") if use_lhs else None + right = i8vals.searchsorted(unbox(t2), side="right") if use_rhs else None + return slice(left, right) + + else: + lhs_mask = (i8vals >= unbox(t1)) if use_lhs else True + rhs_mask = (i8vals <= unbox(t2)) if use_rhs else True + + # try to find the dates + return (lhs_mask & rhs_mask).nonzero()[0] + + # -------------------------------------------------------------------- + # Arithmetic Methods + + __add__ = make_wrapped_arith_op("__add__") + __sub__ = make_wrapped_arith_op("__sub__") + __radd__ = make_wrapped_arith_op("__radd__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") + + def isin(self, values, level=None): + """ + Compute boolean array of whether each index value is found in the + passed set of values. + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + if level is not None: + self._validate_index_level(level) + + if not isinstance(values, type(self)): + try: + values = type(self)(values) + except ValueError: + return self.astype(object).isin(values) + + return algorithms.isin(self.asi8, values.asi8) + + @Appender(Index.where.__doc__) + def where(self, cond, other=None): + values = self.view("i8") + + try: + other = self._data._validate_where_value(other) + except (TypeError, ValueError) as err: + # Includes tzawareness mismatch and IncompatibleFrequencyError + oth = getattr(other, "dtype", other) + raise TypeError(f"Where requires matching dtype, not {oth}") from err + + result = np.where(cond, values, other).astype("i8") + arr = type(self._data)._simple_new(result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) + def _summary(self, name=None) -> str: """ Return a summarized representation. @@ -425,78 +531,6 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): result = result.replace("'", "") return result - # -------------------------------------------------------------------- - # Indexing Methods - - def _validate_partial_date_slice(self, reso: Resolution): - raise NotImplementedError - - def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): - raise NotImplementedError - - def _partial_date_slice( - self, - reso: Resolution, - parsed: datetime, - ): - """ - Parameters - ---------- - reso : Resolution - parsed : datetime - - Returns - ------- - slice or ndarray[intp] - """ - self._validate_partial_date_slice(reso) - - t1, t2 = self._parsed_string_to_bounds(reso, parsed) - vals = self._data._ndarray - unbox = self._data._unbox - - if self.is_monotonic_increasing: - - if len(self) and ( - (t1 < self[0] and t2 < self[0]) or (t1 > self[-1] and t2 > self[-1]) - ): - # we are out of range - raise KeyError - - # TODO: does this depend on being monotonic _increasing_? - - # a monotonic (sorted) series can be sliced - left = vals.searchsorted(unbox(t1), side="left") - right = vals.searchsorted(unbox(t2), side="right") - return slice(left, right) - - else: - lhs_mask = vals >= unbox(t1) - rhs_mask = vals <= unbox(t2) - - # try to find the dates - return (lhs_mask & rhs_mask).nonzero()[0] - - # -------------------------------------------------------------------- - # Arithmetic Methods - - __add__ = make_wrapped_arith_op("__add__") - __sub__ = make_wrapped_arith_op("__sub__") - __radd__ = make_wrapped_arith_op("__radd__") - __rsub__ = make_wrapped_arith_op("__rsub__") - __pow__ = make_wrapped_arith_op("__pow__") - __rpow__ = make_wrapped_arith_op("__rpow__") - __mul__ = make_wrapped_arith_op("__mul__") - __rmul__ = make_wrapped_arith_op("__rmul__") - __floordiv__ = make_wrapped_arith_op("__floordiv__") - __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") - __mod__ = make_wrapped_arith_op("__mod__") - __rmod__ = make_wrapped_arith_op("__rmod__") - __divmod__ = make_wrapped_arith_op("__divmod__") - __rdivmod__ = make_wrapped_arith_op("__rdivmod__") - __truediv__ = make_wrapped_arith_op("__truediv__") - __rtruediv__ = make_wrapped_arith_op("__rtruediv__") - def shift(self, periods=1, freq=None): """ Shift index by desired number of time frequency increments. @@ -535,95 +569,58 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): # -------------------------------------------------------------------- # List-like Methods - def _get_delete_freq(self, loc: int): - """ - Find the `freq` for self.delete(loc). - """ - freq = None - if is_period_dtype(self.dtype): - freq = self.freq - elif self.freq is not None: - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice( - np.asarray(loc, dtype=np.intp), len(self) - ) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - return freq - - def _get_insert_freq(self, loc, item): - """ - Find the `freq` for self.insert(loc, item). - """ - value = self._data._validate_scalar(item) - item = self._data._box_func(value) - - freq = None - if is_period_dtype(self.dtype): - freq = self.freq - elif self.freq is not None: - # freq can be preserved on edge cases - if self.size: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - else: - # Adding a single item to an empty index may preserve freq - if self.freq.is_on_offset(item): - freq = self.freq - return freq - - @doc(NDArrayBackedExtensionIndex.delete) def delete(self, loc): - result = super().delete(loc) - result._data._freq = self._get_delete_freq(loc) - return result + new_i8s = np.delete(self.asi8, loc) - @doc(NDArrayBackedExtensionIndex.insert) - def insert(self, loc: int, item): - result = super().insert(loc, item) + freq = None + if is_period_dtype(self.dtype): + freq = self.freq + elif is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice( + np.asarray(loc, dtype=np.intp), len(self) + ) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq - result._data._freq = self._get_insert_freq(loc, item) - return result + arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) + return type(self)._simple_new(arr, name=self.name) # -------------------------------------------------------------------- # Join/Set Methods - def _can_union_without_object_cast(self, other) -> bool: - return is_dtype_equal(self.dtype, other.dtype) + def _wrap_joined_index(self, joined: np.ndarray, other): + assert other.dtype == self.dtype, (other.dtype, self.dtype) + name = get_op_result_name(self, other) - def _get_join_freq(self, other): - """ - Get the freq to attach to the result of a join operation. - """ if is_period_dtype(self.dtype): freq = self.freq else: self = cast(DatetimeTimedeltaMixin, self) freq = self.freq if self._can_fast_union(other) else None - return freq + new_data = type(self._data)._simple_new(joined, dtype=self.dtype, freq=freq) - def _wrap_joined_index(self, joined: np.ndarray, other): - assert other.dtype == self.dtype, (other.dtype, self.dtype) - - result = super()._wrap_joined_index(joined, other) - result._data._freq = self._get_join_freq(other) - return result + return type(self)._simple_new(new_data, name=name) @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): - try: - return self._data._validate_listlike(keyarr, allow_object=True) - except (ValueError, TypeError): - return com.asarray_tuplesafe(keyarr) + if lib.infer_dtype(keyarr) == "string": + # Weak reasoning that indexer is a list of strings + # representing datetime or timedelta or period + try: + extension_arr = pd_array(keyarr, self.dtype) + except (ValueError, DateParseError): + # Fail to infer keyarr from self.dtype + return keyarr + + converted_arr = extract_array(extension_arr, extract_numpy=True) + else: + converted_arr = com.asarray_tuplesafe(keyarr) + return converted_arr class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): @@ -641,13 +638,20 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): arr = self._data._with_freq(freq) return type(self)._simple_new(arr, name=self.name) - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return False + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} - def is_type_compatible(self, kind: str) -> bool: - return kind in self._data._infer_matches + if values is None: + values = self._data + + if isinstance(values, np.ndarray): + # TODO: We would rather not get here + values = type(self._data)(values, dtype=self.dtype) + + result = type(self)._simple_new(values, name=name) + result._cache = cache + return result # -------------------------------------------------------------------- # Set Operation Methods @@ -686,35 +690,33 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + res_name = get_op_result_name(self, other) if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) - return self._intersection(other, sort=sort) - - def _intersection(self, other: Index, sort=False) -> Index: - """ - intersection specialized to the case with matching dtypes. - """ if len(self) == 0: - return self.copy()._get_reconciled_name_object(other) + return self.copy() if len(other) == 0: - return other.copy()._get_reconciled_name_object(self) + return other.copy() if not isinstance(other, type(self)): result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + # TODO: no tests rely on this; needed? + result = result._with_freq("infer") + result.name = res_name return result elif not self._can_fast_intersect(other): - result = Index._intersection(self, other, sort=sort) - # We need to invalidate the freq because Index._intersection + result = Index.intersection(self, other, sort=sort) + # We need to invalidate the freq because Index.intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. - result = self._wrap_setop_result(other, result) - return result._with_freq(None)._with_freq("infer") + result = result._with_freq(None)._with_freq("infer") + result.name = res_name + return result # to make our life easier, "sort" the two ranges if self[0] <= other[0]: @@ -728,16 +730,11 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): start = right[0] if end < start: - result = self[:0] + return type(self)(data=[], dtype=self.dtype, freq=self.freq, name=res_name) else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left._values[lslice] - # error: Argument 1 to "_simple_new" of "DatetimeIndexOpsMixin" has - # incompatible type "Union[ExtensionArray, Any]"; expected - # "Union[DatetimeArray, TimedeltaArray, PeriodArray]" [arg-type] - result = type(self)._simple_new(left_chunk) # type: ignore[arg-type] - - return self._wrap_setop_result(other, result) + return type(self)._simple_new(left_chunk, name=res_name) def _can_fast_intersect(self: _T, other: _T) -> bool: if self.freq is None: @@ -832,7 +829,7 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): # The can_fast_union check ensures that the result.freq # should match self.freq dates = type(self._data)(dates, freq=self.freq) - result = type(self)._simple_new(dates) + result = type(self)._simple_new(dates, name=self.name) return result else: return left @@ -857,14 +854,10 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): result = result._with_freq("infer") return result else: - i8self = Int64Index._simple_new(self.asi8) - i8other = Int64Index._simple_new(other.asi8) + i8self = Int64Index._simple_new(self.asi8, name=self.name) + i8other = Int64Index._simple_new(other.asi8, name=other.name) i8result = i8self._union(i8other, sort=sort) - # pandas\core\indexes\datetimelike.py:887: error: Unexpected - # keyword argument "freq" for "DatetimeTimedeltaMixin" [call-arg] - result = type(self)( - i8result, dtype=self.dtype, freq="infer" # type: ignore[call-arg] - ) + result = type(self)(i8result, dtype=self.dtype, freq="infer") return result # -------------------------------------------------------------------- @@ -884,11 +877,11 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): """ See Index.join """ - pself, pother = self._maybe_promote(other) - if pself is not self or pother is not other: - return pself.join( - pother, how=how, level=level, return_indexers=return_indexers, sort=sort - ) + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass this, other = self._maybe_utc_convert(other) return Index.join( @@ -900,18 +893,84 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): sort=sort, ) - def _maybe_utc_convert(self: _T, other: Index) -> Tuple[_T, Index]: - # Overridden by DatetimeIndex - return self, other + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other + + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other + + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False # -------------------------------------------------------------------- # List-Like Methods - @Appender(DatetimeIndexOpsMixin.insert.__doc__) def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ if isinstance(item, str): # TODO: Why are strings special? # TODO: Should we attempt _scalar_from_string? return self.astype(object).insert(loc, item) - return DatetimeIndexOpsMixin.insert(self, loc, item) + item = self._data._validate_insert_value(item) + + freq = None + # check freq can be preserved on edge cases + if self.freq is not None: + if self.size: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + else: + # Adding a single item to an empty index may preserve freq + if self.freq.is_on_offset(item): + freq = self.freq + + item = self._data._unbox_scalar(item) + + new_i8s = np.concatenate([self[:loc].asi8, [item], self[loc:].asi8]) + arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) + return type(self)._simple_new(arr, name=self.name) diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimes.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimes.py index 8329c41..6d2e592 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimes.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/datetimes.py @@ -1,6 +1,6 @@ from datetime import date, datetime, time, timedelta, tzinfo import operator -from typing import TYPE_CHECKING, Optional, Tuple +from typing import Optional import warnings import numpy as np @@ -14,28 +14,28 @@ from pandas._libs.tslibs import ( to_offset, ) from pandas._libs.tslibs.offsets import prefix_mapping -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_float, + is_integer, is_scalar, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype import pandas.core.common as com -from pandas.core.indexes.base import Index, get_unanimous_names, maybe_extract_name +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin from pandas.core.indexes.extension import inherit_names from pandas.core.tools.times import to_time -if TYPE_CHECKING: - from pandas import DataFrame, Float64Index, PeriodIndex, TimedeltaIndex - def _new_DatetimeIndex(cls, d): """ @@ -70,11 +70,12 @@ def _new_DatetimeIndex(cls, d): @inherit_names( - DatetimeArray._field_ops + ["to_perioddelta", "to_julian_date", "strftime", "isocalendar"] + + DatetimeArray._field_ops + [ method for method in DatetimeArray._datetimelike_methods - if method not in ("tz_localize", "tz_convert") + if method not in ("tz_localize",) ], DatetimeArray, wrap=True, @@ -96,7 +97,6 @@ def _new_DatetimeIndex(cls, d): "date", "time", "timetz", - "std", ] + DatetimeArray._bool_ops, DatetimeArray, @@ -162,11 +162,9 @@ class DatetimeIndex(DatetimeTimedeltaMixin): time timetz dayofyear - day_of_year weekofyear week dayofweek - day_of_week weekday quarter tz @@ -199,7 +197,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): month_name day_name mean - std See Also -------- @@ -217,7 +214,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _typ = "datetimeindex" - _data_cls = DatetimeArray _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True @@ -227,21 +223,10 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _is_numeric_dtype = False _data: DatetimeArray - inferred_freq: Optional[str] tz: Optional[tzinfo] # -------------------------------------------------------------------- - # methods that dispatch to DatetimeArray and wrap result - - @doc(DatetimeArray.strftime) - def strftime(self, date_format) -> Index: - arr = self._data.strftime(date_format) - return Index(arr, name=self.name) - - @doc(DatetimeArray.tz_convert) - def tz_convert(self, tz) -> "DatetimeIndex": - arr = self._data.tz_convert(tz) - return type(self)._simple_new(arr, name=self.name) + # methods that dispatch to array and wrap result in DatetimeIndex @doc(DatetimeArray.tz_localize) def tz_localize( @@ -251,30 +236,9 @@ class DatetimeIndex(DatetimeTimedeltaMixin): return type(self)._simple_new(arr, name=self.name) @doc(DatetimeArray.to_period) - def to_period(self, freq=None) -> "PeriodIndex": - from pandas.core.indexes.api import PeriodIndex - + def to_period(self, freq=None) -> "DatetimeIndex": arr = self._data.to_period(freq) - return PeriodIndex._simple_new(arr, name=self.name) - - @doc(DatetimeArray.to_perioddelta) - def to_perioddelta(self, freq) -> "TimedeltaIndex": - from pandas.core.indexes.api import TimedeltaIndex - - arr = self._data.to_perioddelta(freq) - return TimedeltaIndex._simple_new(arr, name=self.name) - - @doc(DatetimeArray.to_julian_date) - def to_julian_date(self) -> "Float64Index": - from pandas.core.indexes.api import Float64Index - - arr = self._data.to_julian_date() - return Float64Index._simple_new(arr, name=self.name) - - @doc(DatetimeArray.isocalendar) - def isocalendar(self) -> "DataFrame": - df = self._data.isocalendar() - return df.set_index(self) + return type(self)._simple_new(arr, name=self.name) # -------------------------------------------------------------------- # Constructors @@ -304,7 +268,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): name = maybe_extract_name(name, data, cls) - dtarr = DatetimeArray._from_sequence_not_strict( + dtarr = DatetimeArray._from_sequence( data, dtype=dtype, copy=copy, @@ -318,6 +282,20 @@ class DatetimeIndex(DatetimeTimedeltaMixin): subarr = cls._simple_new(dtarr, name=name) return subarr + @classmethod + def _simple_new(cls, values: DatetimeArray, name: Label = None): + assert isinstance(values, DatetimeArray), type(values) + + result = object.__new__(cls) + result._data = values + result.name = name + result._cache = {} + result._no_setting_name = False + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + result._reset_identity() + return result + # -------------------------------------------------------------------- @cache_readonly @@ -329,29 +307,33 @@ class DatetimeIndex(DatetimeTimedeltaMixin): ------- bool """ - from pandas.io.formats.format import is_dates_only + from pandas.io.formats.format import _is_dates_only - return self.tz is None and is_dates_only(self._values) + return self.tz is None and _is_dates_only(self._values) def __reduce__(self): # we use a special reduce here because we need # to simply set the .tz (and not reinterpret it) - d = {"data": self._data} + d = dict(data=self._data) d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None - def _validate_fill_value(self, value): + def _convert_for_op(self, value): """ Convert value to be insertable to ndarray. """ - return self._data._validate_setitem_value(value) + if self._has_same_tz(value): + return Timestamp(value).asm8 + raise ValueError("Passed item and index have different timezone") def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ + if not is_datetime64_any_dtype(dtype): + return False if self.tz is not None: # If we have tz, we can compare to tzaware return is_datetime64tz_dtype(dtype) @@ -367,10 +349,10 @@ class DatetimeIndex(DatetimeTimedeltaMixin): @property def _formatter_func(self): - from pandas.io.formats.format import get_format_datetime64 + from pandas.io.formats.format import _get_format_datetime64 - formatter = get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: f"'{formatter(x)}'" + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- # Set Operation Methods @@ -398,27 +380,8 @@ class DatetimeIndex(DatetimeTimedeltaMixin): this = this._fast_union(other) else: this = Index.union(this, other) - - res_name = get_unanimous_names(self, *others)[0] - if this.name != res_name: - return this.rename(res_name) return this - def _maybe_utc_convert(self, other: Index) -> Tuple["DatetimeIndex", Index]: - this = self - - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - # -------------------------------------------------------------------- def _get_time_micros(self): @@ -429,7 +392,9 @@ class DatetimeIndex(DatetimeTimedeltaMixin): ------- ndarray[int64_t] """ - values = self._data._local_timestamps() + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._data._local_timestamps() nanos = values % (24 * 3600 * 1_000_000_000) micros = nanos // 1000 @@ -539,9 +504,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): dta = DatetimeArray(snapped, dtype=self.dtype) return DatetimeIndex._simple_new(dta, name=self.name) - # -------------------------------------------------------------------- - # Indexing Methods - def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): """ Calculate datetime bounds for parsed time string and its resolution. @@ -612,28 +574,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # _parsed_string_to_bounds allows it. raise KeyError - def _deprecate_mismatched_indexing(self, key): - # GH#36148 - # we get here with isinstance(key, self._data._recognized_scalars) - try: - self._data._assert_tzawareness_compat(key) - except TypeError: - if self.tz is None: - msg = ( - "Indexing a timezone-naive DatetimeIndex with a " - "timezone-aware datetime is deprecated and will " - "raise KeyError in a future version. " - "Use a timezone-naive object instead." - ) - else: - msg = ( - "Indexing a timezone-aware DatetimeIndex with a " - "timezone-naive datetime is deprecated and will " - "raise KeyError in a future version. " - "Use a timezone-aware object instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=5) - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -651,7 +591,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): if isinstance(key, self._data._recognized_scalars): # needed to localize naive datetimes - self._deprecate_mismatched_indexing(key) key = self._maybe_cast_for_get_loc(key) elif isinstance(key, str): @@ -688,7 +627,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): raise KeyError(orig_key) from err def _maybe_cast_for_get_loc(self, key) -> Timestamp: - # needed to localize naive datetimes or dates (GH 35690) + # needed to localize naive datetimes key = Timestamp(key) if key.tzinfo is None: key = key.tz_localize(self.tz) @@ -716,13 +655,12 @@ class DatetimeIndex(DatetimeTimedeltaMixin): """ assert kind in ["loc", "getitem", None] + if is_float(label) or isinstance(label, time) or is_integer(label): + self._invalid_indexer("slice", label) + if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - try: - parsed, reso = parsing.parse_time_string(label, freq) - except parsing.DateParseError as err: - raise self._invalid_indexer("slice", label) from err - + parsed, reso = parsing.parse_time_string(label, freq) reso = Resolution.from_attrname(reso) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: @@ -734,18 +672,14 @@ class DatetimeIndex(DatetimeTimedeltaMixin): if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == "left" else lower return lower if side == "left" else upper - elif isinstance(label, (self._data._recognized_scalars, date)): - self._deprecate_mismatched_indexing(label) else: - raise self._invalid_indexer("slice", label) + return label - return self._maybe_cast_for_get_loc(label) - - def _get_string_slice(self, key: str): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) parsed, reso = parsing.parse_time_string(key, freq) reso = Resolution.from_attrname(reso) - loc = self._partial_date_slice(reso, parsed) + loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc def slice_indexer(self, start=None, end=None, step=None, kind=None): @@ -789,26 +723,15 @@ class DatetimeIndex(DatetimeTimedeltaMixin): if (start is None or isinstance(start, str)) and ( end is None or isinstance(end, str) ): - mask = np.array(True) - deprecation_mask = np.array(True) + mask = True if start is not None: start_casted = self._maybe_cast_slice_bound(start, "left", kind) mask = start_casted <= self - deprecation_mask = start_casted == self if end is not None: end_casted = self._maybe_cast_slice_bound(end, "right", kind) mask = (self <= end_casted) & mask - deprecation_mask = (end_casted == self) | deprecation_mask - if not deprecation_mask.any(): - warnings.warn( - "Value based partial slicing on non-monotonic DatetimeIndexes " - "with non-existing keys is deprecated and will raise a " - "KeyError in a future Version.", - FutureWarning, - stacklevel=5, - ) indexer = mask.nonzero()[0][::step] if len(indexer) == len(self): return slice(None) @@ -819,6 +742,9 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # -------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "datetime" + @property def inferred_type(self) -> str: # b/c datetime is represented as microseconds since the epoch, make @@ -916,6 +842,10 @@ class DatetimeIndex(DatetimeTimedeltaMixin): return mask.nonzero()[0] +DatetimeIndex._add_numeric_methods_disabled() +DatetimeIndex._add_logical_methods_disabled() + + def date_range( start=None, end=None, diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/extension.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/extension.py index 92bd82f..c9367b7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/extension.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/extension.py @@ -1,12 +1,10 @@ """ Shared methods for Index subclasses backed by ExtensionArray. """ -from typing import List, Optional, TypeVar +from typing import List import numpy as np -from pandas._libs import lib -from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -15,13 +13,10 @@ from pandas.core.dtypes.common import is_dtype_equal, is_object_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.arrays import ExtensionArray -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.base import Index from pandas.core.ops import get_op_result_name -_T = TypeVar("_T", bound="NDArrayBackedExtensionIndex") - def inherit_from_data(name: str, delegate, cache: bool = False, wrap: bool = False): """ @@ -213,24 +208,6 @@ class ExtensionIndex(Index): __le__ = _make_wrapped_comparison_op("__le__") __ge__ = _make_wrapped_comparison_op("__ge__") - @doc(Index._shallow_copy) - def _shallow_copy( - self, values: Optional[ExtensionArray] = None, name: Label = lib.no_default - ): - name = self.name if name is lib.no_default else name - - if values is not None: - return self._simple_new(values, name=name) - - result = self._simple_new(self._data, name=name) - result._cache = self._cache - return result - - @property - def _has_complex_internals(self) -> bool: - # used to avoid libreduction code paths, which raise or require conversion - return True - # --------------------------------------------------------------------- # NDarray-Like Methods @@ -240,42 +217,24 @@ class ExtensionIndex(Index): if result.ndim == 1: return type(self)(result, name=self.name) # Unpack to ndarray for MPL compat - # pandas\core\indexes\extension.py:220: error: "ExtensionArray" has - # no attribute "_data" [attr-defined] - result = result._data # type: ignore[attr-defined] + result = result._data # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) return result - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: - # overriding IndexOpsMixin improves performance GH#38083 - return self._data.searchsorted(value, side=side, sorter=sorter) - # --------------------------------------------------------------------- - def _check_indexing_method(self, method): - """ - Raise if we have a get_indexer `method` that is not supported or valid. - """ - # GH#37871 for now this is only for IntervalIndex and CategoricalIndex - if method is None: - return - - if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: - raise NotImplementedError( - f"method {method} not yet implemented for {type(self).__name__}" - ) - - raise ValueError("Invalid fill method") - def _get_engine_target(self) -> np.ndarray: - return np.asarray(self._data) + # NB: _values_for_argsort happens to match the desired engine targets + # for all of our existing EA-backed indexes, but in general + # cannot be relied upon to exist. + return self._data._values_for_argsort() def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) result = self._data.repeat(repeats, axis=axis) - return type(self)._simple_new(result, name=self.name) + return self._shallow_copy(result) def insert(self, loc: int, item): # ExtensionIndex subclasses must override Index.insert @@ -318,85 +277,3 @@ class ExtensionIndex(Index): # pass copy=False because any copying will be done in the # _data.astype call above return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - - @cache_readonly - def _isnan(self) -> np.ndarray: - return self._data.isna() - - @doc(Index.equals) - def equals(self, other) -> bool: - # Dispatch to the ExtensionArray's .equals method. - if self.is_(other): - return True - - if not isinstance(other, type(self)): - return False - - return self._data.equals(other._data) - - -class NDArrayBackedExtensionIndex(ExtensionIndex): - """ - Index subclass for indexes backed by NDArrayBackedExtensionArray. - """ - - _data: NDArrayBackedExtensionArray - - def _get_engine_target(self) -> np.ndarray: - return self._data._ndarray - - def delete(self, loc): - """ - Make new Index with passed location(-s) deleted - - Returns - ------- - new_index : Index - """ - new_vals = np.delete(self._data._ndarray, loc) - arr = self._data._from_backing_data(new_vals) - return type(self)._simple_new(arr, name=self.name) - - def insert(self, loc: int, item): - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values. - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - - Raises - ------ - ValueError if the item is not valid for this dtype. - """ - arr = self._data - code = arr._validate_scalar(item) - - new_vals = np.concatenate((arr._ndarray[:loc], [code], arr._ndarray[loc:])) - new_arr = arr._from_backing_data(new_vals) - return type(self)._simple_new(new_arr, name=self.name) - - @doc(Index.where) - def where(self, cond, other=None): - res_values = self._data.where(cond, other) - return type(self)._simple_new(res_values, name=self.name) - - def putmask(self, mask, value): - res_values = self._data.copy() - try: - res_values.putmask(mask, value) - except (TypeError, ValueError): - return self.astype(object).putmask(mask, value) - - return type(self)._simple_new(res_values, name=self.name) - - def _wrap_joined_index(self: _T, joined: np.ndarray, other: _T) -> _T: - name = get_op_result_name(self, other) - arr = self._data._from_backing_data(joined) - return type(self)._simple_new(arr, name=name) diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/frozen.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/frozen.py index 8c4437f..909643d 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/frozen.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/frozen.py @@ -103,7 +103,5 @@ class FrozenList(PandasObject, list): def __repr__(self) -> str: return f"{type(self).__name__}({str(self)})" - __setitem__ = __setslice__ = _disabled # type: ignore[assignment] - __delitem__ = __delslice__ = _disabled # type: ignore[assignment] - pop = append = extend = _disabled # type: ignore[assignment] - remove = sort = insert = _disabled # type: ignore[assignment] + __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled + pop = append = extend = remove = sort = insert = _disabled diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/interval.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/interval.py index f252cea..dcf89f2 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/interval.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/interval.py @@ -1,8 +1,7 @@ """ define the IntervalIndex """ -from functools import wraps from operator import le, lt import textwrap -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, cast +from typing import Any, List, Optional, Tuple, Union import numpy as np @@ -11,7 +10,7 @@ from pandas._config import get_option from pandas._libs import lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset -from pandas._typing import AnyArrayLike, DtypeObj, Label +from pandas._typing import AnyArrayLike, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -19,7 +18,6 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from_scalar, - maybe_box_datetimelike, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -38,7 +36,7 @@ from pandas.core.dtypes.common import ( is_object_dtype, is_scalar, ) -from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs @@ -51,7 +49,6 @@ from pandas.core.indexes.base import ( default_pprint, ensure_index, maybe_extract_name, - unpack_nested_dtype, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.extension import ExtensionIndex, inherit_names @@ -59,23 +56,21 @@ from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name -if TYPE_CHECKING: - from pandas import CategoricalIndex - +_VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - { - "klass": "IntervalIndex", - "qualname": "IntervalIndex", - "target_klass": "IntervalIndex or list of Intervals", - "name": textwrap.dedent( + dict( + klass="IntervalIndex", + qualname="IntervalIndex", + target_klass="IntervalIndex or list of Intervals", + name=textwrap.dedent( """\ name : object, optional Name to be stored in the index. """ ), - } + ) ) @@ -115,40 +110,55 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -def setop_check(method): +class SetopCheck: """ This is called to decorate the set operations of IntervalIndex to perform the type check in advance. """ - op_name = method.__name__ - @wraps(method) - def wrapped(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + def __init__(self, op_name): + self.op_name = op_name - if not isinstance(other, IntervalIndex): - result = getattr(self.astype(object), op_name)(other) - if op_name in ("difference",): - result = result.astype(self.dtype) - return result + def __call__(self, setop): + def func(intvidx_self, other, sort=False): + intvidx_self._assert_can_do_setop(other) + other = ensure_index(other) - return method(self, other, sort) + if not isinstance(other, IntervalIndex): + result = getattr(intvidx_self.astype(object), self.op_name)(other) + if self.op_name in ("difference",): + result = result.astype(intvidx_self.dtype) + return result + elif intvidx_self.closed != other.closed: + raise ValueError( + "can only do set operations between two IntervalIndex " + "objects that are closed on the same side" + ) - return wrapped + # GH 19016: ensure set op will not return a prohibited dtype + subtypes = [intvidx_self.dtype.subtype, other.dtype.subtype] + common_subtype = find_common_type(subtypes) + if is_object_dtype(common_subtype): + raise TypeError( + f"can only do {self.op_name} between two IntervalIndex " + "objects that have compatible dtypes" + ) + + return setop(intvidx_self, other, sort) + + return func @Appender( _interval_shared_docs["class"] - % { - "klass": "IntervalIndex", - "summary": "Immutable index of intervals that are closed on the same side.", - "name": _index_doc_kwargs["name"], - "versionadded": "0.20.0", - "extra_attributes": "is_overlapping\nvalues\n", - "extra_methods": "", - "examples": textwrap.dedent( + % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs["name"], + versionadded="0.20.0", + extra_attributes="is_overlapping\nvalues\n", + extra_methods="", + examples=textwrap.dedent( """\ Examples -------- @@ -168,11 +178,15 @@ def setop_check(method): mentioned constructor methods. """ ), - } + ) ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) -@inherit_names(["__array__", "overlaps", "contains"], IntervalArray) -@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) +@inherit_names( + ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray, +) +@inherit_names( + ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True, +) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] @@ -181,10 +195,10 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): # we would like our indexing holder to defer to us _defer_to_indexing = True - _data: IntervalArray - _values: IntervalArray - _can_hold_strings = False + # Immutable, so we are able to cache computations like isna in '_mask' + _mask = None + _data: IntervalArray # -------------------------------------------------------------------- # Constructors @@ -228,15 +242,16 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): result._data = array result.name = name result._cache = {} + result._no_setting_name = False result._reset_identity() return result @classmethod @Appender( _interval_shared_docs["from_breaks"] - % { - "klass": "IntervalIndex", - "examples": textwrap.dedent( + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( """\ Examples -------- @@ -246,7 +261,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): dtype='interval[int64]') """ ), - } + ) ) def from_breaks( cls, breaks, closed: str = "right", name=None, copy: bool = False, dtype=None @@ -260,9 +275,9 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): @classmethod @Appender( _interval_shared_docs["from_arrays"] - % { - "klass": "IntervalIndex", - "examples": textwrap.dedent( + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( """\ Examples -------- @@ -272,7 +287,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): dtype='interval[int64]') """ ), - } + ) ) def from_arrays( cls, @@ -292,9 +307,9 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): @classmethod @Appender( _interval_shared_docs["from_tuples"] - % { - "klass": "IntervalIndex", - "examples": textwrap.dedent( + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( """\ Examples -------- @@ -304,7 +319,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): dtype='interval[int64]') """ ), - } + ) ) def from_tuples( cls, data, closed: str = "right", name=None, copy: bool = False, dtype=None @@ -315,6 +330,26 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): # -------------------------------------------------------------------- + @Appender(Index._shallow_copy.__doc__) + def _shallow_copy(self, values=None, name: Label = lib.no_default): + name = self.name if name is lib.no_default else name + cache = self._cache.copy() if values is None else {} + if values is None: + values = self._data + + result = self._simple_new(values, name=name) + result._cache = cache + return result + + @cache_readonly + def _isnan(self): + """ + Return a mask indicating if each value is NA. + """ + if self._mask is None: + self._mask = isna(self.left) + return self._mask + @cache_readonly def _engine(self): left = self._maybe_convert_i8(self.left) @@ -355,20 +390,27 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): """ return self._data + @property + def _has_complex_internals(self) -> bool: + # used to avoid libreduction code paths, which raise or require conversion + return True + def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result def __reduce__(self): - d = {"left": self.left, "right": self.right} + d = dict(left=self.left, right=self.right) d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None @Appender(Index.astype.__doc__) - def astype(self, dtype, copy: bool = True): + def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) - return Index(new_values, dtype=new_values.dtype, name=self.name) + if is_interval_dtype(new_values.dtype): + return self._shallow_copy(new_values) + return Index.astype(self, dtype, copy=copy) @property def inferred_type(self) -> str: @@ -392,7 +434,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return self[::-1].is_monotonic_increasing @cache_readonly - def is_unique(self) -> bool: + def is_unique(self): """ Return True if the IntervalIndex contains unique elements, else False. """ @@ -470,11 +512,49 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): # GH 23309 return self._engine.is_overlapping + def _should_fallback_to_positional(self) -> bool: + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + return self.dtype.subtype.kind in ["m", "M"] + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(Index._convert_list_indexer.__doc__) + def _convert_list_indexer(self, keyarr): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + + # we have missing values + if (locs == -1).any(): + raise KeyError + + return locs + + def _can_reindex(self, indexer: np.ndarray) -> None: + """ + Check if we are allowing reindexing with this particular indexer. + + Parameters + ---------- + indexer : an integer indexer + + Raises + ------ + ValueError if its a duplicate axis + """ + # trying to reindex on an axis with duplicates + if self.is_overlapping and len(indexer): + raise ValueError("cannot reindex from an overlapping axis") + def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An - Interval-like requires conversion if its endpoints are one of the + Interval-like requires conversion if it's endpoints are one of the aforementioned types. Assumes that any list-like data has already been cast to an Index. @@ -496,7 +576,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): def _maybe_convert_i8(self, key): """ - Maybe convert a given key to its equivalent i8 value(s). Used as a + Maybe convert a given key to it's equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. @@ -529,13 +609,11 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) - if lib.is_period(key): - key_i8 = key.ordinal else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: - # convert NaT from its i8 value to np.nan so it's not viewed + # convert NaT from it's i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) @@ -550,6 +628,17 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return key_i8 + def _check_method(self, method): + if method is None: + return + + if method in ["bfill", "backfill", "pad", "ffill", "nearest"]: + raise NotImplementedError( + f"method {method} not yet implemented for IntervalIndex" + ) + + raise ValueError("Invalid fill method") + def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: raise KeyError( @@ -576,9 +665,6 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return sub_idx._searchsorted_monotonic(label, side) - # -------------------------------------------------------------------- - # Indexing Methods - def get_loc( self, key, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -620,7 +706,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): >>> index.get_loc(pd.Interval(0, 1)) 0 """ - self._check_indexing_method(method) + self._check_method(method) if not is_scalar(key): raise InvalidIndexError(key) @@ -671,7 +757,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): tolerance: Optional[Any] = None, ) -> np.ndarray: - self._check_indexing_method(method) + self._check_method(method) if self.is_overlapping: raise InvalidIndexError( @@ -686,8 +772,11 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): if self.equals(target_as_index): return np.arange(len(self), dtype="intp") - if self._is_non_comparable_own_type(target_as_index): - # different closed or incompatible subtype -> no matches + # different closed or incompatible subtype -> no matches + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): return np.repeat(np.intp(-1), len(target_as_index)) # non-overlapping -> at most one match per interval in target_as_index @@ -697,7 +786,6 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): right_indexer = self.right.get_indexer(target_as_index.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) elif is_categorical_dtype(target_as_index.dtype): - target_as_index = cast("CategoricalIndex", target_as_index) # get an indexer for unique categories then propagate to codes via take_1d categories_indexer = self.get_indexer(target_as_index.categories) indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) @@ -707,7 +795,17 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): indexer = self._engine.get_indexer(target_as_index.values) else: # heterogeneous scalar index: defer elementwise to get_loc - return self._get_indexer_pointwise(target_as_index)[0] + # (non-overlapping so get_loc guarantees scalar of KeyError) + indexer = [] + for key in target_as_index: + try: + loc = self.get_loc(key) + except KeyError: + loc = -1 + except InvalidIndexError as err: + # i.e. non-scalar key + raise TypeError(key) from err + indexer.append(loc) return ensure_platform_int(indexer) @@ -719,8 +817,10 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): # check that target_as_index IntervalIndex is compatible if isinstance(target_as_index, IntervalIndex): - - if self._is_non_comparable_own_type(target_as_index): + common_subtype = find_common_type( + [self.dtype.subtype, target_as_index.dtype.subtype] + ) + if self.closed != target_as_index.closed or is_object_dtype(common_subtype): # different closed or incompatible subtype -> no matches return ( np.repeat(-1, len(target_as_index)), @@ -731,8 +831,18 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): target_as_index, IntervalIndex ): # target_as_index might contain intervals: defer elementwise to get_loc - return self._get_indexer_pointwise(target_as_index) - + indexer, missing = [], [] + for i, key in enumerate(target_as_index): + try: + locs = self.get_loc(key) + if isinstance(locs, slice): + locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + locs = np.array(locs, ndmin=1) + except KeyError: + missing.append(i) + locs = np.array([-1]) + indexer.append(locs) + indexer = np.concatenate(indexer) else: target_as_index = self._maybe_convert_i8(target_as_index) indexer, missing = self._engine.get_indexer_non_unique( @@ -741,33 +851,21 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return ensure_platform_int(indexer), ensure_platform_int(missing) - def _get_indexer_pointwise(self, target: Index) -> Tuple[np.ndarray, np.ndarray]: + def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: """ - pointwise implementation for get_indexer and get_indexer_non_unique. + Guaranteed return of an indexer even when overlapping. + + This dispatches to get_indexer or get_indexer_non_unique + as appropriate. + + Returns + ------- + numpy.ndarray + List of indices. """ - indexer, missing = [], [] - for i, key in enumerate(target): - try: - locs = self.get_loc(key) - if isinstance(locs, slice): - # Only needed for get_indexer_non_unique - locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") - locs = np.array(locs, ndmin=1) - except KeyError: - missing.append(i) - locs = np.array([-1]) - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err - - indexer.append(locs) - - indexer = np.concatenate(indexer) - return ensure_platform_int(indexer), ensure_platform_int(missing) - - @property - def _index_as_unique(self): - return not self.is_overlapping + if self.is_overlapping: + return self.get_indexer_non_unique(target)[0] + return self.get_indexer(target, **kwargs) def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): @@ -783,91 +881,13 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return super()._convert_slice_indexer(key, kind) - def _should_fallback_to_positional(self) -> bool: - # integer lookups in Series.__getitem__ are unambiguously - # positional in this case - return self.dtype.subtype.kind in ["m", "M"] - - def _maybe_cast_slice_bound(self, label, side: str, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError(keyarr[locs == -1].tolist()) - - return locs - - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - if not isinstance(dtype, IntervalDtype): - return False - common_subtype = find_common_type([self.dtype.subtype, dtype.subtype]) - return not is_object_dtype(common_subtype) - - def _should_compare(self, other) -> bool: - if not super()._should_compare(other): - return False - other = unpack_nested_dtype(other) - return other.closed == self.closed - - # TODO: use should_compare and get rid of _is_non_comparable_own_type - def _is_non_comparable_own_type(self, other: "IntervalIndex") -> bool: - # different closed or incompatible subtype -> no matches - - # TODO: once closed is part of IntervalDtype, we can just define - # is_comparable_dtype GH#19371 - if self.closed != other.closed: - return True - return not self._is_comparable_dtype(other.dtype) - - # -------------------------------------------------------------------- - - @cache_readonly - def left(self) -> Index: - return Index(self._data.left, copy=False) - - @cache_readonly - def right(self) -> Index: - return Index(self._data.right, copy=False) - - @cache_readonly - def mid(self): - return Index(self._data.mid, copy=False) - - @property - def length(self): - return Index(self._data.length, copy=False) - - def putmask(self, mask, value): - arr = self._data.copy() - try: - value_left, value_right = arr._validate_setitem_value(value) - except (ValueError, TypeError): - return self.astype(object).putmask(mask, value) - - if isinstance(self._data._left, np.ndarray): - np.putmask(arr._left, mask, value_left) - np.putmask(arr._right, mask, value_right) - else: - # TODO: special case not needed with __array_function__ - arr._left.putmask(mask, value_left) - arr._right.putmask(mask, value_right) - return type(self)._simple_new(arr, name=self.name) - @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self._values, other) result = IntervalArray(values) - return type(self)._simple_new(result, name=self.name) + return self._shallow_copy(result) def delete(self, loc): """ @@ -880,7 +900,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): new_left = self.left.delete(loc) new_right = self.right.delete(loc) result = self._data._shallow_copy(new_left, new_right) - return type(self)._simple_new(result, name=self.name) + return self._shallow_copy(result) def insert(self, loc, item): """ @@ -897,12 +917,32 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): ------- IntervalIndex """ - left_insert, right_insert = self._data._validate_scalar(item) + if isinstance(item, Interval): + if item.closed != self.closed: + raise ValueError( + "inserted item must be closed on the same side as the index" + ) + left_insert = item.left + right_insert = item.right + elif is_scalar(item) and isna(item): + # GH 18295 + left_insert = right_insert = item + else: + raise ValueError( + "can only insert Interval objects and NA into an IntervalIndex" + ) new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) result = self._data._shallow_copy(new_left, new_right) - return type(self)._simple_new(result, name=self.name) + return self._shallow_copy(result) + + @Appender(_index_shared_docs["take"] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + result = self._data.take( + indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs + ) + return self._shallow_copy(result) # -------------------------------------------------------------------- # Rendering Methods @@ -961,41 +1001,35 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return f"\n{space}" # -------------------------------------------------------------------- - # Set Operations - def _assert_can_do_setop(self, other): - super()._assert_can_do_setop(other) + def argsort(self, *args, **kwargs) -> np.ndarray: + return np.lexsort((self.right, self.left)) - if isinstance(other, IntervalIndex) and self._is_non_comparable_own_type(other): - # GH#19016: ensure set op will not return a prohibited dtype - raise TypeError( - "can only do set operations between two IntervalIndex " - "objects that are closed on the same side " - "and have compatible dtypes" - ) + def equals(self, other) -> bool: + """ + Determines if two IntervalIndex objects contain the same elements. + """ + if self.is_(other): + return True + + # if we can coerce to an II + # then we can compare + if not isinstance(other, IntervalIndex): + if not is_interval_dtype(other): + return False + other = Index(other) + + return ( + self.left.equals(other.left) + and self.right.equals(other.right) + and self.closed == other.closed + ) @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False) -> Index: - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) - - if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) - return self._get_reconciled_name_object(other) - - if not isinstance(other, IntervalIndex): - return self.astype(object).intersection(other) - - result = self._intersection(other, sort=sort) - return self._wrap_setop_result(other, result) - - def _intersection(self, other, sort): - """ - intersection specialized to the case with matching dtypes. - """ - # For IntervalIndex we also know other.closed == self.closed + @SetopCheck(op_name="intersection") + def intersection( + self, other: "IntervalIndex", sort: bool = False + ) -> "IntervalIndex": if self.left.is_unique and self.right.is_unique: taken = self._intersection_unique(other) elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: @@ -1014,7 +1048,7 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does not have any common endpoint, - no matter left or right. + no mater left or right. Return the intersection with another IntervalIndex. Parameters @@ -1061,11 +1095,8 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return self[mask] def _setop(op_name: str, sort=None): + @SetopCheck(op_name=op_name) def func(self, other, sort=sort): - # At this point we are assured - # isinstance(other, IntervalIndex) - # other.closed == self.closed - result = getattr(self._multiindex, op_name)(other._multiindex, sort=sort) result_name = get_op_result_name(self, other) @@ -1077,25 +1108,38 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): return type(self).from_tuples(result, closed=self.closed, name=result_name) - func.__name__ = op_name - return setop_check(func) - - _union = _setop("union") - difference = _setop("difference") - symmetric_difference = _setop("symmetric_difference") - - # -------------------------------------------------------------------- + return func @property - def _is_all_dates(self) -> bool: + def is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself """ return False + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") + # TODO: arithmetic operations + # GH#30817 until IntervalArray implements inequalities, get them from Index + def __lt__(self, other): + return Index.__lt__(self, other) + + def __le__(self, other): + return Index.__le__(self, other) + + def __gt__(self, other): + return Index.__gt__(self, other) + + def __ge__(self, other): + return Index.__ge__(self, other) + + +IntervalIndex._add_logical_methods_disabled() + def _is_valid_endpoint(endpoint) -> bool: """ @@ -1215,8 +1259,8 @@ def interval_range( IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], closed='both', dtype='interval[int64]') """ - start = maybe_box_datetimelike(start) - end = maybe_box_datetimelike(end) + start = com.maybe_box_datetimelike(start) + end = com.maybe_box_datetimelike(end) endpoint = start if start is not None else end if freq is None and com.any_none(periods, start, end): @@ -1280,8 +1324,10 @@ def interval_range( else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): - breaks = date_range(start=start, end=end, periods=periods, freq=freq) + range_func = date_range else: - breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) + range_func = timedelta_range + + breaks = range_func(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/multi.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/multi.py index a9d93f4..6ad82e8 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/multi.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/multi.py @@ -1,4 +1,3 @@ -from functools import wraps from sys import getsizeof from typing import ( TYPE_CHECKING, @@ -20,7 +19,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, DtypeObj, Label, Scalar, Shape +from pandas._typing import AnyArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -47,16 +46,10 @@ from pandas.core.arrays import Categorical from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import ( - Index, - _index_shared_docs, - ensure_index, - get_unanimous_names, -) +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.frozen import FrozenList from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing -from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, indexer_from_factorized, @@ -70,11 +63,11 @@ from pandas.io.formats.printing import ( ) if TYPE_CHECKING: - from pandas import Series + from pandas import Series # noqa:F401 _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( - {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"} + dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") ) @@ -158,25 +151,6 @@ class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectE return np.bitwise_or.reduce(codes, axis=1) -def names_compat(meth): - """ - A decorator to allow either `name` or `names` keyword but not both. - - This makes it easier to share code with base class. - """ - - @wraps(meth) - def new_meth(self_or_cls, *args, **kwargs): - if "name" in kwargs and "names" in kwargs: - raise TypeError("Can only provide one of `names` and `name`") - elif "name" in kwargs: - kwargs["names"] = kwargs.pop("name") - - return meth(self_or_cls, *args, **kwargs) - - return new_meth - - class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects. @@ -258,7 +232,7 @@ class MultiIndex(Index): of the mentioned helper methods. """ - _hidden_attrs = Index._hidden_attrs | frozenset() + _deprecations = Index._deprecations | frozenset() # initialize to zero-length tuples to make everything work _typ = "multiindex" @@ -268,6 +242,7 @@ class MultiIndex(Index): _comparables = ["names"] rename = Index.set_names + _tuples = None sortorder: Optional[int] # -------------------------------------------------------------------- @@ -283,6 +258,7 @@ class MultiIndex(Index): copy=False, name=None, verify_integrity: bool = True, + _set_identity: bool = True, ): # compat with Index @@ -316,7 +292,8 @@ class MultiIndex(Index): new_codes = result._verify_integrity() result._codes = new_codes - result._reset_identity() + if _set_identity: + result._reset_identity() return result @@ -463,7 +440,7 @@ class MultiIndex(Index): if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] - return cls( + return MultiIndex( levels=levels, codes=codes, sortorder=sortorder, @@ -472,13 +449,7 @@ class MultiIndex(Index): ) @classmethod - @names_compat - def from_tuples( - cls, - tuples, - sortorder: Optional[int] = None, - names: Optional[Sequence[Label]] = None, - ): + def from_tuples(cls, tuples, sortorder=None, names=None): """ Convert list of tuples to MultiIndex. @@ -519,7 +490,6 @@ class MultiIndex(Index): elif is_iterator(tuples): tuples = list(tuples) - arrays: List[Sequence[Label]] if len(tuples) == 0: if names is None: raise TypeError("Cannot infer number of levels from empty list") @@ -534,7 +504,7 @@ class MultiIndex(Index): else: arrays = zip(*tuples) - return cls.from_arrays(arrays, sortorder=sortorder, names=names) + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod def from_product(cls, iterables, sortorder=None, names=lib.no_default): @@ -593,7 +563,7 @@ class MultiIndex(Index): # codes are all ndarrays, so cartesian_product is lossless codes = cartesian_product(codes) - return cls(levels, codes, sortorder=sortorder, names=names) + return MultiIndex(levels, codes, sortorder=sortorder, names=names) @classmethod def from_frame(cls, df, sortorder=None, names=None): @@ -663,9 +633,16 @@ class MultiIndex(Index): # -------------------------------------------------------------------- - @cache_readonly + @property def _values(self): # We override here, since our parent uses _data, which we don't use. + return self.values + + @property + def values(self): + if self._tuples is not None: + return self._tuples + values = [] for i in range(self.nlevels): @@ -679,12 +656,8 @@ class MultiIndex(Index): vals = np.array(vals, copy=False) values.append(vals) - arr = lib.fast_zip(values) - return arr - - @property - def values(self): - return self._values + self._tuples = lib.fast_zip(values) + return self._tuples @property def array(self): @@ -702,7 +675,7 @@ class MultiIndex(Index): ) @property - def shape(self) -> Shape: + def shape(self): """ Return a tuple of the shape of the underlying data. """ @@ -730,13 +703,8 @@ class MultiIndex(Index): return FrozenList(result) def _set_levels( - self, - levels, - level=None, - copy: bool = False, - validate: bool = True, - verify_integrity: bool = False, - ) -> None: + self, levels, level=None, copy=False, validate=True, verify_integrity=False + ): # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly @@ -754,10 +722,10 @@ class MultiIndex(Index): ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_levels_list = list(self._levels) + new_levels = list(self._levels) for lev_num, lev in zip(level_numbers, levels): - new_levels_list[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() - new_levels = FrozenList(new_levels_list) + new_levels[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() + new_levels = FrozenList(new_levels) if verify_integrity: new_codes = self._verify_integrity(levels=new_levels) @@ -768,9 +736,10 @@ class MultiIndex(Index): if any(names): self._set_names(names) + self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): + def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -782,15 +751,12 @@ class MultiIndex(Index): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. - - .. deprecated:: 1.2.0 verify_integrity : bool, default True If True, checks that levels and codes are compatible. Returns ------- - new index (of same type and class...etc) or None - The same type as the caller or None if ``inplace=True``. + new index (of same type and class...etc) Examples -------- @@ -855,15 +821,6 @@ class MultiIndex(Index): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ - if inplace is not None: - warnings.warn( - "inplace is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - inplace = False - if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -893,15 +850,6 @@ class MultiIndex(Index): def nlevels(self) -> int: """ Integer number of levels in this MultiIndex. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.nlevels - 3 """ return len(self._levels) @@ -909,15 +857,6 @@ class MultiIndex(Index): def levshape(self): """ A tuple with the length of each level. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']]) - >>> mi - MultiIndex([('a', 'b', 'c')], - ) - >>> mi.levshape - (1, 1, 1) """ return tuple(len(x) for x in self.levels) @@ -929,13 +868,8 @@ class MultiIndex(Index): return self._codes def _set_codes( - self, - codes, - level=None, - copy: bool = False, - validate: bool = True, - verify_integrity: bool = False, - ) -> None: + self, codes, level=None, copy=False, validate=True, verify_integrity=False + ): if validate: if level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") @@ -949,22 +883,21 @@ class MultiIndex(Index): ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_codes_list = list(self._codes) + new_codes = list(self._codes) for lev_num, level_codes in zip(level_numbers, codes): lev = self.levels[lev_num] - new_codes_list[lev_num] = _coerce_indexer_frozen( - level_codes, lev, copy=copy - ) - new_codes = FrozenList(new_codes_list) + new_codes[lev_num] = _coerce_indexer_frozen(level_codes, lev, copy=copy) + new_codes = FrozenList(new_codes) if verify_integrity: new_codes = self._verify_integrity(codes=new_codes) self._codes = new_codes + self._tuples = None self._reset_cache() - def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): + def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -980,15 +913,12 @@ class MultiIndex(Index): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. - - .. deprecated:: 1.2.0 verify_integrity : bool (default True) If True, checks that levels and codes are compatible. Returns ------- - new index (of same type and class...etc) or None - The same type as the caller or None if ``inplace=True``. + new index (of same type and class...etc) Examples -------- @@ -1027,15 +957,6 @@ class MultiIndex(Index): (1, 'two')], names=['foo', 'bar']) """ - if inplace is not None: - warnings.warn( - "inplace is deprecated and will be removed in a future version.", - FutureWarning, - stacklevel=2, - ) - else: - inplace = False - if level is not None and not is_list_like(level): if not is_list_like(codes): raise TypeError("Codes must be list-like") @@ -1063,7 +984,7 @@ class MultiIndex(Index): def _engine(self): # Calculate the number of bits needed to represent labels in each # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) + sizes = np.ceil(np.log2([len(l) + 1 for l in self.levels])) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] @@ -1083,26 +1004,57 @@ class MultiIndex(Index): @property def _constructor(self): - return type(self).from_tuples + return MultiIndex.from_tuples @doc(Index._shallow_copy) - def _shallow_copy(self, values=None, name=lib.no_default): - names = name if name is not lib.no_default else self.names + def _shallow_copy( + self, + values=None, + name=lib.no_default, + levels=None, + codes=None, + dtype=None, + sortorder=None, + names=lib.no_default, + _set_identity: bool = True, + ): + if names is not lib.no_default and name is not lib.no_default: + raise TypeError("Can only provide one of `names` and `name`") + elif names is lib.no_default: + names = name if name is not lib.no_default else self.names if values is not None: - return type(self).from_tuples(values, sortorder=None, names=names) + assert levels is None and codes is None and dtype is None + return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) - result = type(self)( - levels=self.levels, - codes=self.codes, - sortorder=None, + levels = levels if levels is not None else self.levels + codes = codes if codes is not None else self.codes + + result = MultiIndex( + levels=levels, + codes=codes, + dtype=dtype, + sortorder=sortorder, names=names, verify_integrity=False, + _set_identity=_set_identity, ) result._cache = self._cache.copy() result._cache.pop("levels", None) # GH32669 return result + def symmetric_difference(self, other, result_name=None, sort=None): + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + tups = Index.symmetric_difference(self, other, result_name, sort) + if len(tups) == 0: + return MultiIndex( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=tups.name, + ) + return type(self).from_tuples(tups, names=tups.name) + # -------------------------------------------------------------------- def copy( @@ -1113,6 +1065,7 @@ class MultiIndex(Index): codes=None, deep=False, name=None, + _set_identity=False, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -1122,14 +1075,8 @@ class MultiIndex(Index): ---------- names : sequence, optional dtype : numpy dtype or pandas type, optional - - .. deprecated:: 1.2.0 levels : sequence, optional - - .. deprecated:: 1.2.0 codes : sequence, optional - - .. deprecated:: 1.2.0 deep : bool, default False name : Label Kept for compatibility with 1-dimensional Index. Should not be used. @@ -1145,21 +1092,6 @@ class MultiIndex(Index): This could be potentially expensive on large MultiIndex objects. """ names = self._validate_names(name=name, names=names, deep=deep) - if levels is not None: - warnings.warn( - "parameter levels is deprecated and will be removed in a future " - "version. Use the set_levels method instead.", - FutureWarning, - stacklevel=2, - ) - if codes is not None: - warnings.warn( - "parameter codes is deprecated and will be removed in a future " - "version. Use the set_codes method instead.", - FutureWarning, - stacklevel=2, - ) - if deep: from copy import deepcopy @@ -1168,28 +1100,14 @@ class MultiIndex(Index): if codes is None: codes = deepcopy(self.codes) - levels = levels if levels is not None else self.levels - codes = codes if codes is not None else self.codes - - new_index = type(self)( + return self._shallow_copy( levels=levels, codes=codes, - sortorder=self.sortorder, names=names, - verify_integrity=False, + dtype=dtype, + sortorder=self.sortorder, + _set_identity=_set_identity, ) - new_index._cache = self._cache.copy() - new_index._cache.pop("levels", None) # GH32669 - - if dtype: - warnings.warn( - "parameter dtype is deprecated and will be removed in a future " - "version. Use the astype method instead.", - FutureWarning, - stacklevel=2, - ) - new_index = new_index.astype(dtype) - return new_index def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ @@ -1217,10 +1135,10 @@ class MultiIndex(Index): def _is_memory_usage_qualified(self) -> bool: """ return a boolean if we need a qualified .info display """ - def f(level): - return "mixed" in level or "string" in level or "unicode" in level + def f(l): + return "mixed" in l or "string" in l or "unicode" in l - return any(f(level) for level in self._inferred_type_levels) + return any(f(l) for l in self._inferred_type_levels) @doc(Index.memory_usage) def memory_usage(self, deep: bool = False) -> int: @@ -1375,14 +1293,14 @@ class MultiIndex(Index): if sparsify in [False, lib.no_default]: sentinel = sparsify # little bit of a kludge job for #1217 - result_levels = sparsify_labels( + result_levels = _sparsify( result_levels, start=int(names), sentinel=sentinel ) if adjoin: - from pandas.io.formats.format import get_adjustment + from pandas.io.formats.format import _get_adjustment - adj = get_adjustment() + adj = _get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: return result_levels @@ -1446,30 +1364,13 @@ class MultiIndex(Index): raise TypeError( f"{type(self).__name__}.name must be a hashable type" ) - # pandas\core\indexes\multi.py:1448: error: Cannot determine type - # of '__setitem__' [has-type] - self._names[lev] = name # type: ignore[has-type] + self._names[lev] = name # If .levels has been accessed, the names in our cache will be stale. self._reset_cache() names = property( - fset=_set_names, - fget=_get_names, - doc=""" - Names of levels in MultiIndex. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays( - ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z']) - >>> mi - MultiIndex([(1, 3, 5), - (2, 4, 6)], - names=['x', 'y', 'z']) - >>> mi.names - FrozenList(['x', 'y', 'z']) - """, + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) # -------------------------------------------------------------------- @@ -1612,7 +1513,7 @@ class MultiIndex(Index): raise ValueError(f"invalid how option: {how}") new_codes = [level_codes[~indexer] for level_codes in self.codes] - return self.set_codes(codes=new_codes) + return self.copy(codes=new_codes, deep=True) def _get_level_values(self, level, unique=False): """ @@ -1684,6 +1585,10 @@ class MultiIndex(Index): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) + def _to_safe_for_reshape(self): + """ convert to object if we are a categorical """ + return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) + def to_frame(self, index=True, name=None): """ Create a DataFrame with the levels of the MultiIndex as columns. @@ -1709,32 +1614,6 @@ class MultiIndex(Index): -------- DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']]) - >>> mi - MultiIndex([('a', 'c'), - ('b', 'd')], - ) - - >>> df = mi.to_frame() - >>> df - 0 1 - a c a c - b d b d - - >>> df = mi.to_frame(index=False) - >>> df - 0 1 - 0 a c - 1 b d - - >>> df = mi.to_frame(name=['x', 'y']) - >>> df - x y - a c a c - b d b d """ from pandas import DataFrame @@ -1792,7 +1671,7 @@ class MultiIndex(Index): return Index(self._values, tupleize_cols=False) @property - def _is_all_dates(self) -> bool: + def is_all_dates(self) -> bool: return False def is_lexsorted(self) -> bool: @@ -2007,12 +1886,12 @@ class MultiIndex(Index): def __reduce__(self): """Necessary for making this object picklable""" - d = { - "levels": list(self.levels), - "codes": list(self.codes), - "sortorder": self.sortorder, - "names": list(self.names), - } + d = dict( + levels=list(self.levels), + codes=list(self.codes), + sortorder=self.sortorder, + names=list(self.names), + ) return ibase._new_Index, (type(self), d), None # -------------------------------------------------------------------- @@ -2052,15 +1931,31 @@ class MultiIndex(Index): @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) + taken = self._assert_take_fillable( + self.codes, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=-1, + ) + return MultiIndex( + levels=self.levels, codes=taken, names=self.names, verify_integrity=False + ) + def _assert_take_fillable( + self, values, indices, allow_fill=True, fill_value=None, na_value=None + ): + """ Internal method to handle NA filling of take """ # only fill if we are passing a non-None fill_value - allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices) - - na_value = -1 - - if allow_fill: + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ( + "When allow_fill=True and fill_value is not None, " + "all indices must be >= -1" + ) + raise ValueError(msg) taken = [lab.take(indices) for lab in self.codes] mask = indices == -1 if mask.any(): @@ -2072,10 +1967,7 @@ class MultiIndex(Index): taken = masked else: taken = [lab.take(indices) for lab in self.codes] - - return MultiIndex( - levels=self.levels, codes=taken, names=self.names, verify_integrity=False - ) + return taken def append(self, other): """ @@ -2116,7 +2008,7 @@ class MultiIndex(Index): @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) def repeat(self, repeats, axis=None): - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) repeats = ensure_platform_int(repeats) return MultiIndex( levels=self.levels, @@ -2139,7 +2031,7 @@ class MultiIndex(Index): Parameters ---------- codes : array-like - Must be a list of tuples when level is not specified + Must be a list of tuples level : int or level name, default None errors : str, default 'raise' @@ -2165,8 +2057,7 @@ class MultiIndex(Index): if isinstance(loc, int): inds.append(loc) elif isinstance(loc, slice): - step = loc.step if loc.step is not None else 1 - inds.extend(range(loc.start, loc.stop, step)) + inds.extend(range(loc.start, loc.stop)) elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( @@ -2191,17 +2082,10 @@ class MultiIndex(Index): i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) - # If nan should be dropped it will equal -1 here. We have to check which values - # are not nan and equal -1, this means they are missing in the index - nan_codes = isna(codes) - values[(np.equal(nan_codes, False)) & (values == -1)] = -2 - if index.shape[0] == self.shape[0]: - values[np.equal(nan_codes, True)] = -2 - not_found = codes[values == -2] - if len(not_found) != 0 and errors != "ignore": - raise KeyError(f"labels {not_found} not found in level") mask = ~algos.isin(self.codes[i], values) + if mask.all() and errors != "ignore": + raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2276,24 +2160,6 @@ class MultiIndex(Index): Returns ------- MultiIndex - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y']) - >>> mi - MultiIndex([(1, 3), - (2, 4)], - names=['x', 'y']) - - >>> mi.reorder_levels(order=[1, 0]) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) - - >>> mi.reorder_levels(order=['y', 'x']) - MultiIndex([(3, 1), - (4, 2)], - names=['y', 'x']) """ order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: @@ -2311,7 +2177,7 @@ class MultiIndex(Index): def _get_codes_for_sorting(self): """ - we are categorizing our codes by using the + we categorizing our codes by using the available categories (all, not just observed) excluding any missing ones (-1); this is in preparation for sorting, where we need to disambiguate that -1 is not @@ -2352,34 +2218,6 @@ class MultiIndex(Index): Resulting index. indexer : np.ndarray Indices of output values in original index. - - Examples - -------- - >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]]) - >>> mi - MultiIndex([(0, 2), - (0, 1)], - ) - - >>> mi.sortlevel() - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(sort_remaining=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) - - >>> mi.sortlevel(1) - (MultiIndex([(0, 1), - (0, 2)], - ), array([1, 0])) - - >>> mi.sortlevel(1, ascending=False) - (MultiIndex([(0, 2), - (0, 1)], - ), array([0, 1])) """ if isinstance(level, (str, int)): level = [level] @@ -2527,10 +2365,6 @@ class MultiIndex(Index): if is_scalar(loc): return new_values - if len(new_values) == 1 and not self.nlevels > 1: - # If more than one level left, we can not return a scalar - return new_values[0] - new_index = self[loc] new_index = maybe_droplevels(new_index, key) new_ser = series._constructor(new_values, index=new_index, name=series.name) @@ -2578,7 +2412,7 @@ class MultiIndex(Index): if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing: # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) - key = (key,) + (slice(None),) * (len(self.levels) - 1) + key = tuple([key] + [slice(None)] * (len(self.levels) - 1)) if isinstance(key, tuple): # Convert (..., '2016-01-01', ...) in tuple to @@ -2637,6 +2471,10 @@ class MultiIndex(Index): return ensure_platform_int(indexer) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + return super().get_indexer_non_unique(target) + def get_slice_bound( self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str ) -> int: @@ -2768,17 +2606,9 @@ class MultiIndex(Index): return start + section.searchsorted(loc, side=side) idx = self._get_loc_single_level_index(lev, lab) - if isinstance(idx, slice) and k < n - 1: - # Get start and end value from slice, necessary when a non-integer - # interval is given as input GH#37707 - start = idx.start - end = idx.stop - elif k < n - 1: + if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") - elif isinstance(idx, slice): - idx = idx.start - return start + section.searchsorted(idx, side=side) else: return start + section.searchsorted(idx, side=side) @@ -2956,29 +2786,16 @@ class MultiIndex(Index): >>> mi.get_loc_level(['b', 'e']) (1, None) """ - if not isinstance(level, (list, tuple)): - level = self._get_level_number(level) - else: - level = [self._get_level_number(lev) for lev in level] - return self._get_loc_level(key, level=level, drop_level=drop_level) - - def _get_loc_level( - self, key, level: Union[int, List[int]] = 0, drop_level: bool = True - ): - """ - get_loc_level but with `level` known to be positional, not name-based. - """ - # different name to distinguish from maybe_droplevels def maybe_mi_droplevels(indexer, levels, drop_level: bool): if not drop_level: return self[indexer] # kludge around orig_index = new_index = self[indexer] - + levels = [self._get_level_number(i) for i in levels] for i in sorted(levels, reverse=True): try: - new_index = new_index._drop_level_numbers([i]) + new_index = new_index.droplevel(i) except ValueError: # no dropping here @@ -2992,7 +2809,7 @@ class MultiIndex(Index): ) result = None for lev, k in zip(level, key): - loc, new_index = self._get_loc_level(k, level=lev) + loc, new_index = self.get_loc_level(k, level=lev) if isinstance(loc, slice): mask = np.zeros(len(self), dtype=bool) mask[loc] = True @@ -3002,6 +2819,8 @@ class MultiIndex(Index): return result, maybe_mi_droplevels(result, level, drop_level) + level = self._get_level_number(level) + # kludge for #1796 if isinstance(key, list): key = tuple(key) @@ -3066,8 +2885,7 @@ class MultiIndex(Index): indexer = self._get_level_indexer(key, level=level) return indexer, maybe_mi_droplevels(indexer, [level], drop_level) - def _get_level_indexer(self, key, level: int = 0, indexer=None): - # `level` kwarg is _always_ positional, never name + def _get_level_indexer(self, key, level=0, indexer=None): # return an indexer, boolean array or a slice showing where the key is # in the totality of values # if the indexer is provided, then use this @@ -3079,11 +2897,8 @@ class MultiIndex(Index): # given the inputs and the codes/indexer, compute an indexer set # if we have a provided indexer, then this need not consider # the entire labels set - if step is not None and step < 0: - # Switch elements for negative step size - start, stop = stop - 1, start - 1 - r = np.arange(start, stop, step) + r = np.arange(start, stop, step) if indexer is not None and len(indexer) != len(codes): # we have an indexer which maps the locations in the labels @@ -3117,8 +2932,6 @@ class MultiIndex(Index): start = 0 if key.stop is not None: stop = level_index.get_loc(key.stop) - elif isinstance(start, slice): - stop = len(level_index) else: stop = len(level_index) - 1 step = key.step @@ -3153,27 +2966,22 @@ class MultiIndex(Index): else: - idx = self._get_loc_single_level_index(level_index, key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted - locs = np.array(level_codes == idx, dtype=bool, copy=False) + locs = np.array(level_codes == code, dtype=bool, copy=False) if not locs.any(): # The label is present in self.levels[level] but unused: raise KeyError(key) return locs - if isinstance(idx, slice): - start = idx.start - end = idx.stop - else: - start = level_codes.searchsorted(idx, side="left") - end = level_codes.searchsorted(idx, side="right") - - if start == end: + i = level_codes.searchsorted(code, side="left") + j = level_codes.searchsorted(code, side="right") + if i == j: # The label is present in self.levels[level] but unused: raise KeyError(key) - return slice(start, end) + return slice(i, j) def get_locs(self, seq): """ @@ -3209,6 +3017,7 @@ class MultiIndex(Index): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) """ + from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -3238,53 +3047,44 @@ class MultiIndex(Index): r = r.nonzero()[0] return Int64Index(r) - def _update_indexer( - idxr: Optional[Index], indexer: Optional[Index], key - ) -> Index: + def _update_indexer(idxr, indexer=indexer): if indexer is None: indexer = Index(np.arange(n)) if idxr is None: return indexer - indexer_intersection = indexer.intersection(idxr) - if indexer_intersection.empty and not idxr.empty and not indexer.empty: - raise KeyError(key) - return indexer_intersection + return indexer & idxr for i, k in enumerate(seq): if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer( - _convert_to_indexer(k), indexer=indexer, key=seq - ) + indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - indexers: Optional[Int64Index] = None + indexers = None for x in k: try: idxrs = _convert_to_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) - indexers = (idxrs if indexers is None else indexers).union( - idxrs, sort=False - ) + indexers = idxrs if indexers is None else indexers | idxrs except KeyError: # ignore not founds continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer, key=seq) + indexer = _update_indexer(indexers, indexer=indexer) else: # no matches we are done return np.array([], dtype=np.int64) elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer, key=seq) + indexer = _update_indexer(None, indexer=indexer) elif isinstance(k, slice): @@ -3294,7 +3094,6 @@ class MultiIndex(Index): self._get_level_indexer(k, level=i, indexer=indexer) ), indexer=indexer, - key=seq, ) else: # a single label @@ -3303,7 +3102,6 @@ class MultiIndex(Index): self.get_loc_level(k, level=i, drop_level=False)[0] ), indexer=indexer, - key=seq, ) # empty indexer @@ -3315,8 +3113,6 @@ class MultiIndex(Index): return indexer._values - # -------------------------------------------------------------------- - def _reorder_indexer( self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], @@ -3346,21 +3142,16 @@ class MultiIndex(Index): k_codes = k_codes[k_codes >= 0] # Filter absent keys # True if the given codes are not ordered need_sort = (k_codes[:-1] > k_codes[1:]).any() - elif isinstance(k, slice) and k.step is not None and k.step < 0: - need_sort = True # Bail out if both index and seq are sorted if not need_sort: return indexer n = len(self) - keys: Tuple[np.ndarray, ...] = () + keys: Tuple[np.ndarray, ...] = tuple() # For each level of the sequence in seq, map the level codes with the # order they appears in a list-like sequence # This mapping is then use to reorder the indexer for i, k in enumerate(seq): - if is_scalar(k): - # GH#34603 we want to treat a scalar the same as an all equal list - k = [k] if com.is_bool_indexer(k): new_order = np.arange(n)[indexer] elif is_list_like(k): @@ -3374,11 +3165,6 @@ class MultiIndex(Index): key_order_map[level_indexer] = np.arange(len(level_indexer)) new_order = key_order_map[self.codes[i][indexer]] - elif isinstance(k, slice) and k.step is not None and k.step < 0: - new_order = np.arange(n)[k][indexer] - elif isinstance(k, slice) and k.start is None and k.stop is None: - # slice(None) should not determine order GH#31330 - new_order = np.ones((n,))[indexer] else: # For all other case, use the same order as the level new_order = np.arange(n)[indexer] @@ -3422,7 +3208,7 @@ class MultiIndex(Index): verify_integrity=False, ) - def equals(self, other: object) -> bool: + def equals(self, other) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3437,19 +3223,21 @@ class MultiIndex(Index): if not isinstance(other, Index): return False - if len(self) != len(other): - return False - if not isinstance(other, MultiIndex): # d-level MultiIndex can equal d-tuple Index if not is_object_dtype(other.dtype): # other cannot contain tuples, so cannot match self return False + elif len(self) != len(other): + return False return array_equivalent(self._values, other._values) if self.nlevels != other.nlevels: return False + if len(self) != len(other): + return False + for i in range(self.nlevels): self_codes = self.codes[i] self_codes = self_codes[self_codes != -1] @@ -3463,10 +3251,11 @@ class MultiIndex(Index): np.asarray(other.levels[i]._values), other_codes, allow_fill=False ) - # since we use NaT both datetime64 and timedelta64 we can have a - # situation where a level is typed say timedelta64 in self (IOW it - # has other values than NaT) but types datetime64 in other (where - # its all NaT) but these are equivalent + # since we use NaT both datetime64 and timedelta64 + # we can have a situation where a level is typed say + # timedelta64 in self (IOW it has other values than NaT) + # but types datetime64 in other (where its all NaT) + # but these are equivalent if len(self_values) == 0 and len(other_values) == 0: continue @@ -3567,12 +3356,7 @@ class MultiIndex(Index): other, result_names = self._convert_can_do_setop(other) if len(other) == 0 or self.equals(other): - return self.rename(result_names) - - return self._union(other, sort=sort) - - def _union(self, other, sort): - other, result_names = self._convert_can_do_setop(other) + return self # TODO: Index.union returns other when `len(self)` is 0. @@ -3588,9 +3372,6 @@ class MultiIndex(Index): zip(*uniq_tuples), sortorder=0, names=result_names ) - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - return is_object_dtype(dtype) - def intersection(self, other, sort=False): """ Form the intersection of two MultiIndex objects. @@ -3618,17 +3399,18 @@ class MultiIndex(Index): if self.equals(other): if self.has_duplicates: - return self.unique().rename(result_names) - return self.rename(result_names) + return self.unique() + return self - return self._intersection(other, sort=sort) - - def _intersection(self, other, sort=False): - other, result_names = self._convert_can_do_setop(other) - - if not self._is_comparable_dtype(other.dtype): + if not is_object_dtype(other.dtype): # The intersection is empty - return self[:0].rename(result_names) + # TODO: we have no tests that get here + return MultiIndex( + levels=self.levels, + codes=[[]] * self.nlevels, + names=result_names, + verify_integrity=False, + ) lvals = self._values rvals = other._values @@ -3646,13 +3428,8 @@ class MultiIndex(Index): if uniq_tuples is None: other_uniq = set(rvals) seen = set() - # pandas\core\indexes\multi.py:3503: error: "add" of "set" does not - # return a value [func-returns-value] uniq_tuples = [ - x - for x in lvals - if x in other_uniq - and not (x in seen or seen.add(x)) # type: ignore[func-returns-value] + x for x in lvals if x in other_uniq and not (x in seen or seen.add(x)) ] if sort is None: @@ -3696,7 +3473,7 @@ class MultiIndex(Index): other, result_names = self._convert_can_do_setop(other) if len(other) == 0: - return self.rename(result_names) + return self if self.equals(other): return MultiIndex( @@ -3732,32 +3509,21 @@ class MultiIndex(Index): if not isinstance(other, Index): if len(other) == 0: - return self[:0], self.names + other = MultiIndex( + levels=[[]] * self.nlevels, + codes=[[]] * self.nlevels, + verify_integrity=False, + ) else: msg = "other must be a MultiIndex or a list of tuples" try: - other = MultiIndex.from_tuples(other, names=self.names) - except (ValueError, TypeError) as err: - # ValueError raised by tuples_to_object_array if we - # have non-object dtype + other = MultiIndex.from_tuples(other) + except TypeError as err: raise TypeError(msg) from err else: - result_names = get_unanimous_names(self, other) - + result_names = self.names if self.names == other.names else None return other, result_names - def symmetric_difference(self, other, result_name=None, sort=None): - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - tups = Index.symmetric_difference(self, other, result_name, sort) - if len(tups) == 0: - return type(self)( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=tups.name, - ) - return type(self).from_tuples(tups, names=tups.name) - # -------------------------------------------------------------------- @doc(Index.astype) @@ -3768,22 +3534,13 @@ class MultiIndex(Index): raise NotImplementedError(msg) elif not is_object_dtype(dtype): raise TypeError( - "Setting a MultiIndex dtype to anything other than object " - "is not supported" + f"Setting {type(self)} dtype to anything other " + "than object is not supported" ) elif copy is True: return self._shallow_copy() return self - def _validate_fill_value(self, item): - if not isinstance(item, tuple): - # Pad the key with empty strings if lower levels of the key - # aren't specified: - item = (item,) + ("",) * (self.nlevels - 1) - elif len(item) != self.nlevels: - raise ValueError("Item must have length equal to number of levels.") - return item - def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location @@ -3798,7 +3555,12 @@ class MultiIndex(Index): ------- new_index : Index """ - item = self._validate_fill_value(item) + # Pad the key with empty strings if lower levels of the key + # aren't specified: + if not isinstance(item, tuple): + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") new_levels = [] new_codes = [] @@ -3808,12 +3570,7 @@ class MultiIndex(Index): # must insert at end otherwise you have to recompute all the # other codes lev_loc = len(level) - try: - level = level.insert(lev_loc, k) - except TypeError: - # TODO: Should this be done inside insert? - # TODO: smarter casting rules? - level = level.astype(object).insert(lev_loc, k) + level = level.insert(lev_loc, k) else: lev_loc = level.get_loc(k) @@ -3840,6 +3597,10 @@ class MultiIndex(Index): verify_integrity=False, ) + def _wrap_joined_index(self, joined, other): + names = self.names if self.names == other.names else None + return MultiIndex.from_tuples(joined, names=names) + @doc(Index.isin) def isin(self, values, level=None): if level is None: @@ -3853,35 +3614,13 @@ class MultiIndex(Index): return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) - # --------------------------------------------------------------- - # Arithmetic/Numeric Methods - Disabled - __add__ = make_invalid_op("__add__") - __radd__ = make_invalid_op("__radd__") - __iadd__ = make_invalid_op("__iadd__") - __sub__ = make_invalid_op("__sub__") - __rsub__ = make_invalid_op("__rsub__") - __isub__ = make_invalid_op("__isub__") - __pow__ = make_invalid_op("__pow__") - __rpow__ = make_invalid_op("__rpow__") - __mul__ = make_invalid_op("__mul__") - __rmul__ = make_invalid_op("__rmul__") - __floordiv__ = make_invalid_op("__floordiv__") - __rfloordiv__ = make_invalid_op("__rfloordiv__") - __truediv__ = make_invalid_op("__truediv__") - __rtruediv__ = make_invalid_op("__rtruediv__") - __mod__ = make_invalid_op("__mod__") - __rmod__ = make_invalid_op("__rmod__") - __divmod__ = make_invalid_op("__divmod__") - __rdivmod__ = make_invalid_op("__rdivmod__") - # Unary methods disabled - __neg__ = make_invalid_op("__neg__") - __pos__ = make_invalid_op("__pos__") - __abs__ = make_invalid_op("__abs__") - __inv__ = make_invalid_op("__inv__") +MultiIndex._add_numeric_methods_disabled() +MultiIndex._add_numeric_methods_add_sub_disabled() +MultiIndex._add_logical_methods_disabled() -def sparsify_labels(label_list, start: int = 0, sentinel=""): +def _sparsify(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) @@ -3931,13 +3670,13 @@ def maybe_droplevels(index, key): if isinstance(key, tuple): for _ in key: try: - index = index._drop_level_numbers([0]) + index = index.droplevel(0) except ValueError: # we have dropped too much, so back out return original_index else: try: - index = index._drop_level_numbers([0]) + index = index.droplevel(0) except ValueError: pass diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/numeric.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/numeric.py index ed76e26..7319079 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/numeric.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/numeric.py @@ -1,11 +1,10 @@ from typing import Any -import warnings import numpy as np from pandas._libs import index as libindex, lib -from pandas._typing import Dtype, DtypeObj, Label -from pandas.util._decorators import doc +from pandas._typing import Dtype, Label +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( @@ -16,20 +15,27 @@ from pandas.core.dtypes.common import ( is_float, is_float_dtype, is_integer_dtype, - is_numeric_dtype, is_scalar, is_signed_integer_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.generic import ( + ABCFloat64Index, + ABCInt64Index, + ABCRangeIndex, + ABCSeries, + ABCUInt64Index, +) +from pandas.core.dtypes.missing import isna +from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name +from pandas.core.ops import get_op_result_name -_num_index_shared_docs = {} +_num_index_shared_docs = dict() class NumericIndex(Index): @@ -39,10 +45,7 @@ class NumericIndex(Index): This is an abstract class. """ - _default_dtype: np.dtype - _is_numeric_dtype = True - _can_hold_strings = False def __new__(cls, data=None, dtype=None, copy=False, name=None): cls._validate_dtype(dtype) @@ -92,18 +95,13 @@ class NumericIndex(Index): f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) - # ---------------------------------------------------------------- - # Indexing Methods - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side: str, kind): + def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) - # ---------------------------------------------------------------- - @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): if values is not None and not self._can_hold_na and values.dtype.kind == "f": @@ -112,7 +110,7 @@ class NumericIndex(Index): return Float64Index._simple_new(values, name=name) return super()._shallow_copy(values=values, name=name) - def _validate_fill_value(self, value): + def _convert_for_op(self, value): """ Convert value to be insertable to ndarray. """ @@ -120,14 +118,6 @@ class NumericIndex(Index): # force conversion to object # so we don't lose the bools raise TypeError - elif isinstance(value, str) or lib.is_complex(value): - raise TypeError - elif is_scalar(value) and isna(value): - if is_valid_nat_for_dtype(value, self.dtype): - value = self._na_value - else: - # NaT, np.datetime64("NaT"), np.timedelta64("NaT") - raise TypeError return value @@ -148,10 +138,6 @@ class NumericIndex(Index): ) return tolerance - def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: - # If we ever have BoolIndex or ComplexIndex, this may need to be tightened - return is_numeric_dtype(dtype) - @classmethod def _assert_safe_casting(cls, data, subarr): """ @@ -162,7 +148,7 @@ class NumericIndex(Index): pass @property - def _is_all_dates(self) -> bool: + def is_all_dates(self) -> bool: """ Checks that all the labels are datetime objects. """ @@ -170,11 +156,9 @@ class NumericIndex(Index): @doc(Index.insert) def insert(self, loc: int, item): - try: - item = self._validate_fill_value(item) - except TypeError: - return self.astype(object).insert(loc, item) - + # treat NA values as nans: + if is_scalar(item) and isna(item): + item = self._na_value return super().insert(loc, item) def _union(self, other, sort): @@ -198,7 +182,7 @@ class NumericIndex(Index): _num_index_shared_docs[ "class_descr" ] = """ - Immutable sequence used for indexing and alignment. The basic object + Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s. @@ -228,12 +212,7 @@ _num_index_shared_docs[ An Index instance can **only** contain hashable objects. """ -_int64_descr_args = { - "klass": "Int64Index", - "ltype": "integer", - "dtype": "int64", - "extra": "", -} +_int64_descr_args = dict(klass="Int64Index", ltype="integer", dtype="int64", extra="") class IntegerIndex(NumericIndex): @@ -242,20 +221,6 @@ class IntegerIndex(NumericIndex): """ _default_dtype: np.dtype - _can_hold_na = False - - @classmethod - def _assert_safe_casting(cls, data, subarr): - """ - Ensure incoming data can be represented with matching signed-ness. - """ - if data.dtype.kind != cls._default_dtype.kind: - if not np.array_equal(data, subarr): - raise TypeError("Unsafe NumPy casting, you must explicitly cast") - - def _can_union_without_object_cast(self, other) -> bool: - # See GH#26778, further casting may occur in NumericIndex._union - return other.dtype == "f8" or other.dtype == self.dtype def __contains__(self, key) -> bool: """ @@ -279,11 +244,6 @@ class IntegerIndex(NumericIndex): @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - warnings.warn( - "Index.asi8 is deprecated and will be removed in a future version", - FutureWarning, - stacklevel=2, - ) return self._values.view(self._default_dtype) @@ -291,28 +251,46 @@ class Int64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args _typ = "int64index" + _can_hold_na = False _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Int64Index(joined, name=name) -_uint64_descr_args = { - "klass": "UInt64Index", - "ltype": "unsigned integer", - "dtype": "uint64", - "extra": "", -} + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as ints. + """ + if not issubclass(data.dtype.type, np.signedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _is_compatible_with_other(self, other) -> bool: + return super()._is_compatible_with_other(other) or all( + isinstance(obj, (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) + for obj in [self, other] + ) + + +Int64Index._add_numeric_methods() +Int64Index._add_logical_methods() + +_uint64_descr_args = dict( + klass="UInt64Index", ltype="unsigned integer", dtype="uint64", extra="" +) class UInt64Index(IntegerIndex): __doc__ = _num_index_shared_docs["class_descr"] % _uint64_descr_args _typ = "uint64index" + _can_hold_na = False _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) - # ---------------------------------------------------------------- - # Indexing Methods - @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned @@ -325,13 +303,40 @@ class UInt64Index(IntegerIndex): return com.asarray_tuplesafe(keyarr, dtype=dtype) + @doc(Index._convert_index_indexer) + def _convert_index_indexer(self, keyarr): + # Cast the indexer to uint64 if possible so + # that the values returned from indexing are + # also uint64. + if keyarr.is_integer(): + return keyarr.astype(np.uint64) + return keyarr -_float64_descr_args = { - "klass": "Float64Index", - "dtype": "float64", - "ltype": "float", - "extra": "", -} + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return UInt64Index(joined, name=name) + + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as uints. + """ + if not issubclass(data.dtype.type, np.unsignedinteger): + if not np.array_equal(data, subarr): + raise TypeError("Unsafe NumPy casting, you must explicitly cast") + + def _is_compatible_with_other(self, other) -> bool: + return super()._is_compatible_with_other(other) or all( + isinstance(obj, (ABCUInt64Index, ABCFloat64Index)) for obj in [self, other] + ) + + +UInt64Index._add_numeric_methods() +UInt64Index._add_logical_methods() + +_float64_descr_args = dict( + klass="Float64Index", dtype="float64", ltype="float", extra="" +) class Float64Index(NumericIndex): @@ -339,7 +344,7 @@ class Float64Index(NumericIndex): _typ = "float64index" _engine_type = libindex.Float64Engine - _default_dtype = np.dtype(np.float64) + _default_dtype = np.float64 @property def inferred_type(self) -> str: @@ -378,22 +383,6 @@ class Float64Index(NumericIndex): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - - if is_float(key) and np.isnan(key): - nan_idxs = self._nan_idxs - if not len(nan_idxs): - raise KeyError(key) - elif len(nan_idxs) == 1: - return nan_idxs[0] - return nan_idxs - - return super().get_loc(key, method=method, tolerance=tolerance) - # ---------------------------------------------------------------- def _format_native_types( @@ -418,6 +407,40 @@ class Float64Index(NumericIndex): return is_float(other) and np.isnan(other) and self.hasnans - def _can_union_without_object_cast(self, other) -> bool: - # See GH#26778, further casting may occur in NumericIndex._union - return is_numeric_dtype(other.dtype) + @doc(Index.get_loc) + def get_loc(self, key, method=None, tolerance=None): + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + + return super().get_loc(key, method=method, tolerance=tolerance) + + @cache_readonly + def is_unique(self) -> bool: + return super().is_unique and self._nan_idxs.size < 2 + + @doc(Index.isin) + def isin(self, values, level=None): + if level is not None: + self._validate_index_level(level) + return algorithms.isin(np.array(self), values) + + def _is_compatible_with_other(self, other) -> bool: + return super()._is_compatible_with_other(other) or all( + isinstance( + obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + ) + for obj in [self, other] + ) + + +Float64Index._add_numeric_methods() +Float64Index._add_logical_methods_disabled() diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/period.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/period.py index b60828b..03e11b6 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/period.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/period.py @@ -1,17 +1,18 @@ from datetime import datetime, timedelta from typing import Any -import warnings import numpy as np -from pandas._libs import index as libindex, lib +from pandas._libs import index as libindex +from pandas._libs.lib import no_default from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( + ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -43,7 +44,7 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name _index_doc_kwargs = dict(ibase._index_doc_kwargs) -_index_doc_kwargs.update({"target_klass": "PeriodIndex or list of Periods"}) +_index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) # --- Period index sketch @@ -60,12 +61,12 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, + ["strftime", "to_timestamp", "start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) @inherit_names(["is_leap_year", "_format_native_types"], PeriodArray) -class PeriodIndex(DatetimeIndexOpsMixin): +class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in time. @@ -95,9 +96,7 @@ class PeriodIndex(DatetimeIndexOpsMixin): ---------- day dayofweek - day_of_week dayofyear - day_of_year days_in_month daysinmonth end_time @@ -146,42 +145,17 @@ class PeriodIndex(DatetimeIndexOpsMixin): _data: PeriodArray freq: BaseOffset - _data_cls = PeriodArray _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True # -------------------------------------------------------------------- - # methods that dispatch to array and wrap result in Index - # These are defined here instead of via inherit_names for mypy + # methods that dispatch to array and wrap result in PeriodIndex @doc(PeriodArray.asfreq) def asfreq(self, freq=None, how: str = "E") -> "PeriodIndex": arr = self._data.asfreq(freq, how) return type(self)._simple_new(arr, name=self.name) - @doc(PeriodArray.to_timestamp) - def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: - arr = self._data.to_timestamp(freq, how) - return DatetimeIndex._simple_new(arr, name=self.name) - - # error: Decorated property not supported [misc] - @property # type:ignore[misc] - @doc(PeriodArray.hour.fget) - def hour(self) -> Int64Index: - return Int64Index(self._data.hour, name=self.name) - - # error: Decorated property not supported [misc] - @property # type:ignore[misc] - @doc(PeriodArray.minute.fget) - def minute(self) -> Int64Index: - return Int64Index(self._data.minute, name=self.name) - - # error: Decorated property not supported [misc] - @property # type:ignore[misc] - @doc(PeriodArray.second.fget) - def second(self) -> Int64Index: - return Int64Index(self._data.second, name=self.name) - # ------------------------------------------------------------------------ # Index Constructors @@ -235,7 +209,7 @@ class PeriodIndex(DatetimeIndexOpsMixin): if data is None and ordinal is not None: # we strangely ignore `ordinal` if data is passed. ordinal = np.asarray(ordinal, dtype=np.int64) - data = PeriodArray(ordinal, freq=freq) + data = PeriodArray(ordinal, freq) else: # don't pass copy here, since we copy later. data = period_array(data=data, freq=freq) @@ -245,12 +219,49 @@ class PeriodIndex(DatetimeIndexOpsMixin): return cls._simple_new(data, name=name) + @classmethod + def _simple_new(cls, values: PeriodArray, name: Label = None): + """ + Create a new PeriodIndex. + + Parameters + ---------- + values : PeriodArray + Values that can be converted to a PeriodArray without inference + or coercion. + """ + assert isinstance(values, PeriodArray), type(values) + + result = object.__new__(cls) + result._data = values + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + result.name = name + result._cache = {} + result._reset_identity() + return result + # ------------------------------------------------------------------------ # Data @property - def values(self) -> np.ndarray: - return np.asarray(self, dtype=object) + def values(self): + return np.asarray(self) + + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + + def _shallow_copy(self, values=None, name: Label = no_default): + name = name if name is not no_default else self.name + cache = self._cache.copy() if values is None else {} + if values is None: + values = self._data + + result = self._simple_new(values, name=name) + result._cache = cache + return result def _maybe_convert_timedelta(self, other): """ @@ -303,6 +314,10 @@ class PeriodIndex(DatetimeIndexOpsMixin): # how to represent ourselves to matplotlib return self.astype(object)._values + @property + def _formatter_func(self): + return self.array._formatter(boxed=False) + # ------------------------------------------------------------------------ # Indexing @@ -330,13 +345,10 @@ class PeriodIndex(DatetimeIndexOpsMixin): def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc and other functions. + Gets called after a ufunc. Needs additional handling as + PeriodIndex stores internal data as int dtype - Needs additional handling as PeriodIndex stores internal data as int - dtype - - Replace this to __numpy_ufunc__ in future version and implement - __array_function__ for Indexes + Replace this to __numpy_ufunc__ in future version """ if isinstance(context, tuple) and len(context) > 0: func = context[0] @@ -364,39 +376,39 @@ class PeriodIndex(DatetimeIndexOpsMixin): # cannot pass _simple_new as it is return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: + def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps mask : array of booleans where data is not NA """ - if isinstance(where, DatetimeIndex): - where = PeriodIndex(where._values, freq=self.freq) - elif not isinstance(where, PeriodIndex): + where_idx = where + if isinstance(where_idx, DatetimeIndex): + where_idx = PeriodIndex(where_idx._values, freq=self.freq) + elif not isinstance(where_idx, PeriodIndex): raise TypeError("asof_locs `where` must be DatetimeIndex or PeriodIndex") + elif where_idx.freq != self.freq: + raise raise_on_incompatible(self, where_idx) - return super().asof_locs(where, mask) + locs = self.asi8[mask].searchsorted(where_idx.asi8, side="right") + + locs = np.where(locs > 0, locs - 1, 0) + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[(locs == 0) & (where_idx.asi8 < self.asi8[first])] = -1 + + return result @doc(Index.astype) - def astype(self, dtype, copy: bool = True, how=lib.no_default): + def astype(self, dtype, copy=True, how="start"): dtype = pandas_dtype(dtype) - if how is not lib.no_default: - # GH#37982 - warnings.warn( - "The 'how' keyword in PeriodIndex.astype is deprecated and " - "will be removed in a future version. " - "Use index.to_timestamp(how=how) instead", - FutureWarning, - stacklevel=2, - ) - else: - how = "start" - if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) + # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) @property @@ -407,7 +419,7 @@ class PeriodIndex(DatetimeIndexOpsMixin): """ if len(self) == 0: return True - if not self.is_monotonic_increasing: + if not self.is_monotonic: raise ValueError("Index is not monotonic") values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() @@ -418,45 +430,17 @@ class PeriodIndex(DatetimeIndexOpsMixin): # indexing return "period" - def insert(self, loc: int, item): - if not isinstance(item, Period) or self.freq != item.freq: - return self.astype(object).insert(loc, item) - - return DatetimeIndexOpsMixin.insert(self, loc, item) - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - self._assert_can_do_setop(other) - - if not isinstance(other, PeriodIndex): - return self.astype(object).join( - other, how=how, level=level, return_indexers=return_indexers, sort=sort - ) - - # _assert_can_do_setop ensures we have matching dtype - result = super().join( - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - return result - - # ------------------------------------------------------------------------ - # Indexing Methods - @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if not self._should_compare(target): - return self._get_indexer_non_comparable(target, method, unique=True) - if isinstance(target, PeriodIndex): - target = target._get_engine_target() # i.e. target.asi8 + if target.freq != self.freq: + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + + target = target.asi8 self_index = self._int64index else: self_index = self @@ -469,6 +453,19 @@ class PeriodIndex(DatetimeIndexOpsMixin): return Index.get_indexer(self_index, target, method, limit, tolerance) + @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + target = target.asi8 + + indexer, missing = self._int64index.get_indexer_non_unique(target) + return ensure_platform_int(indexer), missing + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label. @@ -504,7 +501,7 @@ class PeriodIndex(DatetimeIndexOpsMixin): try: asdt, reso = parse_time_string(key, self.freq) - except (ValueError, DateParseError) as err: + except DateParseError as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err @@ -575,9 +572,10 @@ class PeriodIndex(DatetimeIndexOpsMixin): return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like - raise self._invalid_indexer("slice", label) from err + # TODO: we need tests for this case + raise KeyError(label) from err elif is_integer(label) or is_float(label): - raise self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) return label @@ -597,14 +595,47 @@ class PeriodIndex(DatetimeIndexOpsMixin): # why is that check not needed? raise ValueError - def _get_string_slice(self, key: str): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + # TODO: Check for non-True use_lhs/use_rhs parsed, reso = parse_time_string(key, self.freq) reso = Resolution.from_attrname(reso) try: - return self._partial_date_slice(reso, parsed) + return self._partial_date_slice(reso, parsed, use_lhs, use_rhs) except KeyError as err: raise KeyError(key) from err + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.astype(object).insert(loc, item) + + i8result = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) + arr = type(self._data)._simple_new(i8result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) + + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + if not isinstance(other, PeriodIndex): + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + # _assert_can_do_setop ensures we have matching dtype + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + return result + # ------------------------------------------------------------------------ # Set Operation Methods @@ -636,21 +667,15 @@ class PeriodIndex(DatetimeIndexOpsMixin): def intersection(self, other, sort=False): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, _ = self._convert_can_do_setop(other) + other = ensure_index(other) if self.equals(other): - if self.has_duplicates: - return self.unique()._get_reconciled_name_object(other) return self._get_reconciled_name_object(other) - return self._intersection(other, sort=sort) - - def _intersection(self, other, sort=False): - - if is_object_dtype(other.dtype): + elif is_object_dtype(other.dtype): return self.astype("O").intersection(other, sort=sort) - elif not self._is_comparable_dtype(other.dtype): + elif not is_dtype_equal(self.dtype, other.dtype): # We can infer that the intersection is empty. # assert_can_do_setop ensures that this is not just a mismatched freq this = self[:0].astype("O") @@ -662,14 +687,11 @@ class PeriodIndex(DatetimeIndexOpsMixin): def difference(self, other, sort=None): self._validate_sort_keyword(sort) self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) + other = ensure_index(other) if self.equals(other): - return self[:0].rename(result_name) - - return self._difference(other, sort=sort) - - def _difference(self, other, sort): + # pass an empty PeriodArray with the appropriate dtype + return type(self)._simple_new(self._data[:0], name=self.name) if is_object_dtype(other): return self.astype(object).difference(other).astype(self.dtype) @@ -695,13 +717,17 @@ class PeriodIndex(DatetimeIndexOpsMixin): # ------------------------------------------------------------------------ - def memory_usage(self, deep: bool = False) -> int: + def memory_usage(self, deep=False): result = super().memory_usage(deep=deep) if hasattr(self, "_cache") and "_int64index" in self._cache: result += self._int64index.memory_usage(deep=deep) return result +PeriodIndex._add_numeric_methods_disabled() +PeriodIndex._add_logical_methods_disabled() + + def period_range( start=None, end=None, periods=None, freq=None, name=None ) -> PeriodIndex: diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/range.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/range.py index ec896d9..dcc0bdd 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/range.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional, Tuple +from typing import Any, List, Optional import warnings import numpy as np @@ -9,6 +9,7 @@ import numpy as np from pandas._libs import index as libindex from pandas._libs.lib import no_default from pandas._typing import Label +import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, cache_readonly, doc @@ -17,9 +18,9 @@ from pandas.core.dtypes.common import ( ensure_python_int, is_float, is_integer, + is_integer_dtype, is_list_like, is_scalar, - is_signed_integer_dtype, is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCTimedeltaIndex @@ -29,7 +30,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name -from pandas.core.indexes.numeric import Float64Index, Int64Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer _empty_range = range(0) @@ -52,12 +53,10 @@ class RangeIndex(Int64Index): If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) step : int (default: 1) - dtype : np.int64 - Unused, accepted for homogeneity with other index types. - copy : bool, default False - Unused, accepted for homogeneity with other index types. name : object, optional Name to be stored in the index. + copy : bool, default False + Unused, accepted for homogeneity with other index types. Attributes ---------- @@ -79,11 +78,13 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range: range + # check whether self._data has been called + _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors def __new__( - cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, ): cls._validate_dtype(dtype) @@ -149,14 +150,20 @@ class RangeIndex(Int64Index): """ return the class to use for construction """ return Int64Index - @cache_readonly + @property def _data(self): """ An int array that for performance reasons is created only when needed. - The constructed array is saved in ``_cache``. + The constructed array is saved in ``_cached_data``. This allows us to + check if the array has been created without accessing ``_data`` and + triggering the construction. """ - return np.arange(self.start, self.stop, self.step, dtype=np.int64) + if self._cached_data is None: + self._cached_data = np.arange( + self.start, self.stop, self.step, dtype=np.int64 + ) + return self._cached_data @cache_readonly def _int64index(self) -> Int64Index: @@ -339,9 +346,6 @@ class RangeIndex(Int64Index): return False return key in self._range - # -------------------------------------------------------------------- - # Indexing Methods - @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: @@ -369,7 +373,7 @@ class RangeIndex(Int64Index): start, stop, step = reverse.start, reverse.stop, reverse.step target_array = np.asarray(target) - if not (is_signed_integer_dtype(target_array) and target_array.ndim == 1): + if not (is_integer_dtype(target_array) and target_array.ndim == 1): # checks/conversions/roundings are delegated to general method return super().get_indexer(target, method=method, tolerance=tolerance) @@ -383,42 +387,26 @@ class RangeIndex(Int64Index): locs[valid] = len(self) - 1 - locs[valid] return ensure_platform_int(locs) - # -------------------------------------------------------------------- - def tolist(self): return list(self._range) - @doc(Int64Index.__iter__) - def __iter__(self): - yield from self._range - @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name - if values is not None: - if values.dtype.kind == "f": - return Float64Index(values, name=name) + if values is None: + result = self._simple_new(self._range, name=name) + result._cache = self._cache.copy() + return result + else: return Int64Index._simple_new(values, name=name) - result = self._simple_new(self._range, name=name) - result._cache = self._cache - return result - @doc(Int64Index.copy) - def copy(self, name=None, deep=False, dtype=None, names=None): - name = self._validate_names(name=name, names=names, deep=deep)[0] - new_index = self._shallow_copy(name=name) - - if dtype: - warnings.warn( - "parameter dtype is deprecated and will be removed in a future " - "version. Use the astype method instead.", - FutureWarning, - stacklevel=2, - ) - new_index = new_index.astype(dtype) - return new_index + def copy(self, name=None, deep=False, dtype=None, **kwargs): + self._validate_dtype(dtype) + if name is None: + name = self.name + return self.from_range(self._range, name=name) def _minmax(self, meth: str): no_steps = len(self) - 1 @@ -461,17 +449,7 @@ class RangeIndex(Int64Index): else: return np.arange(len(self) - 1, -1, -1) - def factorize( - self, sort: bool = False, na_sentinel: Optional[int] = -1 - ) -> Tuple[np.ndarray, "RangeIndex"]: - codes = np.arange(len(self), dtype=np.intp) - uniques = self - if sort and self.step < 0: - codes = codes[::-1] - uniques = uniques[::-1] - return codes, uniques - - def equals(self, other: object) -> bool: + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -479,14 +457,34 @@ class RangeIndex(Int64Index): return self._range == other._range return super().equals(other) - # -------------------------------------------------------------------- - # Set Operations + def intersection(self, other, sort=False): + """ + Form the intersection of two Index objects. - def _intersection(self, other, sort=False): + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Sort the resulting index if possible + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + Returns + ------- + intersection : Index + """ + self._validate_sort_keyword(sort) + + if self.equals(other): + return self._get_reconciled_name_object(other) if not isinstance(other, RangeIndex): - # Int64Index - return super()._intersection(other, sort=sort) + return super().intersection(other, sort=sort) if not len(self) or not len(other): return self._simple_new(_empty_range) @@ -527,7 +525,6 @@ class RangeIndex(Int64Index): new_index = new_index[::-1] if sort is None: new_index = new_index.sort_values() - return new_index def _min_fitting_element(self, lower_limit: int) -> int: @@ -626,63 +623,6 @@ class RangeIndex(Int64Index): return type(self)(start_r, end_r + step_o, step_o) return self._int64index._union(other, sort=sort) - def difference(self, other, sort=None): - # optimized set operation if we have another RangeIndex - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - other, result_name = self._convert_can_do_setop(other) - - if not isinstance(other, RangeIndex): - return super().difference(other, sort=sort) - - res_name = ops.get_op_result_name(self, other) - - first = self._range[::-1] if self.step < 0 else self._range - overlap = self.intersection(other) - if overlap.step < 0: - overlap = overlap[::-1] - - if len(overlap) == 0: - return self._shallow_copy(name=res_name) - if len(overlap) == len(self): - return self[:0].rename(res_name) - if not isinstance(overlap, RangeIndex): - # We wont end up with RangeIndex, so fall back - return super().difference(other, sort=sort) - if overlap.step != first.step: - # In some cases we might be able to get a RangeIndex back, - # but not worth the effort. - return super().difference(other, sort=sort) - - if overlap[0] == first.start: - # The difference is everything after the intersection - new_rng = range(overlap[-1] + first.step, first.stop, first.step) - elif overlap[-1] == first[-1]: - # The difference is everything before the intersection - new_rng = range(first.start, overlap[0], first.step) - else: - # The difference is not range-like - return super().difference(other, sort=sort) - - new_index = type(self)._simple_new(new_rng, name=res_name) - if first is not self._range: - new_index = new_index[::-1] - return new_index - - def symmetric_difference(self, other, result_name=None, sort=None): - if not isinstance(other, RangeIndex) or sort is not None: - return super().symmetric_difference(other, result_name, sort) - - left = self.difference(other) - right = other.difference(self) - result = left.union(right) - - if result_name is not None: - result = result.rename(result_name) - return result - - # -------------------------------------------------------------------- - @doc(Int64Index.join) def join(self, other, how="left", level=None, return_indexers=False, sort=False): if how == "outer" and self is not other: @@ -795,92 +735,89 @@ class RangeIndex(Int64Index): return self._simple_new(new_range, name=self.name) return self._int64index // other - # -------------------------------------------------------------------- - # Reductions - - def all(self, *args, **kwargs) -> bool: + def all(self) -> bool: return 0 not in self._range - def any(self, *args, **kwargs) -> bool: + def any(self) -> bool: return any(self._range) - # -------------------------------------------------------------------- + @classmethod + def _add_numeric_methods_binary(cls): + """ add in numeric methods, specialized to RangeIndex """ - def _cmp_method(self, other, op): - if isinstance(other, RangeIndex) and self._range == other._range: - # Both are immutable so if ._range attr. are equal, shortcut is possible - return super()._cmp_method(self, op) - return super()._cmp_method(other, op) + def _make_evaluate_binop(op, step=False): + """ + Parameters + ---------- + op : callable that accepts 2 params + perform the binary op + step : callable, optional, default to False + op to apply to the step parm if not None + if False, use the existing step + """ - def _arith_method(self, other, op): - """ - Parameters - ---------- - other : Any - op : callable that accepts 2 params - perform the binary op - """ + @unpack_zerodim_and_defer(op.__name__) + def _evaluate_numeric_binop(self, other): + if isinstance(other, ABCTimedeltaIndex): + # Defer to TimedeltaIndex implementation + return NotImplemented + elif isinstance(other, (timedelta, np.timedelta64)): + # GH#19333 is_integer evaluated True on timedelta64, + # so we need to catch these explicitly + return op(self._int64index, other) + elif is_timedelta64_dtype(other): + # Must be an np.ndarray; GH#22390 + return op(self._int64index, other) - if isinstance(other, ABCTimedeltaIndex): - # Defer to TimedeltaIndex implementation - return NotImplemented - elif isinstance(other, (timedelta, np.timedelta64)): - # GH#19333 is_integer evaluated True on timedelta64, - # so we need to catch these explicitly - return op(self._int64index, other) - elif is_timedelta64_dtype(other): - # Must be an np.ndarray; GH#22390 - return op(self._int64index, other) + other = extract_array(other, extract_numpy=True) + attrs = self._get_attributes_dict() - if op in [ - operator.pow, - ops.rpow, - operator.mod, - ops.rmod, - ops.rfloordiv, - divmod, - ops.rdivmod, - ]: - return op(self._int64index, other) + left, right = self, other - step = False - if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: - step = op + try: + # apply if we have an override + if step: + with np.errstate(all="ignore"): + rstep = step(left.step, right) - other = extract_array(other, extract_numpy=True) - attrs = self._get_attributes_dict() + # we don't have a representable op + # so return a base index + if not is_integer(rstep) or not rstep: + raise ValueError - left, right = self, other + else: + rstep = left.step - try: - # apply if we have an override - if step: - with np.errstate(all="ignore"): - rstep = step(left.step, right) + with np.errstate(all="ignore"): + rstart = op(left.start, right) + rstop = op(left.stop, right) - # we don't have a representable op - # so return a base index - if not is_integer(rstep) or not rstep: - raise ValueError + result = type(self)(rstart, rstop, rstep, **attrs) - else: - rstep = left.step + # for compat with numpy / Int64Index + # even if we can represent as a RangeIndex, return + # as a Float64Index if we have float-like descriptors + if not all(is_integer(x) for x in [rstart, rstop, rstep]): + result = result.astype("float64") - with np.errstate(all="ignore"): - rstart = op(left.start, right) - rstop = op(left.stop, right) + return result - result = type(self)(rstart, rstop, rstep, **attrs) + except (ValueError, TypeError, ZeroDivisionError): + # Defer to Int64Index implementation + return op(self._int64index, other) + # TODO: Do attrs get handled reliably? - # for compat with numpy / Int64Index - # even if we can represent as a RangeIndex, return - # as a Float64Index if we have float-like descriptors - if not all(is_integer(x) for x in [rstart, rstop, rstep]): - result = result.astype("float64") + name = f"__{op.__name__}__" + return compat.set_function_name(_evaluate_numeric_binop, name, cls) - return result + cls.__add__ = _make_evaluate_binop(operator.add) + cls.__radd__ = _make_evaluate_binop(ops.radd) + cls.__sub__ = _make_evaluate_binop(operator.sub) + cls.__rsub__ = _make_evaluate_binop(ops.rsub) + cls.__mul__ = _make_evaluate_binop(operator.mul, step=operator.mul) + cls.__rmul__ = _make_evaluate_binop(ops.rmul, step=ops.rmul) + cls.__truediv__ = _make_evaluate_binop(operator.truediv, step=operator.truediv) + cls.__rtruediv__ = _make_evaluate_binop(ops.rtruediv, step=ops.rtruediv) - except (ValueError, TypeError, ZeroDivisionError): - # Defer to Int64Index implementation - return op(self._int64index, other) - # TODO: Do attrs get handled reliably? + +RangeIndex._add_numeric_methods() diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexes/timedeltas.py b/venv/lib/python3.8/site-packages/pandas/core/indexes/timedeltas.py index fcab3e1..af3b2d1 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexes/timedeltas.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexes/timedeltas.py @@ -2,12 +2,14 @@ from pandas._libs import index as libindex, lib from pandas._libs.tslibs import Timedelta, to_offset -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import doc from pandas.core.dtypes.common import ( TD64NS_DTYPE, + is_float, + is_integer, is_scalar, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -103,7 +105,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _typ = "timedeltaindex" - _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine _comparables = ["name", "freq"] @@ -152,15 +153,38 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): # - Cases checked above all return/raise before reaching here - # - tdarr = TimedeltaArray._from_sequence_not_strict( + tdarr = TimedeltaArray._from_sequence( data, freq=freq, unit=unit, dtype=dtype, copy=copy ) return cls._simple_new(tdarr, name=name) + @classmethod + def _simple_new(cls, values: TimedeltaArray, name: Label = None): + assert isinstance(values, TimedeltaArray) + + result = object.__new__(cls) + result._data = values + result._name = name + result._cache = {} + # For groupby perf. See note in indexes/base about _index_data + result._index_data = values._data + + result._reset_identity() + return result + + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + + return _get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- @doc(Index.astype) - def astype(self, dtype, copy: bool = True): + def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # Have to repeat the check for 'timedelta64' (not ns) dtype @@ -178,9 +202,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): """ return is_timedelta64_dtype(dtype) - # ------------------------------------------------------------------- - # Indexing Methods - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -193,7 +214,7 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): raise InvalidIndexError(key) try: - key = self._data._validate_scalar(key, unbox=False) + key = self._data._validate_scalar(key, cast_str=True) except TypeError as err: raise KeyError(key) from err @@ -222,18 +243,22 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): return lbound else: return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") - elif not isinstance(label, self._data._recognized_scalars): - raise self._invalid_indexer("slice", label) + elif is_integer(label) or is_float(label): + self._invalid_indexer("slice", label) return label - # ------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: + return typ == self.inferred_type or typ == "timedelta" @property def inferred_type(self) -> str: return "timedelta64" +TimedeltaIndex._add_logical_methods_disabled() + + def timedelta_range( start=None, end=None, periods=None, freq=None, name=None, closed=None ) -> TimedeltaIndex: diff --git a/venv/lib/python3.8/site-packages/pandas/core/indexing.py b/venv/lib/python3.8/site-packages/pandas/core/indexing.py index e7cf8ca..c33cb39 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/indexing.py +++ b/venv/lib/python3.8/site-packages/pandas/core/indexing.py @@ -1,12 +1,10 @@ -from contextlib import suppress -from typing import TYPE_CHECKING, Any, Hashable, List, Sequence, Tuple, Union -import warnings +from typing import TYPE_CHECKING, Hashable, List, Tuple, Union import numpy as np from pandas._config.config import option_context -from pandas._libs.indexing import NDFrameIndexerBase +from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc @@ -24,7 +22,7 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries -from pandas.core.dtypes.missing import infer_fill_value, isna +from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com from pandas.core.construction import array as pd_array @@ -36,7 +34,7 @@ from pandas.core.indexers import ( from pandas.core.indexes.api import Index if TYPE_CHECKING: - from pandas import DataFrame, Series + from pandas import DataFrame # noqa:F401 # "null slice" _NS = slice(None, None) @@ -61,7 +59,7 @@ class _IndexSlice: >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) >>> columns = ['foo', 'bar'] >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - ... index=midx, columns=columns) + index=midx, columns=columns) Using the default slice command: @@ -257,20 +255,15 @@ class IndexingMixin: - A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. - - An alignable boolean Series. The index of the key will be aligned before - masking. - - An alignable Index. The Index of the returned selection will be the input. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) - See more at :ref:`Selection by Label `. + See more at :ref:`Selection by Label ` Raises ------ KeyError If any items are not found. - IndexingError - If an indexed key is passed and its index is unalignable to the frame index. See Also -------- @@ -326,21 +319,6 @@ class IndexingMixin: max_speed shield sidewinder 7 8 - Alignable boolean Series: - - >>> df.loc[pd.Series([False, True, False], - ... index=['viper', 'sidewinder', 'cobra'])] - max_speed shield - sidewinder 7 8 - - Index (same behavior as ``df.reindex``) - - >>> df.loc[pd.Index(["cobra", "viper"], name="foo")] - max_speed shield - foo - cobra 1 2 - viper 4 5 - Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] @@ -594,7 +572,7 @@ class IndexingMixin: return _iAtIndexer("iat", self) -class _LocationIndexer(NDFrameIndexerBase): +class _LocationIndexer(_NDFrameIndexerBase): _valid_types: str axis = None @@ -620,13 +598,17 @@ class _LocationIndexer(NDFrameIndexerBase): ax = self.obj._get_axis(0) if isinstance(ax, ABCMultiIndex) and self.name != "iloc": - with suppress(TypeError, KeyError, InvalidIndexError): - # TypeError e.g. passed a bool + try: return ax.get_loc(key) + except (TypeError, KeyError, InvalidIndexError): + # TypeError e.g. passed a bool + pass if isinstance(key, tuple): - with suppress(IndexingError): + try: return self._convert_tuple(key, is_setter=True) + except IndexingError: + pass if isinstance(key, range): return list(key) @@ -642,7 +624,7 @@ class _LocationIndexer(NDFrameIndexerBase): raise raise IndexingError(key) from e - def _ensure_listlike_indexer(self, key, axis=None, value=None): + def _ensure_listlike_indexer(self, key, axis=None): """ Ensure that a list-like of column labels are all present by adding them if they do not already exist. @@ -672,12 +654,9 @@ class _LocationIndexer(NDFrameIndexerBase): and not com.is_bool_indexer(key) and all(is_hashable(k) for k in key) ): - # GH#38148 - keys = self.obj.columns.union(key, sort=False) - - self.obj._mgr = self.obj._mgr.reindex_axis( - keys, axis=0, copy=False, consolidate=False, only_slice=True - ) + for k in key: + if k not in self.obj: + self.obj[k] = np.nan def __setitem__(self, key, value): if isinstance(key, tuple): @@ -688,7 +667,7 @@ class _LocationIndexer(NDFrameIndexerBase): self._has_valid_setitem_indexer(key) iloc = self if self.name == "iloc" else self.obj.iloc - iloc._setitem_with_indexer(indexer, value, self.name) + iloc._setitem_with_indexer(indexer, value) def _validate_key(self, key, axis: int): """ @@ -716,8 +695,9 @@ class _LocationIndexer(NDFrameIndexerBase): """ Check the key for valid keys across my indexer. """ - self._validate_key_length(key) for i, k in enumerate(key): + if i >= self.ndim: + raise IndexingError("Too many indexers") try: self._validate_key(k, i) except ValueError as err: @@ -748,17 +728,13 @@ class _LocationIndexer(NDFrameIndexerBase): else: keyidx.append(slice(None)) else: - self._validate_key_length(key) for i, k in enumerate(key): + if i >= self.ndim: + raise IndexingError("Too many indexers") idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter) keyidx.append(idx) - return tuple(keyidx) - def _validate_key_length(self, key: Sequence[Any]) -> None: - if len(key) > self.ndim: - raise IndexingError("Too many indexers") - def _getitem_tuple_same_dim(self, tup: Tuple): """ Index with indexers that should return an object of the same dimension @@ -794,10 +770,14 @@ class _LocationIndexer(NDFrameIndexerBase): # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": - with suppress(IndexingError): - return self._handle_lowerdim_multi_index_axis0(tup) + try: + result = self._handle_lowerdim_multi_index_axis0(tup) + return result + except IndexingError: + pass - self._validate_key_length(tup) + if len(tup) > self.ndim: + raise IndexingError("Too many indexers. handle elsewhere") for i, key in enumerate(tup): if is_label_like(key): @@ -842,8 +822,11 @@ class _LocationIndexer(NDFrameIndexerBase): if self.name != "loc": # This should never be reached, but lets be explicit about it raise ValueError("Too many indices") - with suppress(IndexingError): - return self._handle_lowerdim_multi_index_axis0(tup) + try: + result = self._handle_lowerdim_multi_index_axis0(tup) + return result + except IndexingError: + pass # this is a series with a multi-index specified a tuple of # selectors @@ -882,9 +865,11 @@ class _LocationIndexer(NDFrameIndexerBase): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - with suppress(KeyError, IndexError, AttributeError): - # AttributeError for IntervalTree get_value + try: return self.obj._get_value(*key, takeable=self._takeable) + except (KeyError, IndexError, AttributeError): + # AttributeError for IntervalTree get_value + pass return self._getitem_tuple(key) else: # we by definition only have the 0th axis @@ -1025,7 +1010,7 @@ class _LocIndexer(_LocationIndexer): def _getitem_iterable(self, key, axis: int): """ - Index current object with an iterable collection of keys. + Index current object with an an iterable collection of keys. Parameters ---------- @@ -1055,8 +1040,10 @@ class _LocIndexer(_LocationIndexer): ) def _getitem_tuple(self, tup: Tuple): - with suppress(IndexingError): + try: return self._getitem_lowerdim(tup) + except IndexingError: + pass # no multi-index, so validate all of the indexers self._has_valid_tuple(tup) @@ -1083,7 +1070,7 @@ class _LocIndexer(_LocationIndexer): except KeyError as ek: # raise KeyError if number of indexers match # else IndexingError will be raised - if self.ndim < len(tup) <= self.obj.index.nlevels: + if len(tup) <= self.obj.index.nlevels and len(tup) > self.ndim: raise ek raise IndexingError("No label returned") @@ -1253,10 +1240,12 @@ class _LocIndexer(_LocationIndexer): indexer, keyarr = ax._convert_listlike_indexer(key) # We only act on all found values: if indexer is not None and (indexer != -1).all(): - # _validate_read_indexer is a no-op if no -1s, so skip + self._validate_read_indexer( + keyarr, indexer, axis, raise_missing=raise_missing + ) return ax[indexer], indexer - if ax._index_as_unique: + if ax.is_unique and not getattr(ax, "is_overlapping", False): indexer = ax.get_indexer_for(keyarr) keyarr = ax.reindex(keyarr)[0] else: @@ -1294,6 +1283,8 @@ class _LocIndexer(_LocationIndexer): If at least one key was requested but none was found, and raise_missing=True. """ + ax = self.obj._get_axis(axis) + if len(key) == 0: return @@ -1306,23 +1297,27 @@ class _LocIndexer(_LocationIndexer): axis_name = self.obj._get_axis_name(axis) raise KeyError(f"None of [{key}] are in the [{axis_name}]") - ax = self.obj._get_axis(axis) - # We (temporarily) allow for some missing keys with .loc, except in # some cases (e.g. setting) in which "raise_missing" will be False if raise_missing: not_found = list(set(key) - set(ax)) raise KeyError(f"{not_found} not in index") - not_found = key[missing_mask] + # we skip the warning on Categorical + # as this check is actually done (check for + # non-missing values), but a bit later in the + # code, so we want to avoid warning & then + # just raising + if not ax.is_categorical(): + not_found = key[missing_mask] - with option_context("display.max_seq_items", 10, "display.width", 80): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported. " - f"The following labels were missing: {not_found}. " - "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) @@ -1384,22 +1379,21 @@ class _iLocIndexer(_LocationIndexer): """ if isinstance(indexer, dict): raise IndexError("iloc cannot enlarge its target object") - - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - - for ax, i in zip(self.obj.axes, indexer): - if isinstance(i, slice): - # should check the stop slice? - pass - elif is_list_like_indexer(i): - # should check the elements? - pass - elif is_integer(i): - if i >= len(ax): + else: + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) + for ax, i in zip(self.obj.axes, indexer): + if isinstance(i, slice): + # should check the stop slice? + pass + elif is_list_like_indexer(i): + # should check the elements? + pass + elif is_integer(i): + if i >= len(ax): + raise IndexError("iloc cannot enlarge its target object") + elif isinstance(i, dict): raise IndexError("iloc cannot enlarge its target object") - elif isinstance(i, dict): - raise IndexError("iloc cannot enlarge its target object") return True @@ -1447,8 +1441,10 @@ class _iLocIndexer(_LocationIndexer): def _getitem_tuple(self, tup: Tuple): self._has_valid_tuple(tup) - with suppress(IndexingError): + try: return self._getitem_lowerdim(tup) + except IndexingError: + pass return self._getitem_tuple_same_dim(tup) @@ -1524,7 +1520,7 @@ class _iLocIndexer(_LocationIndexer): # ------------------------------------------------------------------- - def _setitem_with_indexer(self, indexer, value, name="iloc"): + def _setitem_with_indexer(self, indexer, value): """ _setitem_with_indexer is for setting values on a Series/DataFrame using positional indexers. @@ -1536,18 +1532,21 @@ class _iLocIndexer(_LocationIndexer): since it goes from positional indexers back to labels when calling BlockManager methods, see GH#12991, GH#22046, GH#15686. """ + + # also has the side effect of consolidating in-place + from pandas import Series + info_axis = self.obj._info_axis_number # maybe partial set - take_split_path = not self.obj._mgr.is_single_block + take_split_path = self.obj._is_mixed_type # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value if not take_split_path and self.obj._mgr.blocks: - if self.ndim > 1: - # in case of dict, keys are indices + (blk,) = self.obj._mgr.blocks + if 1 < blk.ndim: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - blk = self.obj._mgr.blocks[0] take_split_path = not blk._can_hold_element(val) # if we have any multi-indexes that have non-trivial slices @@ -1581,7 +1580,10 @@ class _iLocIndexer(_LocationIndexer): # must have all defined axes if we have a scalar # or a list-like on the non-info axes if we have a # list-like - if not len(self.obj): + len_non_info_axes = ( + len(_ax) for _i, _ax in enumerate(self.obj.axes) if _i != i + ) + if any(not l for l in len_non_info_axes): if not is_list_like_indexer(value): raise ValueError( "cannot set a frame with no " @@ -1591,16 +1593,12 @@ class _iLocIndexer(_LocationIndexer): return # add a new item with the dtype setup - if com.is_null_slice(indexer[0]): - # We are setting an entire column - self.obj[key] = value - else: - self.obj[key] = infer_fill_value(value) + self.obj[key] = _infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes ) - self._setitem_with_indexer(new_indexer, value, name) + self._setitem_with_indexer(new_indexer, value) return @@ -1628,237 +1626,179 @@ class _iLocIndexer(_LocationIndexer): self._setitem_with_indexer_missing(indexer, value) return - # align and set the values - if take_split_path: - # We have to operate column-wise - self._setitem_with_indexer_split_path(indexer, value, name) - else: - self._setitem_single_block(indexer, value, name) - - def _setitem_with_indexer_split_path(self, indexer, value, name: str): - """ - Setitem column-wise. - """ - # Above we only set take_split_path to True for 2D cases - assert self.ndim == 2 - - if not isinstance(indexer, tuple): - indexer = _tuplify(self.ndim, indexer) - if len(indexer) > self.ndim: - raise IndexError("too many indices for array") - if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2: - raise ValueError(r"Cannot set values with ndim > 2") - - if isinstance(value, ABCSeries) and name != "iloc": - value = self._align_series(indexer, value) - - # Ensure we have something we can iterate over - info_axis = indexer[1] - ilocs = self._ensure_iterable_column_indexer(info_axis) - - pi = indexer[0] - lplane_indexer = length_of_indexer(pi, self.obj.index) - # lplane_indexer gives the expected length of obj[indexer[0]] - - # we need an iterable, with a ndim of at least 1 - # eg. don't pass through np.array(0) - if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: - - if isinstance(value, ABCDataFrame): - self._setitem_with_indexer_frame_value(indexer, value, name) - - elif np.ndim(value) == 2: - self._setitem_with_indexer_2d_value(indexer, value) - - elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): - # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) - - elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): - # We are trying to set N values into M entries of a single - # column, which is invalid for N != M - # Exclude zero-len for e.g. boolean masking that is all-false - - if len(value) == 1 and not is_integer(info_axis): - # This is a case like df.iloc[:3, [1]] = [0] - # where we treat as df.iloc[:3, 1] = 0 - return self._setitem_with_indexer((pi, info_axis[0]), value[0]) - - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) - - elif lplane_indexer == 0 and len(value) == len(self.obj.index): - # We get here in one case via .loc with a all-False mask - pass - - elif len(ilocs) == len(value): - # We are setting multiple columns in a single row. - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) - - elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: - # This is a setitem-with-expansion, see - # test_loc_setitem_empty_append_expands_rows_mixed_dtype - # e.g. df = DataFrame(columns=["x", "y"]) - # df["x"] = df["x"].astype(np.int64) - # df.loc[:, "x"] = [1, 2, 3] - self._setitem_single_column(ilocs[0], value, pi) - - else: - raise ValueError( - "Must have equal len keys and value " - "when setting with an iterable" - ) - - else: - - # scalar value - for loc in ilocs: - self._setitem_single_column(loc, value, pi) - - def _setitem_with_indexer_2d_value(self, indexer, value): - # We get here with np.ndim(value) == 2, excluding DataFrame, - # which goes through _setitem_with_indexer_frame_value - pi = indexer[0] - - ilocs = self._ensure_iterable_column_indexer(indexer[1]) - - # GH#7551 Note that this coerces the dtype if we are mixed - value = np.array(value, dtype=object) - if len(ilocs) != value.shape[1]: - raise ValueError( - "Must have equal len keys and value when setting with an ndarray" - ) - - for i, loc in enumerate(ilocs): - # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), pi) - - def _setitem_with_indexer_frame_value(self, indexer, value: "DataFrame", name: str): - ilocs = self._ensure_iterable_column_indexer(indexer[1]) - - sub_indexer = list(indexer) - pi = indexer[0] - - multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) - - unique_cols = value.columns.is_unique - - # We do not want to align the value in case of iloc GH#37728 - if name == "iloc": - for i, loc in enumerate(ilocs): - val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) - - elif not unique_cols and value.columns.equals(self.obj.columns): - # We assume we are already aligned, see - # test_iloc_setitem_frame_duplicate_columns_multiple_blocks - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - tuple(sub_indexer), - value.iloc[:, loc], - multiindex_indexer, - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) - - elif not unique_cols: - raise ValueError("Setting with non-unique columns is not allowed.") - - else: - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - tuple(sub_indexer), value[item], multiindex_indexer - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) - - def _setitem_single_column(self, loc: int, value, plane_indexer): - """ - - Parameters - ---------- - loc : int - Indexer for column position - plane_indexer : int, slice, listlike[int] - The indexer we use for setitem along axis=0. - """ - pi = plane_indexer - - ser = self.obj._ixs(loc, axis=1) - - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH#6149 (null slice), GH#10408 (full bounds) - if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): - ser = value - else: - # set the item, possibly having a dtype change - ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value) - ser._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj._iset_item(loc, ser) - - def _setitem_single_block(self, indexer, value, name: str): - """ - _setitem_with_indexer for the case when we have a single Block. - """ - from pandas import Series - - info_axis = self.obj._info_axis_number + # set item_labels = self.obj._get_axis(info_axis) - if isinstance(indexer, tuple): + # align and set the values + if take_split_path: + # Above we only set take_split_path to True for 2D cases + assert self.ndim == 2 + assert info_axis == 1 - # if we are setting on the info axis ONLY - # set using those methods to avoid block-splitting - # logic here - if ( - len(indexer) > info_axis - and is_integer(indexer[info_axis]) - and all( - com.is_null_slice(idx) - for i, idx in enumerate(indexer) - if i != info_axis - ) - and item_labels.is_unique - ): - self.obj[item_labels[indexer[info_axis]]] = value - return + if not isinstance(indexer, tuple): + indexer = _tuplify(self.ndim, indexer) - indexer = maybe_convert_ix(*indexer) - if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict): - # TODO(EA): ExtensionBlock.setitem this causes issues with - # setting for extensionarrays that store dicts. Need to decide - # if it's worth supporting that. - value = self._align_series(indexer, Series(value)) + if isinstance(value, ABCSeries): + value = self._align_series(indexer, value) - elif isinstance(value, ABCDataFrame) and name != "iloc": - value = self._align_frame(indexer, value) + info_idx = indexer[info_axis] + if is_integer(info_idx): + info_idx = [info_idx] + labels = item_labels[info_idx] - # check for chained assignment - self.obj._check_is_chained_assignment_possible() + # Ensure we have something we can iterate over + ilocs = info_idx + if isinstance(info_idx, slice): + ri = Index(range(len(self.obj.columns))) + ilocs = ri[info_idx] - # actually do the set - self.obj._consolidate_inplace() - self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True) + plane_indexer = indexer[:1] + lplane_indexer = length_of_indexer(plane_indexer[0], self.obj.index) + # lplane_indexer gives the expected length of obj[indexer[0]] + + if len(labels) == 1: + # We can operate on a single column + + # require that we are setting the right number of values that + # we are indexing + if is_list_like_indexer(value) and 0 != lplane_indexer != len(value): + # Exclude zero-len for e.g. boolean masking that is all-false + raise ValueError( + "cannot set using a multi-index " + "selection indexer with a different " + "length than the value" + ) + + def isetter(loc, v): + # positional setting on column loc + ser = self.obj._ixs(loc, axis=1) + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH6149 (null slice), GH10408 (full bounds) + if isinstance(plane_indexer, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in plane_indexer + ): + ser = v + else: + # set the item, possibly having a dtype change + ser = ser.copy() + ser._mgr = ser._mgr.setitem(indexer=plane_indexer, value=v) + ser._maybe_update_cacher(clear=True) + + # reset the sliced object if unique + self.obj._iset_item(loc, ser) + + # we need an iterable, with a ndim of at least 1 + # eg. don't pass through np.array(0) + if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: + + # we have an equal len Frame + if isinstance(value, ABCDataFrame): + sub_indexer = list(indexer) + multiindex_indexer = isinstance(labels, ABCMultiIndex) + # TODO: we are implicitly assuming value.columns is unique + + for loc in ilocs: + item = item_labels[loc] + if item in value: + sub_indexer[info_axis] = item + v = self._align_series( + tuple(sub_indexer), value[item], multiindex_indexer + ) + else: + v = np.nan + + isetter(loc, v) + + # we have an equal len ndarray/convertible to our labels + # hasattr first, to avoid coercing to ndarray without reason. + # But we may be relying on the ndarray coercion to check ndim. + # Why not just convert to an ndarray earlier on if needed? + elif np.ndim(value) == 2: + + # note that this coerces the dtype if we are mixed + # GH 7551 + value = np.array(value, dtype=object) + if len(ilocs) != value.shape[1]: + raise ValueError( + "Must have equal len keys and value " + "when setting with an ndarray" + ) + + for i, loc in enumerate(ilocs): + # setting with a list, re-coerces + isetter(loc, value[:, i].tolist()) + + elif ( + len(labels) == 1 + and lplane_indexer == len(value) + and not is_scalar(plane_indexer[0]) + ): + # we have an equal len list/ndarray + # We only get here with len(labels) == len(ilocs) == 1 + isetter(ilocs[0], value) + + elif lplane_indexer == 0 and len(value) == len(self.obj.index): + # We get here in one case via .loc with a all-False mask + pass + + else: + # per-label values + if len(ilocs) != len(value): + raise ValueError( + "Must have equal len keys and value " + "when setting with an iterable" + ) + + for loc, v in zip(ilocs, value): + isetter(loc, v) + else: + + # scalar value + for loc in ilocs: + isetter(loc, value) + + else: + if isinstance(indexer, tuple): + + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): + self.obj[item_labels[indexer[info_axis]]] = value + return + + indexer = maybe_convert_ix(*indexer) + + if isinstance(value, (ABCSeries, dict)): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + elif isinstance(value, ABCDataFrame): + value = self._align_frame(indexer, value) + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._consolidate_inplace() + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True) def _setitem_with_indexer_missing(self, indexer, value): """ @@ -1881,8 +1821,7 @@ class _iLocIndexer(_LocationIndexer): if index.is_unique: new_indexer = index.get_indexer([new_index[-1]]) if (new_indexer != -1).any(): - # We get only here with loc, so can hard code - return self._setitem_with_indexer(new_indexer, value, "loc") + return self._setitem_with_indexer(new_indexer, value) # this preserves dtype of the value new_values = Series([value])._values @@ -1921,21 +1860,7 @@ class _iLocIndexer(_LocationIndexer): self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) - def _ensure_iterable_column_indexer(self, column_indexer): - """ - Ensure that our column indexer is something that can be iterated over. - """ - # Ensure we have something we can iterate over - if is_integer(column_indexer): - ilocs = [column_indexer] - elif isinstance(column_indexer, slice): - ri = Index(range(len(self.obj.columns))) - ilocs = ri[column_indexer] - else: - ilocs = column_indexer - return ilocs - - def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False): + def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = False): """ Parameters ---------- @@ -1953,7 +1878,7 @@ class _iLocIndexer(_LocationIndexer): to the locations selected by `indexer` """ if isinstance(indexer, (slice, np.ndarray, list, Index)): - indexer = (indexer,) + indexer = tuple([indexer]) if isinstance(indexer, tuple): @@ -2026,7 +1951,7 @@ class _iLocIndexer(_LocationIndexer): raise ValueError("Incompatible indexer with Series") - def _align_frame(self, indexer, df: "DataFrame"): + def _align_frame(self, indexer, df: ABCDataFrame): is_frame = self.ndim == 2 if isinstance(indexer, tuple): @@ -2079,7 +2004,7 @@ class _iLocIndexer(_LocationIndexer): raise ValueError("Incompatible indexer with DataFrame") -class _ScalarAccessIndexer(NDFrameIndexerBase): +class _ScalarAccessIndexer(_NDFrameIndexerBase): """ Access scalars quickly. """ @@ -2092,7 +2017,7 @@ class _ScalarAccessIndexer(NDFrameIndexerBase): # we could have a convertible item here (e.g. Timestamp) if not is_list_like_indexer(key): - key = (key,) + key = tuple([key]) else: raise ValueError("Invalid call for scalar access (getting)!") @@ -2216,16 +2141,7 @@ def convert_to_index_sliceable(obj: "DataFrame", key): # slice here via partial string indexing if idx._supports_partial_string_indexing: try: - res = idx._get_string_slice(key) - warnings.warn( - "Indexing a DataFrame with a datetimelike index using a single " - "string to slice the rows, like `frame[string]`, is deprecated " - "and will be removed in a future version. Use `frame.loc[string]` " - "instead.", - FutureWarning, - stacklevel=3, - ) - return res + return idx._get_string_slice(key) except (KeyError, ValueError, NotImplementedError): return None @@ -2311,10 +2227,15 @@ def maybe_convert_ix(*args): """ We likely want to take the cross-product. """ + ixify = True for arg in args: if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)): - return args - return np.ix_(*args) + ixify = False + + if ixify: + return np.ix_(*args) + else: + return args def is_nested_tuple(tup, labels) -> bool: @@ -2357,7 +2278,7 @@ def need_slice(obj) -> bool: ) -def non_reducing_slice(slice_): +def _non_reducing_slice(slice_): """ Ensure that a slice doesn't reduce to a Series or Scalar. @@ -2396,7 +2317,7 @@ def non_reducing_slice(slice_): return tuple(slice_) -def maybe_numeric_slice(df, slice_, include_bool: bool = False): +def _maybe_numeric_slice(df, slice_, include_bool=False): """ Want nice defaults for background_gradient that don't break with non-numeric data. But if slice_ is passed go with that. diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/internals/__init__.py index fbccac1..e12e0d7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/__init__.py @@ -10,8 +10,8 @@ from pandas.core.internals.blocks import ( # io.pytables, io.packers IntBlock, ObjectBlock, TimeDeltaBlock, + _safe_reshape, make_block, - safe_reshape, ) from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( @@ -33,7 +33,7 @@ __all__ = [ "IntBlock", "ObjectBlock", "TimeDeltaBlock", - "safe_reshape", + "_safe_reshape", "make_block", "BlockManager", "SingleBlockManager", diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/blocks.py b/venv/lib/python3.8/site-packages/pandas/core/internals/blocks.py index fe07823..9446529 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/blocks.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/blocks.py @@ -1,16 +1,17 @@ from datetime import datetime, timedelta import inspect import re -from typing import TYPE_CHECKING, Any, List, Optional, Type, Union, cast +from typing import TYPE_CHECKING, Any, List, Optional import warnings import numpy as np -from pandas._libs import NaT, algos as libalgos, internals as libinternals, lib, writers +from pandas._libs import NaT, algos as libalgos, lib, writers +import pandas._libs.internals as libinternals from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike, Scalar, Shape +from pandas._typing import ArrayLike from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -19,7 +20,6 @@ from pandas.core.dtypes.cast import ( find_common_type, infer_dtype_from, infer_dtype_from_scalar, - maybe_box_datetimelike, maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, @@ -32,7 +32,6 @@ from pandas.core.dtypes.common import ( TD64NS_DTYPE, is_bool_dtype, is_categorical_dtype, - is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, @@ -58,10 +57,9 @@ from pandas.core.dtypes.generic import ( ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat +from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna import pandas.core.algorithms as algos -from pandas.core.array_algos.replace import compare_or_regex_search, replace_regex from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -94,8 +92,6 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - values: Union[np.ndarray, ExtensionArray] - __slots__ = ["_mgr_locs", "values", "ndim"] is_numeric = False is_float = False @@ -106,6 +102,7 @@ class Block(PandasObject): is_timedelta = False is_bool = False is_object = False + is_categorical = False is_extension = False _can_hold_na = False _can_consolidate = True @@ -124,19 +121,10 @@ class Block(PandasObject): obj._mgr_locs = placement return obj - def __init__(self, values, placement, ndim: int): - """ - Parameters - ---------- - values : np.ndarray or ExtensionArray - placement : BlockPlacement (or castable) - ndim : int - 1 for SingleBlockManager/Series, 2 for BlockManager/DataFrame - """ - # TODO(EA2D): ndim will be unnecessary with 2D EAs + def __init__(self, values, placement, ndim=None): self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement - self.values = self._maybe_coerce_values(values) + self.values = values if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): raise ValueError( @@ -144,20 +132,6 @@ class Block(PandasObject): f"placement implies {len(self.mgr_locs)}" ) - def _maybe_coerce_values(self, values): - """ - Ensure we have correctly-typed values. - - Parameters - ---------- - values : np.ndarray, ExtensionArray, Index - - Returns - ------- - np.ndarray or ExtensionArray - """ - return values - def _check_ndim(self, values, ndim): """ ndim inference and validation. @@ -201,18 +175,12 @@ class Block(PandasObject): @property def _consolidate_key(self): - return self._can_consolidate, self.dtype.name + return (self._can_consolidate, self.dtype.name) @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ - values = self.values - values = cast(np.ndarray, values) - return values.base is not None - - @property - def is_categorical(self) -> bool: - return self._holder is Categorical + return self.values.base is not None @property def is_datelike(self) -> bool: @@ -351,7 +319,7 @@ class Block(PandasObject): def iget(self, i): return self.values[i] - def set_inplace(self, locs, values): + def set(self, locs, values): """ Modify block values in-place with new item value. @@ -379,27 +347,6 @@ class Block(PandasObject): return self._split_op_result(result) - def reduce(self, func, ignore_failures: bool = False) -> List["Block"]: - # We will apply the function and reshape the result into a single-row - # Block with the same mgr_locs; squeezing will be done at a higher level - assert self.ndim == 2 - - try: - result = func(self.values) - except (TypeError, NotImplementedError): - if ignore_failures: - return [] - raise - - if np.ndim(result) == 0: - # TODO(EA2D): special case not needed with 2D EAs - res_values = np.array([[result]]) - else: - res_values = result.reshape(-1, 1) - - nb = self.make_block(res_values) - return [nb] - def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: @@ -427,9 +374,8 @@ class Block(PandasObject): inplace = validate_bool_kwarg(inplace, "inplace") mask = isna(self.values) - mask = _extract_bool_array(mask) if limit is not None: - limit = libalgos.validate_limit(None, limit=limit) + limit = libalgos._validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: @@ -439,10 +385,9 @@ class Block(PandasObject): return [self.copy()] if self._can_hold_element(value): - nb = self if inplace else self.copy() - nb._putmask_simple(mask, value) - # TODO: should be nb._maybe_downcast? - return self._maybe_downcast([nb], downcast) + # equivalent: _try_coerce_args(value) would not raise + blocks = self.putmask(mask, value, inplace=inplace) + return self._maybe_downcast(blocks, downcast) # we can't process the value, but nothing to do if not mask.any(): @@ -460,23 +405,7 @@ class Block(PandasObject): return self.split_and_operate(None, f, inplace) - def _split(self) -> List["Block"]: - """ - Split a block into a list of single-column blocks. - """ - assert self.ndim == 2 - - new_blocks = [] - for i, ref_loc in enumerate(self.mgr_locs): - vals = self.values[slice(i, i + 1)] - - nb = self.make_block(vals, [ref_loc]) - new_blocks.append(nb) - return new_blocks - - def split_and_operate( - self, mask, f, inplace: bool, ignore_failures: bool = False - ) -> List["Block"]: + def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]: """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -486,8 +415,7 @@ class Block(PandasObject): ---------- mask : 2-d boolean mask f : callable accepting (1d-mask, 1d values, indexer) - inplace : bool - ignore_failures : bool, default False + inplace : boolean Returns ------- @@ -526,16 +454,8 @@ class Block(PandasObject): v = new_values[i] # need a new block - if m.any() or m.size == 0: - # Apply our function; we may ignore_failures if this is a - # reduction that is dropping nuisance columns GH#37827 - try: - nv = f(m, v, i) - except TypeError: - if ignore_failures: - continue - else: - raise + if m.any(): + nv = f(m, v, i) else: nv = v if inplace else v.copy() @@ -548,16 +468,18 @@ class Block(PandasObject): # no need to downcast our float # unless indicated - if downcast is None and (self.is_float or self.is_datelike): + if downcast is None and ( + self.is_float or self.is_timedelta or self.is_datetime + ): return blocks - return extend_blocks([b.downcast(downcast) for b in blocks]) + return _extend_blocks([b.downcast(downcast) for b in blocks]) - def downcast(self, dtypes=None) -> List["Block"]: + def downcast(self, dtypes=None): """ try to downcast each item to the dict of dtypes if present """ # turn it off completely if dtypes is False: - return [self] + return self values = self.values @@ -568,11 +490,11 @@ class Block(PandasObject): dtypes = "infer" nv = maybe_downcast_to_dtype(values, dtypes) - return [self.make_block(nv)] + return self.make_block(nv) # ndim > 1 if dtypes is None: - return [self] + return self if not (dtypes == "infer" or isinstance(dtypes, dict)): raise ValueError( @@ -656,7 +578,7 @@ class Block(PandasObject): # use native type formatting for datetime/tz/timedelta if self.is_datelike: - values = self.to_native_types().values + values = self.to_native_types() # astype formatting else: @@ -683,7 +605,7 @@ class Block(PandasObject): if isinstance(values, np.ndarray): values = values.reshape(self.shape) - newb = self.make_block(values) + newb = make_block(values, placement=self.mgr_locs, ndim=self.ndim) if newb.is_numeric and self.is_numeric: if newb.shape != self.shape: @@ -700,13 +622,14 @@ class Block(PandasObject): datetime: bool = True, numeric: bool = True, timedelta: bool = True, - ) -> List["Block"]: + coerce: bool = False, + ): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we are not an ObjectBlock here! """ - return [self.copy()] if copy else [self] + return self.copy() if copy else self def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ @@ -746,7 +669,7 @@ class Block(PandasObject): values = np.array(values, dtype="object") values[mask] = na_rep - return self.make_block(values) + return values # block actions # def copy(self, deep: bool = True): @@ -762,7 +685,8 @@ class Block(PandasObject): value, inplace: bool = False, regex: bool = False, - ) -> List["Block"]: + convert: bool = True, + ): """ replace the to_replace value with value, possible to create new blocks here this is just a call to putmask. regex is not used here. @@ -771,12 +695,43 @@ class Block(PandasObject): inplace = validate_bool_kwarg(inplace, "inplace") original_to_replace = to_replace + # If we cannot replace with own dtype, convert to ObjectBlock and + # retry if not self._can_hold_element(to_replace): - # We cannot hold `to_replace`, so we know immediately that - # replacing it is a no-op. - # Note: If to_replace were a list, NDFrame.replace would call - # replace_list instead of replace. - return [self] if inplace else [self.copy()] + if not isinstance(to_replace, list): + if inplace: + return [self] + return [self.copy()] + + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], value, inplace=inplace, regex=regex, convert=convert, + ) + + # GH 22083, TypeError or ValueError occurred within error handling + # causes infinite loop. Cast and retry only if not objectblock. + if is_object_dtype(self): + raise AssertionError + + # try again with a compatible block + block = self.astype(object) + return block.replace( + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + convert=convert, + ) values = self.values if lib.is_scalar(to_replace) and isinstance(values, np.ndarray): @@ -786,132 +741,43 @@ class Block(PandasObject): to_replace = convert_scalar_for_putitemlike(to_replace, values.dtype) mask = missing.mask_missing(values, to_replace) - if not mask.any(): - # Note: we get here with test_replace_extension_other incorrectly - # bc _can_hold_element is incorrect. - return [self] if inplace else [self.copy()] - if not self._can_hold_element(value): - blk = self.astype(object) - return blk.replace( + try: + blocks = self.putmask(mask, value, inplace=inplace) + # Note: it is _not_ the case that self._can_hold_element(value) + # is always true at this point. In particular, that can fail + # for: + # "2u" with bool-dtype, float-dtype + # 0.5 with int64-dtype + # np.nan with int64-dtype + except (TypeError, ValueError): + # GH 22083, TypeError or ValueError occurred within error handling + # causes infinite loop. Cast and retry only if not objectblock. + if is_object_dtype(self): + raise + + if not self.is_extension: + # TODO: https://github.com/pandas-dev/pandas/issues/32586 + # Need an ExtensionArray._can_hold_element to indicate whether + # a scalar value can be placed in the array. + assert not self._can_hold_element(value), value + + # try again with a compatible block + block = self.astype(object) + return block.replace( to_replace=original_to_replace, value=value, - inplace=True, + inplace=inplace, regex=regex, + convert=convert, ) - - blk = self if inplace else self.copy() - blk._putmask_simple(mask, value) - blocks = blk.convert(numeric=False, copy=not inplace) + if convert: + blocks = [b.convert(numeric=False, copy=not inplace) for b in blocks] return blocks - def _replace_regex( - self, - to_replace, - value, - inplace: bool = False, - convert: bool = True, - mask=None, - ) -> List["Block"]: - """ - Replace elements by the given value. - - Parameters - ---------- - to_replace : object or pattern - Scalar to replace or regular expression to match. - value : object - Replacement object. - inplace : bool, default False - Perform inplace modification. - convert : bool, default True - If true, try to coerce any object types to better types. - mask : array-like of bool, optional - True indicate corresponding element is ignored. - - Returns - ------- - List[Block] - """ - if not self._can_hold_element(to_replace): - # i.e. only ObjectBlock, but could in principle include a - # String ExtensionBlock - return [self] if inplace else [self.copy()] - - rx = re.compile(to_replace) - - new_values = self.values if inplace else self.values.copy() - replace_regex(new_values, rx, value, mask) - - block = self.make_block(new_values) - if convert: - nbs = block.convert(numeric=False) - else: - nbs = [block] - return nbs - - def _replace_list( - self, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> List["Block"]: - """ - See BlockManager._replace_list docstring. - """ - # Exclude anything that we know we won't contain - pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) - ] - if not len(pairs): - # shortcut, nothing to replace - return [self] if inplace else [self.copy()] - - src_len = len(pairs) - 1 - - def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - s = maybe_box_datetimelike(s) - return compare_or_regex_search(self.values, s, regex, mask) - - if self.is_object: - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(self.values) - masks = [comp(s[0], mask, regex) for s in pairs] - else: - # GH#38086 faster if we know we dont need to check for regex - masks = [missing.mask_missing(self.values, s[0]) for s in pairs] - - masks = [_extract_bool_array(x) for x in masks] - - rb = [self if inplace else self.copy()] - for i, (src, dest) in enumerate(pairs): - new_rb: List["Block"] = [] - for blk in rb: - m = masks[i] - convert = i == src_len # only convert once at the end - result = blk._replace_coerce( - to_replace=src, - value=dest, - mask=m, - inplace=inplace, - regex=regex, - ) - if convert and blk.is_object: - result = extend_blocks( - [b.convert(numeric=False, copy=True) for b in result] - ) - new_rb.extend(result) - rb = new_rb - return rb + def _replace_single(self, *args, **kwargs): + """ no-op on a non-ObjectBlock """ + return self if kwargs["inplace"] else self.copy() def setitem(self, indexer, value): """ @@ -1029,37 +895,8 @@ class Block(PandasObject): block = self.make_block(values) return block - def _putmask_simple(self, mask: np.ndarray, value: Any): - """ - Like putmask but - - a) we do not cast on failure - b) we do not handle repeating or truncating like numpy. - - Parameters - ---------- - mask : np.ndarray[bool] - We assume _extract_bool_array has already been called. - value : Any - We assume self._can_hold_element(value) - """ - values = self.values - - if lib.is_scalar(value) and isinstance(values, np.ndarray): - value = convert_scalar_for_putitemlike(value, values.dtype) - - if self.is_extension or (self.is_object and not lib.is_scalar(value)): - # GH#19266 using np.putmask gives unexpected results with listlike value - if is_list_like(value) and len(value) == len(values): - values[mask] = value[mask] - else: - values[mask] = value - else: - # GH#37833 np.putmask is more performant than __setitem__ - np.putmask(values, mask, value) - def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a @@ -1209,15 +1046,39 @@ class Block(PandasObject): # don't coerce float/complex to int return self - elif self.is_datetime or is_datetime64_any_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) + elif ( + self.is_datetime + or is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + ): + + # not a datetime + if not ( + (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)) + and self.is_datetime + ): + return self.astype(object) + + # don't upcast timezone with different timezone or no timezone + mytz = getattr(self.dtype, "tz", None) + othertz = getattr(dtype, "tz", None) + + if not tz_compare(mytz, othertz): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) elif self.is_timedelta or is_timedelta64_dtype(dtype): - # The is_dtype_equal check above ensures that at most one of - # these two conditions hold, so we must cast to object. - return self.astype(object) + + # not a timedelta + if not (is_timedelta64_dtype(dtype) and self.is_timedelta): + return self.astype(object) + + raise AssertionError( + f"possible recursion in coerce_to_target_dtype: {self} {other}" + ) try: return self.astype(dtype) @@ -1241,8 +1102,8 @@ class Block(PandasObject): inplace = validate_bool_kwarg(inplace, "inplace") - if not self._can_hold_na: - # If there are no NAs, then interpolate is a no-op + # Only FloatBlocks will contain NaNs. timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: return self if inplace else self.copy() # a fill na type method @@ -1252,16 +1113,13 @@ class Block(PandasObject): m = None if m is not None: - if fill_value is not None: - # similar to validate_fillna_kwargs - raise ValueError("Cannot pass both fill_value and method") - return self._interpolate_with_fill( method=m, axis=axis, inplace=inplace, limit=limit, - limit_area=limit_area, + fill_value=fill_value, + coerce=coerce, downcast=downcast, ) # validate the interp method @@ -1288,22 +1146,34 @@ class Block(PandasObject): axis: int = 0, inplace: bool = False, limit: Optional[int] = None, - limit_area: Optional[str] = None, + fill_value: Optional[Any] = None, + coerce: bool = False, downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") - assert self._can_hold_na # checked by caller + # if we are coercing, then don't force the conversion + # if the block can't hold the type + if coerce: + if not self._can_hold_na: + if inplace: + return [self] + else: + return [self.copy()] values = self.values if inplace else self.values.copy() + # We only get here for non-ExtensionBlock + fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype) + values = missing.interpolate_2d( values, method=method, axis=axis, limit=limit, - limit_area=limit_area, + fill_value=fill_value, + dtype=self.dtype, ) blocks = [self.make_block_same_class(values, ndim=self.ndim)] @@ -1413,7 +1283,7 @@ class Block(PandasObject): return [self.make_block(new_values)] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1425,7 +1295,6 @@ class Block(PandasObject): errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object - try_cast: bool, default False axis : int, default 0 Returns @@ -1451,46 +1320,46 @@ class Block(PandasObject): if values.ndim - 1 == other.ndim and axis == 1: other = other.reshape(tuple(other.shape + (1,))) elif transpose and values.ndim == self.ndim - 1: - # TODO(EA2D): not neceesssary with 2D EAs cond = cond.T if not hasattr(cond, "shape"): raise ValueError("where must have a condition that is ndarray like") - if cond.ravel("K").all(): - result = values - else: - # see if we can operate on the entire block, or need item-by-item - # or if we are a single block (ndim == 1) - if ( - (self.is_integer or self.is_bool) - and lib.is_float(other) - and np.isnan(other) - ): - # GH#3733 special case to avoid object-dtype casting - # and go through numexpr path instead. - # In integer case, np.where will cast to floats - pass - elif not self._can_hold_element(other): - # we cannot coerce, return a compat dtype - # we are explicitly ignoring errors - block = self.coerce_to_target_dtype(other) - blocks = block.where( - orig_other, cond, errors=errors, try_cast=try_cast, axis=axis - ) - return self._maybe_downcast(blocks, "infer") + def where_func(cond, values, other): if not ( (self.is_integer or self.is_bool) and lib.is_float(other) and np.isnan(other) ): - # convert datetime to datetime64, timedelta to timedelta64 - other = convert_scalar_for_putitemlike(other, values.dtype) + # np.where will cast integer array to floats in this case + if not self._can_hold_element(other): + raise TypeError + if lib.is_scalar(other) and isinstance(values, np.ndarray): + # convert datetime to datetime64, timedelta to timedelta64 + other = convert_scalar_for_putitemlike(other, values.dtype) # By the time we get here, we should have all Series/Index - # args extracted to ndarray - result = expressions.where(cond, values, other) + # args extracted to ndarray + fastres = expressions.where(cond, values, other) + return fastres + + if cond.ravel("K").all(): + result = values + else: + # see if we can operate on the entire block, or need item-by-item + # or if we are a single block (ndim == 1) + try: + result = where_func(cond, values, other) + except TypeError: + + # we cannot coerce, return a compat dtype + # we are explicitly ignoring errors + block = self.coerce_to_target_dtype(other) + blocks = block.where( + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, + ) + return self._maybe_downcast(blocks, "infer") if self._can_hold_na or self.ndim == 1: @@ -1504,10 +1373,9 @@ class Block(PandasObject): cond = cond.swapaxes(axis, 0) mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) - result_blocks: List["Block"] = [] + result_blocks = [] for m in [mask, ~mask]: if m.any(): - result = cast(np.ndarray, result) # EABlock overrides where taken = result.take(m.nonzero()[0], axis=axis) r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) @@ -1605,10 +1473,11 @@ class Block(PandasObject): self, to_replace, value, - mask: np.ndarray, inplace: bool = True, regex: bool = False, - ) -> List["Block"]: + convert: bool = False, + mask=None, + ): """ Replace value corresponding to the given boolean array with another value. @@ -1619,36 +1488,33 @@ class Block(PandasObject): Scalar to replace or regular expression to match. value : object Replacement object. - mask : np.ndarray[bool] - True indicate corresponding element is ignored. inplace : bool, default True Perform inplace modification. regex : bool, default False If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. Returns ------- - List[Block] + A new block if there is anything to replace or the original block. """ if mask.any(): if not regex: - nb = self.coerce_to_target_dtype(value) - if nb is self and not inplace: - nb = nb.copy() - nb._putmask_simple(mask, value) - return [nb] + self = self.coerce_to_target_dtype(value) + return self.putmask(mask, value, inplace=inplace) else: - regex = _should_use_regex(regex, to_replace) - if regex: - return self._replace_regex( - to_replace, - value, - inplace=inplace, - convert=False, - mask=mask, - ) - return self.replace(to_replace, value, inplace=inplace, regex=False) - return [self] + return self._replace_single( + to_replace, + value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) + return self class ExtensionBlock(Block): @@ -1667,9 +1533,7 @@ class ExtensionBlock(Block): _validate_ndim = False is_extension = True - values: ExtensionArray - - def __init__(self, values, placement, ndim: int): + def __init__(self, values, placement, ndim=None): """ Initialize a non-consolidatable block. @@ -1678,6 +1542,7 @@ class ExtensionBlock(Block): This will call continue to call __init__ for the other base classes mixed in with this Mixin. """ + values = self._maybe_coerce_values(values) # Placement must be converted to BlockPlacement so that we can check # its length @@ -1700,8 +1565,8 @@ class ExtensionBlock(Block): def shape(self): # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: - return (len(self.values),) - return len(self.mgr_locs), len(self.values) + return ((len(self.values)),) + return (len(self.mgr_locs), len(self.values)) def iget(self, col): @@ -1720,14 +1585,18 @@ class ExtensionBlock(Block): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values): - # NB: This is a misnomer, is supposed to be inplace but is not, - # see GH#33457 + def should_store(self, value: ArrayLike) -> bool: + """ + Can we set the given array-like value inplace? + """ + return isinstance(value, self._holder) + + def set(self, locs, values): assert locs.tolist() == [0] self.values = values def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, ) -> List["Block"]: """ See Block.putmask.__doc__ @@ -1741,7 +1610,7 @@ class ExtensionBlock(Block): if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): new = new[mask] - mask = safe_reshape(mask, new_values.shape) + mask = _safe_reshape(mask, new_values.shape) new_values[mask] = new return [self.make_block(values=new_values)] @@ -1810,14 +1679,6 @@ class ExtensionBlock(Block): `indexer` is a direct slice/positional indexer. `value` must be a compatible shape. """ - if not self._can_hold_element(value): - # This is only relevant for DatetimeTZBlock, which has a - # non-trivial `_can_hold_element`. - # https://github.com/pandas-dev/pandas/issues/24020 - # Need a dedicated setitem until GH#24020 (type promotion in setitem - # for extension arrays) is designed and implemented. - return self.astype(object).setitem(indexer, value) - if isinstance(indexer, tuple): # TODO(EA2D): not needed with 2D EAs # we are always 1-D @@ -1845,7 +1706,7 @@ class ExtensionBlock(Block): # TODO(EA2D): reshape not needed with 2D EAs # we are expected to return a 2-d ndarray - return self.make_block(values) + return values.reshape(1, len(values)) def take_nd( self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default @@ -1946,7 +1807,7 @@ class ExtensionBlock(Block): return super().diff(n, axis) def shift( - self, periods: int, axis: int = 0, fill_value: Any = None + self, periods: int, axis: int = 0, fill_value: Any = None, ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1963,7 +1824,7 @@ class ExtensionBlock(Block): ] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, ) -> List["Block"]: cond = _extract_bool_array(cond) @@ -2046,16 +1907,6 @@ class ObjectValuesExtensionBlock(ExtensionBlock): def external_values(self): return self.values.astype(object) - def _can_hold_element(self, element: Any) -> bool: - if is_valid_nat_for_dtype(element, self.dtype): - return True - if isinstance(element, list) and len(element) == 0: - return True - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, self.dtype.type) - return isinstance(element, self.dtype.type) - class NumericBlock(Block): __slots__ = () @@ -2063,7 +1914,11 @@ class NumericBlock(Block): _can_hold_na = True -class FloatBlock(NumericBlock): +class FloatOrComplexBlock(NumericBlock): + __slots__ = () + + +class FloatBlock(FloatOrComplexBlock): __slots__ = () is_float = True @@ -2071,17 +1926,17 @@ class FloatBlock(NumericBlock): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( - tipo.type, np.timedelta64 + tipo.type, (np.datetime64, np.timedelta64) ) return isinstance( element, (float, int, np.floating, np.int_) ) and not isinstance( element, - (bool, np.bool_, np.timedelta64), + (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64), ) def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs, ): """ convert to our native types format """ values = self.values @@ -2098,7 +1953,7 @@ class FloatBlock(NumericBlock): values = np.array(values, dtype="object") values[mask] = na_rep - return self.make_block(values) + return values from pandas.io.formats.format import FloatArrayFormatter @@ -2110,11 +1965,10 @@ class FloatBlock(NumericBlock): quoting=quoting, fixed_width=False, ) - res = formatter.get_result_as_array() - return self.make_block(res) + return formatter.get_result_as_array() -class ComplexBlock(NumericBlock): +class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True @@ -2126,6 +1980,9 @@ class ComplexBlock(NumericBlock): element, (float, int, complex, np.float_, np.int_) ) and not isinstance(element, (bool, np.bool_)) + def should_store(self, value: ArrayLike) -> bool: + return issubclass(value.dtype.type, np.complexfloating) + class IntBlock(NumericBlock): __slots__ = () @@ -2137,7 +1994,7 @@ class IntBlock(NumericBlock): if tipo is not None: return ( issubclass(tipo.type, np.integer) - and not issubclass(tipo.type, np.timedelta64) + and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) and self.dtype.itemsize >= tipo.itemsize ) # We have not inferred an integer from the dtype @@ -2145,10 +2002,16 @@ class IntBlock(NumericBlock): return is_integer(element) or (is_float(element) and element.is_integer()) -class DatetimeLikeBlockMixin(Block): +class DatetimeLikeBlockMixin: """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" - _can_hold_na = True + @property + def _holder(self): + return DatetimeArray + + @property + def fill_value(self): + return np.datetime64("NaT", "ns") def get_values(self, dtype=None): """ @@ -2171,53 +2034,24 @@ class DatetimeLikeBlockMixin(Block): # TODO(EA2D): this can be removed if we ever have 2D EA return self.array_values().reshape(self.shape)[key] - def diff(self, n: int, axis: int = 0) -> List["Block"]: - """ - 1st discrete difference. - - Parameters - ---------- - n : int - Number of periods to diff. - axis : int, default 0 - Axis to diff upon. - - Returns - ------- - A list with a new TimeDeltaBlock. - - Notes - ----- - The arguments here are mimicking shift so they are called correctly - by apply. - """ - # TODO(EA2D): reshape not necessary with 2D EAs - values = self.array_values().reshape(self.shape) - - new_values = values - values.shift(n, axis=axis) - return [ - TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer, ndim=self.ndim) - ] - def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() new_values = values.shift(periods, fill_value=fill_value, axis=axis) return self.make_block_same_class(new_values) - def to_native_types(self, na_rep="NaT", **kwargs): - """ convert to our native types format """ - arr = self.array_values() - result = arr._format_native_types(na_rep=na_rep, **kwargs) - return self.make_block(result) - - -class DatetimeBlock(DatetimeLikeBlockMixin): +class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () is_datetime = True - _holder = DatetimeArray - fill_value = np.datetime64("NaT", "ns") + + def __init__(self, values, placement, ndim=None): + values = self._maybe_coerce_values(values) + super().__init__(values, placement=placement, ndim=ndim) + + @property + def _can_hold_na(self): + return True def _maybe_coerce_values(self, values): """ @@ -2257,7 +2091,9 @@ class DatetimeBlock(DatetimeLikeBlockMixin): if copy: # this should be the only copy values = values.copy() - values = DatetimeArray._simple_new(values.view("i8"), dtype=dtype) + if getattr(values, "tz", None) is None: + values = DatetimeArray(values).tz_localize("UTC") + values = values.tz_convert(dtype.tz) return self.make_block(values) # delegate @@ -2266,13 +2102,7 @@ class DatetimeBlock(DatetimeLikeBlockMixin): def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: - if isinstance(element, list) and len(element) == 0: - # Following DatetimeArray._validate_setitem_value - # convention, we treat this as object-dtype - # (even though tipo is float64) - return True - - elif self.is_datetimetz: + if self.is_datetimetz: # require exact match, since non-nano does not exist return is_dtype_equal(tipo, self.dtype) or is_valid_nat_for_dtype( element, self.dtype @@ -2289,7 +2119,16 @@ class DatetimeBlock(DatetimeLikeBlockMixin): return is_valid_nat_for_dtype(element, self.dtype) - def set_inplace(self, locs, values): + def to_native_types(self, na_rep="NaT", date_format=None, **kwargs): + """ convert to our native types format """ + dta = self.array_values() + + result = dta._format_native_types( + na_rep=na_rep, date_format=date_format, **kwargs + ) + return np.atleast_2d(result) + + def set(self, locs, values): """ See Block.set.__doc__ """ @@ -2301,24 +2140,21 @@ class DatetimeBlock(DatetimeLikeBlockMixin): class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ - values: DatetimeArray - __slots__ = () is_datetimetz = True is_extension = True internal_values = Block.internal_values - - _holder = DatetimeBlock._holder _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types - diff = DatetimeBlock.diff - fillna = DatetimeBlock.fillna # i.e. Block.fillna - fill_value = DatetimeBlock.fill_value - _can_hold_na = DatetimeBlock._can_hold_na - + fill_value = np.datetime64("NaT", "ns") + should_store = Block.should_store array_values = ExtensionBlock.array_values + @property + def _holder(self): + return DatetimeArray + def _maybe_coerce_values(self, values): """ Input validation for values passed to __init__. Ensure that @@ -2383,6 +2219,69 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. + + Parameters + ---------- + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. + + Returns + ------- + A list with a new TimeDeltaBlock. + + Notes + ----- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + if axis == 0: + # TODO(EA2D): special case not needed with 2D EAs + # Cannot currently calculate diff across multiple blocks since this + # function is invoked via apply + raise NotImplementedError + + if n == 0: + # Fastpath avoids making a copy in `shift` + new_values = np.zeros(self.values.shape, dtype=np.int64) + else: + new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 + + # Reshape the new_values like how algos.diff does for timedelta data + new_values = new_values.reshape(1, len(new_values)) + new_values = new_values.astype("timedelta64[ns]") + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + if self._can_hold_element(value): + return super().fillna(value, limit, inplace, downcast) + + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast + ) + + def setitem(self, indexer, value): + # https://github.com/pandas-dev/pandas/issues/24020 + # Need a dedicated setitem until #24020 (type promotion in setitem + # for extension arrays) is designed and implemented. + if self._can_hold_element(value) or ( + isinstance(indexer, np.ndarray) and indexer.size == 0 + ): + return super().setitem(indexer, value) + + obj_vals = self.values.astype(object) + newb = make_block( + obj_vals, placement=self.mgr_locs, klass=ObjectBlock, ndim=self.ndim + ) + return newb.setitem(indexer, value) + def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") @@ -2419,23 +2318,21 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): return ndim -class TimeDeltaBlock(DatetimeLikeBlockMixin): +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () is_timedelta = True + _can_hold_na = True + is_numeric = False fill_value = np.timedelta64("NaT", "ns") - def _maybe_coerce_values(self, values): + def __init__(self, values, placement, ndim=None): if values.dtype != TD64NS_DTYPE: - # non-nano we will convert to nano - if values.dtype.kind != "m": - # caller is responsible for ensuring timedelta64 dtype - raise TypeError(values.dtype) # pragma: no cover - + # e.g. non-nano or int64 values = TimedeltaArray._from_sequence(values)._data if isinstance(values, TimedeltaArray): values = values._data assert isinstance(values, np.ndarray), type(values) - return values + super().__init__(values, placement=placement, ndim=ndim) @property def _holder(self): @@ -2452,8 +2349,9 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin): return is_valid_nat_for_dtype(element, self.dtype) def fillna(self, value, **kwargs): - # TODO(EA2D): if we operated on array_values, TDA.fillna would handle - # raising here. + + # allow filling with integers to be + # interpreted as nanoseconds if is_integer(value): # Deprecation GH#24694, GH#19233 raise TypeError( @@ -2463,6 +2361,11 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin): ) return super().fillna(value, **kwargs) + def to_native_types(self, na_rep="NaT", **kwargs): + """ convert to our native types format """ + tda = self.array_values() + return tda._format_native_types(na_rep, **kwargs) + class BoolBlock(NumericBlock): __slots__ = () @@ -2475,16 +2378,26 @@ class BoolBlock(NumericBlock): return issubclass(tipo.type, np.bool_) return isinstance(element, (bool, np.bool_)) + def replace(self, to_replace, value, inplace=False, regex=False, convert=True): + inplace = validate_bool_kwarg(inplace, "inplace") + to_replace_values = np.atleast_1d(to_replace) + if not np.can_cast(to_replace_values, bool): + return self + return super().replace( + to_replace, value, inplace=inplace, regex=regex, convert=convert, + ) + class ObjectBlock(Block): __slots__ = () is_object = True _can_hold_na = True - def _maybe_coerce_values(self, values): + def __init__(self, values, placement=None, ndim=2): if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) - return values + + super().__init__(values, ndim=ndim, placement=placement) @property def is_bool(self): @@ -2494,48 +2407,20 @@ class ObjectBlock(Block): """ return lib.is_bool_array(self.values.ravel("K")) - def reduce(self, func, ignore_failures: bool = False) -> List[Block]: - """ - For object-dtype, we operate column-wise. - """ - assert self.ndim == 2 - - values = self.values - if len(values) > 1: - # split_and_operate expects func with signature (mask, values, inplace) - def mask_func(mask, values, inplace): - if values.ndim == 1: - values = values.reshape(1, -1) - return func(values) - - return self.split_and_operate( - None, mask_func, False, ignore_failures=ignore_failures - ) - - try: - res = func(values) - except TypeError: - if not ignore_failures: - raise - return [] - - assert isinstance(res, np.ndarray) - assert res.ndim == 1 - res = res.reshape(1, -1) - return [self.make_block_same_class(res)] - def convert( self, copy: bool = True, datetime: bool = True, numeric: bool = True, timedelta: bool = True, - ) -> List["Block"]: + coerce: bool = False, + ): """ - attempt to cast any object types to better types return a copy of + attempt to coerce any object types to better types return a copy of the block (if copy = True) by definition we ARE an ObjectBlock!!!!! - """ + can return multiple blocks! + """ # operate column-by-column def f(mask, val, idx): shape = val.shape @@ -2544,6 +2429,7 @@ class ObjectBlock(Block): datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, copy=copy, ) if isinstance(values, np.ndarray): @@ -2556,7 +2442,7 @@ class ObjectBlock(Block): blocks = self.split_and_operate(None, f, False) else: values = f(None, self.values.ravel(), None) - blocks = [self.make_block(values)] + blocks = [make_block(values, ndim=self.ndim, placement=self.mgr_locs)] return blocks @@ -2566,59 +2452,208 @@ class ObjectBlock(Block): return blocks # split and convert the blocks - return extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) + return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element: Any) -> bool: return True - def replace( - self, - to_replace, - value, - inplace: bool = False, - regex: bool = False, - ) -> List["Block"]: - # Note: the checks we do in NDFrame.replace ensure we never get - # here with listlike to_replace or value, as those cases - # go through _replace_list + def replace(self, to_replace, value, inplace=False, regex=False, convert=True): + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) + both_lists = to_rep_is_list and value_is_list + either_list = to_rep_is_list or value_is_list - regex = _should_use_regex(regex, to_replace) + result_blocks = [] + blocks = [self] - if regex: - return self._replace_regex(to_replace, value, inplace=inplace) + if not either_list and is_re(to_replace): + return self._replace_single( + to_replace, value, inplace=inplace, regex=True, convert=convert, + ) + elif not (either_list or regex): + return super().replace( + to_replace, value, inplace=inplace, regex=regex, convert=convert, + ) + elif both_lists: + for to_rep, v in zip(to_replace, value): + result_blocks = [] + for b in blocks: + result = b._replace_single( + to_rep, v, inplace=inplace, regex=regex, convert=convert, + ) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + elif to_rep_is_list and regex: + for to_rep in to_replace: + result_blocks = [] + for b in blocks: + result = b._replace_single( + to_rep, value, inplace=inplace, regex=regex, convert=convert, + ) + result_blocks = _extend_blocks(result, result_blocks) + blocks = result_blocks + return result_blocks + + return self._replace_single( + to_replace, value, inplace=inplace, convert=convert, regex=regex, + ) + + def _replace_single( + self, to_replace, value, inplace=False, regex=False, convert=True, mask=None, + ): + """ + Replace elements by the given value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + a new block, the result after replacing + """ + inplace = validate_bool_kwarg(inplace, "inplace") + + # to_replace is regex compilable + to_rep_re = regex and is_re_compilable(to_replace) + + # regex is regex compilable + regex_re = is_re_compilable(regex) + + # only one will survive + if to_rep_re and regex_re: + raise AssertionError( + "only one of to_replace and regex can be regex compilable" + ) + + # if regex was passed as something that can be a regex (rather than a + # boolean) + if regex_re: + to_replace = regex + + regex = regex_re or to_rep_re + + # try to get the pattern attribute (compiled re) or it's a string + if is_re(to_replace): + pattern = to_replace.pattern else: - return super().replace(to_replace, value, inplace=inplace, regex=False) + pattern = to_replace + # if the pattern is not empty and to_replace is either a string or a + # regex + if regex and pattern: + rx = re.compile(to_replace) + else: + # if the thing to replace is not a string or compiled regex call + # the superclass method -> to_replace is some kind of object + return super().replace(to_replace, value, inplace=inplace, regex=regex) -def _should_use_regex(regex: bool, to_replace: Any) -> bool: - """ - Decide whether to treat `to_replace` as a regular expression. - """ - if is_re(to_replace): - regex = True + new_values = self.values if inplace else self.values.copy() - regex = regex and is_re_compilable(to_replace) + # deal with replacing values with objects (strings) that match but + # whose replacement is not a string (numeric, nan, object) + if isna(value) or not isinstance(value, str): - # Don't use regex if the pattern is empty. - regex = regex and re.compile(to_replace).pattern != "" - return regex + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return value if rx.search(s) is not None else s + else: + return s + + else: + # value is guaranteed to be a string here, s can be either a string + # or null if it's null it gets returned + def re_replacer(s): + if is_re(rx) and isinstance(s, str): + return rx.sub(value, s) + else: + return s + + f = np.vectorize(re_replacer, otypes=[self.dtype]) + + if mask is None: + new_values[:] = f(new_values) + else: + new_values[mask] = f(new_values[mask]) + + # convert + block = self.make_block(new_values) + if convert: + block = block.convert(numeric=False) + return block + + def _replace_coerce( + self, to_replace, value, inplace=True, regex=False, convert=False, mask=None + ): + """ + Replace value corresponding to the given boolean array with another + value. + + Parameters + ---------- + to_replace : object or pattern + Scalar to replace or regular expression to match. + value : object + Replacement object. + inplace : bool, default False + Perform inplace modification. + regex : bool, default False + If true, perform regular expression substitution. + convert : bool, default True + If true, try to coerce any object types to better types. + mask : array-like of bool, optional + True indicate corresponding element is ignored. + + Returns + ------- + A new block if there is anything to replace or the original block. + """ + if mask.any(): + block = super()._replace_coerce( + to_replace=to_replace, + value=value, + inplace=inplace, + regex=regex, + convert=convert, + mask=mask, + ) + if convert: + block = [b.convert(numeric=False, copy=True) for b in block] + return block + if convert: + return [self.convert(numeric=False, copy=True)] + return self class CategoricalBlock(ExtensionBlock): __slots__ = () + is_categorical = True + _can_hold_na = True - def _replace_list( - self, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> List["Block"]: - if len(algos.unique(dest_list)) == 1: - # We likely got here by tiling value inside NDFrame.replace, - # so un-tile here - return self.replace(src_list, dest_list[0], inplace, regex) - return super()._replace_list(src_list, dest_list, inplace, regex) + should_store = Block.should_store + + def __init__(self, values, placement, ndim=None): + # coerce to categorical if we can + values = extract_array(values) + assert isinstance(values, Categorical), type(values) + super().__init__(values, placement=placement, ndim=ndim) + + @property + def _holder(self): + return Categorical def replace( self, @@ -2626,12 +2661,13 @@ class CategoricalBlock(ExtensionBlock): value, inplace: bool = False, regex: bool = False, - ) -> List["Block"]: + convert: bool = True, + ): inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() result.values.replace(to_replace, value, inplace=True) - return [result] + return result # ----------------------------------------------------------------- @@ -2654,8 +2690,6 @@ def get_block_type(values, dtype=None): dtype = dtype or values.dtype vtype = dtype.type - cls: Type[Block] - if is_sparse(dtype): # Need this first(ish) so that Sparse[datetime] is sparse cls = ExtensionBlock @@ -2669,7 +2703,6 @@ def get_block_type(values, dtype=None): elif is_interval_dtype(dtype) or is_period_dtype(dtype): cls = ObjectValuesExtensionBlock elif is_extension_array_dtype(values.dtype): - # Note: need to be sure PandasArray is unwrapped before we get here cls = ExtensionBlock elif issubclass(vtype, np.floating): cls = FloatBlock @@ -2714,7 +2747,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None): # ----------------------------------------------------------------- -def extend_blocks(result, blocks=None): +def _extend_blocks(result, blocks=None): """ return a new extended blocks, given the result """ if blocks is None: blocks = [] @@ -2738,12 +2771,11 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - # error: "ExtensionArray" has no attribute "reshape" - values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] + values = values.reshape(tuple((1,) + shape)) # type: ignore return values -def safe_reshape(arr, new_shape: Shape): +def _safe_reshape(arr, new_shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): @@ -2805,7 +2837,7 @@ def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: else: # make sure that we have a nullable type # if we have nulls - if not isna_compat(v, nn[0]): + if not _isna_compat(v, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats @@ -2853,9 +2885,7 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... - # Except for BooleanArray, this is equivalent to just - # np.asarray(mask, dtype=bool) - mask = mask.to_numpy(dtype=bool, na_value=False) + mask = np.asarray(mask, dtype=np.bool_) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/concat.py b/venv/lib/python3.8/site-packages/pandas/core/internals/concat.py index 06de197..5d06cb4 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/concat.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/concat.py @@ -1,16 +1,15 @@ from collections import defaultdict import copy -from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast +from typing import List import numpy as np from pandas._libs import NaT, internals as libinternals -from pandas._typing import DtypeObj, Shape from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - get_dtype, + _get_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, @@ -21,19 +20,16 @@ from pandas.core.dtypes.common import ( is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna_all +from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager -if TYPE_CHECKING: - from pandas.core.arrays.sparse.dtype import SparseDtype - def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool + mgrs_indexers, axes, concat_axis: int, copy: bool, ) -> BlockManager: """ Concatenate block managers into one. @@ -80,9 +76,8 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy), + _concatenate_join_units(join_units, concat_axis, copy=copy,), placement=placement, - ndim=len(axes), ) blocks.append(b) @@ -105,10 +100,10 @@ def _get_mgr_concatenation_plan(mgr, indexers): """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. - mgr_shape_list = list(mgr.shape) + mgr_shape = list(mgr.shape) for ax, indexer in indexers.items(): - mgr_shape_list[ax] = len(indexer) - mgr_shape = tuple(mgr_shape_list) + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) if 0 in indexers: ax0_indexer = indexers.pop(0) @@ -116,7 +111,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): blklocs = algos.take_1d(mgr.blklocs, ax0_indexer, fill_value=-1) else: - if mgr.is_single_block: + if mgr._is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] @@ -131,9 +126,9 @@ def _get_mgr_concatenation_plan(mgr, indexers): join_unit_indexers = indexers.copy() - shape_list = list(mgr_shape) - shape_list[0] = len(placements) - shape = tuple(shape_list) + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) if blkno == -1: unit = JoinUnit(None, shape) @@ -176,7 +171,7 @@ def _get_mgr_concatenation_plan(mgr, indexers): class JoinUnit: - def __init__(self, block, shape: Shape, indexers=None): + def __init__(self, block, shape, indexers=None): # Passing shape explicitly is required for cases when block is None. if indexers is None: indexers = {} @@ -188,7 +183,7 @@ class JoinUnit: return f"{type(self).__name__}({repr(self.block)}, {self.indexers})" @cache_readonly - def needs_filling(self) -> bool: + def needs_filling(self): for indexer in self.indexers.values(): # FIXME: cache results of indexer == -1 checks. if (indexer == -1).any(): @@ -204,10 +199,10 @@ class JoinUnit: if not self.needs_filling: return self.block.dtype else: - return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) + return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) @cache_readonly - def is_na(self) -> bool: + def is_na(self): if self.block is None: return True @@ -218,17 +213,24 @@ class JoinUnit: # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. values = self.block.values - if is_sparse(self.block.values.dtype): + if self.block.is_categorical: + values_flat = values.categories + elif is_sparse(self.block.values.dtype): return False elif self.block.is_extension: # TODO(EA2D): no need for special case with 2D EAs values_flat = values else: values_flat = values.ravel(order="K") + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isna(values_flat[i : i + chunk_len]).all(): + return False - return isna_all(values_flat) + return True - def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na): + def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value @@ -249,8 +251,9 @@ class JoinUnit: empty_dtype ): if self.block is None: + array = empty_dtype.construct_array_type() # TODO(EA2D): special case unneeded with 2D EAs - return DatetimeArray( + return array( np.full(self.shape[1], fill_value.value), dtype=empty_dtype ) elif getattr(self.block, "is_categorical", False): @@ -340,12 +343,12 @@ def _concatenate_join_units(join_units, concat_axis, copy): # consolidated 2D block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis) + concat_values = concat_compat(to_concat, axis=concat_axis,) return concat_values -def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: +def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -375,8 +378,45 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A else: dtypes[i] = unit.dtype - upcast_classes = _get_upcast_classes(join_units, dtypes) + upcast_classes = defaultdict(list) + null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + if is_categorical_dtype(dtype): + upcast_cls = "category" + elif is_datetime64tz_dtype(dtype): + upcast_cls = "datetimetz" + + elif is_extension_array_dtype(dtype): + upcast_cls = "extension" + + elif issubclass(dtype.type, np.bool_): + upcast_cls = "bool" + elif issubclass(dtype.type, np.object_): + upcast_cls = "object" + elif is_datetime64_dtype(dtype): + upcast_cls = "datetime" + elif is_timedelta64_dtype(dtype): + upcast_cls = "timedelta" + elif is_sparse(dtype): + upcast_cls = dtype.subtype.name + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + upcast_cls = dtype.name + else: + upcast_cls = "float" + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: @@ -405,74 +445,23 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: - common_dtype = np.find_common_type(upcast_classes, []) + g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: - if is_float_dtype(common_dtype): - return common_dtype, common_dtype.type(np.nan) - elif is_numeric_dtype(common_dtype): + if is_float_dtype(g): + return g, g.type(np.nan) + elif is_numeric_dtype(g): if has_none_blocks: return np.dtype(np.float64), np.nan else: - return common_dtype, None + return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) -def _get_upcast_classes( - join_units: Sequence[JoinUnit], - dtypes: Sequence[DtypeObj], -) -> Dict[str, List[DtypeObj]]: - """Create mapping between upcast class names and lists of dtypes.""" - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - upcast_cls = _select_upcast_cls_from_dtype(dtype) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes - - return upcast_classes - - -def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: - """Select upcast class name based on dtype.""" - if is_categorical_dtype(dtype): - return "category" - elif is_datetime64tz_dtype(dtype): - return "datetimetz" - elif is_extension_array_dtype(dtype): - return "extension" - elif issubclass(dtype.type, np.bool_): - return "bool" - elif issubclass(dtype.type, np.object_): - return "object" - elif is_datetime64_dtype(dtype): - return "datetime" - elif is_timedelta64_dtype(dtype): - return "timedelta" - elif is_sparse(dtype): - dtype = cast("SparseDtype", dtype) - return dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - return dtype.name - else: - return "float" - - def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can @@ -484,8 +473,8 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa - and + all(type(ju.block) is type(join_units[0].block) for ju in join_units) + and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py b/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py index 9c2d08b..2d4163e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/construction.py @@ -9,12 +9,10 @@ import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axis, DtypeObj, Label, Scalar +from pandas._typing import Axis, DtypeObj, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, - construct_1d_ndarray_preserving_na, - dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -53,7 +51,7 @@ from pandas.core.internals.managers import ( ) if TYPE_CHECKING: - from pandas import Series + from pandas import Series # noqa:F401 # --------------------------------------------------------------------- # BlockManager Interface @@ -191,16 +189,15 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None and not is_dtype_equal(values.dtype, dtype): - try: - values = construct_1d_ndarray_preserving_na( - values.ravel(), dtype=dtype, copy=False - ).reshape(values.shape) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + if dtype is not None: + if not is_dtype_equal(values.dtype, dtype): + try: + values = values.astype(dtype) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( @@ -225,8 +222,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # TODO: What about re-joining object columns? block_values = [ - make_block(dvals_list[n], placement=[n], ndim=2) - for n in range(len(dvals_list)) + make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) ] else: @@ -246,7 +242,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays: Union[Sequence[Any], "Series"] if columns is not None: - from pandas.core.series import Series + from pandas.core.series import Series # noqa:F811 arrays = Series(data, index=columns, dtype=object) data_names = arrays.index @@ -348,7 +344,7 @@ def _homogenize(data, index, dtype: Optional[DtypeObj]): oindex = index.astype("O") if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): - val = dict_compat(val) + val = com.dict_compat(val) else: val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) @@ -370,7 +366,7 @@ def extract_index(data) -> Index: index = Index([]) elif len(data) > 0: raw_lengths = [] - indexes: List[Union[List[Label], Index]] = [] + indexes = [] have_raw_arrays = False have_series = False @@ -438,7 +434,7 @@ def get_names_from_index(data): if not has_some_name: return ibase.default_index(len(data)) - index: List[Label] = list(range(len(data))) + index = list(range(len(data))) count = 0 for i, s in enumerate(data): n = getattr(s, "name", None) @@ -611,7 +607,7 @@ def _list_of_series_to_arrays( def _list_of_dict_to_arrays( - data: List[Dict], + data: List, columns: Union[Index, List], coerce_float: bool = False, dtype: Optional[DtypeObj] = None, diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/managers.py b/venv/lib/python3.8/site-packages/pandas/core/internals/managers.py index 93ab207..67bf258 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/managers.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/managers.py @@ -1,12 +1,13 @@ from collections import defaultdict import itertools +import operator +import re from typing import ( - Any, - Callable, DefaultDict, Dict, List, Optional, + Pattern, Sequence, Tuple, TypeVar, @@ -17,7 +18,7 @@ import warnings import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label, Shape +from pandas._typing import ArrayLike, DtypeObj, Label, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -27,18 +28,23 @@ from pandas.core.dtypes.cast import ( ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_list_like, + is_numeric_v_string_like, + is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries -from pandas.core.dtypes.missing import array_equals, isna +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import array_equivalent, isna import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject +import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -48,12 +54,12 @@ from pandas.core.internals.blocks import ( DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - extend_blocks, + _extend_blocks, + _safe_reshape, get_block_type, make_block, - safe_reshape, ) -from pandas.core.internals.ops import blockwise_all, operate_blockwise +from pandas.core.internals.ops import operate_blockwise # TODO: flexible with index=None and/or items=None @@ -204,7 +210,7 @@ class BlockManager(PandasObject): __bool__ = __nonzero__ @property - def shape(self) -> Shape: + def shape(self) -> Tuple[int, ...]: return tuple(len(ax) for ax in self.axes) @property @@ -225,8 +231,8 @@ class BlockManager(PandasObject): self.axes[axis] = new_labels @property - def is_single_block(self) -> bool: - # Assumes we are 2D; overridden by SingleBlockManager + def _is_single_block(self) -> bool: + # Assumes we are 2D; overriden by SingleBlockManager return len(self.blocks) == 1 def _rebuild_blknos_and_blklocs(self) -> None: @@ -267,7 +273,7 @@ class BlockManager(PandasObject): "0.14.1": { "axes": axes_array, "blocks": [ - {"values": b.values, "mgr_locs": b.mgr_locs.indexer} + dict(values=b.values, mgr_locs=b.mgr_locs.indexer) for b in self.blocks ], } @@ -328,44 +334,31 @@ class BlockManager(PandasObject): f"tot_items: {tot_items}" ) - def reduce( - self: T, func: Callable, ignore_failures: bool = False - ) -> Tuple[T, np.ndarray]: - """ - Apply reduction function blockwise, returning a single-row BlockManager. - - Parameters - ---------- - func : reduction function - ignore_failures : bool, default False - Whether to drop blocks where func raises TypeError. - - Returns - ------- - BlockManager - np.ndarray - Indexer of mgr_locs that are retained. - """ + def reduce(self, func): # If 2D, we assume that we're operating column-wise - assert self.ndim == 2 + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values) - res_blocks: List[Block] = [] + res = {} for blk in self.blocks: - nbs = blk.reduce(func, ignore_failures) - res_blocks.extend(nbs) + bres = func(blk.values) - index = Index([None]) # placeholder - if ignore_failures: - if res_blocks: - indexer = np.concatenate([blk.mgr_locs.as_array for blk in res_blocks]) - new_mgr = self._combine(res_blocks, copy=False, index=index) + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) else: - indexer = [] - new_mgr = type(self).from_blocks([], [Index([]), index]) - else: - indexer = np.arange(self.shape[0]) - new_mgr = type(self).from_blocks(res_blocks, [self.items, index]) - return new_mgr, indexer + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ @@ -373,13 +366,7 @@ class BlockManager(PandasObject): """ return operate_blockwise(self, other, array_op) - def apply( - self: T, - f, - align_keys: Optional[List[str]] = None, - ignore_failures: bool = False, - **kwargs, - ) -> T: + def apply(self: T, f, align_keys=None, **kwargs) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -387,10 +374,6 @@ class BlockManager(PandasObject): ---------- f : str or callable Name of the Block method to apply. - align_keys: List[str] or None, default None - ignore_failures: bool, default False - **kwargs - Keywords to pass to `f` Returns ------- @@ -420,19 +403,11 @@ class BlockManager(PandasObject): # otherwise we have an ndarray kwargs[k] = obj[b.mgr_locs.indexer] - try: - if callable(f): - applied = b.apply(f, **kwargs) - else: - applied = getattr(b, f)(**kwargs) - except (TypeError, NotImplementedError): - if not ignore_failures: - raise - continue - result_blocks = extend_blocks(applied, result_blocks) - - if ignore_failures: - return self._combine(result_blocks) + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: return self.make_empty(self.axes) @@ -539,7 +514,7 @@ class BlockManager(PandasObject): values = values.take(indexer) return SingleBlockManager( - make_block(values, ndim=1, placement=np.arange(len(values))), axes[0] + make_block(values, ndim=1, placement=np.arange(len(values))), axes[0], ) def isna(self, func) -> "BlockManager": @@ -567,7 +542,9 @@ class BlockManager(PandasObject): def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) - def putmask(self, mask, new, align: bool = True, axis: int = 0): + def putmask( + self, mask, new, align: bool = True, axis: int = 0, + ): transpose = self.ndim == 2 if align: @@ -593,12 +570,8 @@ class BlockManager(PandasObject): return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": - if fill_value is lib.no_default: - fill_value = None - if axis == 0 and self.ndim == 2 and self.nblocks > 1: # GH#35488 we need to watch out for multi-block cases - # We only get here with fill_value not-lib.no_default ncols = self.shape[0] if periods > 0: indexer = [-1] * periods + list(range(ncols - periods)) @@ -636,6 +609,7 @@ class BlockManager(PandasObject): datetime: bool = True, numeric: bool = True, timedelta: bool = True, + coerce: bool = False, ) -> "BlockManager": return self.apply( "convert", @@ -643,41 +617,70 @@ class BlockManager(PandasObject): datetime=datetime, numeric=numeric, timedelta=timedelta, + coerce=coerce, ) - def replace(self, to_replace, value, inplace: bool, regex: bool) -> "BlockManager": + def replace(self, value, **kwargs) -> "BlockManager": assert np.ndim(value) == 0, value - return self.apply( - "replace", to_replace=to_replace, value=value, inplace=inplace, regex=regex - ) + return self.apply("replace", value=value, **kwargs) def replace_list( - self: T, - src_list: List[Any], - dest_list: List[Any], - inplace: bool = False, - regex: bool = False, - ) -> T: + self, src_list, dest_list, inplace: bool = False, regex: bool = False + ) -> "BlockManager": """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") - bm = self.apply( - "_replace_list", - src_list=src_list, - dest_list=dest_list, - inplace=inplace, - regex=regex, - ) + # figure out our mask apriori to avoid repeated replacements + values = self.as_array() + + def comp(s: Scalar, mask: np.ndarray, regex: bool = False): + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return ~mask + + s = com.maybe_box_datetimelike(s) + return _compare_or_regex_search(values, s, regex, mask) + + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(values) + + masks = [comp(s, mask, regex) for s in src_list] + + result_blocks = [] + src_len = len(src_list) - 1 + for blk in self.blocks: + + # its possible to get multiple result blocks here + # replace ALWAYS will return a list + rb = [blk if inplace else blk.copy()] + for i, (s, d) in enumerate(zip(src_list, dest_list)): + new_rb: List[Block] = [] + for b in rb: + m = masks[i][b.mgr_locs.indexer] + convert = i == src_len # only convert once at the end + result = b._replace_coerce( + mask=m, + to_replace=s, + value=d, + inplace=inplace, + convert=convert, + regex=regex, + ) + if m.any() or convert: + new_rb = _extend_blocks(result, new_rb) + else: + new_rb.append(b) + rb = new_rb + result_blocks.extend(rb) + + bm = type(self).from_blocks(result_blocks, self.axes) bm._consolidate_inplace() return bm - def to_native_types(self, **kwargs) -> "BlockManager": - """ - Convert values to native types (strings / python objects) that are used - in formatting (repr / csv). - """ - return self.apply("to_native_types", **kwargs) - def is_consolidated(self) -> bool: """ Return True if more than one block with the same dtype @@ -691,6 +694,12 @@ class BlockManager(PandasObject): self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True + @property + def is_mixed_type(self) -> bool: + # Warning, consolidation needs to get checked upstairs + self._consolidate_inplace() + return len(self.blocks) > 1 + @property def is_numeric_mixed_type(self) -> bool: return all(block.is_numeric for block in self.blocks) @@ -717,28 +726,13 @@ class BlockManager(PandasObject): def get_bool_data(self, copy: bool = False) -> "BlockManager": """ - Select blocks that are bool-dtype and columns from object-dtype blocks - that are all-bool. - Parameters ---------- copy : bool, default False Whether to copy the blocks """ - - new_blocks = [] - - for blk in self.blocks: - if blk.dtype == bool: - new_blocks.append(blk) - - elif blk.is_object: - nbs = blk._split() - for nb in nbs: - if nb.is_bool: - new_blocks.append(nb) - - return self._combine(new_blocks, copy) + self._consolidate_inplace() + return self._combine([b for b in self.blocks if b.is_bool], copy) def get_numeric_data(self, copy: bool = False) -> "BlockManager": """ @@ -749,9 +743,7 @@ class BlockManager(PandasObject): """ return self._combine([b for b in self.blocks if b.is_numeric], copy) - def _combine( - self: T, blocks: List[Block], copy: bool = True, index: Optional[Index] = None - ) -> T: + def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() @@ -760,15 +752,13 @@ class BlockManager(PandasObject): indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_blocks: List[Block] = [] + new_blocks = [] for b in blocks: b = b.copy(deep=copy) b.mgr_locs = inv_indexer[b.mgr_locs.indexer] new_blocks.append(b) axes = list(self.axes) - if index is not None: - axes[-1] = index axes[0] = self.items.take(indexer) return type(self).from_blocks(new_blocks, axes) @@ -857,7 +847,7 @@ class BlockManager(PandasObject): # mutating the original object copy = copy or na_value is not lib.no_default - if self.is_single_block: + if self._is_single_block: blk = self.blocks[0] if blk.is_extension: # Avoid implicit conversion of extension blocks to object @@ -928,7 +918,12 @@ class BlockManager(PandasObject): Returns ------- values : a dict of dtype -> BlockManager + + Notes + ----- + This consolidates based on str(dtype) """ + self._consolidate_inplace() bd: Dict[str, List[Block]] = {} for b in self.blocks: @@ -1075,7 +1070,7 @@ class BlockManager(PandasObject): else: if value.ndim == self.ndim - 1: - value = safe_reshape(value, (1,) + value.shape) + value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -1107,7 +1102,7 @@ class BlockManager(PandasObject): blk = self.blocks[blkno] blk_locs = blklocs[val_locs.indexer] if blk.should_store(value): - blk.set_inplace(blk_locs, value_getitem(val_locs)) + blk.set(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1198,7 +1193,7 @@ class BlockManager(PandasObject): if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs - value = safe_reshape(value, (1,) + value.shape) + value = _safe_reshape(value, (1,) + value.shape) block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) @@ -1236,8 +1231,6 @@ class BlockManager(PandasObject): limit=None, fill_value=None, copy: bool = True, - consolidate: bool = True, - only_slice: bool = False, ): """ Conform block manager to new index. @@ -1248,13 +1241,7 @@ class BlockManager(PandasObject): ) return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=copy, - consolidate=consolidate, - only_slice=only_slice, + new_index, indexer, axis=axis, fill_value=fill_value, copy=copy ) def reindex_indexer( @@ -1266,7 +1253,6 @@ class BlockManager(PandasObject): allow_dups: bool = False, copy: bool = True, consolidate: bool = True, - only_slice: bool = False, ) -> T: """ Parameters @@ -1279,8 +1265,6 @@ class BlockManager(PandasObject): copy : bool, default True consolidate: bool, default True Whether to consolidate inplace before reindexing. - only_slice : bool, default False - Whether to take views, not copies, along columns. pandas-indexer with -1's only. """ @@ -1304,9 +1288,7 @@ class BlockManager(PandasObject): raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( - indexer, fill_value=fill_value, only_slice=only_slice - ) + new_blocks = self._slice_take_blocks_ax0(indexer, fill_value=fill_value) else: new_blocks = [ blk.take_nd( @@ -1350,7 +1332,7 @@ class BlockManager(PandasObject): slice_or_indexer, self.shape[0], allow_fill=allow_fill ) - if self.is_single_block: + if self._is_single_block: blk = self.blocks[0] if sl_type in ("slice", "mask"): @@ -1449,7 +1431,7 @@ class BlockManager(PandasObject): dtype, fill_value = infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, placement=placement, ndim=block_values.ndim) + return make_block(block_values, placement=placement) def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): """ @@ -1475,10 +1457,7 @@ class BlockManager(PandasObject): new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def equals(self, other: object) -> bool: - if not isinstance(other, BlockManager): - return False - + def equals(self, other: "BlockManager") -> bool: self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False @@ -1491,9 +1470,26 @@ class BlockManager(PandasObject): return False left = self.blocks[0].values right = other.blocks[0].values - return array_equals(left, right) + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right) - return blockwise_all(self, other, array_equals) + for i in range(len(self.items)): + # Check column-wise, return False if any column doesn't match + left = self.iget_values(i) + right = other.iget_values(i) + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ExtensionArray): + if not left.equals(right): + return False + else: + if not array_equivalent(left, right, dtype_equal=True): + return False + return True def unstack(self, unstacker, fill_value) -> "BlockManager": """ @@ -1540,7 +1536,7 @@ class SingleBlockManager(BlockManager): _is_consolidated = True _known_consolidated = True __slots__ = () - is_single_block = True + _is_single_block = True def __init__( self, @@ -1561,7 +1557,7 @@ class SingleBlockManager(BlockManager): ) self.axes = [axis] - self.blocks = (block,) + self.blocks = tuple([block]) @classmethod def from_blocks( @@ -1672,9 +1668,7 @@ def create_block_manager_from_blocks(blocks, axes: List[Index]) -> BlockManager: # is basically "all items", but if there're many, don't bother # converting, it's an error anyway. blocks = [ - make_block( - values=blocks[0], placement=slice(0, len(axes[0])), ndim=2 - ) + make_block(values=blocks[0], placement=slice(0, len(axes[0]))) ] mgr = BlockManager(blocks, axes) @@ -1694,11 +1688,8 @@ def create_block_manager_from_arrays( assert isinstance(axes, list) assert all(isinstance(x, Index) for x in axes) - # ensure we dont have any PandasArrays when we call get_block_type - # Note: just calling extract_array breaks tests that patch PandasArray._typ. - arrays = [x if not isinstance(x, ABCPandasArray) else x.to_numpy() for x in arrays] try: - blocks = _form_blocks(arrays, names, axes) + blocks = form_blocks(arrays, names, axes) mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() return mgr @@ -1730,7 +1721,7 @@ def construction_error(tot_items, block_shape, axes, e=None): # ----------------------------------------------------------------------- -def _form_blocks(arrays, names: Index, axes) -> List[Block]: +def form_blocks(arrays, names: Index, axes) -> List[Block]: # put "leftover" items in float bucket, where else? # generalize? items_dict: DefaultDict[str, List] = defaultdict(list) @@ -1777,7 +1768,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) + make_block(array, klass=DatetimeTZBlock, placement=i) for i, _, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1792,14 +1783,15 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["CategoricalBlock"]) > 0: cat_blocks = [ - make_block(array, klass=CategoricalBlock, placement=i, ndim=2) + make_block(array, klass=CategoricalBlock, placement=i) for i, _, array in items_dict["CategoricalBlock"] ] blocks.extend(cat_blocks) if len(items_dict["ExtensionBlock"]): + external_blocks = [ - make_block(array, klass=ExtensionBlock, placement=i, ndim=2) + make_block(array, klass=ExtensionBlock, placement=i) for i, _, array in items_dict["ExtensionBlock"] ] @@ -1807,7 +1799,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: if len(items_dict["ObjectValuesExtensionBlock"]): external_blocks = [ - make_block(array, klass=ObjectValuesExtensionBlock, placement=i, ndim=2) + make_block(array, klass=ObjectValuesExtensionBlock, placement=i) for i, _, array in items_dict["ObjectValuesExtensionBlock"] ] @@ -1820,7 +1812,7 @@ def _form_blocks(arrays, names: Index, axes) -> List[Block]: block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - na_block = make_block(block_values, placement=extra_locs, ndim=2) + na_block = make_block(block_values, placement=extra_locs) blocks.append(na_block) return blocks @@ -1837,7 +1829,7 @@ def _simple_blockify(tuples, dtype) -> List[Block]: if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - block = make_block(values, placement=placement, ndim=2) + block = make_block(values, placement=placement) return [block] @@ -1851,7 +1843,7 @@ def _multi_blockify(tuples, dtype=None): values, placement = _stack_arrays(list(tup_block), dtype) - block = make_block(values, placement=placement, ndim=2) + block = make_block(values, placement=placement) new_blocks.append(block) return new_blocks @@ -1866,7 +1858,7 @@ def _stack_arrays(tuples, dtype): else: return np.asarray(x) - def _shape_compat(x) -> Shape: + def _shape_compat(x): if isinstance(x, ABCSeries): return (len(x),) else: @@ -1911,12 +1903,12 @@ def _consolidate(blocks): gkey = lambda x: x._consolidate_key grouper = itertools.groupby(sorted(blocks, key=gkey), gkey) - new_blocks: List[Block] = [] + new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) - new_blocks.extend(merged_blocks) + new_blocks = _extend_blocks(merged_blocks, new_blocks) return new_blocks @@ -1942,12 +1934,86 @@ def _merge_blocks( new_values = new_values[argsort] new_mgr_locs = new_mgr_locs[argsort] - return [make_block(new_values, placement=new_mgr_locs, ndim=2)] + return [make_block(new_values, placement=new_mgr_locs)] # can't consolidate --> no merge return blocks +def _compare_or_regex_search( + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, +) -> Union[ArrayLike, bool]: + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like + b : scalar or regex pattern + regex : bool, default False + mask : array_like or None (default) + + Returns + ------- + mask : array_like of bool + """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + if isinstance(a, np.ndarray): + a = a[mask] + + if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + _check_comparison_types(False, a, b) + return False + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + tmp[mask] = result + result = tmp + + _check_comparison_types(result, a, b) + return result + + def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/venv/lib/python3.8/site-packages/pandas/core/internals/ops.py b/venv/lib/python3.8/site-packages/pandas/core/internals/ops.py index d7ea5d6..6eedf72 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/internals/ops.py +++ b/venv/lib/python3.8/site-packages/pandas/core/internals/ops.py @@ -1,26 +1,21 @@ -from collections import namedtuple -from typing import TYPE_CHECKING, Iterator, List, Tuple +from typing import TYPE_CHECKING, List, Tuple import numpy as np from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.blocks import Block - from pandas.core.internals.managers import BlockManager + from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.managers import BlockManager # noqa:F401 -BlockPairInfo = namedtuple( - "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"] -) - - -def _iter_block_pairs( - left: "BlockManager", right: "BlockManager" -) -> Iterator[BlockPairInfo]: +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) + res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -39,32 +34,21 @@ def _iter_block_pairs( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) - info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) - yield info + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": - # At this point we have already checked the parent DataFrames for - # assert rframe._indexed_same(lframe) + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - res_blks: List["Block"] = [] - for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) + _reset_block_mgr_locs(nbs, locs) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - - _reset_block_mgr_locs(nbs, locs) - - res_blks.extend(nbs) + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -101,7 +85,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs only this first clause would be needed + # TODO(EA2D): with 2D EAs pnly this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -118,14 +102,3 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals - - -def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: - """ - Blockwise `all` reduction. - """ - for info in _iter_block_pairs(left, right): - res = op(info.lvals, info.rvals) - if not res: - return False - return True diff --git a/venv/lib/python3.8/site-packages/pandas/core/missing.py b/venv/lib/python3.8/site-packages/pandas/core/missing.py index 445c1ef..7802c5c 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/missing.py +++ b/venv/lib/python3.8/site-packages/pandas/core/missing.py @@ -1,67 +1,78 @@ """ Routines for filling missing data. """ -from functools import partial -from typing import TYPE_CHECKING, Any, List, Optional, Set, Union + +from typing import Any, List, Optional, Set, Union import numpy as np from pandas._libs import algos, lib -from pandas._typing import ArrayLike, Axis, DtypeObj from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_float64, + is_datetime64_dtype, + is_datetime64tz_dtype, is_integer_dtype, is_numeric_v_string_like, + is_scalar, + is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna -if TYPE_CHECKING: - from pandas import Index - -def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray: +def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True - - Parameters - ---------- - arr : ArrayLike - values_to_mask: list, tuple, or scalar - - Returns - ------- - np.ndarray[bool] """ - # When called from Block.replace/replace_list, values_to_mask is a scalar - # known to be holdable by arr. - # When called from Series._single_replace, values_to_mask is tuple or list dtype, values_to_mask = infer_dtype_from_array(values_to_mask) - values_to_mask = np.array(values_to_mask, dtype=dtype) + + try: + values_to_mask = np.array(values_to_mask, dtype=dtype) + + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) na_mask = isna(values_to_mask) nonna = values_to_mask[~na_mask] - # GH 21977 - mask = np.zeros(arr.shape, dtype=bool) + mask = None for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass + if mask is None: + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask = False + else: + mask = arr == x + + # if x is a string and arr is not, then we get False and we must + # expand the mask to size arr.shape + if is_scalar(mask): + mask = np.zeros(arr.shape, dtype=bool) else: - mask |= arr == x + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + mask |= False + else: + mask |= arr == x if na_mask.any(): - mask |= isna(arr) + if mask is None: + mask = isna(arr) + else: + mask |= isna(arr) + + # GH 21977 + if mask is None: + mask = np.zeros(arr.shape, dtype=bool) return mask -def clean_fill_method(method, allow_nearest: bool = False): +def clean_fill_method(method, allow_nearest=False): # asfreq is compat for resampling if method in [None, "asfreq"]: return None @@ -158,7 +169,7 @@ def find_valid_index(values, how: str): def interpolate_1d( - xvalues: "Index", + xvalues: np.ndarray, yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, @@ -180,7 +191,9 @@ def interpolate_1d( valid = ~invalid if not valid.any(): - result = np.empty(xvalues.shape, dtype=np.float64) + # have to call np.asarray(xvalues) since xvalues could be an Index + # which can't be mutated + result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result @@ -188,7 +201,8 @@ def interpolate_1d( return yvalues if method == "time": - if not needs_i8_conversion(xvalues.dtype): + if not getattr(xvalues, "is_all_dates", None): + # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError( "time-weighted interpolation only works " "on Series or DataFrames with a " @@ -214,7 +228,7 @@ def interpolate_1d( ) # default limit is unlimited GH #16282 - limit = algos.validate_limit(nobs=None, limit=limit) + limit = algos._validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) @@ -252,18 +266,20 @@ def interpolate_1d( # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) + yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() - # xarr to pass to NumPy/SciPy - xarr = xvalues._values - if needs_i8_conversion(xarr.dtype): - # GH#1646 for dt64tz - xarr = xarr.view("i8") + # xvalues to pass to NumPy/SciPy + xvalues = getattr(xvalues, "values", xvalues) if method == "linear": - inds = xarr + inds = xvalues else: - inds = np.asarray(xarr) + inds = np.asarray(xvalues) + + # hack for DatetimeIndex, #1646 + if needs_i8_conversion(inds.dtype): + inds = inds.view(np.int64) if method in ("values", "index"): if inds.dtype == np.object_: @@ -313,7 +329,7 @@ def _interpolate_scipy_wrapper( "piecewise_polynomial": _from_derivatives, } - if getattr(x, "_is_all_dates", False): + if getattr(x, "is_all_dates", False): # GH 5975, scipy.interp1d can't handle datetime64s x, new_x = x._values.astype("i8"), new_x.astype("i8") @@ -526,92 +542,13 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat return P(x) -def _interpolate_with_limit_area( - values: ArrayLike, method: str, limit: Optional[int], limit_area: Optional[str] -) -> ArrayLike: - """ - Apply interpolation and limit_area logic to values along a to-be-specified axis. - - Parameters - ---------- - values: array-like - Input array. - method: str - Interpolation method. Could be "bfill" or "pad" - limit: int, optional - Index limit on interpolation. - limit_area: str - Limit area for interpolation. Can be "inside" or "outside" - - Returns - ------- - values: array-like - Interpolated array. - """ - - invalid = isna(values) - - if not invalid.all(): - first = find_valid_index(values, "first") - last = find_valid_index(values, "last") - - values = interpolate_2d( - values, - method=method, - limit=limit, - ) - - if limit_area == "inside": - invalid[first : last + 1] = False - elif limit_area == "outside": - invalid[:first] = invalid[last + 1 :] = False - - values[invalid] = np.nan - - return values - - def interpolate_2d( - values, - method: str = "pad", - axis: Axis = 0, - limit: Optional[int] = None, - limit_area: Optional[str] = None, + values, method="pad", axis=0, limit=None, fill_value=None, dtype=None ): """ Perform an actual interpolation of values, values will be make 2-d if needed fills inplace, returns the result. - - Parameters - ---------- - values: array-like - Input array. - method: str, default "pad" - Interpolation method. Could be "bfill" or "pad" - axis: 0 or 1 - Interpolation axis - limit: int, optional - Index limit on interpolation. - limit_area: str, optional - Limit area for interpolation. Can be "inside" or "outside" - - Returns - ------- - values: array-like - Interpolated array. """ - if limit_area is not None: - return np.apply_along_axis( - partial( - _interpolate_with_limit_area, - method=method, - limit=limit, - limit_area=limit_area, - ), - axis, - values, - ) - orig_values = values transf = (lambda x: x) if axis == 0 else (lambda x: x.T) @@ -623,73 +560,80 @@ def interpolate_2d( raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") values = values.reshape(tuple((1,) + values.shape)) - method = clean_fill_method(method) - tvalues = transf(values) - if method == "pad": - result = _pad_2d(tvalues, limit=limit) - else: - result = _backfill_2d(tvalues, limit=limit) + if fill_value is None: + mask = None + else: # todo create faster fill func without masking + mask = mask_missing(transf(values), fill_value) + + method = clean_fill_method(method) + if method == "pad": + values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + else: + values = transf( + backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) + ) - result = transf(result) # reshape back if ndim == 1: - result = result[0] + values = values[0] - if orig_values.dtype.kind in ["m", "M"]: - # convert float back to datetime64/timedelta64 - result = result.view(orig_values.dtype) + if orig_values.dtype.kind == "M": + # convert float back to datetime64 + values = values.astype(orig_values.dtype) - return result + return values -def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): +def _cast_values_for_fillna(values, dtype): """ Cast values to a dtype that algos.pad and algos.backfill can handle. """ # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if needs_i8_conversion(dtype): + if ( + is_datetime64_dtype(dtype) + or is_datetime64tz_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): values = values.view(np.int64) - elif is_integer_dtype(values) and not has_mask: + elif is_integer_dtype(values): # NB: this check needs to come after the datetime64 check above - # has_mask check to avoid casting i8 values that have already - # been cast from PeriodDtype values = ensure_float64(values) return values -def _fillna_prep(values, mask=None): - # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d - dtype = values.dtype +def _fillna_prep(values, mask=None, dtype=None): + # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d + if dtype is None: + dtype = values.dtype - has_mask = mask is not None - if not has_mask: + if mask is None: # This needs to occur before datetime/timedeltas are cast to int64 mask = isna(values) - values = _cast_values_for_fillna(values, dtype, has_mask) + values = _cast_values_for_fillna(values, dtype) mask = mask.view(np.uint8) return values, mask -def _pad_1d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) +def pad_1d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) algos.pad_inplace(values, mask, limit=limit) return values -def _backfill_1d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) +def backfill_1d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) algos.backfill_inplace(values, mask, limit=limit) return values -def _pad_2d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) +def pad_2d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): algos.pad_2d_inplace(values, mask, limit=limit) @@ -699,8 +643,8 @@ def _pad_2d(values, limit=None, mask=None): return values -def _backfill_2d(values, limit=None, mask=None): - values, mask = _fillna_prep(values, mask) +def backfill_2d(values, limit=None, mask=None, dtype=None): + values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): algos.backfill_2d_inplace(values, mask, limit=limit) @@ -710,7 +654,7 @@ def _backfill_2d(values, limit=None, mask=None): return values -_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} +_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} def get_fill_func(method): @@ -779,15 +723,15 @@ def _interp_limit(invalid, fw_limit, bw_limit): # just use forwards return f_idx else: - b_idx_inv = list(inner(invalid[::-1], bw_limit)) - b_idx = set(N - 1 - np.asarray(b_idx_inv)) + b_idx = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx)) if fw_limit == 0: return b_idx return f_idx & b_idx -def _rolling_window(a: np.ndarray, window: int): +def _rolling_window(a, window): """ [True, True, False, True, False], 2 -> diff --git a/venv/lib/python3.8/site-packages/pandas/core/nanops.py b/venv/lib/python3.8/site-packages/pandas/core/nanops.py index 88662a4..e7e2879 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/nanops.py +++ b/venv/lib/python3.8/site-packages/pandas/core/nanops.py @@ -2,18 +2,18 @@ import functools import itertools import operator from typing import Any, Optional, Tuple, Union, cast -import warnings import numpy as np from pandas._config import get_option -from pandas._libs import NaT, Timedelta, iNaT, lib +from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - get_dtype, + _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, @@ -96,11 +96,7 @@ class bottleneck_switch: @functools.wraps(alt) def f( - values: np.ndarray, - *, - axis: Optional[int] = None, - skipna: bool = True, - **kwds, + values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, **kwds ): if len(self.kwargs) > 0: for k, v in self.kwargs.items(): @@ -189,7 +185,7 @@ def _get_fill_value( else: if fill_value_typ == "+inf": # need the max int here - return np.iinfo(np.int64).max + return _int64_max else: return iNaT @@ -232,7 +228,7 @@ def _maybe_get_mask( # Boolean data cannot contain nulls, so signal via mask being None return None - if skipna or needs_i8_conversion(values.dtype): + if skipna: mask = isna(values) return mask @@ -283,7 +279,7 @@ def _get_values( """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the - # np.where call below + # maybe_upcast_putmask call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) @@ -291,12 +287,10 @@ def _get_values( dtype = values.dtype - datetimelike = False if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) - datetimelike = True dtype_ok = _na_ok_dtype(dtype) @@ -307,13 +301,13 @@ def _get_values( ) if skipna and (mask is not None) and (fill_value is not None): - if mask.any(): - if dtype_ok or datetimelike: - values = values.copy() - np.putmask(values, mask, fill_value) - else: - # np.where will promote if needed - values = np.where(~mask, values, fill_value) + values = values.copy() + if dtype_ok and mask.any(): + np.putmask(values, mask, fill_value) + + # promote if needed + else: + values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype @@ -331,24 +325,18 @@ def _na_ok_dtype(dtype: DtypeObj) -> bool: return not issubclass(dtype.type, np.integer) -def _wrap_results(result, dtype: np.dtype, fill_value=None): +def _wrap_results(result, dtype: DtypeObj, fill_value=None): """ wrap our results if needed """ - if result is NaT: - pass - - elif is_datetime64_any_dtype(dtype): + if is_datetime64_any_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): + tz = getattr(dtype, "tz", None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan - - if isna(result): - result = np.datetime64("NaT", "ns") - else: - result = np.int64(result).view("datetime64[ns]") + result = Timestamp(result, tz=tz) else: # If we have float dtype, taking a view will give the wrong result result = result.astype(dtype) @@ -358,7 +346,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None): result = np.nan # raise if we have a timedelta64[ns] which is too large - if np.fabs(result) > np.iinfo(np.int64).max: + if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = Timedelta(result, unit="ns") @@ -368,39 +356,6 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None): return result -def _datetimelike_compat(func: F) -> F: - """ - If we have datetime64 or timedelta64 values, ensure we have a correct - mask before calling the wrapped function, then cast back afterwards. - """ - - @functools.wraps(func) - def new_func( - values: np.ndarray, - *, - axis: Optional[int] = None, - skipna: bool = True, - mask: Optional[np.ndarray] = None, - **kwargs, - ): - orig_values = values - - datetimelike = values.dtype.kind in ["m", "M"] - if datetimelike and mask is None: - mask = isna(values) - - result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs) - - if datetimelike: - result = _wrap_results(result, orig_values.dtype, fill_value=iNaT) - if not skipna: - result = _mask_datetimelike_result(result, axis, mask, orig_values) - - return result - - return cast(F, new_func) - - def _na_for_min_count( values: np.ndarray, axis: Optional[int] ) -> Union[Scalar, np.ndarray]: @@ -423,23 +378,23 @@ def _na_for_min_count( if is_numeric_dtype(values): values = values.astype("float64") fill_value = na_value_for_dtype(values.dtype) - if fill_value is NaT: - fill_value = values.dtype.type("NaT", "ns") if values.ndim == 1: return fill_value - elif axis is None: - return fill_value else: + assert axis is not None # assertion to make mypy happy result_shape = values.shape[:axis] + values.shape[axis + 1 :] - - result = np.full(result_shape, fill_value, dtype=values.dtype) + # calling np.full with dtype parameter throws an ValueError when called + # with dtype=np.datetime64 and and fill_value=pd.NaT + try: + result = np.full(result_shape, fill_value, dtype=values.dtype) + except ValueError: + result = np.full(result_shape, fill_value) return result def nanany( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -477,7 +432,6 @@ def nanany( def nanall( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -514,10 +468,8 @@ def nanall( @disallow("M8") -@_datetimelike_compat def nansum( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, min_count: int = 0, @@ -554,36 +506,16 @@ def nansum( dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 - the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count) - return the_sum - - -def _mask_datetimelike_result( - result: Union[np.ndarray, np.datetime64, np.timedelta64], - axis: Optional[int], - mask: np.ndarray, - orig_values: np.ndarray, -): - if isinstance(result, np.ndarray): - # we need to apply the mask - result = result.astype("i8").view(orig_values.dtype) - axis_mask = mask.any(axis=axis) - result[axis_mask] = iNaT - else: - if mask.any(): - result = NaT - return result + return _wrap_results(the_sum, dtype) @disallow(PeriodDtype) @bottleneck_switch() -@_datetimelike_compat def nanmean( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -617,16 +549,16 @@ def nanmean( ) dtype_sum = dtype_max dtype_count = np.float64 - # not using needs_i8_conversion because that includes period - if dtype.kind in ["m", "M"]: - dtype_sum = np.float64 - elif is_integer_dtype(dtype): + if ( + is_integer_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) + ): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype - count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) @@ -641,11 +573,11 @@ def nanmean( else: the_mean = the_sum / count if count > 0 else np.nan - return the_mean + return _wrap_results(the_mean, dtype) @bottleneck_switch() -def nanmedian(values, *, axis=None, skipna=True, mask=None): +def nanmedian(values, axis=None, skipna=True, mask=None): """ Parameters ---------- @@ -673,11 +605,7 @@ def nanmedian(values, *, axis=None, skipna=True, mask=None): mask = notna(x) if not skipna and not mask.all(): return np.nan - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings("ignore", "All-NaN slice encountered") - res = np.nanmedian(x[mask]) - return res + return np.nanmedian(x[mask]) values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values.dtype): @@ -700,50 +628,25 @@ def nanmedian(values, *, axis=None, skipna=True, mask=None): # there's a non-empty array to apply over otherwise numpy raises if notempty: if not skipna: - res = np.apply_along_axis(get_median, axis, values) + return _wrap_results( + np.apply_along_axis(get_median, axis, values), dtype + ) - else: - # fastpath for the skipna case - with warnings.catch_warnings(): - # Suppress RuntimeWarning about All-NaN slice - warnings.filterwarnings("ignore", "All-NaN slice encountered") - res = np.nanmedian(values, axis) + # fastpath for the skipna case + return _wrap_results(np.nanmedian(values, axis), dtype) - else: - # must return the correct shape, but median is not defined for the - # empty set so return nans of shape "everything but the passed axis" - # since "axis" is where the reduction would occur if we had a nonempty - # array - res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan) + # must return the correct shape, but median is not defined for the + # empty set so return nans of shape "everything but the passed axis" + # since "axis" is where the reduction would occur if we had a nonempty + # array + shp = np.array(values.shape) + dims = np.arange(values.ndim) + ret = np.empty(shp[dims != axis]) + ret.fill(np.nan) + return _wrap_results(ret, dtype) - else: - # otherwise return a scalar value - res = get_median(values) if notempty else np.nan - return _wrap_results(res, dtype) - - -def get_empty_reduction_result( - shape: Tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any -) -> np.ndarray: - """ - The result from a reduction on an empty ndarray. - - Parameters - ---------- - shape : Tuple[int] - axis : int - dtype : np.dtype - fill_value : Any - - Returns - ------- - np.ndarray - """ - shp = np.array(shape) - dims = np.arange(len(shape)) - ret = np.empty(shp[dims != axis], dtype=dtype) - ret.fill(fill_value) - return ret + # otherwise return a scalar value + return _wrap_results(get_median(values) if notempty else np.nan, dtype) def _get_counts_nanvar( @@ -775,7 +678,7 @@ def _get_counts_nanvar( count : scalar or array d : scalar or array """ - dtype = get_dtype(dtype) + dtype = _get_dtype(dtype) count = _get_counts(value_counts, mask, axis, dtype=dtype) d = count - dtype.type(ddof) @@ -792,8 +695,9 @@ def _get_counts_nanvar( return count, d +@disallow("M8") @bottleneck_switch(ddof=1) -def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None): +def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the standard deviation along given axis while ignoring NaNs @@ -821,9 +725,6 @@ def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None): >>> nanops.nanstd(s) 1.0 """ - if values.dtype == "M8[ns]": - values = values.view("m8[ns]") - orig_dtype = values.dtype values, mask, _, _, _ = _get_values(values, skipna, mask=mask) @@ -833,7 +734,7 @@ def nanstd(values, *, axis=None, skipna=True, ddof=1, mask=None): @disallow("M8", "m8") @bottleneck_switch(ddof=1) -def nanvar(values, *, axis=None, skipna=True, ddof=1, mask=None): +def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the variance along given axis while ignoring NaNs @@ -897,13 +798,12 @@ def nanvar(values, *, axis=None, skipna=True, ddof=1, mask=None): # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) - return result + return _wrap_results(result, values.dtype) @disallow("M8", "m8") def nansem( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, ddof: int = 1, @@ -938,24 +838,22 @@ def nansem( """ # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise - nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask) + nanvar(values, axis, skipna, ddof=ddof, mask=mask) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) - var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof) + var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count) def _nanminmax(meth, fill_value_typ): @bottleneck_switch(name="nan" + meth) - @_datetimelike_compat def reduction( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -974,8 +872,8 @@ def _nanminmax(meth, fill_value_typ): else: result = getattr(values, meth)(axis) - result = _maybe_null_out(result, axis, mask, values.shape) - return result + result = _wrap_results(result, dtype, fill_value) + return _maybe_null_out(result, axis, mask, values.shape) return reduction @@ -987,7 +885,6 @@ nanmax = _nanminmax("max", fill_value_typ="-inf") @disallow("O") def nanargmax( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1032,7 +929,6 @@ def nanargmax( @disallow("O") def nanargmin( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1077,7 +973,6 @@ def nanargmin( @disallow("M8", "m8") def nanskew( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1162,7 +1057,6 @@ def nanskew( @disallow("M8", "m8") def nankurt( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, @@ -1256,7 +1150,6 @@ def nankurt( @disallow("M8", "m8") def nanprod( values: np.ndarray, - *, axis: Optional[int] = None, skipna: bool = True, min_count: int = 0, @@ -1341,7 +1234,7 @@ def _get_counts( ------- count : scalar or array """ - dtype = get_dtype(dtype) + dtype = _get_dtype(dtype) if axis is None: if mask is not None: n = mask.size - mask.sum() @@ -1436,7 +1329,7 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") def nancorr( - a: np.ndarray, b: np.ndarray, *, method="pearson", min_periods: Optional[int] = None + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, ): """ a, b: ndarrays @@ -1493,7 +1386,6 @@ def get_corr_func(method): def nancov( a: np.ndarray, b: np.ndarray, - *, min_periods: Optional[int] = None, ddof: Optional[int] = 1, ): @@ -1609,7 +1501,6 @@ def _nanpercentile_1d( def nanpercentile( values: np.ndarray, q, - *, axis: int, na_value, mask: np.ndarray, @@ -1638,16 +1529,10 @@ def nanpercentile( if values.dtype.kind in ["m", "M"]: # need to cast to integer to avoid rounding errors in numpy result = nanpercentile( - values.view("i8"), - q=q, - axis=axis, - na_value=na_value.view("i8"), - mask=mask, - ndim=ndim, - interpolation=interpolation, + values.view("i8"), q, axis, na_value.view("i8"), mask, ndim, interpolation ) - # Note: we have to do `astype` and not view because in general we + # Note: we have to do do `astype` and not view because in general we # have float result at this point, not i8 return result.astype(values.dtype) @@ -1673,7 +1558,7 @@ def nanpercentile( return np.percentile(values, q, axis=axis, interpolation=interpolation) -def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: +def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: """ Cumulative function with skipna support. @@ -1731,9 +1616,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: result = result.view(orig_dtype) else: # DatetimeArray - result = type(values)._simple_new( # type: ignore[attr-defined] - result, dtype=orig_dtype - ) + result = type(values)._from_sequence(result, dtype=orig_dtype) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() diff --git a/venv/lib/python3.8/site-packages/pandas/core/ops/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/ops/__init__.py index 7b14a5c..5e1b8cd 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/ops/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/core/ops/__init__.py @@ -4,39 +4,41 @@ Arithmetic operations for PandasObjects This is not a public API. """ import operator -from typing import TYPE_CHECKING, Optional, Set -import warnings +from typing import TYPE_CHECKING, Optional, Set, Type import numpy as np +from pandas._libs import lib from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas._typing import Level from pandas.util._decorators import Appender -from pandas.core.dtypes.common import is_array_like, is_list_like -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.common import is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import algorithms -from pandas.core.ops.array_ops import ( # noqa:F401 +from pandas.core.construction import extract_array +from pandas.core.ops.array_ops import ( arithmetic_op, - comp_method_OBJECT_ARRAY, comparison_op, get_array_op, logical_op, ) -from pandas.core.ops.common import ( # noqa:F401 - get_op_result_name, - unpack_zerodim_and_defer, -) +from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 +from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.docstrings import ( + _arith_doc_FRAME, _flex_comp_doc_FRAME, + _make_flex_doc, _op_descriptions, - make_flex_doc, ) from pandas.core.ops.invalid import invalid_comparison # noqa:F401 from pandas.core.ops.mask_ops import kleene_and, kleene_or, kleene_xor # noqa: F401 -from pandas.core.ops.methods import add_flex_arithmetic_methods # noqa:F401 +from pandas.core.ops.methods import ( # noqa:F401 + add_flex_arithmetic_methods, + add_special_arithmetic_methods, +) from pandas.core.ops.roperator import ( # noqa:F401 radd, rand_, @@ -53,7 +55,7 @@ from pandas.core.ops.roperator import ( # noqa:F401 ) if TYPE_CHECKING: - from pandas import DataFrame, Series + from pandas import DataFrame, Series # noqa:F401 # ----------------------------------------------------------------------------- # constants @@ -79,6 +81,115 @@ ARITHMETIC_BINOPS: Set[str] = { COMPARISON_BINOPS: Set[str] = {"eq", "ne", "lt", "gt", "le", "ge"} +# ----------------------------------------------------------------------------- +# Ops Wrapping Utilities + + +def get_op_result_name(left, right): + """ + Find the appropriate name to pin to an operation result. This result + should always be either an Index or a Series. + + Parameters + ---------- + left : {Series, Index} + right : object + + Returns + ------- + name : object + Usually a string + """ + # `left` is always a Series when called from within ops + if isinstance(right, (ABCSeries, ABCIndexClass)): + name = _maybe_match_name(left, right) + else: + name = left.name + return name + + +def _maybe_match_name(a, b): + """ + Try to find a name to attach to the result of an operation between + a and b. If only one of these has a `name` attribute, return that + name. Otherwise return a consensus name if they match of None if + they have different names. + + Parameters + ---------- + a : object + b : object + + Returns + ------- + name : str or None + + See Also + -------- + pandas.core.common.consensus_name_attr + """ + a_has = hasattr(a, "name") + b_has = hasattr(b, "name") + if a_has and b_has: + if a.name == b.name: + return a.name + else: + # TODO: what if they both have np.nan for their names? + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None + + +# ----------------------------------------------------------------------------- + + +def _get_frame_op_default_axis(name: str) -> Optional[str]: + """ + Only DataFrame cares about default_axis, specifically: + special methods have default_axis=None and flex methods + have default_axis='columns'. + + Parameters + ---------- + name : str + + Returns + ------- + default_axis: str or None + """ + if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: + # bool methods + return "columns" + elif name.startswith("__"): + # __add__, __mul__, ... + return None + else: + # add, mul, ... + return "columns" + + +def _get_op_name(op, special: bool) -> str: + """ + Find the name to attach to this method according to conventions + for special and non-special methods. + + Parameters + ---------- + op : binary operator + special : bool + + Returns + ------- + op_name : str + """ + opname = op.__name__.strip("_") + if special: + opname = f"__{opname}__" + return opname + # ----------------------------------------------------------------------------- # Masking NA values and fallbacks for operations numpy does not support @@ -125,13 +236,77 @@ def fill_binop(left, right, fill_value): return left, right +# ----------------------------------------------------------------------------- +# Dispatch logic + + +def dispatch_to_series(left, right, func, axis: Optional[int] = None): + """ + Evaluate the frame operation func(left, right) by evaluating + column-by-column, dispatching to the Series implementation. + + Parameters + ---------- + left : DataFrame + right : scalar, Series, or DataFrame + func : arithmetic or comparison operator + axis : {None, 0, 1} + + Returns + ------- + DataFrame + """ + # Get the appropriate array-op to apply to each column/block's values. + array_op = get_array_op(func) + + right = lib.item_from_zerodim(right) + if not is_list_like(right): + # i.e. scalar, faster than checking np.ndim(right) == 0 + bm = left._mgr.apply(array_op, right=right) + return type(left)(bm) + + elif isinstance(right, ABCDataFrame): + assert left.index.equals(right.index) + assert left.columns.equals(right.columns) + # TODO: The previous assertion `assert right._indexed_same(left)` + # fails in cases with empty columns reached via + # _frame_arith_method_with_reindex + + bm = left._mgr.operate_blockwise(right._mgr, array_op) + return type(left)(bm) + + elif isinstance(right, ABCSeries) and axis == 1: + # axis=1 means we want to operate row-by-row + assert right.index.equals(left.columns) + + right = right._values + # maybe_align_as_frame ensures we do not have an ndarray here + assert not isinstance(right, np.ndarray) + + arrays = [array_op(l, r) for l, r in zip(left._iter_column_arrays(), right)] + + elif isinstance(right, ABCSeries): + assert right.index.equals(left.index) # Handle other cases later + right = right._values + + arrays = [array_op(l, right) for l in left._iter_column_arrays()] + + else: + # Remaining cases have less-obvious dispatch rules + raise NotImplementedError(right) + + return type(left)._from_arrays( + arrays, left.columns, left.index, verify_integrity=False + ) + + # ----------------------------------------------------------------------------- # Series -def align_method_SERIES(left: "Series", right, align_asobject: bool = False): +def _align_method_SERIES(left: "Series", right, align_asobject: bool = False): """ align lhs and rhs Series """ - # ToDo: Different from align_method_FRAME, list, tuple and ndarray + # ToDo: Different from _align_method_FRAME, list, tuple and ndarray # are not coerced here # because Series has inconsistencies described in #13637 @@ -149,9 +324,84 @@ def align_method_SERIES(left: "Series", right, align_asobject: bool = False): return left, right -def flex_method_SERIES(op): - name = op.__name__.strip("_") - doc = make_flex_doc(name, "series") +def _arith_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + assert special # non-special uses _flex_method_SERIES + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(left, right): + + left, right = _align_method_SERIES(left, right) + res_name = get_op_result_name(left, right) + + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) + result = arithmetic_op(lvalues, rvalues, op) + + return left._construct_result(result, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _comp_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + assert special # non-special uses _flex_method_SERIES + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(self, other): + + res_name = get_op_result_name(self, other) + + if isinstance(other, ABCSeries) and not self._indexed_same(other): + raise ValueError("Can only compare identically-labeled Series objects") + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = comparison_op(lvalues, rvalues, op) + + return self._construct_result(res_values, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _bool_method_SERIES(cls, op, special): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + assert special # non-special uses _flex_method_SERIES + op_name = _get_op_name(op, special) + + @unpack_zerodim_and_defer(op_name) + def wrapper(self, other): + self, other = _align_method_SERIES(self, other, align_asobject=True) + res_name = get_op_result_name(self, other) + + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + res_values = logical_op(lvalues, rvalues, op) + return self._construct_result(res_values, name=res_name) + + wrapper.__name__ = op_name + return wrapper + + +def _flex_method_SERIES(cls, op, special): + assert not special # "special" also means "not flex" + name = _get_op_name(op, special) + doc = _make_flex_doc(name, "series") @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): @@ -159,17 +409,13 @@ def flex_method_SERIES(op): if axis is not None: self._get_axis_number(axis) - res_name = get_op_result_name(self, other) - if isinstance(other, ABCSeries): return self._binop(other, op, level=level, fill_value=fill_value) elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError("Lengths must be equal") other = self._constructor(other, self.index) - result = self._binop(other, op, level=level, fill_value=fill_value) - result.name = res_name - return result + return self._binop(other, op, level=level, fill_value=fill_value) else: if fill_value is not None: self = self.fillna(fill_value) @@ -184,7 +430,7 @@ def flex_method_SERIES(op): # DataFrame -def align_method_FRAME( +def _align_method_FRAME( left, right, axis, flex: Optional[bool] = False, level: Level = None ): """ @@ -253,11 +499,6 @@ def align_method_FRAME( ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): - # GH 36702. Raise when attempting arithmetic with list of array-like. - if any(is_array_like(el) for el in right): - raise ValueError( - f"Unable to coerce list of {type(right[0])} to Series/DataFrame" - ) # GH17901 right = to_series(right) @@ -272,18 +513,6 @@ def align_method_FRAME( elif isinstance(right, ABCSeries): # axis=1 is default for DataFrame-with-Series op axis = left._get_axis_number(axis) if axis is not None else 1 - - if not flex: - if not left.axes[axis].equals(right.index): - warnings.warn( - "Automatic reindexing on DataFrame vs Series comparisons " - "is deprecated and will raise ValueError in a future version. " - "Do `left, right = left.align(right, axis=1, copy=False)` " - "before e.g. `left == right`", - FutureWarning, - stacklevel=5, - ) - left, right = left.align( right, join="outer", axis=axis, level=level, copy=False ) @@ -292,7 +521,7 @@ def align_method_FRAME( return left, right -def should_reindex_frame_op( +def _should_reindex_frame_op( left: "DataFrame", right, op, axis, default_axis, fill_value, level ) -> bool: """ @@ -309,19 +538,18 @@ def should_reindex_frame_op( if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? + cols = left.columns.intersection(right.columns) # Intersection is always unique so we have to check the unique columns left_uniques = left.columns.unique() right_uniques = right.columns.unique() - cols = left_uniques.intersection(right_uniques) - if len(cols) and not (cols.equals(left_uniques) and cols.equals(right_uniques)): - # TODO: is there a shortcut available when len(cols) == 0? + if not (cols.equals(left_uniques) and cols.equals(right_uniques)): return True return False -def frame_arith_method_with_reindex( +def _frame_arith_method_with_reindex( left: "DataFrame", right: "DataFrame", op ) -> "DataFrame": """ @@ -347,7 +575,7 @@ def frame_arith_method_with_reindex( new_right = right.iloc[:, rcols] result = op(new_left, new_right) - # Do the join on the columns instead of using align_method_FRAME + # Do the join on the columns instead of using _align_method_FRAME # to avoid constructing two potentially large/sparse DataFrames join_columns, _, _ = left.columns.join( right.columns, how="outer", level=None, return_indexers=True @@ -390,20 +618,26 @@ def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int return type(frame)(rvalues, index=frame.index, columns=frame.columns) -def flex_arith_method_FRAME(op): - op_name = op.__name__.strip("_") - default_axis = "columns" +def _arith_method_FRAME(cls: Type["DataFrame"], op, special: bool): + # This is the only function where `special` can be either True or False + op_name = _get_op_name(op, special) + default_axis = _get_frame_op_default_axis(op_name) na_op = get_array_op(op) - doc = make_flex_doc(op_name, "dataframe") + + if op_name in _op_descriptions: + # i.e. include "add" but not "__add__" + doc = _make_flex_doc(op_name, "dataframe") + else: + doc = _arith_doc_FRAME % op_name @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): - if should_reindex_frame_op( + if _should_reindex_frame_op( self, other, op, axis, default_axis, fill_value, level ): - return frame_arith_method_with_reindex(self, other, op) + return _frame_arith_method_with_reindex(self, other, op) if isinstance(other, ABCSeries) and fill_value is not None: # TODO: We could allow this in cases where we end up going @@ -412,20 +646,22 @@ def flex_arith_method_FRAME(op): axis = self._get_axis_number(axis) if axis is not None else 1 - self, other = align_method_FRAME(self, other, axis, flex=True, level=level) + # TODO: why are we passing flex=True instead of flex=not special? + # 15 tests fail if we pass flex=not special instead + self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): # Another DataFrame new_data = self._combine_frame(other, na_op, fill_value) elif isinstance(other, ABCSeries): - new_data = self._dispatch_frame_op(other, op, axis=axis) + new_data = dispatch_to_series(self, other, op, axis=axis) else: # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - new_data = self._dispatch_frame_op(other, op) + new_data = dispatch_to_series(self, other, op) return self._construct_result(new_data) @@ -434,9 +670,11 @@ def flex_arith_method_FRAME(op): return f -def flex_comp_method_FRAME(op): - op_name = op.__name__.strip("_") - default_axis = "columns" # because we are "flex" +def _flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): + assert not special # "special" also means "not flex" + op_name = _get_op_name(op, special) + default_axis = _get_frame_op_default_axis(op_name) + assert default_axis == "columns", default_axis # because we are not "special" doc = _flex_comp_doc_FRAME.format( op_name=op_name, desc=_op_descriptions[op_name]["desc"] @@ -446,9 +684,28 @@ def flex_comp_method_FRAME(op): def f(self, other, axis=default_axis, level=None): axis = self._get_axis_number(axis) if axis is not None else 1 - self, other = align_method_FRAME(self, other, axis, flex=True, level=level) + self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) - new_data = self._dispatch_frame_op(other, op, axis=axis) + new_data = dispatch_to_series(self, other, op, axis=axis) + return self._construct_result(new_data) + + f.__name__ = op_name + + return f + + +def _comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): + assert special # "special" also means "not flex" + op_name = _get_op_name(op, special) + + @Appender(f"Wrapper for comparison method {op_name}") + def f(self, other): + axis = 1 # only relevant for Series other case + + self, other = _align_method_FRAME(self, other, axis, level=None, flex=False) + + # See GH#4537 for discussion of scalar op behavior + new_data = dispatch_to_series(self, other, op, axis=axis) return self._construct_result(new_data) f.__name__ = op_name diff --git a/venv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py b/venv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py index 41d5395..31e8d00 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py +++ b/venv/lib/python3.8/site-packages/pandas/core/ops/array_ops.py @@ -5,13 +5,13 @@ ExtensionArrays. from datetime import timedelta from functools import partial import operator -from typing import Any +from typing import Any, Tuple import warnings import numpy as np from pandas._libs import Timedelta, Timestamp, lib, ops as libops -from pandas._typing import ArrayLike, Shape +from pandas._typing import ArrayLike from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, @@ -27,10 +27,9 @@ from pandas.core.dtypes.common import ( is_object_dtype, is_scalar, ) -from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCExtensionArray, ABCIndex, ABCSeries from pandas.core.dtypes.missing import isna, notna -from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.ops import missing from pandas.core.ops.dispatch import should_extension_dispatch from pandas.core.ops.invalid import invalid_comparison @@ -41,11 +40,13 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, list): y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndexClass)): + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + # Note: these checks can be for ABCIndex and not ABCIndexClass + # because that is the only object-dtype class. if not is_object_dtype(y.dtype): y = y.astype(np.object_) - if isinstance(y, (ABCSeries, ABCIndexClass)): + if isinstance(y, (ABCSeries, ABCIndex)): y = y._values if x.shape != y.shape: @@ -56,7 +57,7 @@ def comp_method_OBJECT_ARRAY(op, x, y): return result.reshape(x.shape) -def _masked_arith_op(x: np.ndarray, y, op): +def masked_arith_op(x: np.ndarray, y, op): """ If the given arithmetic operation fails, attempt it again on only the non-null elements of the input array(s). @@ -115,7 +116,7 @@ def _masked_arith_op(x: np.ndarray, y, op): return result -def _na_arithmetic_op(left, right, op, is_cmp: bool = False): +def na_arithmetic_op(left, right, op, is_cmp: bool = False): """ Return the result of evaluating op on the passed in values. @@ -146,7 +147,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = _masked_arith_op(left, right, op) + result = masked_arith_op(left, right, op) if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise @@ -176,9 +177,9 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # NB: We assume that extract_array has already been called # on `left` and `right`. - lvalues = ensure_wrapped_if_datetimelike(left) - rvalues = ensure_wrapped_if_datetimelike(right) - rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) + lvalues = maybe_upcast_datetimelike_array(left) + rvalues = maybe_upcast_datetimelike_array(right) + rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 @@ -186,7 +187,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op): else: with np.errstate(all="ignore"): - res_values = _na_arithmetic_op(lvalues, rvalues, op) + res_values = na_arithmetic_op(lvalues, rvalues, op) return res_values @@ -207,7 +208,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right - lvalues = ensure_wrapped_if_datetimelike(left) + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right rvalues = lib.item_from_zerodim(rvalues) @@ -247,7 +248,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # suppress warnings from numpy about element-wise comparison warnings.simplefilter("ignore", DeprecationWarning) with np.errstate(all="ignore"): - res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values @@ -332,7 +333,7 @@ def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right - lvalues = ensure_wrapped_if_datetimelike(left) + lvalues = maybe_upcast_datetimelike_array(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): @@ -353,8 +354,7 @@ def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) - # error: Cannot call function of unknown type - res_values = filler(res_values) # type: ignore[operator] + res_values = filler(res_values) # type: ignore return res_values @@ -401,7 +401,32 @@ def get_array_op(op): raise NotImplementedError(op_name) -def _maybe_upcast_for_op(obj, shape: Shape): +def maybe_upcast_datetimelike_array(obj: ArrayLike) -> ArrayLike: + """ + If we have an ndarray that is either datetime64 or timedelta64, wrap in EA. + + Parameters + ---------- + obj : ndarray or ExtensionArray + + Returns + ------- + ndarray or ExtensionArray + """ + if isinstance(obj, np.ndarray): + if obj.dtype.kind == "m": + from pandas.core.arrays import TimedeltaArray + + return TimedeltaArray._from_sequence(obj) + if obj.dtype.kind == "M": + from pandas.core.arrays import DatetimeArray + + return DatetimeArray._from_sequence(obj) + + return obj + + +def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): """ Cast non-pandas objects to pandas types to unify behavior of arithmetic and comparison operations. diff --git a/venv/lib/python3.8/site-packages/pandas/core/ops/common.py b/venv/lib/python3.8/site-packages/pandas/core/ops/common.py index a6bcab4..515a0a5 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/ops/common.py +++ b/venv/lib/python3.8/site-packages/pandas/core/ops/common.py @@ -65,60 +65,3 @@ def _unpack_zerodim_and_defer(method, name: str): return method(self, other) return new_method - - -def get_op_result_name(left, right): - """ - Find the appropriate name to pin to an operation result. This result - should always be either an Index or a Series. - - Parameters - ---------- - left : {Series, Index} - right : object - - Returns - ------- - name : object - Usually a string - """ - if isinstance(right, (ABCSeries, ABCIndexClass)): - name = _maybe_match_name(left, right) - else: - name = left.name - return name - - -def _maybe_match_name(a, b): - """ - Try to find a name to attach to the result of an operation between - a and b. If only one of these has a `name` attribute, return that - name. Otherwise return a consensus name if they match of None if - they have different names. - - Parameters - ---------- - a : object - b : object - - Returns - ------- - name : str or None - - See Also - -------- - pandas.core.common.consensus_name_attr - """ - a_has = hasattr(a, "name") - b_has = hasattr(b, "name") - if a_has and b_has: - if a.name == b.name: - return a.name - else: - # TODO: what if they both have np.nan for their names? - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None diff --git a/venv/lib/python3.8/site-packages/pandas/core/ops/docstrings.py b/venv/lib/python3.8/site-packages/pandas/core/ops/docstrings.py index 06ed321..4ace873 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/ops/docstrings.py +++ b/venv/lib/python3.8/site-packages/pandas/core/ops/docstrings.py @@ -4,7 +4,7 @@ Templating for ops docstrings from typing import Dict, Optional -def make_flex_doc(op_name: str, typ: str) -> str: +def _make_flex_doc(op_name, typ): """ Make the appropriate substitutions for the given operation and class-typ into either _flex_doc_SERIES or _flex_doc_FRAME to return the docstring @@ -22,20 +22,16 @@ def make_flex_doc(op_name: str, typ: str) -> str: op_name = op_name.replace("__", "") op_desc = _op_descriptions[op_name] - op_desc_op = op_desc["op"] - assert op_desc_op is not None # for mypy if op_name.startswith("r"): - equiv = "other " + op_desc_op + " " + typ - elif op_name == "divmod": - equiv = f"{op_name}({typ}, other)" + equiv = "other " + op_desc["op"] + " " + typ else: - equiv = typ + " " + op_desc_op + " other" + equiv = typ + " " + op_desc["op"] + " other" if typ == "series": base_doc = _flex_doc_SERIES if op_desc["reverse"]: base_doc += _see_also_reverse_SERIES.format( - reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"] + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], ) doc_no_examples = base_doc.format( desc=op_desc["desc"], @@ -43,9 +39,8 @@ def make_flex_doc(op_name: str, typ: str) -> str: equiv=equiv, series_returns=op_desc["series_returns"], ) - ser_example = op_desc["series_examples"] - if ser_example: - doc = doc_no_examples + ser_example + if op_desc["series_examples"]: + doc = doc_no_examples + op_desc["series_examples"] else: doc = doc_no_examples elif typ == "dataframe": @@ -164,25 +159,6 @@ dtype: float64 """ ) -_divmod_example_SERIES = ( - _common_examples_algebra_SERIES - + """ ->>> a.divmod(b, fill_value=0) -(a 1.0 - b NaN - c NaN - d 0.0 - e NaN - dtype: float64, - a 0.0 - b NaN - c NaN - d 0.0 - e NaN - dtype: float64) -""" -) - _mod_example_SERIES = ( _common_examples_algebra_SERIES + """ @@ -353,7 +329,7 @@ _op_descriptions: Dict[str, Dict[str, Optional[str]]] = { "op": "divmod", "desc": "Integer division and modulo", "reverse": "rdivmod", - "series_examples": _divmod_example_SERIES, + "series_examples": None, "series_returns": _returns_tuple, "df_examples": None, }, @@ -448,6 +424,33 @@ See Also Series.{reverse} : {see_also_desc}. """ +_arith_doc_FRAME = """ +Binary operator %s with support to substitute a fill_value for missing data in +one of the inputs + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill existing missing (NaN) values, and any new element needed for + successful DataFrame alignment, with this value before computation. + If data in both corresponding DataFrame locations is missing + the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Returns +------- +result : DataFrame + +Notes +----- +Mismatched indices will be unioned together +""" + _flex_doc_FRAME = """ Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). @@ -608,7 +611,7 @@ Get {desc} of dataframe and other, element-wise (binary operator `{op_name}`). Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. -Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis +Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. Parameters diff --git a/venv/lib/python3.8/site-packages/pandas/core/ops/methods.py b/venv/lib/python3.8/site-packages/pandas/core/ops/methods.py index 4866905..17223d6 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/ops/methods.py +++ b/venv/lib/python3.8/site-packages/pandas/core/ops/methods.py @@ -3,17 +3,21 @@ Functions to generate methods and pin them to the appropriate classes. """ import operator +from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.ops.roperator import ( radd, + rand_, rdivmod, rfloordiv, rmod, rmul, + ror_, rpow, rsub, rtruediv, + rxor, ) @@ -30,23 +34,112 @@ def _get_method_wrappers(cls): ------- arith_flex : function or None comp_flex : function or None + arith_special : function + comp_special : function + bool_special : function + + Notes + ----- + None is only returned for SparseArray """ # TODO: make these non-runtime imports once the relevant functions # are no longer in __init__ from pandas.core.ops import ( - flex_arith_method_FRAME, - flex_comp_method_FRAME, - flex_method_SERIES, + _arith_method_FRAME, + _arith_method_SERIES, + _bool_method_SERIES, + _comp_method_FRAME, + _comp_method_SERIES, + _flex_comp_method_FRAME, + _flex_method_SERIES, ) if issubclass(cls, ABCSeries): # Just Series - arith_flex = flex_method_SERIES - comp_flex = flex_method_SERIES + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SERIES + comp_special = _comp_method_SERIES + bool_special = _bool_method_SERIES elif issubclass(cls, ABCDataFrame): - arith_flex = flex_arith_method_FRAME - comp_flex = flex_comp_method_FRAME - return arith_flex, comp_flex + arith_flex = _arith_method_FRAME + comp_flex = _flex_comp_method_FRAME + arith_special = _arith_method_FRAME + comp_special = _comp_method_FRAME + bool_special = _arith_method_FRAME + return arith_flex, comp_flex, arith_special, comp_special, bool_special + + +def add_special_arithmetic_methods(cls): + """ + Adds the full suite of special arithmetic methods (``__add__``, + ``__sub__``, etc.) to the class. + + Parameters + ---------- + cls : class + special methods will be defined and pinned to this class + """ + _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) + # inplace operators (I feel like these should get passed an `inplace=True` + # or just be removed + + def _wrap_inplace_method(method): + """ + return an inplace wrapper for this method + """ + + def f(self, other): + result = method(self, other) + + if ( + self.ndim == 1 + and result._indexed_same(self) + and is_dtype_equal(result.dtype, self.dtype) + ): + # GH#36498 this inplace op can _actually_ be inplace. + self._values[:] = result._values + return self + + # Delete cacher + self._reset_cacher() + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False), verify_is_copy=False + ) + + return self + + name = method.__name__.strip("__") + f.__name__ = f"__i{name}__" + return f + + new_methods.update( + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) + + new_methods.update( + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) + + _add_methods(cls, new_methods=new_methods) def add_flex_arithmetic_methods(cls): @@ -59,14 +152,16 @@ def add_flex_arithmetic_methods(cls): cls : class flex methods will be defined and pinned to this class """ - flex_arith_method, flex_comp_method = _get_method_wrappers(cls) - new_methods = _create_methods(cls, flex_arith_method, flex_comp_method) + flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) new_methods.update( - { - "multiply": new_methods["mul"], - "subtract": new_methods["sub"], - "divide": new_methods["div"], - } + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) ) # opt out of bool flex methods for now assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) @@ -74,52 +169,66 @@ def add_flex_arithmetic_methods(cls): _add_methods(cls, new_methods=new_methods) -def _create_methods(cls, arith_method, comp_method): - # creates actual flex methods based upon arithmetic, and comp method +def _create_methods(cls, arith_method, comp_method, bool_method, special): + # creates actual methods based upon arithmetic, comp and bool method # constructors. have_divmod = issubclass(cls, ABCSeries) # divmod is available for Series - new_methods = {} - - new_methods.update( - { - "add": arith_method(operator.add), - "radd": arith_method(radd), - "sub": arith_method(operator.sub), - "mul": arith_method(operator.mul), - "truediv": arith_method(operator.truediv), - "floordiv": arith_method(operator.floordiv), - "mod": arith_method(operator.mod), - "pow": arith_method(operator.pow), - "rmul": arith_method(rmul), - "rsub": arith_method(rsub), - "rtruediv": arith_method(rtruediv), - "rfloordiv": arith_method(rfloordiv), - "rpow": arith_method(rpow), - "rmod": arith_method(rmod), - } + new_methods = dict( + add=arith_method(cls, operator.add, special), + radd=arith_method(cls, radd, special), + sub=arith_method(cls, operator.sub, special), + mul=arith_method(cls, operator.mul, special), + truediv=arith_method(cls, operator.truediv, special), + floordiv=arith_method(cls, operator.floordiv, special), + mod=arith_method(cls, operator.mod, special), + pow=arith_method(cls, operator.pow, special), + # not entirely sure why this is necessary, but previously was included + # so it's here to maintain compatibility + rmul=arith_method(cls, rmul, special), + rsub=arith_method(cls, rsub, special), + rtruediv=arith_method(cls, rtruediv, special), + rfloordiv=arith_method(cls, rfloordiv, special), + rpow=arith_method(cls, rpow, special), + rmod=arith_method(cls, rmod, special), ) new_methods["div"] = new_methods["truediv"] new_methods["rdiv"] = new_methods["rtruediv"] if have_divmod: # divmod doesn't have an op that is supported by numexpr - new_methods["divmod"] = arith_method(divmod) - new_methods["rdivmod"] = arith_method(rdivmod) + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) new_methods.update( - { - "eq": comp_method(operator.eq), - "ne": comp_method(operator.ne), - "lt": comp_method(operator.lt), - "gt": comp_method(operator.gt), - "le": comp_method(operator.le), - "ge": comp_method(operator.ge), - } + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) ) - new_methods = {k.strip("_"): v for k, v in new_methods.items()} + if bool_method: + new_methods.update( + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) + + if special: + dunderize = lambda x: f"__{x.strip('_')}__" + else: + dunderize = lambda x: x + new_methods = {dunderize(k): v for k, v in new_methods.items()} return new_methods diff --git a/venv/lib/python3.8/site-packages/pandas/core/resample.py b/venv/lib/python3.8/site-packages/pandas/core/resample.py index afd189a..0dfbf96 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/resample.py +++ b/venv/lib/python3.8/site-packages/pandas/core/resample.py @@ -21,18 +21,12 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.aggregation import aggregate import pandas.core.algorithms as algos -from pandas.core.base import DataError +from pandas.core.base import DataError, ShallowMixin from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby.base import GotItemMixin, ShallowMixin +from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import ( - BaseGroupBy, - GroupBy, - _pipe_template, - get_groupby, -) +from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import Index @@ -43,10 +37,10 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import is_subperiod, is_superperiod from pandas.tseries.offsets import DateOffset, Day, Nano, Tick -_shared_docs_kwargs: Dict[str, str] = {} +_shared_docs_kwargs: Dict[str, str] = dict() -class Resampler(BaseGroupBy, ShallowMixin): +class Resampler(_GroupBy, ShallowMixin): """ Class for resampling datetimelike data, a groupby-like operation. See aggregate, transform, and apply functions on this object. @@ -94,10 +88,7 @@ class Resampler(BaseGroupBy, ShallowMixin): self.as_index = True self.exclusions = set() self.binner = None - # pandas\core\resample.py:96: error: Incompatible types in assignment - # (expression has type "None", variable has type "BaseGrouper") - # [assignment] - self.grouper = None # type: ignore[assignment] + self.grouper = None if self.groupby is not None: self.groupby._set_grouper(self._convert_obj(obj), sort=True) @@ -134,7 +125,7 @@ class Resampler(BaseGroupBy, ShallowMixin): See Also -------- - GroupBy.__iter__ : Generator yielding sequence for each group. + GroupBy.__iter__ """ self._set_binner() return super().__iter__() @@ -212,6 +203,7 @@ class Resampler(BaseGroupBy, ShallowMixin): @Substitution( klass="Resampler", + versionadded=".. versionadded:: 0.23.0", examples=""" >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, ... index=pd.date_range('2012-08-02', periods=4)) @@ -238,12 +230,9 @@ class Resampler(BaseGroupBy, ShallowMixin): """ See Also -------- - DataFrame.groupby.aggregate : Aggregate using callable, string, dict, - or list of string/callables. - DataFrame.resample.transform : Transforms the Series on each group - based on the given function. - DataFrame.aggregate: Aggregate using one or more - operations over the specified axis. + DataFrame.groupby.aggregate + DataFrame.resample.transform + DataFrame.aggregate """ ) @@ -289,13 +278,14 @@ class Resampler(BaseGroupBy, ShallowMixin): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="", klass="DataFrame", axis="", ) def aggregate(self, func, *args, **kwargs): self._set_binner() - result, how = aggregate(self, func, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if result is None: how = func grouper = None @@ -413,21 +403,14 @@ class Resampler(BaseGroupBy, ShallowMixin): result : Series or DataFrame the result of resample """ - # pandas\core\resample.py:409: error: Cannot determine type of - # 'loffset' [has-type] needs_offset = ( - isinstance( - self.loffset, # type: ignore[has-type] - (DateOffset, timedelta, np.timedelta64), - ) + isinstance(self.loffset, (DateOffset, timedelta, np.timedelta64)) and isinstance(result.index, DatetimeIndex) and len(result.index) > 0 ) if needs_offset: - # pandas\core\resample.py:415: error: Cannot determine type of - # 'loffset' [has-type] - result.index = result.index + self.loffset # type: ignore[has-type] + result.index = result.index + self.loffset self.loffset = None return result @@ -467,8 +450,8 @@ class Resampler(BaseGroupBy, ShallowMixin): See Also -------- - Series.fillna: Fill NA/NaN values using the specified method. - DataFrame.fillna: Fill NA/NaN values using the specified method. + Series.fillna + DataFrame.fillna """ return self._upsample("pad", limit=limit) @@ -813,7 +796,7 @@ class Resampler(BaseGroupBy, ShallowMixin): """ Interpolate values according to different methods. """ - result = self._upsample("asfreq") + result = self._upsample(None) return result.interpolate( method=method, axis=axis, @@ -842,8 +825,8 @@ class Resampler(BaseGroupBy, ShallowMixin): See Also -------- - Series.asfreq: Convert TimeSeries to specified frequency. - DataFrame.asfreq: Convert TimeSeries to specified frequency. + Series.asfreq + DataFrame.asfreq """ return self._upsample("asfreq", fill_value=fill_value) @@ -862,9 +845,7 @@ class Resampler(BaseGroupBy, ShallowMixin): Standard deviation of values within each group. """ nv.validate_resampler_func("std", args, kwargs) - # pandas\core\resample.py:850: error: Unexpected keyword argument - # "ddof" for "_downsample" [call-arg] - return self._downsample("std", ddof=ddof) # type: ignore[call-arg] + return self._downsample("std", ddof=ddof) def var(self, ddof=1, *args, **kwargs): """ @@ -881,9 +862,7 @@ class Resampler(BaseGroupBy, ShallowMixin): Variance of values within each group. """ nv.validate_resampler_func("var", args, kwargs) - # pandas\core\resample.py:867: error: Unexpected keyword argument - # "ddof" for "_downsample" [call-arg] - return self._downsample("var", ddof=ddof) # type: ignore[call-arg] + return self._downsample("var", ddof=ddof) @doc(GroupBy.size) def size(self): @@ -933,24 +912,14 @@ class Resampler(BaseGroupBy, ShallowMixin): See Also -------- Series.quantile - Return a series, where the index is q and the values are the quantiles. DataFrame.quantile - Return a DataFrame, where the columns are the columns of self, - and the values are the quantiles. DataFrameGroupBy.quantile - Return a DataFrame, where the coulmns are groupby columns, - and the values are its quantiles. """ - # pandas\core\resample.py:920: error: Unexpected keyword argument "q" - # for "_downsample" [call-arg] - - # pandas\core\resample.py:920: error: Too many arguments for - # "_downsample" [call-arg] - return self._downsample("quantile", q=q, **kwargs) # type: ignore[call-arg] + return self._downsample("quantile", q=q, **kwargs) # downsample methods -for method in ["sum", "prod", "min", "max", "first", "last"]: +for method in ["sum", "prod"]: def f(self, _method=method, min_count=0, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -961,7 +930,7 @@ for method in ["sum", "prod", "min", "max", "first", "last"]: # downsample methods -for method in ["mean", "sem", "median", "ohlc"]: +for method in ["min", "max", "first", "last", "mean", "sem", "median", "ohlc"]: def g(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) @@ -981,7 +950,7 @@ for method in ["nunique"]: setattr(Resampler, method, h) -class _GroupByMixin(GotItemMixin): +class _GroupByMixin(GroupByMixin): """ Provide the groupby facilities. """ @@ -998,9 +967,8 @@ class _GroupByMixin(GotItemMixin): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - # pandas\core\resample.py:972: error: Too many arguments for "__init__" - # of "object" [call-arg] - super().__init__(None) # type: ignore[call-arg] + # error: Too many arguments for "__init__" of "object" + super().__init__(None) # type: ignore self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True @@ -1065,12 +1033,7 @@ class DatetimeIndexResampler(Resampler): # do we have a regular frequency if ax.freq is not None or ax.inferred_freq is not None: - # pandas\core\resample.py:1037: error: "BaseGrouper" has no - # attribute "binlabels" [attr-defined] - if ( - len(self.grouper.binlabels) > len(ax) # type: ignore[attr-defined] - and how is None - ): + if len(self.grouper.binlabels) > len(ax) and how is None: # let's do an asfreq return self.asfreq() @@ -1107,7 +1070,7 @@ class DatetimeIndexResampler(Resampler): See Also -------- - .fillna: Fill NA/NaN values using the specified method. + .fillna """ self._set_binner() @@ -1126,11 +1089,7 @@ class DatetimeIndexResampler(Resampler): res_index = self._adjust_binner_for_upsample(binner) # if we have the same frequency as our axis, then we are equal sampling - if ( - limit is None - and to_offset(ax.inferred_freq) == self.freq - and len(obj) == len(res_index) - ): + if limit is None and to_offset(ax.inferred_freq) == self.freq: result = obj.copy() result.index = res_index else: @@ -1243,7 +1202,7 @@ class PeriodIndexResampler(DatetimeIndexResampler): See Also -------- - .fillna: Fill NA/NaN values using the specified method. + .fillna """ # we may need to actually resample as if we are timestamps diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/concat.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/concat.py index 70668ac..9e8fb64 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/concat.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/concat.py @@ -3,25 +3,14 @@ Concat routines. """ from collections import abc -from typing import ( - TYPE_CHECKING, - Iterable, - List, - Mapping, - Optional, - Type, - Union, - cast, - overload, -) +from typing import TYPE_CHECKING, Iterable, List, Mapping, Union, overload import numpy as np -from pandas._typing import FrameOrSeriesUnion, Label +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import isna from pandas.core.arrays.categorical import ( factorize_from_iterable, @@ -33,15 +22,14 @@ from pandas.core.indexes.api import ( MultiIndex, all_indexes_same, ensure_index, + get_consensus_names, get_objs_combined_axis, - get_unanimous_names, ) import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers if TYPE_CHECKING: - from pandas import DataFrame, Series - from pandas.core.generic import NDFrame + from pandas import DataFrame # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -65,7 +53,7 @@ def concat( @overload def concat( - objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -80,7 +68,7 @@ def concat( def concat( - objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join="outer", ignore_index: bool = False, @@ -133,6 +121,7 @@ def concat( This has no effect when ``join='inner'``, which already preserves the order of the non-concatenation axis. + .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. @@ -305,7 +294,7 @@ class _Concatenator: def __init__( self, - objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join: str = "outer", keys=None, @@ -370,13 +359,13 @@ class _Concatenator: raise TypeError(msg) # consolidate - obj._consolidate_inplace() + obj._consolidate(inplace=True) ndims.add(obj.ndim) # get the sample # want the highest ndim that we have, and must be non-empty # unless all objs are empty - sample: Optional["NDFrame"] = None + sample = None if len(ndims) > 1: max_ndim = max(ndims) for obj in objs: @@ -446,8 +435,6 @@ class _Concatenator: # to line up if self._is_frame and axis == 1: name = 0 - # mypy needs to know sample is not an NDFrame - sample = cast("FrameOrSeriesUnion", sample) obj = sample._constructor({name: obj}) self.objs.append(obj) @@ -467,17 +454,14 @@ class _Concatenator: self.new_axes = self._get_new_axes() def get_result(self): - cons: Type[FrameOrSeriesUnion] - sample: FrameOrSeriesUnion # series only if self._is_series: - sample = cast("Series", self.objs[0]) # stack blocks if self.bm_axis == 0: name = com.consensus_name_attr(self.objs) - cons = sample._constructor + cons = self.objs[0]._constructor arrs = [ser._values for ser in self.objs] @@ -490,7 +474,7 @@ class _Concatenator: data = dict(zip(range(len(self.objs)), self.objs)) # GH28330 Preserves subclassed objects through concat - cons = sample._constructor_expanddim + cons = self.objs[0]._constructor_expanddim index, columns = self.new_axes df = cons(data, index=index) @@ -499,8 +483,6 @@ class _Concatenator: # combine block managers else: - sample = cast("DataFrame", self.objs[0]) - mgrs_indexers = [] for obj in self.objs: indexers = {} @@ -513,17 +495,17 @@ class _Concatenator: # 1-ax to convert BlockManager axis to DataFrame axis obj_labels = obj.axes[1 - ax] if not new_labels.equals(obj_labels): - indexers[ax] = obj_labels.get_indexer(new_labels) + indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, ) if not self.copy: new_data._consolidate_inplace() - cons = sample._constructor + cons = self.objs[0]._constructor return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: @@ -642,11 +624,10 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - # Find matching codes, include matching nan values as equal. - mask = (isna(level) & isna(key)) | (level == key) + mask = level == key if not mask.any(): raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(mask)[0][0] + i = np.nonzero(level == key)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) @@ -672,7 +653,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde ) # also copies - names = list(names) + list(get_unanimous_names(*indexes)) + names = names + get_consensus_names(indexes) return MultiIndex( levels=levels, codes=codes_list, names=names, verify_integrity=False diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/melt.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/melt.py index f49aaee..1ba6854 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/melt.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/melt.py @@ -14,15 +14,18 @@ from pandas.core.arrays import Categorical import pandas.core.common as com from pandas.core.indexes.api import Index, MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import tile_compat +from pandas.core.reshape.util import _tile_compat from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: - from pandas import DataFrame, Series + from pandas import DataFrame, Series # noqa: F401 -@Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"}) +@Appender( + _shared_docs["melt"] + % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") +) def melt( frame: "DataFrame", id_vars=None, @@ -42,7 +45,7 @@ def melt( if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " - "name of the resulting Dataframe. " + "name of the resultiing Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, @@ -133,7 +136,7 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = tile_compat(frame.index, K) + result.index = _tile_compat(frame.index, K) return result @@ -141,43 +144,14 @@ def melt( @deprecate_kwarg(old_arg_name="label", new_arg_name=None) def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "DataFrame": """ - Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. - - Accepts a dictionary, ``groups``, in which each key is a new column name - and each value is a list of old column names that will be "melted" under - the new column name as part of the reshape. + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame - The wide-format DataFrame. groups : dict - {new_name : list_of_columns}. - dropna : bool, default True - Do not include columns whose entries are all NaN. - label : None - Not used. - - .. deprecated:: 1.0.0 - - Returns - ------- - DataFrame - Reshaped DataFrame. - - See Also - -------- - melt : Unpivot a DataFrame from wide to long format, optionally leaving - identifiers set. - pivot : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Pivot without aggregation that can handle - non-numeric data. - DataFrame.pivot_table : Generalization of pivot that can handle - duplicate values for one index/column pair. - DataFrame.unstack : Pivot based on the index values instead of a - column. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. + {new_name : list_of_columns} + dropna : boolean, default True Examples -------- @@ -195,6 +169,10 @@ def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "Dat 1 Yankees 2007 573 2 Red Sox 2008 545 3 Yankees 2008 526 + + Returns + ------- + reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) @@ -271,10 +249,12 @@ def wide_to_long( A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate - suffixes, for example, if your wide variables are of the form A-one, - B-two,.., and you have an unrelated column A-rating, you can ignore the - last one by specifying `suffix='(!?one|two)'`. When all suffixes are - numeric, they are cast to int64/float64. + suffixes, for example, if your wide variables are of the form + A-one, B-two,.., and you have an unrelated column A-rating, you can + ignore the last one by specifying `suffix='(!?one|two)'`. + + .. versionchanged:: 0.23.0 + When all suffixes are numeric, they are cast to int64/float64. Returns ------- @@ -282,18 +262,6 @@ def wide_to_long( A DataFrame that contains each stub name as a variable, with new index (i, j). - See Also - -------- - melt : Unpivot a DataFrame from wide to long format, optionally leaving - identifiers set. - pivot : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Pivot without aggregation that can handle - non-numeric data. - DataFrame.pivot_table : Generalization of pivot that can handle - duplicate values for one index/column pair. - DataFrame.unstack : Pivot based on the index values instead of a - column. - Notes ----- All extra variables are left untouched. This simply uses @@ -444,7 +412,7 @@ def wide_to_long( 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age', - ... sep='_', suffix=r'\w+') + ... sep='_', suffix='\w+') >>> l ... # doctest: +NORMALIZE_WHITESPACE ht @@ -483,7 +451,7 @@ def wide_to_long( var_name=j, ) newdf[j] = Categorical(newdf[j]) - newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True) + newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") # GH17627 Cast numerics suffixes to int/float newdf[j] = to_numeric(newdf[j], errors="ignore") diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/merge.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/merge.py index 95fdb21..f43a33d 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/merge.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/merge.py @@ -5,15 +5,15 @@ SQL-style merge routines import copy import datetime from functools import partial -import hashlib import string -from typing import TYPE_CHECKING, Optional, Tuple, cast +from typing import TYPE_CHECKING, Optional, Tuple, Union import warnings import numpy as np -from pandas._libs import Timedelta, hashtable as libhashtable, join as libjoin, lib -from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion +from pandas._libs import Timedelta, hashtable as libhashtable, lib +import pandas._libs.join as libjoin +from pandas._typing import ArrayLike, FrameOrSeries from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -43,6 +43,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos +from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -50,8 +51,7 @@ from pandas.core.internals import concatenate_block_managers from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame - from pandas.core.arrays import DatetimeArray + from pandas import DataFrame, Series # noqa:F401 @Substitution("\nleft : DataFrame") @@ -114,8 +114,11 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # if we can groupby the rhs # then we can get vastly better perf - if all(item in right.columns for item in by): + + try: rby = right.groupby(by, sort=False) + except KeyError: + pass for key, lhs in lby: @@ -137,7 +140,9 @@ def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_piec # make sure join keys are in the merged # TODO, should merge_pieces do this? - merged[by] = key + for k in by: + if k in merged: + merged[k] = key pieces.append(merged) @@ -271,20 +276,10 @@ def merge_ordered( if left_by is not None and right_by is not None: raise ValueError("Can only group either left or right frames") elif left_by is not None: - if isinstance(left_by, str): - left_by = [left_by] - check = set(left_by).difference(left.columns) - if len(check) != 0: - raise KeyError(f"{check} not found in left columns") result, _ = _groupby_and_merge( left_by, on, left, right, lambda x, y: _merger(x, y) ) elif right_by is not None: - if isinstance(right_by, str): - right_by = [right_by] - check = set(right_by).difference(right.columns) - if len(check) != 0: - raise KeyError(f"{check} not found in right columns") result, _ = _groupby_and_merge( right_by, on, right, left, lambda x, y: _merger(y, x) ) @@ -580,8 +575,8 @@ class _MergeOperation: def __init__( self, - left: FrameOrSeriesUnion, - right: FrameOrSeriesUnion, + left: Union["Series", "DataFrame"], + right: Union["Series", "DataFrame"], how: str = "inner", on=None, left_on=None, @@ -649,17 +644,6 @@ class _MergeOperation: self._validate_specification() - cross_col = None - if self.how == "cross": - ( - self.left, - self.right, - self.how, - cross_col, - ) = self._create_cross_configuration(self.left, self.right) - self.left_on = self.right_on = [cross_col] - self._cross = cross_col - # note this function has side effects ( self.left_join_keys, @@ -707,13 +691,7 @@ class _MergeOperation: self._maybe_restore_index_levels(result) - self._maybe_drop_cross_column(result, self._cross) - - return result.__finalize__(self, method="merge") - - def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): - if cross_col is not None: - result.drop(columns=cross_col, inplace=True) + return result def _indicator_pre_merge( self, left: "DataFrame", right: "DataFrame" @@ -854,15 +832,12 @@ class _MergeOperation: rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) # if we have an all missing left_indexer - # make sure to just use the right values or vice-versa - mask_left = left_indexer == -1 - mask_right = right_indexer == -1 - if mask_left.all(): + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): key_col = rvals - elif right_indexer is not None and mask_right.all(): - key_col = lvals else: - key_col = Index(lvals).where(~mask_left, rvals) + key_col = Index(lvals).where(~mask, rvals) if result._is_label_reference(name): result[name] = key_col @@ -884,7 +859,7 @@ class _MergeOperation: def _get_join_indexers(self): """ return the join indexers """ - return get_join_indexers( + return _get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -989,10 +964,7 @@ class _MergeOperation: """ left_keys = [] right_keys = [] - # pandas\core\reshape\merge.py:966: error: Need type annotation for - # 'join_names' (hint: "join_names: List[] = ...") - # [var-annotated] - join_names = [] # type: ignore[var-annotated] + join_names = [] right_drop = [] left_drop = [] @@ -1113,7 +1085,7 @@ class _MergeOperation: # if either left or right is a categorical # then the must match exactly in categories & ordered if lk_is_cat and rk_is_cat: - if lk._categories_match_up_to_permutation(rk): + if lk.is_dtype_equal(rk): continue elif lk_is_cat or rk_is_cat: @@ -1223,57 +1195,18 @@ class _MergeOperation: typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign(**{name: self.right[name].astype(typ)}) - def _create_cross_configuration( - self, left, right - ) -> Tuple["DataFrame", "DataFrame", str, str]: - """ - Creates the configuration to dispatch the cross operation to inner join, - e.g. adding a join column and resetting parameters. Join column is added - to a new object, no inplace modification - - Parameters - ---------- - left: DataFrame - right DataFrame - - Returns - ------- - a tuple (left, right, how, cross_col) representing the adjusted - DataFrames with cross_col, the merge operation set to inner and the column - to join over. - """ - cross_col = f"_cross_{hashlib.md5().hexdigest()}" - how = "inner" - return ( - left.assign(**{cross_col: 1}), - right.assign(**{cross_col: 1}), - how, - cross_col, - ) - def _validate_specification(self): - if self.how == "cross": - if ( - self.left_index - or self.right_index - or self.right_on is not None - or self.left_on is not None - or self.on is not None - ): - raise MergeError( - "Can not pass on, right_on, left_on or set right_index=True or " - "left_index=True" - ) - return # Hm, any way to make this logic less complicated?? - elif self.on is None and self.left_on is None and self.right_on is None: + if self.on is None and self.left_on is None and self.right_on is None: if self.left_index and self.right_index: self.left_on, self.right_on = (), () elif self.left_index: - raise MergeError("Must pass right_on or right_index=True") + if self.right_on is None: + raise MergeError("Must pass right_on or right_index=True") elif self.right_index: - raise MergeError("Must pass left_on or left_index=True") + if self.left_on is None: + raise MergeError("Must pass left_on or left_index=True") else: # use the common columns left_cols = self.left.columns @@ -1299,19 +1232,8 @@ class _MergeOperation: 'Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.' ) - if self.left_index or self.right_index: - raise MergeError( - 'Can only pass argument "on" OR "left_index" ' - 'and "right_index", not a combination of both.' - ) self.left_on = self.right_on = self.on elif self.left_on is not None: - if self.left_index: - raise MergeError( - 'Can only pass argument "left_on" OR "left_index" not both.' - ) - if not self.right_index and self.right_on is None: - raise MergeError('Must pass "right_on" OR "right_index".') n = len(self.left_on) if self.right_index: if len(self.left_on) != self.right.index.nlevels: @@ -1321,12 +1243,6 @@ class _MergeOperation: ) self.right_on = [None] * n elif self.right_on is not None: - if self.right_index: - raise MergeError( - 'Can only pass argument "right_on" OR "right_index" not both.' - ) - if not self.left_index and self.left_on is None: - raise MergeError('Must pass "left_on" OR "left_index".') n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: @@ -1335,7 +1251,7 @@ class _MergeOperation: 'of levels in the index of "left"' ) self.left_on = [None] * n - if self.how != "cross" and len(self.right_on) != len(self.left_on): + if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") def _validate(self, validate: str): @@ -1387,7 +1303,7 @@ class _MergeOperation: raise ValueError("Not a valid argument for validate") -def get_join_indexers( +def _get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ): """ @@ -1427,21 +1343,19 @@ def get_join_indexers( lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) # preserve left frame order if how == 'left' and sort == False kwargs = copy.copy(kwargs) - if how in ("left", "right"): + if how == "left": kwargs["sort"] = sort join_func = { "inner": libjoin.inner_join, "left": libjoin.left_outer_join, - "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( - y, x, count, **kwargs - )[::-1], + "right": _right_outer_join, "outer": libjoin.full_outer_join, }[how] return join_func(lkey, rkey, count, **kwargs) -def restore_dropped_levels_multijoin( +def _restore_dropped_levels_multijoin( left: MultiIndex, right: MultiIndex, dropped_level_names, @@ -1595,7 +1509,7 @@ class _OrderedMerge(_MergeOperation): ) typ = self.left._constructor - result = typ(result_data) + result = typ(result_data).__finalize__(self, method=self._merge_type) self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -1929,7 +1843,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( - isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels + (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) ): raise AssertionError( "If more than one join key is given then " @@ -1954,9 +1868,14 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = return left_ax, None, right_indexer +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = libjoin.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + + def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" -) -> Tuple[np.ndarray, np.ndarray, int]: +) -> Tuple[np.array, np.array, int]: """ Encode left and right keys as enumerated types. @@ -2014,27 +1933,29 @@ def _factorize_keys( if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): # Extract the ndarray (UTC-localized) values # Note: we dont need the dtypes to match, as these can still be compared - lk = cast("DatetimeArray", lk)._ndarray - rk = cast("DatetimeArray", rk)._ndarray + lk, _ = lk._values_for_factorize() + rk, _ = rk._values_for_factorize() elif ( - is_categorical_dtype(lk.dtype) - and is_categorical_dtype(rk.dtype) - and is_dtype_equal(lk.dtype, rk.dtype) + is_categorical_dtype(lk) and is_categorical_dtype(rk) and is_dtype_equal(lk, rk) ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) - # Cast rk to encoding so we can compare codes with lk - rk = lk._encode_with_my_categories(rk) + if lk.categories.equals(rk.categories): + # if we exactly match in categories, allow us to factorize on codes + rk = rk.codes + else: + # Same categories in different orders -> recode + rk = recode_for_categories(rk.codes, rk.categories, lk.categories) lk = ensure_int64(lk.codes) - rk = ensure_int64(rk.codes) + rk = ensure_int64(rk) elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): lk, _ = lk._values_for_factorize() rk, _ = rk._values_for_factorize() - if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): + if is_integer_dtype(lk) and is_integer_dtype(rk): # GH#23917 TODO: needs tests for case where lk is integer-dtype # and rk is datetime-dtype klass = libhashtable.Int64Factorizer @@ -2096,11 +2017,8 @@ def _sort_labels(uniques: np.ndarray, left, right): def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow - nlev = next( - lev - for lev in range(len(shape), 0, -1) - if not is_int64_overflow_possible(shape[:lev]) - ) + pred = lambda i: not is_int64_overflow_possible(shape[:i]) + nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype="i8") diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/pivot.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/pivot.py index 40496a5..ea5916e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/pivot.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/pivot.py @@ -5,7 +5,6 @@ from typing import ( List, Optional, Sequence, - Set, Tuple, Union, cast, @@ -13,7 +12,7 @@ from typing import ( import numpy as np -from pandas._typing import FrameOrSeriesUnion, Label +from pandas._typing import Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -201,7 +200,7 @@ def pivot_table( def _add_margins( - table: FrameOrSeriesUnion, + table: Union["Series", "DataFrame"], data, values, rows, @@ -240,7 +239,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name + table, data, values, rows, cols, aggfunc, observed, margins_name, ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -268,13 +267,19 @@ def _add_margins( margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names - # check the result column and leave floats - for dtype in set(result.dtypes): - cols = result.select_dtypes([dtype]).columns - margin_dummy[cols] = margin_dummy[cols].apply( - maybe_downcast_to_dtype, args=(dtype,) - ) - result = result.append(margin_dummy) + try: + # check the result column and leave floats + for dtype in set(result.dtypes): + cols = result.select_dtypes([dtype]).columns + margin_dummy[cols] = margin_dummy[cols].apply( + maybe_downcast_to_dtype, args=(dtype,) + ) + result = result.append(margin_dummy) + except TypeError: + + # we cannot reshape, so coerce the axis + result.index = result.index._to_safe_for_reshape() + result = result.append(margin_dummy) result.index.names = row_names return result @@ -303,7 +308,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", ): if len(cols) > 0: # need to "interleave" the margins @@ -322,7 +327,17 @@ def _generate_marginal_results( # we are going to mutate this, so need to copy! piece = piece.copy() - piece[all_key] = margin[key] + try: + piece[all_key] = margin[key] + except TypeError: + + # we cannot reshape, so coerce the axis + piece.set_axis( + piece._get_axis(cat_axis)._to_safe_for_reshape(), + axis=cat_axis, + inplace=True, + ) + piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) @@ -436,9 +451,10 @@ def pivot( cols = com.convert_to_list_like(index) else: cols = [] + cols.extend(columns) append = index is None - indexed = data.set_index(cols + columns, append=append) + indexed = data.set_index(cols, append=append) else: if index is None: index = [Series(data.index, name=data.index.name)] @@ -564,37 +580,29 @@ def crosstab( b 0 1 0 c 0 0 0 """ - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") - - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") - index = com.maybe_make_list(index) columns = com.maybe_make_list(columns) + rownames = _get_names(index, rownames, prefix="row") + colnames = _get_names(columns, colnames, prefix="col") + common_idx = None pass_objs = [x for x in index + columns if isinstance(x, (ABCSeries, ABCDataFrame))] if pass_objs: common_idx = get_objs_combined_axis(pass_objs, intersect=True, sort=False) - rownames = _get_names(index, rownames, prefix="row") - colnames = _get_names(columns, colnames, prefix="col") + data: Dict = {} + data.update(zip(rownames, index)) + data.update(zip(colnames, columns)) - # duplicate names mapped to unique names for pivot op - ( - rownames_mapper, - unique_rownames, - colnames_mapper, - unique_colnames, - ) = _build_names_mapper(rownames, colnames) + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame - data = { - **dict(zip(unique_rownames, index)), - **dict(zip(unique_colnames, columns)), - } df = DataFrame(data, index=common_idx) original_df_cols = df.columns @@ -607,8 +615,8 @@ def crosstab( table = df.pivot_table( ["__dummy__"], - index=unique_rownames, - columns=unique_colnames, + index=rownames, + columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, @@ -627,9 +635,6 @@ def crosstab( table, normalize=normalize, margins=margins, margins_name=margins_name ) - table = table.rename_axis(index=rownames_mapper, axis=0) - table = table.rename_axis(columns=colnames_mapper, axis=1) - return table @@ -665,11 +670,12 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # keep index and column of pivoted table table_index = table.index table_columns = table.columns - last_ind_or_col = table.iloc[-1, :].name - # check if margin name is not in (for MI cases) and not equal to last + # check if margin name is in (for MI cases) or equal to last # index/column and save the column and index margin - if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): + if (margins_name not in table.iloc[-1, :].name) | ( + margins_name != table.iloc[:, -1].name + ): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] @@ -728,57 +734,3 @@ def _get_names(arrs, names, prefix: str = "row"): names = list(names) return names - - -def _build_names_mapper( - rownames: List[str], colnames: List[str] -) -> Tuple[Dict[str, str], List[str], Dict[str, str], List[str]]: - """ - Given the names of a DataFrame's rows and columns, returns a set of unique row - and column names and mappers that convert to original names. - - A row or column name is replaced if it is duplicate among the rows of the inputs, - among the columns of the inputs or between the rows and the columns. - - Paramters - --------- - rownames: list[str] - colnames: list[str] - - Returns - ------- - Tuple(Dict[str, str], List[str], Dict[str, str], List[str]) - - rownames_mapper: dict[str, str] - a dictionary with new row names as keys and original rownames as values - unique_rownames: list[str] - a list of rownames with duplicate names replaced by dummy names - colnames_mapper: dict[str, str] - a dictionary with new column names as keys and original column names as values - unique_colnames: list[str] - a list of column names with duplicate names replaced by dummy names - - """ - - def get_duplicates(names): - seen: Set = set() - return {name for name in names if name not in seen} - - shared_names = set(rownames).intersection(set(colnames)) - dup_names = get_duplicates(rownames) | get_duplicates(colnames) | shared_names - - rownames_mapper = { - f"row_{i}": name for i, name in enumerate(rownames) if name in dup_names - } - unique_rownames = [ - f"row_{i}" if name in dup_names else name for i, name in enumerate(rownames) - ] - - colnames_mapper = { - f"col_{i}": name for i, name in enumerate(colnames) if name in dup_names - } - unique_colnames = [ - f"col_{i}" if name in dup_names else name for i, name in enumerate(colnames) - ] - - return rownames_mapper, unique_rownames, colnames_mapper, unique_colnames diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/reshape.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/reshape.py index c197e14..1d4c9a7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/reshape.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/reshape.py @@ -81,7 +81,9 @@ class _Unstacker: unstacked : DataFrame """ - def __init__(self, index: MultiIndex, level=-1, constructor=None): + def __init__( + self, index: MultiIndex, level=-1, constructor=None, + ): if constructor is None: constructor = DataFrame @@ -137,7 +139,7 @@ class _Unstacker: @cache_readonly def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort - return [line.take(indexer) for line in to_sort] + return [l.take(indexer) for l in to_sort] def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort @@ -399,7 +401,6 @@ def _unstack_multiple(data, clocs, fill_value=None): def unstack(obj, level, fill_value=None): - if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, @@ -417,18 +418,11 @@ def unstack(obj, level, fill_value=None): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) - elif not isinstance(obj.index, MultiIndex): - # GH 36113 - # Give nicer error messages when unstack a Series whose - # Index is not a MultiIndex. - raise ValueError( - f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" - ) else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim + obj.index, level=level, constructor=obj._constructor_expanddim, ) return unstacker.get_result( obj.values, value_columns=None, fill_value=fill_value @@ -442,7 +436,7 @@ def _unstack_frame(obj, level, fill_value=None): return obj._constructor(mgr) else: return _Unstacker( - obj.index, level=level, constructor=obj._constructor + obj.index, level=level, constructor=obj._constructor, ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) @@ -521,7 +515,7 @@ def stack(frame, level=-1, dropna=True): verify_integrity=False, ) - if not frame.empty and frame._is_homogeneous_type: + if frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) @@ -768,6 +762,8 @@ def get_dummies( dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. + .. versionadded:: 0.23.0 + Returns ------- DataFrame diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/tile.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/tile.py index 4c5347b..aefc6eb 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/tile.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/tile.py @@ -84,6 +84,8 @@ def cut( Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. + + .. versionadded:: 0.23.0 ordered : bool, default True Whether the labels are ordered or not. Applies to returned types Categorical and Series (with Categorical dtype). If True, diff --git a/venv/lib/python3.8/site-packages/pandas/core/reshape/util.py b/venv/lib/python3.8/site-packages/pandas/core/reshape/util.py index d2c0871..6949270 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/reshape/util.py +++ b/venv/lib/python3.8/site-packages/pandas/core/reshape/util.py @@ -39,9 +39,6 @@ def cartesian_product(X): lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) - if np.any(cumprodX < 0): - raise ValueError("Product space too large to allocate arrays!") - a = np.roll(cumprodX, 1) a[0] = 1 @@ -51,10 +48,10 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] + return [_tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] -def tile_compat(arr, num: int): +def _tile_compat(arr, num: int): """ Index compat for np.tile. diff --git a/venv/lib/python3.8/site-packages/pandas/core/series.py b/venv/lib/python3.8/site-packages/pandas/core/series.py index 1449b78..00fcd44 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/series.py +++ b/venv/lib/python3.8/site-packages/pandas/core/series.py @@ -25,14 +25,12 @@ from pandas._config import get_option from pandas._libs import lib, properties, reshape, tslibs from pandas._libs.lib import no_default from pandas._typing import ( - AggFuncType, ArrayLike, Axis, DtypeObj, FrameOrSeriesUnion, IndexKeyFunc, Label, - StorageOptions, ValueKeyFunc, ) from pandas.compat.numpy import function as nv @@ -56,7 +54,6 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, is_scalar, - validate_all_hashable, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -67,15 +64,14 @@ from pandas.core.dtypes.missing import ( remove_na_arraylike, ) -from pandas.core import algorithms, base, generic, missing, nanops, ops +import pandas as pd +from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import aggregate, transform from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor import pandas.core.common as com from pandas.core.construction import ( - array as pd_array, create_series_with_explicit_dtype, extract_array, is_empty_data, @@ -84,21 +80,14 @@ from pandas.core.construction import ( from pandas.core.generic import NDFrame from pandas.core.indexers import deprecate_ndim_indexing, unpack_1tuple from pandas.core.indexes.accessors import CombinedDatetimelikeProperties -from pandas.core.indexes.api import ( - CategoricalIndex, - Float64Index, - Index, - MultiIndex, - ensure_index, -) +from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager -from pandas.core.shared_docs import _shared_docs -from pandas.core.sorting import ensure_key_mapped, nargsort +from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -111,21 +100,22 @@ if TYPE_CHECKING: __all__ = ["Series"] -_shared_doc_kwargs = { - "axes": "index", - "klass": "Series", - "axes_single_arg": "{0 or 'index'}", - "axis": """axis : {0 or 'index'} +_shared_doc_kwargs = dict( + axes="index", + klass="Series", + axes_single_arg="{0 or 'index'}", + axis="""axis : {0 or 'index'} Parameter needed for compatibility with DataFrame.""", - "inplace": """inplace : boolean, default False + inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - "unique": "np.ndarray", - "duplicated": "Series", - "optional_by": "", - "optional_mapper": "", - "optional_labels": "", - "optional_axis": "", -} + unique="np.ndarray", + duplicated="Series", + optional_by="", + optional_mapper="", + optional_labels="", + optional_axis="", + versionadded_to_excel="\n .. versionadded:: 0.20.0\n", +) def _coerce_method(converter): @@ -163,14 +153,18 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Parameters ---------- data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. If data is a dict, argument order is - maintained. + Contains data stored in Series. + + .. versionchanged:: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to - RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like - and index is None, then the values in the index are used to - reindex the Series after it is created using the keys in the data. + RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index + sequence are used, the index will override the keys found in the + dict. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. @@ -182,15 +176,14 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ _typ = "series" - _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) _name: Label _metadata: List[str] = ["name"] _internal_names_set = {"index"} | generic.NDFrame._internal_names_set _accessors = {"dt", "cat", "str", "sparse"} - _hidden_attrs = ( - base.IndexOpsMixin._hidden_attrs - | generic.NDFrame._hidden_attrs + _deprecations = ( + base.IndexOpsMixin._deprecations + | generic.NDFrame._deprecations | frozenset(["compress", "ptp"]) ) @@ -198,7 +191,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) - __hash__ = generic.NDFrame.__hash__ _mgr: SingleBlockManager div: Callable[["Series", Any], "Series"] rdiv: Callable[["Series", Any], "Series"] @@ -362,19 +354,15 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: - # GH:34717, issue was using zip to extract key and values from data. - # using generators in effects the performance. - # Below is the new way of extracting the keys and values - - keys = tuple(data.keys()) - values = list(data.values()) # Generating list of values- faster way + keys, values = zip(*data.items()) + values = list(values) elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. values = na_value_for_dtype(dtype) keys = index else: - keys, values = (), [] + keys, values = [], [] # Input is now list-like, so rely on "standard" construction: @@ -402,7 +390,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # types @property - def _can_hold_na(self) -> bool: + def _can_hold_na(self): return self._mgr._can_hold_na _index = None @@ -417,14 +405,9 @@ class Series(base.IndexOpsMixin, generic.NDFrame): if not fastpath: labels = ensure_index(labels) - if labels._is_all_dates: - deep_labels = labels - if isinstance(labels, CategoricalIndex): - deep_labels = labels.categories - - if not isinstance( - deep_labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex) - ): + is_all_dates = labels.is_all_dates + if is_all_dates: + if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: labels = DatetimeIndex(labels) # need to set here because we changed the index @@ -508,7 +491,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): @name.setter def name(self, value: Label) -> None: - validate_all_hashable(value, error_name=f"{type(self).__name__}.name") + if not is_hashable(value): + raise TypeError("Series.name must be a hashable type") object.__setattr__(self, "_name", value) @property @@ -587,8 +571,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): """ return self._mgr.internal_values() - # error: Decorated property not supported - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore @property def array(self) -> ExtensionArray: return self._mgr._block.array_values() @@ -689,6 +672,81 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # NDArray Compat _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray) + def __array_ufunc__( + self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any + ): + # TODO: handle DataFrame + cls = type(self) + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + # Determine if we should defer. + no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + + for item in inputs: + higher_priority = ( + hasattr(item, "__array_priority__") + and item.__array_priority__ > self.__array_priority__ + ) + has_array_ufunc = ( + hasattr(item, "__array_ufunc__") + and type(item).__array_ufunc__ not in no_defer + and not isinstance(item, self._HANDLED_TYPES) + ) + if higher_priority or has_array_ufunc: + return NotImplemented + + # align all the inputs. + names = [getattr(x, "name") for x in inputs if hasattr(x, "name")] + types = tuple(type(x) for x in inputs) + # TODO: dataframe + alignable = [x for x, t in zip(inputs, types) if issubclass(t, Series)] + + if len(alignable) > 1: + # This triggers alignment. + # At the moment, there aren't any ufuncs with more than two inputs + # so this ends up just being x1.index | x2.index, but we write + # it to handle *args. + index = alignable[0].index + for s in alignable[1:]: + index |= s.index + inputs = tuple( + x.reindex(index) if issubclass(t, Series) else x + for x, t in zip(inputs, types) + ) + else: + index = self.index + + inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs) + result = getattr(ufunc, method)(*inputs, **kwargs) + + name = names[0] if len(set(names)) == 1 else None + + def construct_return(result): + if lib.is_scalar(result): + return result + elif result.ndim > 1: + # e.g. np.subtract.outer + if method == "outer": + # GH#27198 + raise NotImplementedError + return result + return self._constructor(result, index=index, name=name, copy=False) + + if type(result) is tuple: + # multiple return values + return tuple(construct_return(x) for x in result) + elif method == "at": + # no return value + return None + else: + return construct_return(result) + def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -768,7 +826,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): FutureWarning, stacklevel=2, ) - nv.validate_take((), kwargs) + nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) new_index = self.index.take(indices) @@ -900,17 +958,17 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self + self, ) def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack # the asarray is needed to avoid returning a 2D DatetimeArray - return np.asarray(self._values[indexer]) + return np.asarray(self._values)[indexer] def _get_value(self, label, takeable: bool = False): """ @@ -947,7 +1005,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # positional setter values[key] = value else: - # GH#12862 adding a new key to the Series + # GH#12862 adding an new key to the Series self.loc[key] = value except TypeError as err: @@ -1016,9 +1074,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def _set_values(self, key, value): if isinstance(key, Series): key = key._values - self._mgr = self._mgr.setitem( # type: ignore[assignment] - indexer=key, value=value - ) + self._mgr = self._mgr.setitem(indexer=key, value=value) self._maybe_update_cacher() def _set_value(self, label, value, takeable: bool = False): @@ -1107,7 +1163,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): 2 c dtype: object """ - nv.validate_repeat((), {"axis": axis}) + nv.validate_repeat(tuple(), dict(axis=axis)) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) return self._constructor(new_values, index=new_index).__finalize__( @@ -1139,7 +1195,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Returns ------- - Series or DataFrame or None + Series or DataFrame When `drop` is False (the default), a DataFrame is returned. The newly created columns will come first in the DataFrame, followed by the original Series values. @@ -1360,7 +1416,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): @doc( klass=_shared_doc_kwargs["klass"], - storage_options=generic._shared_docs["storage_options"], examples=dedent( """ Examples @@ -1379,9 +1434,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def to_markdown( self, buf: Optional[IO[str]] = None, - mode: str = "wt", + mode: Optional[str] = None, index: bool = True, - storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: """ @@ -1394,14 +1448,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame): buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened, "wt" by default. + Mode in which file is opened. index : bool, optional, default True Add index (row) labels. .. versionadded:: 1.1.0 - {storage_options} - - .. versionadded:: 1.2.0 **kwargs These parameters will be passed to `tabulate \ @@ -1412,10 +1463,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): str {klass} in Markdown-friendly format. - Notes - ----- - Requires the `tabulate `_ package. - Examples -------- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") @@ -1442,9 +1489,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown( - buf, mode, index, storage_options=storage_options, **kwargs - ) + return self.to_frame().to_markdown(buf, mode, index, **kwargs) # ---------------------------------------------------------------------- @@ -1733,17 +1778,12 @@ Name: Max Speed, dtype: float64 """ if level is None: return notna(self.array).sum() - elif not isinstance(self.index, MultiIndex): - raise ValueError("Series.count level is only valid with a MultiIndex") - - index = self.index - assert isinstance(index, MultiIndex) # for mypy if isinstance(level, str): - level = index._get_level_number(level) + level = self.index._get_level_number(level) - lev = index.levels[level] - level_codes = np.array(index.codes[level], subok=False, copy=True) + lev = self.index.levels[level] + level_codes = np.array(self.index.codes[level], subok=False, copy=True) mask = level_codes == -1 if mask.any(): @@ -1758,9 +1798,7 @@ Name: Max Speed, dtype: float64 def mode(self, dropna=True) -> "Series": """ - Return the mode(s) of the Series. - - The mode is the value that appears most often. There can be multiple modes. + Return the mode(s) of the dataset. Always returns Series even if only one value is returned. @@ -1861,8 +1899,8 @@ Name: Max Speed, dtype: float64 Returns ------- - Series or None - Series with duplicates dropped or None if ``inplace=True``. + Series + Series with duplicates dropped. See Also -------- @@ -2002,9 +2040,7 @@ Name: Max Speed, dtype: float64 4 True dtype: bool """ - res = base.IndexOpsMixin.duplicated(self, keep=keep) - result = self._constructor(res, index=self.index) - return result.__finalize__(self, method="duplicated") + return super().duplicated(keep=keep) def idxmin(self, axis=0, skipna=True, *args, **kwargs): """ @@ -2739,8 +2775,7 @@ Name: Max Speed, dtype: float64 out.name = name return out - @doc( - generic._shared_docs["compare"], + @Appender( """ Returns ------- @@ -2800,9 +2835,9 @@ Keep all original rows and also all original values 2 c c 3 d b 4 e e -""", - klass=_shared_doc_kwargs["klass"], +""" ) + @Appender(generic._shared_docs["compare"] % _shared_doc_kwargs) def compare( self, other: "Series", @@ -3082,8 +3117,8 @@ Keep all original rows and also all original values Returns ------- - Series or None - Series ordered by values or None if ``inplace=True``. + Series + Series ordered by values. See Also -------- @@ -3215,6 +3250,29 @@ Keep all original rows and also all original values "sort in-place you must create a copy" ) + def _try_kind_sort(arr): + arr = ensure_key_mapped(arr, key) + arr = getattr(arr, "_values", arr) + + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind="quicksort") + + arr = self._values + sorted_index = np.empty(len(self), dtype=np.int32) + + bad = isna(arr) + + good = ~bad + idx = ibase.default_index(len(self)) + + argsorted = _try_kind_sort(self[good]) + if is_list_like(ascending): if len(ascending) != 1: raise ValueError( @@ -3225,16 +3283,21 @@ Keep all original rows and also all original values if not is_bool(ascending): raise ValueError("ascending must be boolean") - if na_position not in ["first", "last"]: + if not ascending: + argsorted = argsorted[::-1] + + if na_position == "last": + n = good.sum() + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] + elif na_position == "first": + n = bad.sum() + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] + else: raise ValueError(f"invalid na_position: {na_position}") - # GH 35922. Make sorting stable by leveraging nargsort - values_to_sort = ensure_key_mapped(self, key)._values if key else self._values - sorted_index = nargsort(values_to_sort, kind, ascending, na_position) - - result = self._constructor( - self._values[sorted_index], index=self.index[sorted_index] - ) + result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) if ignore_index: result.index = ibase.default_index(len(sorted_index)) @@ -3300,8 +3363,8 @@ Keep all original rows and also all original values Returns ------- - Series or None - The original Series sorted by the labels or None if ``inplace=True``. + Series + The original Series sorted by the labels. See Also -------- @@ -3391,17 +3454,59 @@ Keep all original rows and also all original values dtype: int64 """ - return super().sort_index( - axis, - level, - ascending, - inplace, - kind, - na_position, - sort_remaining, - ignore_index, - key, - ) + # TODO: this can be combined with DataFrame.sort_index impl as + # almost identical + inplace = validate_bool_kwarg(inplace, "inplace") + # Validate the axis parameter + self._get_axis_number(axis) + index = ensure_key_mapped(self.index, key, levels=level) + + if level is not None: + new_index, indexer = index.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + + elif isinstance(index, MultiIndex): + from pandas.core.sorting import lexsort_indexer + + labels = index._sort_levels_monotonic() + + indexer = lexsort_indexer( + labels._get_codes_for_sorting(), + orders=ascending, + na_position=na_position, + ) + else: + from pandas.core.sorting import nargsort + + # Check monotonic-ness before sort an index + # GH11080 + if (ascending and index.is_monotonic_increasing) or ( + not ascending and index.is_monotonic_decreasing + ): + if inplace: + return + else: + return self.copy() + + indexer = nargsort( + index, kind=kind, ascending=ascending, na_position=na_position + ) + + indexer = ensure_platform_int(indexer) + new_index = self.index.take(indexer) + new_index = new_index._sort_levels_monotonic() + + new_values = self._values.take(indexer) + result = self._constructor(new_values, index=new_index) + + if ignore_index: + result.index = ibase.default_index(len(result)) + + if inplace: + self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_index") def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ @@ -3716,11 +3821,10 @@ Keep all original rows and also all original values Notes ----- - This routine will explode list-likes including lists, tuples, sets, + This routine will explode list-likes including lists, tuples, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged, and empty list-likes will - result in a np.nan for that row. In addition, the ordering of elements in - the output will be non-deterministic when exploding sets. + be object. Scalars will be returned unchanged. Empty list-likes will + result in a np.nan for that row. Examples -------- @@ -3932,6 +4036,7 @@ Keep all original rows and also all original values axis=_shared_doc_kwargs["axis"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter @@ -3941,7 +4046,7 @@ Keep all original rows and also all original values if func is None: func = dict(kwargs.items()) - result, how = aggregate(self, func, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if result is None: # we can be called from an inner function which @@ -3967,14 +4072,14 @@ Keep all original rows and also all original values agg = aggregate @doc( - _shared_docs["transform"], + NDFrame.transform, klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform( - self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> FrameOrSeriesUnion: - return transform(self, func, axis, *args, **kwargs) + def transform(self, func, axis=0, *args, **kwargs): + # Validate the axis parameter + self._get_axis_number(axis) + return super().transform(func, *args, **kwargs) def apply(self, func, convert_dtype=True, args=(), **kwds): """ @@ -4110,22 +4215,14 @@ Keep all original rows and also all original values if len(mapped) and isinstance(mapped[0], Series): # GH 25959 use pd.array instead of tolist # so extension arrays can be used - return self._constructor_expanddim(pd_array(mapped), index=self.index) + return self._constructor_expanddim(pd.array(mapped), index=self.index) else: return self._constructor(mapped, index=self.index).__finalize__( self, method="apply" ) def _reduce( - self, - op, - name: str, - *, - axis=0, - skipna=True, - numeric_only=None, - filter_type=None, - **kwds, + self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds ): """ Perform a reduction operation. @@ -4237,8 +4334,8 @@ Keep all original rows and also all original values Returns ------- - Series or None - Series with index labels or name altered or None if ``inplace=True``. + Series + Series with index labels or name altered. See Also -------- @@ -4351,8 +4448,8 @@ Keep all original rows and also all original values Returns ------- - Series or None - Series with specified index labels removed or None if ``inplace=True``. + Series + Series with specified index labels removed. Raises ------ @@ -4485,31 +4582,6 @@ Keep all original rows and also all original values method=method, ) - def _replace_single(self, to_replace, method, inplace, limit): - """ - Replaces values in a Series using the fill method specified when no - replacement value is given in the replace method - """ - - orig_dtype = self.dtype - result = self if inplace else self.copy() - fill_f = missing.get_fill_func(method) - - mask = missing.mask_missing(result.values, to_replace) - values = fill_f(result.values, limit=limit, mask=mask) - - if values.dtype == orig_dtype and inplace: - return - - result = self._constructor(values, index=self.index, dtype=self.dtype) - result = result.__finalize__(self) - - if inplace: - self._update_inplace(result) - return - - return result - @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( @@ -4563,7 +4635,7 @@ Keep all original rows and also all original values >>> s.memory_usage() 144 >>> s.memory_usage(deep=True) - 244 + 260 """ v = super().memory_usage(deep=deep) if index: @@ -4623,7 +4695,7 @@ Keep all original rows and also all original values 5 False Name: animal, dtype: bool """ - result = algorithms.isin(self._values, values) + result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__( self, method="isin" ) @@ -4712,7 +4784,6 @@ Keep all original rows and also all original values convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, - convert_floating: bool = True, ) -> "Series": input_series = self if infer_objects: @@ -4720,13 +4791,9 @@ Keep all original rows and also all original values if is_object_dtype(input_series): input_series = input_series.copy() - if convert_string or convert_integer or convert_boolean or convert_floating: + if convert_string or convert_integer or convert_boolean: inferred_dtype = convert_dtypes( - input_series._values, - convert_string, - convert_integer, - convert_boolean, - convert_floating, + input_series._values, convert_string, convert_integer, convert_boolean ) try: result = input_series.astype(inferred_dtype) @@ -4738,7 +4805,7 @@ Keep all original rows and also all original values @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> "Series": - return generic.NDFrame.isna(self) + return super().isna() @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isnull(self) -> "Series": @@ -4770,8 +4837,8 @@ Keep all original rows and also all original values Returns ------- - Series or None - Series with NA entries dropped from it or None if ``inplace=True``. + Series + Series with NA entries dropped from it. See Also -------- @@ -4867,7 +4934,7 @@ Keep all original rows and also all original values if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_index = self.index.to_timestamp(freq=freq, how=how) + new_index = self.index.to_timestamp(freq=freq, how=how) # type: ignore return self._constructor(new_values, index=new_index).__finalize__( self, method="to_timestamp" ) @@ -4924,44 +4991,10 @@ Keep all original rows and also all original values # Add plotting methods to Series hist = pandas.plotting.hist_series - # ---------------------------------------------------------------------- - # Template-Based Arithmetic/Comparison Methods - - def _cmp_method(self, other, op): - res_name = ops.get_op_result_name(self, other) - - if isinstance(other, Series) and not self._indexed_same(other): - raise ValueError("Can only compare identically-labeled Series objects") - - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - - res_values = ops.comparison_op(lvalues, rvalues, op) - - return self._construct_result(res_values, name=res_name) - - def _logical_method(self, other, op): - res_name = ops.get_op_result_name(self, other) - self, other = ops.align_method_SERIES(self, other, align_asobject=True) - - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - - res_values = ops.logical_op(lvalues, rvalues, op) - return self._construct_result(res_values, name=res_name) - - def _arith_method(self, other, op): - res_name = ops.get_op_result_name(self, other) - self, other = ops.align_method_SERIES(self, other) - - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - result = ops.arithmetic_op(lvalues, rvalues, op) - - return self._construct_result(result, name=res_name) - Series._add_numeric_operations() +Series._add_series_or_dataframe_operations() # Add arithmetic! ops.add_flex_arithmetic_methods(Series) +ops.add_special_arithmetic_methods(Series) diff --git a/venv/lib/python3.8/site-packages/pandas/core/shared_docs.py b/venv/lib/python3.8/site-packages/pandas/core/shared_docs.py index 3aeb3b6..b81942f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/shared_docs.py +++ b/venv/lib/python3.8/site-packages/pandas/core/shared_docs.py @@ -1,390 +1,118 @@ from typing import Dict -_shared_docs: Dict[str, str] = {} +_shared_docs: Dict[str, str] = dict() -_shared_docs[ - "aggregate" -] = """ -Aggregate using one or more operations over the specified axis. - -Parameters ----------- -func : function, str, list or dict - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. -{axis} -*args - Positional arguments to pass to `func`. -**kwargs - Keyword arguments to pass to `func`. - -Returns -------- -scalar, Series or DataFrame - - The return can be: - - * scalar : when Series.agg is called with single function - * Series : when DataFrame.agg is called with a single function - * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. -{see_also} -Notes ------ -`agg` is an alias for `aggregate`. Use the alias. - -A passed user-defined-function will be passed a Series for evaluation. -{examples}""" - -_shared_docs[ - "compare" -] = """ -Compare to another {klass} and show the differences. - -.. versionadded:: 1.1.0 - -Parameters ----------- -other : {klass} - Object to compare with. - -align_axis : {{0 or 'index', 1 or 'columns'}}, default 1 - Determine which axis to align the comparison on. - - * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. - -keep_shape : bool, default False - If true, all rows and columns are kept. - Otherwise, only the ones with different values are kept. - -keep_equal : bool, default False - If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. -""" - -_shared_docs[ - "groupby" -] = """ -Group %(klass)s using a mapper or by a Series of columns. - -A groupby operation involves some combination of splitting the -object, applying a function, and combining the results. This can be -used to group large amounts of data and compute operations on these -groups. - -Parameters ----------- -by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is to determine the groups. A label or list of - labels may be passed to group by the columns in ``self``. Notice - that a tuple is interpreted as a (single) key. -axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). -level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. -as_index : bool, default True - For aggregated output, return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. -sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. -group_keys : bool, default True - When calling apply, add group keys to index to identify pieces. -squeeze : bool, default False - Reduce the dimensionality of the return type if possible, - otherwise return a consistent type. - - .. deprecated:: 1.1.0 - -observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. -dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups - - .. versionadded:: 1.1.0 - -Returns -------- -%(klass)sGroupBy - Returns a groupby object that contains information about the groups. - -See Also --------- -resample : Convenience method for frequency conversion and resampling - of time series. - -Notes ------ -See the `user guide -`_ for more. -""" _shared_docs[ "melt" ] = """ -Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. -This function is useful to massage a DataFrame into a format where one -or more columns are identifier variables (`id_vars`), while all other -columns, considered measured variables (`value_vars`), are "unpivoted" to -the row axis, leaving just two non-identifier columns, 'variable' and -'value'. + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + %(versionadded)s + Parameters + ---------- + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. + ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. -Parameters ----------- -id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. -value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. -var_name : scalar - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. -value_name : scalar, default 'value' - Name to use for the 'value' column. -col_level : int or str, optional - If columns are a MultiIndex then use this level to melt. -ignore_index : bool, default True - If True, original index is ignored. If False, the original index is retained. - Index labels will be repeated as necessary. + .. versionadded:: 1.1.0 - .. versionadded:: 1.1.0 + Returns + ------- + DataFrame + Unpivoted DataFrame. -Returns -------- -DataFrame - Unpivoted DataFrame. + See Also + -------- + %(other)s : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. -See Also --------- -%(other)s : Identical method. -pivot_table : Create a spreadsheet-style pivot table as a DataFrame. -DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. -DataFrame.explode : Explode a DataFrame from list-like - columns to long format. + Examples + -------- + >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, + ... 'B': {0: 1, 1: 3, 2: 5}, + ... 'C': {0: 2, 1: 4, 2: 6}}) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 -Examples --------- ->>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, -... 'B': {0: 1, 1: 3, 2: 5}, -... 'C': {0: 2, 1: 4, 2: 6}}) ->>> df - A B C -0 a 1 2 -1 b 3 4 -2 c 5 6 + >>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 ->>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 + >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 ->>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 -3 a C 2 -4 b C 4 -5 c C 6 + The names of 'variable' and 'value' columns can be customized: -The names of 'variable' and 'value' columns can be customized: + >>> %(caller)sid_vars=['A'], value_vars=['B'], + ... var_name='myVarname', value_name='myValname') + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 ->>> %(caller)sid_vars=['A'], value_vars=['B'], -... var_name='myVarname', value_name='myValname') - A myVarname myValname -0 a B 1 -1 b B 3 -2 c B 5 + Original index values can be kept around: -Original index values can be kept around: + >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 0 a C 2 + 1 b C 4 + 2 c C 6 ->>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 -0 a C 2 -1 b C 4 -2 c C 6 + If you have multi-index columns: -If you have multi-index columns: + >>> df.columns = [list('ABC'), list('DEF')] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 ->>> df.columns = [list('ABC'), list('DEF')] ->>> df - A B C - D E F -0 a 1 2 -1 b 3 4 -2 c 5 6 + >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 ->>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value -0 a B 1 -1 b B 3 -2 c B 5 - ->>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value -0 a B E 1 -1 b B E 3 -2 c B E 5 -""" - -_shared_docs[ - "transform" -] = """ -Call ``func`` on self producing a {klass} with transformed values. - -Produced {klass} will have same axis length as self. - -Parameters ----------- -func : function, str, list-like or dict-like - Function to use for transforming the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. If func - is both list-like and dict-like, dict-like behavior takes precedence. - - Accepted combinations are: - - - function - - string function name - - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` - - dict-like of axis labels -> functions, function names or list-like of such. -{axis} -*args - Positional arguments to pass to `func`. -**kwargs - Keyword arguments to pass to `func`. - -Returns -------- -{klass} - A {klass} that must have the same length as self. - -Raises ------- -ValueError : If the returned {klass} has a different length than self. - -See Also --------- -{klass}.agg : Only perform aggregating type operations. -{klass}.apply : Invoke function on a {klass}. - -Examples --------- ->>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) ->>> df - A B -0 0 1 -1 1 2 -2 2 3 ->>> df.transform(lambda x: x + 1) - A B -0 1 2 -1 2 3 -2 3 4 - -Even though the resulting {klass} must have the same length as the -input {klass}, it is possible to provide several input functions: - ->>> s = pd.Series(range(3)) ->>> s -0 0 -1 1 -2 2 -dtype: int64 ->>> s.transform([np.sqrt, np.exp]) - sqrt exp -0 0.000000 1.000000 -1 1.000000 2.718282 -2 1.414214 7.389056 - -You can call transform on a GroupBy object: - ->>> df = pd.DataFrame({{ -... "Date": [ -... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05", -... "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"], -... "Data": [5, 8, 6, 1, 50, 100, 60, 120], -... }}) ->>> df - Date Data -0 2015-05-08 5 -1 2015-05-07 8 -2 2015-05-06 6 -3 2015-05-05 1 -4 2015-05-08 50 -5 2015-05-07 100 -6 2015-05-06 60 -7 2015-05-05 120 ->>> df.groupby('Date')['Data'].transform('sum') -0 55 -1 108 -2 66 -3 121 -4 55 -5 108 -6 66 -7 121 -Name: Data, dtype: int64 - ->>> df = pd.DataFrame({{ -... "c": [1, 1, 1, 2, 2, 2, 2], -... "type": ["m", "n", "o", "m", "m", "n", "n"] -... }}) ->>> df - c type -0 1 m -1 1 n -2 1 o -3 2 m -4 2 m -5 2 n -6 2 n ->>> df['size'] = df.groupby('c')['type'].transform(len) ->>> df - c type size -0 1 m 3 -1 1 n 3 -2 1 o 3 -3 2 m 4 -4 2 m 4 -5 2 n 4 -6 2 n 4 -""" - -_shared_docs[ - "storage_options" -] = """storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a non-fsspec URL. - See the fsspec and backend storage implementation docs for the set of - allowed keys and values.""" + >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ diff --git a/venv/lib/python3.8/site-packages/pandas/core/sorting.py b/venv/lib/python3.8/site-packages/pandas/core/sorting.py index 0a1cbc6..c090531 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/sorting.py +++ b/venv/lib/python3.8/site-packages/pandas/core/sorting.py @@ -1,22 +1,10 @@ """ miscellaneous sorting / groupby utilities """ -from collections import defaultdict -from typing import ( - TYPE_CHECKING, - Callable, - DefaultDict, - Dict, - Iterable, - List, - Optional, - Tuple, - Union, -) +from typing import Callable, Optional import numpy as np from pandas._libs import algos, hashtable, lib from pandas._libs.hashtable import unique_label_indices -from pandas._typing import IndexKeyFunc from pandas.core.dtypes.common import ( ensure_int64, @@ -29,66 +17,9 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algorithms from pandas.core.construction import extract_array -if TYPE_CHECKING: - from pandas import MultiIndex - from pandas.core.indexes.base import Index - _INT64_MAX = np.iinfo(np.int64).max -def get_indexer_indexer( - target: "Index", - level: Union[str, int, List[str], List[int]], - ascending: bool, - kind: str, - na_position: str, - sort_remaining: bool, - key: IndexKeyFunc, -) -> Optional[np.array]: - """ - Helper method that return the indexer according to input parameters for - the sort_index method of DataFrame and Series. - - Parameters - ---------- - target : Index - level : int or level name or list of ints or list of level names - ascending : bool or list of bools, default True - kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' - na_position : {'first', 'last'}, default 'last' - sort_remaining : bool, default True - key : callable, optional - - Returns - ------- - Optional[ndarray] - The indexer for the new index. - """ - - target = ensure_key_mapped(target, key, levels=level) - target = target._sort_levels_monotonic() - - if level is not None: - _, indexer = target.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining - ) - elif isinstance(target, ABCMultiIndex): - indexer = lexsort_indexer( - target._get_codes_for_sorting(), orders=ascending, na_position=na_position - ) - else: - # Check monotonic-ness before sort an index (GH 11080) - if (ascending and target.is_monotonic_increasing) or ( - not ascending and target.is_monotonic_decreasing - ): - return None - - indexer = nargsort( - target, kind=kind, ascending=ascending, na_position=na_position - ) - return indexer - - def get_group_index(labels, shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list @@ -329,7 +260,6 @@ def nargsort( ascending: bool = True, na_position: str = "last", key: Optional[Callable] = None, - mask: Optional[np.ndarray] = None, ): """ Intended to be a drop-in replacement for np.argsort which handles NaNs. @@ -344,27 +274,19 @@ def nargsort( ascending : bool, default True na_position : {'first', 'last'}, default 'last' key : Optional[Callable], default None - mask : Optional[np.ndarray], default None - Passed when called by ExtensionArray.argsort. """ if key is not None: items = ensure_key_mapped(items, key) return nargsort( - items, - kind=kind, - ascending=ascending, - na_position=na_position, - key=None, - mask=mask, + items, kind=kind, ascending=ascending, na_position=na_position, key=None ) items = extract_array(items) - if mask is None: - mask = np.asarray(isna(items)) + mask = np.asarray(isna(items)) if is_extension_array_dtype(items): - return items.argsort(ascending=ascending, kind=kind, na_position=na_position) + items = items._values_for_argsort() else: items = np.asanyarray(items) @@ -417,9 +339,7 @@ def nargminmax(values, method: str): return non_nan_idx[func(non_nans)] -def _ensure_key_mapped_multiindex( - index: "MultiIndex", key: Callable, level=None -) -> "MultiIndex": +def ensure_key_mapped_multiindex(index, key: Callable, level=None): """ Returns a new MultiIndex in which key has been applied to all levels specified in level (or all levels if level @@ -445,6 +365,7 @@ def _ensure_key_mapped_multiindex( labels : MultiIndex Resulting MultiIndex with modified levels. """ + from pandas.core.indexes.api import MultiIndex if level is not None: if isinstance(level, (str, int)): @@ -463,7 +384,7 @@ def _ensure_key_mapped_multiindex( for level in range(index.nlevels) ] - labels = type(index).from_arrays(mapped) + labels = MultiIndex.from_arrays(mapped) return labels @@ -487,7 +408,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return values if isinstance(values, ABCMultiIndex): - return _ensure_key_mapped_multiindex(values, key, level=levels) + return ensure_key_mapped_multiindex(values, key, level=levels) result = key(values.copy()) if len(result) != len(values): @@ -512,39 +433,48 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -def get_flattened_list( - comp_ids: np.ndarray, - ngroups: int, - levels: Iterable["Index"], - labels: Iterable[np.ndarray], -) -> List[Tuple]: - """Map compressed group id -> key tuple.""" - comp_ids = comp_ids.astype(np.int64, copy=False) - arrays: DefaultDict[int, List[int]] = defaultdict(list) - for labs, level in zip(labels, levels): - table = hashtable.Int64HashTable(ngroups) - table.map(comp_ids, labs.astype(np.int64, copy=False)) - for i in range(ngroups): - arrays[i].append(level[table.get_item(i)]) - return [tuple(array) for array in arrays.values()] +class _KeyMapper: + """ + Map compressed group id -> key tuple. + """ + + def __init__(self, comp_ids, ngroups: int, levels, labels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple( + level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels) + ) -def get_indexer_dict( - label_list: List[np.ndarray], keys: List["Index"] -) -> Dict[Union[str, Tuple], np.ndarray]: +def get_flattened_iterator(comp_ids, ngroups, levels, labels): + # provide "flattened" iterator for multi-group setting + mapper = _KeyMapper(comp_ids, ngroups, levels, labels) + return [mapper.get_key(i) for i in range(ngroups)] + + +def get_indexer_dict(label_list, keys): """ Returns ------- - dict: + dict Labels mapped to indexers. """ shape = [len(x) for x in keys] group_index = get_group_index(label_list, shape, sort=True, xnull=True) - if np.all(group_index == -1): - # When all keys are nan and dropna=True, indices_fast can't handle this - # and the return is empty anyway - return {} ngroups = ( ((group_index.size and group_index.max()) + 1) if is_int64_overflow_possible(shape) @@ -594,7 +524,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) + size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) @@ -605,7 +535,7 @@ def compress_group_index(group_index, sort: bool = True): if sort and len(obs_group_ids) > 0: obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) - return ensure_int64(comp_ids), ensure_int64(obs_group_ids) + return comp_ids, obs_group_ids def _reorder_by_uniques(uniques, labels): diff --git a/venv/lib/python3.8/site-packages/pandas/core/strings/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/strings/__init__.py deleted file mode 100644 index 243250f..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/strings/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -""" -Implementation of pandas.Series.str and its interface. - -* strings.accessor.StringMethods : Accessor for Series.str -* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods - -Most methods on the StringMethods accessor follow the pattern: - - 1. extract the array from the series (or index) - 2. Call that array's implementation of the string method - 3. Wrap the result (in a Series, index, or DataFrame) - -Pandas extension arrays implementing string methods should inherit from -pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining -the various string methods. To avoid namespace clashes and pollution, -these are prefixed with `_str_`. So ``Series.str.upper()`` calls -``Series.array._str_upper()``. The interface isn't currently public -to other string extension arrays. -""" -# Pandas current implementation is in ObjectStringArrayMixin. This is designed -# to work on object-dtype ndarrays. -# -# BaseStringArrayMethods -# - ObjectStringArrayMixin -# - StringArray -# - PandasArray -# - Categorical - -from .accessor import StringMethods -from .base import BaseStringArrayMethods - -__all__ = ["StringMethods", "BaseStringArrayMethods"] diff --git a/venv/lib/python3.8/site-packages/pandas/core/strings/accessor.py b/venv/lib/python3.8/site-packages/pandas/core/strings/accessor.py deleted file mode 100644 index 2713b76..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/strings/accessor.py +++ /dev/null @@ -1,3110 +0,0 @@ -import codecs -from functools import wraps -import re -from typing import Dict, List, Optional -import warnings - -import numpy as np - -import pandas._libs.lib as lib -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_integer, - is_list_like, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.base import NoNewAttributesMixin - -_shared_docs: Dict[str, str] = {} -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _map_and_wrap(name, docstring): - @forbid_nonstring_types(["bytes"], name=name) - def wrapper(self): - result = getattr(self._array, f"_str_{name}")() - return self._wrap_result(result) - - wrapper.__doc__ = docstring - return wrapper - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - # Note: see the docstring in pandas.core.strings.__init__ - # for an explanation of the implementation. - # TODO: Dispatch all the methods - # Currently the following are not dispatched to the array - # * cat - # * extract - # * extractall - - def __init__(self, data): - from pandas.core.arrays.string_ import StringDtype - - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = isinstance(data.dtype, StringDtype) - array = data.array - self._array = array - - self._index = self._name = None - if isinstance(data, ABCSeries): - self._index = data.index - self._name = data.name - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object, and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - result = self._array._str_getitem(key) - return self._wrap_result(result) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - from pandas import Index, MultiIndex - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - if isinstance(result, ABCDataFrame): - result = result.__finalize__(self._orig, name="str") - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - # Ideally the array method returns the right array type. - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - # This is a mess. - dtype: Optional[str] - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index) - result = result.__finalize__(self._orig, method="str") - if name is not None and result.ndim == 1: - # __finalize__ might copy over the original name, but we may - # want the new name (e.g. str.extract). - result.name = name - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import DataFrame, Series - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los: List[Series] = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionadded:: 0.23.0 - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - # TODO: dispatch - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - result = result.__finalize__(self._orig, method="str_cat") - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = self._array._str_split(pat, n, expand) - return self._wrap_result(result, returns_string=expand, expand=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = self._array._str_rsplit(pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - result = self._array._str_partition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - result = self._array._str_rpartition(sep, expand) - return self._wrap_result(result, expand=expand, returns_string=expand) - - def get(self, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - result = self._array._str_get(i) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - result = self._array._str_join(sep) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=None, regex=True): - r""" - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : scalar, optional - Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - result = self._array._str_contains(pat, case, flags, na, regex) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=None): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : scalar, optional - Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - result = self._array._str_match(pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=None): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : scalar, optional. - Fill value for missing values. The default depends on dtype of the - array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, - ``pandas.NA`` is used. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=None): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on - the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - .. versionadded:: 0.23.0 - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - if regex is None: - if isinstance(pat, str) and any(c in pat for c in ".+*|^$?[](){}\\"): - # warn only in cases where regex behavior would differ from literal - msg = ( - "The default value of regex will change from True to False " - "in a future version." - ) - if len(pat) == 1: - msg += ( - " In addition, single character regular expressions will" - "*not* be treated as literal strings when regex=True." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - regex = True - result = self._array._str_replace( - pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - result = self._array._str_repeat(repeats) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills both sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - result = self._array._str_pad(width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills both sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = self.pad(width, side="left", fillchar="0") - return self._wrap_result(result) - - def slice(self, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - result = self._array._str_slice(start, stop, step) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - result = self._array._str_slice_replace(start, stop, repl) - return self._wrap_result(result) - - def decode(self, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - # TODO: Add a similar _bytes interface. - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - arr = self._array - # assert isinstance(arr, (StringArray,)) - result = arr._str_map(f) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - result = self._array._str_encode(encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % { - "side": "left and right sides", - "method": "strip", - "position": "leading and trailing", - } - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = self._array._str_strip(to_strip) - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % {"side": "left side", "method": "lstrip", "position": "leading"} - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = self._array._str_lstrip(to_strip) - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % {"side": "right side", "method": "rstrip", "position": "trailing"} - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = self._array._str_rstrip(to_strip) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - result = self._array._str_wrap(width, **kwargs) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - result, name = self._array._str_get_dummies(sep) - return self._wrap_result( - result, - name=name, - expand=True, - returns_string=False, - ) - - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - result = self._array._str_translate(table) - return self._wrap_result(result) - - @forbid_nonstring_types(["bytes"]) - def count(self, pat, flags=0): - r""" - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - result = self._array._str_count(pat, flags) - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def startswith(self, pat, na=None): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. The default depends - on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - result = self._array._str_startswith(pat, na=na) - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def endswith(self, pat, na=None): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. The default depends - on dtype of the array. For object-dtype, ``numpy.nan`` is used. - For ``StringDtype``, ``pandas.NA`` is used. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - result = self._array._str_endswith(pat, na=na) - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def findall(self, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - result = self._array._str_findall(pat, flags) - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - # TODO: dispatch - return str_extract(self, pat, flags, expand=expand) - - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - # TODO: dispatch - return str_extractall(self._orig, pat, flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % { - "side": "lowest", - "method": "find", - "also": "rfind : Return highest indexes in each strings.", - } - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - result = self._array._str_find(sub, start, end) - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % { - "side": "highest", - "method": "rfind", - "also": "find : Return lowest indexes in each strings.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - result = self._array._str_rfind(sub, start=start, end=end) - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - result = self._array._str_normalize(form) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % { - "side": "lowest", - "similar": "find", - "method": "index", - "also": "rindex : Return highest indexes in each strings.", - } - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - result = self._array._str_index(sub, start=start, end=end) - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % { - "side": "highest", - "similar": "rfind", - "method": "rindex", - "also": "index : Return lowest indexes in each strings.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - result = self._array._str_rindex(sub, start=start, end=end) - return self._wrap_result(result, returns_string=False) - - def len(self): - """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - result = self._array._str_len() - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - # Types: - # cases: - # upper, lower, title, capitalize, swapcase, casefold - # boolean: - # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} - _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} - _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} - _doc_args["capitalize"] = { - "type": "be capitalized", - "method": "capitalize", - "version": "", - } - _doc_args["swapcase"] = { - "type": "be swapcased", - "method": "swapcase", - "version": "", - } - _doc_args["casefold"] = { - "type": "be casefolded", - "method": "casefold", - "version": "\n .. versionadded:: 0.25.0\n", - } - - @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) - @forbid_nonstring_types(["bytes"]) - def lower(self): - result = self._array._str_lower() - return self._wrap_result(result) - - @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) - @forbid_nonstring_types(["bytes"]) - def upper(self): - result = self._array._str_upper() - return self._wrap_result(result) - - @Appender(_shared_docs["casemethods"] % _doc_args["title"]) - @forbid_nonstring_types(["bytes"]) - def title(self): - result = self._array._str_title() - return self._wrap_result(result) - - @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) - @forbid_nonstring_types(["bytes"]) - def capitalize(self): - result = self._array._str_capitalize() - return self._wrap_result(result) - - @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) - @forbid_nonstring_types(["bytes"]) - def swapcase(self): - result = self._array._str_swapcase() - return self._wrap_result(result) - - @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) - @forbid_nonstring_types(["bytes"]) - def casefold(self): - result = self._array._str_casefold() - return self._wrap_result(result) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} - _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} - _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} - _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} - _doc_args["islower"] = {"type": "lowercase", "method": "islower"} - _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} - _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} - _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} - _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - - isalnum = _map_and_wrap( - "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] - ) - isalpha = _map_and_wrap( - "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] - ) - isdigit = _map_and_wrap( - "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] - ) - isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] - ) - islower = _map_and_wrap( - "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] - ) - isupper = _map_and_wrap( - "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] - ) - istitle = _map_and_wrap( - "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] - ) - isnumeric = _map_and_wrap( - "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] - ) - isdecimal = _map_and_wrap( - "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] - ) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - from pandas.core.arrays.string_ import StringDtype - - if isinstance(arr.dtype, StringDtype): - return arr.dtype.name - else: - return object - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame, array - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - result_dtype = _result_dtype(arr) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - # not dispatching, so we have to reconstruct here. - result = array(result, dtype=result_dtype) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.size == 0: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - result = _str_extract_frame(arr._orig, pat, flags=flags) - return result.__finalize__(arr._orig, method="str_extract") - else: - result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result diff --git a/venv/lib/python3.8/site-packages/pandas/core/strings/base.py b/venv/lib/python3.8/site-packages/pandas/core/strings/base.py deleted file mode 100644 index 0806424..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/strings/base.py +++ /dev/null @@ -1,225 +0,0 @@ -import abc -from typing import Pattern, Union - -import numpy as np - -from pandas._typing import Scalar - - -class BaseStringArrayMethods(abc.ABC): - """ - Base class for extension arrays implementing string methods. - - This is where our ExtensionArrays can override the implementation of - Series.str.. We don't expect this to work with - 3rd-party extension arrays. - - * User calls Series.str. - * pandas extracts the extension array from the Series - * pandas calls ``extension_array._str_(*args, **kwargs)`` - * pandas wraps the result, to return to the user. - - See :ref:`Series.str` for the docstring of each method. - """ - - def _str_getitem(self, key): - if isinstance(key, slice): - return self._str_slice(start=key.start, stop=key.stop, step=key.step) - else: - return self._str_get(key) - - @abc.abstractmethod - def _str_count(self, pat, flags=0): - pass - - @abc.abstractmethod - def _str_pad(self, width, side="left", fillchar=" "): - pass - - @abc.abstractmethod - def _str_contains(self, pat, case=True, flags=0, na=None, regex=True): - pass - - @abc.abstractmethod - def _str_startswith(self, pat, na=None): - pass - - @abc.abstractmethod - def _str_endswith(self, pat, na=None): - pass - - @abc.abstractmethod - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - pass - - @abc.abstractmethod - def _str_repeat(self, repeats): - pass - - @abc.abstractmethod - def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, - ): - pass - - @abc.abstractmethod - def _str_fullmatch( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, - ): - pass - - @abc.abstractmethod - def _str_encode(self, encoding, errors="strict"): - pass - - @abc.abstractmethod - def _str_find(self, sub, start=0, end=None): - pass - - @abc.abstractmethod - def _str_rfind(self, sub, start=0, end=None): - pass - - @abc.abstractmethod - def _str_findall(self, pat, flags=0): - pass - - @abc.abstractmethod - def _str_get(self, i): - pass - - @abc.abstractmethod - def _str_index(self, sub, start=0, end=None): - pass - - @abc.abstractmethod - def _str_rindex(self, sub, start=0, end=None): - pass - - @abc.abstractmethod - def _str_join(self, sep): - pass - - @abc.abstractmethod - def _str_partition(self, sep, expand): - pass - - @abc.abstractmethod - def _str_rpartition(self, sep, expand): - pass - - @abc.abstractmethod - def _str_len(self): - pass - - @abc.abstractmethod - def _str_slice(self, start=None, stop=None, step=None): - pass - - @abc.abstractmethod - def _str_slice_replace(self, start=None, stop=None, repl=None): - pass - - @abc.abstractmethod - def _str_translate(self, table): - pass - - @abc.abstractmethod - def _str_wrap(self, width, **kwargs): - pass - - @abc.abstractmethod - def _str_get_dummies(self, sep="|"): - pass - - @abc.abstractmethod - def _str_isalnum(self): - pass - - @abc.abstractmethod - def _str_isalpha(self): - pass - - @abc.abstractmethod - def _str_isdecimal(self): - pass - - @abc.abstractmethod - def _str_isdigit(self): - pass - - @abc.abstractmethod - def _str_islower(self): - pass - - @abc.abstractmethod - def _str_isnumeric(self): - pass - - @abc.abstractmethod - def _str_isspace(self): - pass - - @abc.abstractmethod - def _str_istitle(self): - pass - - @abc.abstractmethod - def _str_isupper(self): - pass - - @abc.abstractmethod - def _str_capitalize(self): - pass - - @abc.abstractmethod - def _str_casefold(self): - pass - - @abc.abstractmethod - def _str_title(self): - pass - - @abc.abstractmethod - def _str_swapcase(self): - pass - - @abc.abstractmethod - def _str_lower(self): - pass - - @abc.abstractmethod - def _str_upper(self): - pass - - @abc.abstractmethod - def _str_normalize(self, form): - pass - - @abc.abstractmethod - def _str_strip(self, to_strip=None): - pass - - @abc.abstractmethod - def _str_lstrip(self, to_strip=None): - pass - - @abc.abstractmethod - def _str_rstrip(self, to_strip=None): - pass - - @abc.abstractmethod - def _str_split(self, pat=None, n=-1, expand=False): - pass - - @abc.abstractmethod - def _str_rsplit(self, pat=None, n=-1): - pass diff --git a/venv/lib/python3.8/site-packages/pandas/core/strings/object_array.py b/venv/lib/python3.8/site-packages/pandas/core/strings/object_array.py deleted file mode 100644 index a29d84e..0000000 --- a/venv/lib/python3.8/site-packages/pandas/core/strings/object_array.py +++ /dev/null @@ -1,432 +0,0 @@ -import re -import textwrap -from typing import Pattern, Set, Union, cast -import unicodedata -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import Scalar - -from pandas.core.dtypes.common import is_re, is_scalar -from pandas.core.dtypes.missing import isna - -from pandas.core.strings.base import BaseStringArrayMethods - - -class ObjectStringArrayMixin(BaseStringArrayMethods): - """ - String Methods operating on object-dtype ndarrays. - """ - - _str_na_value = np.nan - - def __len__(self): - # For typing, _str_map relies on the object being sized. - raise NotImplementedError - - def _str_map(self, f, na_value=None, dtype=None): - """ - Map a callable over valid element of the array. - - Parameters - ---------- - f : Callable - A function to call on each non-NA element. - na_value : Scalar, optional - The value to set for NA values. Might also be used for the - fill value if the callable `f` raises an exception. - This defaults to ``self._str_na_value`` which is ``np.nan`` - for object-dtype and Categorical and ``pd.NA`` for StringArray. - dtype : Dtype, optional - The dtype of the result array. - """ - arr = self - if dtype is None: - dtype = np.dtype("object") - if na_value is None: - na_value = self._str_na_value - - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - # This type of fallback behavior can be removed once - # we remove object-dtype .str accessor. - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return self._str_map(g, na_value=na_value, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - - def _str_count(self, pat, flags=0): - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return self._str_map(f, dtype="int64") - - def _str_pad(self, width, side="left", fillchar=" "): - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - return self._str_map(f) - - def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x.upper() - return self._str_map(f, na, dtype=np.dtype("bool")) - - def _str_startswith(self, pat, na=None): - f = lambda x: x.startswith(pat) - return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - - def _str_endswith(self, pat, na=None): - f = lambda x: x.endswith(pat) - return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - - def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with " - "regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return self._str_map(f, dtype=str) - - def _str_repeat(self, repeats): - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return self._str_map(scalar_rep, dtype=str) - else: - from pandas.core.arrays.string_ import StringArray - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(self), repeats, rep) - if isinstance(self, StringArray): - # Not going through map, so we have to do this here. - result = StringArray._from_sequence(result) - return result - - def _str_match( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = None, - ): - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - - def _str_fullmatch( - self, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = None, - ): - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - return self._str_map(f, na_value=na, dtype=np.dtype(bool)) - - def _str_encode(self, encoding, errors="strict"): - f = lambda x: x.encode(encoding, errors=errors) - return self._str_map(f, dtype=object) - - def _str_find(self, sub, start=0, end=None): - return self._str_find_(sub, start, end, side="left") - - def _str_rfind(self, sub, start=0, end=None): - return self._str_find_(sub, start, end, side="right") - - def _str_find_(self, sub, start, end, side): - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - return self._str_map(f, dtype="int64") - - def _str_findall(self, pat, flags=0): - regex = re.compile(pat, flags=flags) - return self._str_map(regex.findall, dtype="object") - - def _str_get(self, i): - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return self._str_na_value - - return self._str_map(f) - - def _str_index(self, sub, start=0, end=None): - if end: - f = lambda x: x.index(sub, start, end) - else: - f = lambda x: x.index(sub, start, end) - return self._str_map(f, dtype="int64") - - def _str_rindex(self, sub, start=0, end=None): - if end: - f = lambda x: x.rindex(sub, start, end) - else: - f = lambda x: x.rindex(sub, start, end) - return self._str_map(f, dtype="int64") - - def _str_join(self, sep): - return self._str_map(sep.join) - - def _str_partition(self, sep, expand): - result = self._str_map(lambda x: x.partition(sep), dtype="object") - return result - - def _str_rpartition(self, sep, expand): - return self._str_map(lambda x: x.rpartition(sep), dtype="object") - - def _str_len(self): - return self._str_map(len, dtype="int64") - - def _str_slice(self, start=None, stop=None, step=None): - obj = slice(start, stop, step) - return self._str_map(lambda x: x[obj]) - - def _str_slice_replace(self, start=None, stop=None, repl=None): - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return self._str_map(f) - - def _str_split(self, pat=None, n=-1, expand=False): - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - return self._str_map(f, dtype=object) - - def _str_rsplit(self, pat=None, n=-1): - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - return self._str_map(f, dtype="object") - - def _str_translate(self, table): - return self._str_map(lambda x: x.translate(table)) - - def _str_wrap(self, width, **kwargs): - kwargs["width"] = width - tw = textwrap.TextWrapper(**kwargs) - return self._str_map(lambda s: "\n".join(tw.wrap(s))) - - def _str_get_dummies(self, sep="|"): - from pandas import Series - - arr = Series(self).fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = cast(Series, arr) - arr = sep + arr.astype(str) + sep - arr = cast(Series, arr) - - tags: Set[str] = set() - for ts in Series(arr).str.split(sep): - tags.update(ts) - tags2 = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) - - for i, t in enumerate(tags2): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags2 - - def _str_upper(self): - return self._str_map(lambda x: x.upper()) - - def _str_isalnum(self): - return self._str_map(str.isalnum, dtype="bool") - - def _str_isalpha(self): - return self._str_map(str.isalpha, dtype="bool") - - def _str_isdecimal(self): - return self._str_map(str.isdecimal, dtype="bool") - - def _str_isdigit(self): - return self._str_map(str.isdigit, dtype="bool") - - def _str_islower(self): - return self._str_map(str.islower, dtype="bool") - - def _str_isnumeric(self): - return self._str_map(str.isnumeric, dtype="bool") - - def _str_isspace(self): - return self._str_map(str.isspace, dtype="bool") - - def _str_istitle(self): - return self._str_map(str.istitle, dtype="bool") - - def _str_isupper(self): - return self._str_map(str.isupper, dtype="bool") - - def _str_capitalize(self): - return self._str_map(str.capitalize) - - def _str_casefold(self): - return self._str_map(str.casefold) - - def _str_title(self): - return self._str_map(str.title) - - def _str_swapcase(self): - return self._str_map(str.swapcase) - - def _str_lower(self): - return self._str_map(str.lower) - - def _str_normalize(self, form): - f = lambda x: unicodedata.normalize(form, x) - return self._str_map(f) - - def _str_strip(self, to_strip=None): - return self._str_map(lambda x: x.strip(to_strip)) - - def _str_lstrip(self, to_strip=None): - return self._str_map(lambda x: x.lstrip(to_strip)) - - def _str_rstrip(self, to_strip=None): - return self._str_map(lambda x: x.rstrip(to_strip)) diff --git a/venv/lib/python3.8/site-packages/pandas/core/tools/datetimes.py b/venv/lib/python3.8/site-packages/pandas/core/tools/datetimes.py index 1553dee..7aac2f7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/tools/datetimes.py +++ b/venv/lib/python3.8/site-packages/pandas/core/tools/datetimes.py @@ -16,20 +16,12 @@ import warnings import numpy as np -from pandas._libs import tslib -from pandas._libs.tslibs import ( - OutOfBoundsDatetime, - Timedelta, - Timestamp, - conversion, - iNaT, - nat_strings, - parsing, -) +from pandas._libs import tslib, tslibs +from pandas._libs.tslibs import Timestamp, conversion, parsing from pandas._libs.tslibs.parsing import ( # noqa DateParseError, - format_is_iso, - guess_datetime_format, + _format_is_iso, + _guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ArrayLike, Label, Timezone @@ -61,9 +53,9 @@ from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas._libs.tslibs.nattype import NaTType + from pandas._libs.tslibs.nattype import NaTType # noqa:F401 - from pandas import Series + from pandas import Series # noqa:F401 # --------------------------------------------------------------------- # types used in annotations @@ -81,7 +73,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] if len(non_nan_elements): - return guess_datetime_format(arr[non_nan_elements[0]], **kwargs) + return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def should_cache( @@ -315,7 +307,9 @@ def _convert_listlike_datetimes( if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": - arg = arg.tz_convert(None).tz_localize(tz) + # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has + # no attribute "tz_convert" + arg = arg.tz_convert(None).tz_localize(tz) # type: ignore return arg elif is_datetime64_ns_dtype(arg_dtype): @@ -395,7 +389,7 @@ def _convert_listlike_datetimes( # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case - format_is_iso8601 = format_is_iso(format) + format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None @@ -412,7 +406,7 @@ def _convert_listlike_datetimes( # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) - except (ValueError, TypeError, OutOfBoundsDatetime) as err: + except (ValueError, TypeError, tslibs.OutOfBoundsDatetime) as err: raise ValueError( "cannot convert the input to '%Y%m%d' date format" ) from err @@ -427,13 +421,13 @@ def _convert_listlike_datetimes( return _return_parsed_timezone_results( result, timezones, tz, name ) - except OutOfBoundsDatetime: + except tslibs.OutOfBoundsDatetime: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(iNaT) + iresult.fill(tslibs.iNaT) else: result = arg except ValueError: @@ -446,7 +440,7 @@ def _convert_listlike_datetimes( elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult.fill(iNaT) + iresult.fill(tslibs.iNaT) else: result = arg except ValueError as e: @@ -516,7 +510,7 @@ def _adjust_to_origin(arg, origin, unit): j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): - raise OutOfBoundsDatetime( + raise tslibs.OutOfBoundsDatetime( f"{original} is Out of Bounds for origin='julian'" ) else: @@ -533,8 +527,10 @@ def _adjust_to_origin(arg, origin, unit): # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) - except OutOfBoundsDatetime as err: - raise OutOfBoundsDatetime(f"origin {origin} is Out of Bounds") from err + except tslibs.OutOfBoundsDatetime as err: + raise tslibs.OutOfBoundsDatetime( + f"origin {origin} is Out of Bounds" + ) from err except ValueError as err: raise ValueError( f"origin {origin} cannot be converted to a Timestamp" @@ -546,7 +542,7 @@ def _adjust_to_origin(arg, origin, unit): # convert the offset to the unit of the arg # this should be lossless in terms of precision - offset = offset // Timedelta(1, unit=unit) + offset = offset // tslibs.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance(arg, (ABCSeries, Index, np.ndarray)): @@ -686,6 +682,8 @@ def to_datetime( used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. + .. versionadded:: 0.23.0 + .. versionchanged:: 0.25.0 - changed default value from False to True. @@ -815,7 +813,7 @@ dtype='datetime64[ns]', freq=None) elif is_list_like(arg): try: cache_array = _maybe_cache(arg, format, cache, convert_listlike) - except OutOfBoundsDatetime: + except tslibs.OutOfBoundsDatetime: # caching attempts to create a DatetimeIndex, which may raise # an OOB. If that's the desired behavior, then just reraise... if errors == "raise": @@ -971,7 +969,7 @@ def _attempt_YYYYMMDD(arg, errors): def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype="M8[ns]") iresult = result.view("i8") - iresult[~mask] = iNaT + iresult[~mask] = tslibs.iNaT masked_result = calc(carg[mask].astype(np.float64).astype(np.int64)) result[mask] = masked_result.astype("M8[ns]") @@ -992,7 +990,7 @@ def _attempt_YYYYMMDD(arg, errors): # string with NaN-like try: - mask = ~algorithms.isin(arg, list(nat_strings)) + mask = ~algorithms.isin(arg, list(tslibs.nat_strings)) return calc_with_mask(arg, mask) except (ValueError, OverflowError, TypeError): pass diff --git a/venv/lib/python3.8/site-packages/pandas/core/tools/numeric.py b/venv/lib/python3.8/site-packages/pandas/core/tools/numeric.py index 4af32b2..cff4695 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/tools/numeric.py +++ b/venv/lib/python3.8/site-packages/pandas/core/tools/numeric.py @@ -10,7 +10,6 @@ from pandas.core.dtypes.common import ( is_number, is_numeric_dtype, is_scalar, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -124,9 +123,8 @@ def to_numeric(arg, errors="raise", downcast=None): values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True - if needs_i8_conversion(arg.dtype): - values = arg.asi8 - else: + values = arg.asi8 + if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") @@ -188,7 +186,7 @@ def to_numeric(arg, errors="raise", downcast=None): break if is_series: - return arg._constructor(values, index=arg.index, name=arg.name) + return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy diff --git a/venv/lib/python3.8/site-packages/pandas/core/tools/timedeltas.py b/venv/lib/python3.8/site-packages/pandas/core/tools/timedeltas.py index 6a9fd7a..e457a88 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/tools/timedeltas.py +++ b/venv/lib/python3.8/site-packages/pandas/core/tools/timedeltas.py @@ -26,11 +26,6 @@ def to_timedelta(arg, unit=None, errors="raise"): ---------- arg : str, timedelta, list-like or Series The data to be converted to timedelta. - - .. deprecated:: 1.2 - Strings with units 'M', 'Y' and 'y' do not represent - unambiguous timedelta values and will be removed in a future version - unit : str, optional Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. @@ -66,11 +61,6 @@ def to_timedelta(arg, unit=None, errors="raise"): to_datetime : Convert argument to datetime. convert_dtypes : Convert dtypes. - Notes - ----- - If the precision is higher than nanoseconds, the precision of the duration is - truncated to nanoseconds for string inputs. - Examples -------- Parsing a single string to a Timedelta: @@ -100,7 +90,7 @@ def to_timedelta(arg, unit=None, errors="raise"): unit = parse_timedelta_unit(unit) if errors not in ("ignore", "raise", "coerce"): - raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") + raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") if unit in {"Y", "y", "M"}: raise ValueError( diff --git a/venv/lib/python3.8/site-packages/pandas/core/tools/times.py b/venv/lib/python3.8/site-packages/pandas/core/tools/times.py index 643c116..3bac4cf 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/tools/times.py +++ b/venv/lib/python3.8/site-packages/pandas/core/tools/times.py @@ -5,9 +5,11 @@ import numpy as np from pandas._libs.lib import is_list_like -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import notna +from pandas.core.indexes.base import Index + def to_time(arg, format=None, infer_time_format=False, errors="raise"): """ @@ -103,7 +105,7 @@ def to_time(arg, format=None, infer_time_format=False, errors="raise"): elif isinstance(arg, ABCSeries): values = _convert_listlike(arg._values, format) return arg._constructor(values, index=arg.index, name=arg.name) - elif isinstance(arg, ABCIndexClass): + elif isinstance(arg, Index): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) diff --git a/venv/lib/python3.8/site-packages/pandas/core/util/hashing.py b/venv/lib/python3.8/site-packages/pandas/core/util/hashing.py index df082c7..d79b9f4 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/util/hashing.py +++ b/venv/lib/python3.8/site-packages/pandas/core/util/hashing.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.generic import ( _default_hash_key = "0123456789123456" -def combine_hash_arrays(arrays, num_items: int): +def _combine_hash_arrays(arrays, num_items: int): """ Parameters ---------- @@ -108,7 +108,7 @@ def hash_pandas_object( for _ in [None] ) arrays = itertools.chain([h], index_iter) - h = combine_hash_arrays(arrays, 2) + h = _combine_hash_arrays(arrays, 2) h = Series(h, index=obj.index, dtype="uint64", copy=False) @@ -131,7 +131,7 @@ def hash_pandas_object( # keep `hashes` specifically a generator to keep mypy happy _hashes = itertools.chain(hashes, index_hash_generator) hashes = (x for x in _hashes) - h = combine_hash_arrays(hashes, num_items) + h = _combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) else: @@ -175,7 +175,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): hashes = ( _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals ) - h = combine_hash_arrays(hashes, len(vals)) + h = _combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] diff --git a/venv/lib/python3.8/site-packages/pandas/core/util/numba_.py b/venv/lib/python3.8/site-packages/pandas/core/util/numba_.py index ed920c1..c9b7943 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/util/numba_.py +++ b/venv/lib/python3.8/site-packages/pandas/core/util/numba_.py @@ -1,15 +1,17 @@ """Common utilities for Numba operations""" from distutils.version import LooseVersion +import inspect import types from typing import Callable, Dict, Optional, Tuple import numpy as np +from pandas._typing import FrameOrSeries from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError GLOBAL_USE_NUMBA: bool = False -NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = {} +NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict() def maybe_use_numba(engine: Optional[str]) -> bool: @@ -24,8 +26,37 @@ def set_use_numba(enable: bool = False) -> None: GLOBAL_USE_NUMBA = enable +def check_kwargs_and_nopython( + kwargs: Optional[Dict] = None, nopython: Optional[bool] = None +) -> None: + """ + Validate that **kwargs and nopython=True was passed + https://github.com/numba/numba/issues/2916 + + Parameters + ---------- + kwargs : dict, default None + user passed keyword arguments to pass into the JITed function + nopython : bool, default None + nopython parameter + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + if kwargs and nopython: + raise NumbaUtilError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + def get_jit_arguments( - engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None + engine_kwargs: Optional[Dict[str, bool]] = None ) -> Tuple[bool, bool, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. @@ -34,27 +65,16 @@ def get_jit_arguments( ---------- engine_kwargs : dict, default None user passed keyword arguments for numba.JIT - kwargs : dict, default None - user passed keyword arguments to pass into the JITed function Returns ------- (bool, bool, bool) nopython, nogil, parallel - - Raises - ------ - NumbaUtilError """ if engine_kwargs is None: engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) - if kwargs and nopython: - raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) return nopython, nogil, parallel @@ -109,3 +129,94 @@ def jit_user_function( return impl return numba_func + + +def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: + """ + Split pandas object into its components as numpy arrays for numba functions. + + Parameters + ---------- + arg : Series or DataFrame + + Returns + ------- + (ndarray, ndarray) + values, index + """ + return arg.to_numpy(), arg.index.to_numpy() + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +def generate_numba_func( + func: Callable, + engine_kwargs: Optional[Dict[str, bool]], + kwargs: dict, + cache_key_str: str, +) -> Tuple[Callable, Tuple[Callable, str]]: + """ + Return a JITed function and cache key for the NUMBA_FUNC_CACHE + + This _may_ be specific to groupby (as it's only used there currently). + + Parameters + ---------- + func : function + user defined function + engine_kwargs : dict or None + numba.jit arguments + kwargs : dict + kwargs for func + cache_key_str : str + string representing the second part of the cache key tuple + + Returns + ------- + (JITed function, cache key) + + Raises + ------ + NumbaUtilError + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + check_kwargs_and_nopython(kwargs, nopython) + validate_udf(func) + cache_key = (func, cache_key_str) + numba_func = NUMBA_FUNC_CACHE.get( + cache_key, jit_user_function(func, nopython, nogil, parallel) + ) + return numba_func, cache_key diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/__init__.py b/venv/lib/python3.8/site-packages/pandas/core/window/__init__.py index b3d0820..304c61a 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/__init__.py @@ -1,6 +1,3 @@ -from pandas.core.window.ewm import ( # noqa:F401 - ExponentialMovingWindow, - ExponentialMovingWindowGroupby, -) +from pandas.core.window.ewm import ExponentialMovingWindow # noqa:F401 from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/common.py b/venv/lib/python3.8/site-packages/pandas/core/window/common.py index 6ebf610..51a0674 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/common.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/common.py @@ -1,14 +1,15 @@ """Common utility functions for rolling operations""" from collections import defaultdict -from typing import cast +from typing import Callable, Optional import warnings import numpy as np from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.generic import _shared_docs +from pandas.core.groupby.base import GroupByMixin from pandas.core.indexes.api import MultiIndex -from pandas.core.shared_docs import _shared_docs _shared_docs = dict(**_shared_docs) _doc_template = """ @@ -26,7 +27,72 @@ _doc_template = """ """ -def flex_binary_moment(arg1, arg2, f, pairwise=False): +def _dispatch(name: str, *args, **kwargs): + """ + Dispatch to apply. + """ + + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer + + +class WindowGroupByMixin(GroupByMixin): + """ + Provide the groupby facilities. + """ + + def __init__(self, obj, *args, **kwargs): + kwargs.pop("parent", None) + groupby = kwargs.pop("groupby", None) + if groupby is None: + groupby, obj = obj, obj._selected_obj + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + super().__init__(obj, *args, **kwargs) + + count = _dispatch("count") + corr = _dispatch("corr", other=None, pairwise=None) + cov = _dispatch("cov", other=None, pairwise=None) + + def _apply( + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, + ): + """ + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. + """ + kwargs.pop("floor", None) + kwargs.pop("original_func", None) + + # TODO: can we de-duplicate with _dispatch? + def f(x, name=name, *args): + x = self._shallow_copy(x) + + if isinstance(name, str): + return getattr(x, name)(*args, **kwargs) + + return x.apply(name, *args, **kwargs) + + return self._groupby.apply(f) + + +def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not ( isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) @@ -110,9 +176,6 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): # set the index and reorder if arg2.columns.nlevels > 1: - # mypy needs to know columns is a MultiIndex, Index doesn't - # have levels attribute - arg2.columns = cast(MultiIndex, arg2.columns) result.index = MultiIndex.from_product( arg2.columns.levels + [result_index] ) @@ -159,7 +222,7 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): return dataframe_from_int_dict(results, arg1) else: - return flex_binary_moment(arg2, arg1, f) + return _flex_binary_moment(arg2, arg1, f) def zsqrt(x): diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/ewm.py b/venv/lib/python3.8/site-packages/pandas/core/window/ewm.py index f8237a4..7a2d8e8 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/ewm.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/ewm.py @@ -1,7 +1,7 @@ import datetime from functools import partial from textwrap import dedent -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import numpy as np @@ -12,26 +12,12 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.common import is_datetime64_ns_dtype +from pandas.core.dtypes.generic import ABCDataFrame +from pandas.core.base import DataError import pandas.core.common as common -from pandas.core.util.numba_ import maybe_use_numba -from pandas.core.window.common import ( - _doc_template, - _shared_docs, - flex_binary_moment, - zsqrt, -) -from pandas.core.window.indexers import ( - BaseIndexer, - ExponentialMovingWindowIndexer, - GroupbyIndexer, -) -from pandas.core.window.numba_ import generate_numba_groupby_ewma_func -from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch - -if TYPE_CHECKING: - from pandas import Series - +from pandas.core.window.common import _doc_template, _shared_docs, zsqrt +from pandas.core.window.rolling import _flex_binary_moment, _Rolling _bias_template = """ Parameters @@ -76,16 +62,7 @@ def get_center_of_mass( return float(comass) -def wrap_result(obj: "Series", result: np.ndarray) -> "Series": - """ - Wrap a single 1D result. - """ - obj = obj._selected_obj - - return obj._constructor(result, obj.index, name=obj.name) - - -class ExponentialMovingWindow(BaseWindow): +class ExponentialMovingWindow(_Rolling): r""" Provide exponential weighted (EW) functions. @@ -184,7 +161,7 @@ class ExponentialMovingWindow(BaseWindow): ----- More details can be found at: - :ref:`Exponentially weighted windows `. + :ref:`Exponentially weighted windows `. Examples -------- @@ -231,16 +208,14 @@ class ExponentialMovingWindow(BaseWindow): ignore_na: bool = False, axis: int = 0, times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, - **kwargs, ): + self.com: Optional[float] self.obj = obj self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis self.on = None - self.center = False - self.closed = None if times is not None: if isinstance(times, str): times = self._selected_obj[times] @@ -259,7 +234,7 @@ class ExponentialMovingWindow(BaseWindow): if common.count_not_none(com, span, alpha) > 0: self.com = get_center_of_mass(com, span, None, alpha) else: - self.com = 0.0 + self.com = None else: if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): raise ValueError( @@ -274,12 +249,6 @@ class ExponentialMovingWindow(BaseWindow): def _constructor(self): return ExponentialMovingWindow - def _get_window_indexer(self) -> BaseIndexer: - """ - Return an indexer class that will compute the window start and end bounds - """ - return ExponentialMovingWindowIndexer() - _agg_see_also_doc = dedent( """ See Also @@ -311,6 +280,7 @@ class ExponentialMovingWindow(BaseWindow): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="", klass="Series/Dataframe", axis="", ) @@ -319,6 +289,44 @@ class ExponentialMovingWindow(BaseWindow): agg = aggregate + def _apply(self, func): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + func : str/callable to apply + + Returns + ------- + y : same type as input argument + """ + blocks, obj = self._create_blocks(self._selected_obj) + block_list = list(blocks) + + results = [] + exclude = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError) as err: + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") from err + + if values.size == 0: + results.append(values.copy()) + continue + + results.append(np.apply_along_axis(func, self.axis, values)) + + return self._wrap_results(results, block_list, obj, exclude) + @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) def mean(self, *args, **kwargs): @@ -335,6 +343,7 @@ class ExponentialMovingWindow(BaseWindow): window_func = self._get_roll_func("ewma_time") window_func = partial( window_func, + minp=self.min_periods, times=self.times, halflife=self.halflife, ) @@ -345,6 +354,7 @@ class ExponentialMovingWindow(BaseWindow): com=self.com, adjust=self.adjust, ignore_na=self.ignore_na, + minp=self.min_periods, ) return self._apply(window_func) @@ -368,19 +378,13 @@ class ExponentialMovingWindow(BaseWindow): Exponential weighted moving variance. """ nv.validate_window_func("var", args, kwargs) - window_func = self._get_roll_func("ewmcov") - window_func = partial( - window_func, - com=self.com, - adjust=self.adjust, - ignore_na=self.ignore_na, - bias=bias, - ) - def var_func(values, begin, end, min_periods): - return window_func(values, begin, end, min_periods, values) + def f(arg): + return window_aggregations.ewmcov( + arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias, + ) - return self._apply(var_func) + return self._apply(f) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) @@ -422,18 +426,16 @@ class ExponentialMovingWindow(BaseWindow): Y = self._shallow_copy(Y) cov = window_aggregations.ewmcov( X._prep_values(), - np.array([0], dtype=np.int64), - np.array([0], dtype=np.int64), - self.min_periods, Y._prep_values(), self.com, self.adjust, self.ignore_na, + self.min_periods, bias, ) - return wrap_result(X, cov) + return X._wrap_result(cov) - return flex_binary_moment( + return _flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -475,15 +477,7 @@ class ExponentialMovingWindow(BaseWindow): def _cov(x, y): return window_aggregations.ewmcov( - x, - np.array([0], dtype=np.int64), - np.array([0], dtype=np.int64), - self.min_periods, - y, - self.com, - self.adjust, - self.ignore_na, - 1, + x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1, ) x_values = X._prep_values() @@ -493,83 +487,8 @@ class ExponentialMovingWindow(BaseWindow): x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) corr = cov / zsqrt(x_var * y_var) - return wrap_result(X, corr) + return X._wrap_result(corr) - return flex_binary_moment( + return _flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) - - -class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): - """ - Provide an exponential moving window groupby implementation. - """ - - def _get_window_indexer(self) -> GroupbyIndexer: - """ - Return an indexer class that will compute the window start and end bounds - - Returns - ------- - GroupbyIndexer - """ - window_indexer = GroupbyIndexer( - groupby_indicies=self._groupby.indices, - window_indexer=ExponentialMovingWindowIndexer, - ) - return window_indexer - - var = dispatch("var", bias=False) - std = dispatch("std", bias=False) - cov = dispatch("cov", other=None, pairwise=None, bias=False) - corr = dispatch("corr", other=None, pairwise=None) - - def mean(self, engine=None, engine_kwargs=None): - """ - Parameters - ---------- - engine : str, default None - * ``'cython'`` : Runs mean through C-extensions from cython. - * ``'numba'`` : Runs mean through JIT compiled code from numba. - Only available when ``raw`` is set to ``True``. - * ``None`` : Defaults to ``'cython'`` or globally setting - ``compute.use_numba`` - - .. versionadded:: 1.2.0 - - engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}``. - - .. versionadded:: 1.2.0 - - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - """ - if maybe_use_numba(engine): - groupby_ewma_func = generate_numba_groupby_ewma_func( - engine_kwargs, - self.com, - self.adjust, - self.ignore_na, - ) - return self._apply( - groupby_ewma_func, - numba_cache_key=(lambda x: x, "groupby_ewma"), - ) - elif engine in ("cython", None): - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return x.mean() - - return self._groupby.apply(f) - else: - raise ValueError("engine must be either 'numba' or 'cython'") diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/expanding.py b/venv/lib/python3.8/site-packages/pandas/core/window/expanding.py index 94875ba..ce4ab2f 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/expanding.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/expanding.py @@ -1,18 +1,14 @@ from textwrap import dedent -from typing import Any, Callable, Dict, Optional, Tuple, Union +from typing import Dict, Optional -import numpy as np - -from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc -from pandas.core.window.common import _doc_template, _shared_docs -from pandas.core.window.indexers import BaseIndexer, ExpandingIndexer, GroupbyIndexer -from pandas.core.window.rolling import BaseWindowGroupby, RollingAndExpandingMixin +from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs +from pandas.core.window.rolling import _Rolling_and_Expanding -class Expanding(RollingAndExpandingMixin): +class Expanding(_Rolling_and_Expanding): """ Provide expanding transformations. @@ -68,17 +64,9 @@ class Expanding(RollingAndExpandingMixin): def _constructor(self): return Expanding - def _get_window_indexer(self) -> BaseIndexer: + def _get_window(self, other=None, **kwargs): """ - Return an indexer class that will compute the window start and end bounds - """ - return ExpandingIndexer() - - def _get_cov_corr_window( - self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None, **kwargs - ) -> int: - """ - Get the window length over which to perform cov and corr operations. + Get the window length over which to perform some operation. Parameters ---------- @@ -129,6 +117,7 @@ class Expanding(RollingAndExpandingMixin): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="", klass="Series/Dataframe", axis="", ) @@ -139,19 +128,19 @@ class Expanding(RollingAndExpandingMixin): @Substitution(name="expanding") @Appender(_shared_docs["count"]) - def count(self): - return super().count() + def count(self, **kwargs): + return super().count(**kwargs) @Substitution(name="expanding") @Appender(_shared_docs["apply"]) def apply( self, - func: Callable[..., Any], + func, raw: bool = False, engine: Optional[str] = None, engine_kwargs: Optional[Dict[str, bool]] = None, - args: Optional[Tuple[Any, ...]] = None, - kwargs: Optional[Dict[str, Any]] = None, + args=None, + kwargs=None, ): return super().apply( func, @@ -194,21 +183,16 @@ class Expanding(RollingAndExpandingMixin): @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["std"]) - def std(self, ddof: int = 1, *args, **kwargs): + def std(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("std", args, kwargs) return super().std(ddof=ddof, **kwargs) @Substitution(name="expanding", versionadded="") @Appender(_shared_docs["var"]) - def var(self, ddof: int = 1, *args, **kwargs): + def var(self, ddof=1, *args, **kwargs): nv.validate_expanding_func("var", args, kwargs) return super().var(ddof=ddof, **kwargs) - @Substitution(name="expanding") - @Appender(_shared_docs["sem"]) - def sem(self, ddof: int = 1, *args, **kwargs): - return super().sem(ddof=ddof, **kwargs) - @Substitution(name="expanding", func_name="skew") @Appender(_doc_template) @Appender(_shared_docs["skew"]) @@ -256,41 +240,20 @@ class Expanding(RollingAndExpandingMixin): @Substitution(name="expanding", func_name="cov") @Appender(_doc_template) @Appender(_shared_docs["cov"]) - def cov( - self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, - pairwise: Optional[bool] = None, - ddof: int = 1, - **kwargs, - ): + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) @Substitution(name="expanding") @Appender(_shared_docs["corr"]) - def corr( - self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, - pairwise: Optional[bool] = None, - **kwargs, - ): + def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) -class ExpandingGroupby(BaseWindowGroupby, Expanding): +class ExpandingGroupby(WindowGroupByMixin, Expanding): """ Provide a expanding groupby implementation. """ - def _get_window_indexer(self) -> GroupbyIndexer: - """ - Return an indexer class that will compute the window start and end bounds - - Returns - ------- - GroupbyIndexer - """ - window_indexer = GroupbyIndexer( - groupby_indicies=self._groupby.indices, - window_indexer=ExpandingIndexer, - ) - return window_indexer + @property + def _constructor(self): + return Expanding diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/indexers.py b/venv/lib/python3.8/site-packages/pandas/core/window/indexers.py index a3b9695..7c76a8e 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/indexers.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/indexers.py @@ -40,7 +40,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, ): """ Parameters @@ -78,21 +78,17 @@ class FixedWindowIndexer(BaseIndexer): closed: Optional[str] = None, ) -> Tuple[np.ndarray, np.ndarray]: - if center: - offset = (self.window_size - 1) // 2 - else: - offset = 0 - - end = np.arange(1 + offset, num_values + 1 + offset, dtype="int64") - start = end - self.window_size - if closed in ["left", "both"]: - start -= 1 - if closed in ["left", "neither"]: - end -= 1 - - end = np.clip(end, 0, num_values) - start = np.clip(start, 0, num_values) + start_s = np.zeros(self.window_size, dtype="int64") + start_e = ( + np.arange(self.window_size, num_values, dtype="int64") + - self.window_size + + 1 + ) + start = np.concatenate([start_s, start_e])[:num_values] + end_s = np.arange(self.window_size, dtype="int64") + 1 + end_e = start_e + self.window_size + end = np.concatenate([end_s, end_e])[:num_values] return start, end @@ -109,7 +105,7 @@ class VariableWindowIndexer(BaseIndexer): ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array + num_values, self.window_size, min_periods, center, closed, self.index_array, ) @@ -263,38 +259,26 @@ class FixedForwardWindowIndexer(BaseIndexer): return start, end -class GroupbyIndexer(BaseIndexer): +class GroupbyRollingIndexer(BaseIndexer): """Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()""" def __init__( self, - index_array: Optional[np.ndarray] = None, - window_size: int = 0, - groupby_indicies: Optional[Dict] = None, - window_indexer: Type[BaseIndexer] = BaseIndexer, - indexer_kwargs: Optional[Dict] = None, + index_array: Optional[np.ndarray], + window_size: int, + groupby_indicies: Dict, + rolling_indexer: Type[BaseIndexer], + indexer_kwargs: Optional[Dict], **kwargs, ): """ Parameters ---------- - index_array : np.ndarray or None - np.ndarray of the index of the original object that we are performing - a chained groupby operation over. This index has been pre-sorted relative to - the groups - window_size : int - window size during the windowing operation - groupby_indicies : dict or None - dict of {group label: [positional index of rows belonging to the group]} - window_indexer : BaseIndexer - BaseIndexer class determining the start and end bounds of each group - indexer_kwargs : dict or None - Custom kwargs to be passed to window_indexer **kwargs : keyword arguments that will be available when get_window_bounds is called """ - self.groupby_indicies = groupby_indicies or {} - self.window_indexer = window_indexer + self.groupby_indicies = groupby_indicies + self.rolling_indexer = rolling_indexer self.indexer_kwargs = indexer_kwargs or {} super().__init__( index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs @@ -319,7 +303,7 @@ class GroupbyIndexer(BaseIndexer): index_array = self.index_array.take(ensure_platform_int(indices)) else: index_array = self.index_array - indexer = self.window_indexer( + indexer = self.rolling_indexer( index_array=index_array, window_size=self.window_size, **self.indexer_kwargs, @@ -332,7 +316,7 @@ class GroupbyIndexer(BaseIndexer): # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indices) + window_indicies_start, window_indicies_start + len(indices), ) window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) @@ -343,19 +327,10 @@ class GroupbyIndexer(BaseIndexer): end_arrays.append(window_indicies.take(ensure_platform_int(end))) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end - - -class ExponentialMovingWindowIndexer(BaseIndexer): - """Calculate ewm window bounds (the entire window)""" - - @Appender(get_window_bounds_doc) - def get_window_bounds( - self, - num_values: int = 0, - min_periods: Optional[int] = None, - center: Optional[bool] = None, - closed: Optional[str] = None, - ) -> Tuple[np.ndarray, np.ndarray]: - - return np.array([0], dtype=np.int64), np.array([num_values], dtype=np.int64) diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/numba_.py b/venv/lib/python3.8/site-packages/pandas/core/window/numba_.py index 274586e..5d35ec7 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/numba_.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/numba_.py @@ -6,7 +6,7 @@ from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, + check_kwargs_and_nopython, get_jit_arguments, jit_user_function, ) @@ -42,14 +42,14 @@ def generate_numba_apply_func( ------- Numba function """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - cache_key = (func, "rolling_apply") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] + check_kwargs_and_nopython(kwargs, nopython) numba_func = jit_user_function(func, nopython, nogil, parallel) + numba = import_optional_dependency("numba") + if parallel: loop_range = numba.prange else: @@ -57,7 +57,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): @@ -72,92 +72,3 @@ def generate_numba_apply_func( return result return roll_apply - - -def generate_numba_groupby_ewma_func( - engine_kwargs: Optional[Dict[str, bool]], - com: float, - adjust: bool, - ignore_na: bool, -): - """ - Generate a numba jitted groupby ewma function specified by values - from engine_kwargs. - - Parameters - ---------- - engine_kwargs : dict - dictionary of arguments to be passed into numba.jit - com : float - adjust : bool - ignore_na : bool - - Returns - ------- - Numba function - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - cache_key = (lambda x: x, "groupby_ewma") - if cache_key in NUMBA_FUNC_CACHE: - return NUMBA_FUNC_CACHE[cache_key] - - numba = import_optional_dependency("numba") - if parallel: - loop_range = numba.prange - else: - loop_range = range - - @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def groupby_ewma( - values: np.ndarray, - begin: np.ndarray, - end: np.ndarray, - minimum_periods: int, - ) -> np.ndarray: - result = np.empty(len(values)) - alpha = 1.0 / (1.0 + com) - for i in loop_range(len(begin)): - start = begin[i] - stop = end[i] - window = values[start:stop] - sub_result = np.empty(len(window)) - - old_wt_factor = 1.0 - alpha - new_wt = 1.0 if adjust else alpha - - weighted_avg = window[0] - nobs = int(not np.isnan(weighted_avg)) - sub_result[0] = weighted_avg if nobs >= minimum_periods else np.nan - old_wt = 1.0 - - for j in range(1, len(window)): - cur = window[j] - is_observation = not np.isnan(cur) - nobs += is_observation - if not np.isnan(weighted_avg): - - if is_observation or not ignore_na: - - old_wt *= old_wt_factor - if is_observation: - - # avoid numerical errors on constant series - if weighted_avg != cur: - weighted_avg = ( - (old_wt * weighted_avg) + (new_wt * cur) - ) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1.0 - elif is_observation: - weighted_avg = cur - - sub_result[j] = weighted_avg if nobs >= minimum_periods else np.nan - - result[start:stop] = sub_result - - return result - - return groupby_ewma diff --git a/venv/lib/python3.8/site-packages/pandas/core/window/rolling.py b/venv/lib/python3.8/site-packages/pandas/core/window/rolling.py index e6185f8..237c29a 100644 --- a/venv/lib/python3.8/site-packages/pandas/core/window/rolling.py +++ b/venv/lib/python3.8/site-packages/pandas/core/window/rolling.py @@ -6,25 +6,13 @@ from datetime import timedelta from functools import partial import inspect from textwrap import dedent -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - List, - Optional, - Set, - Tuple, - Type, - Union, -) -import warnings +from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union import numpy as np from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -46,36 +34,104 @@ from pandas.core.dtypes.generic import ( ABCSeries, ABCTimedeltaIndex, ) -from pandas.core.dtypes.missing import notna -from pandas.core.aggregation import aggregate -from pandas.core.base import DataError, SelectionMixin +from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin +import pandas.core.common as com from pandas.core.construction import extract_array -from pandas.core.groupby.base import GotItemMixin, ShallowMixin -from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.indexes.api import Index, MultiIndex, ensure_index from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( + WindowGroupByMixin, _doc_template, + _flex_binary_moment, _shared_docs, - flex_binary_moment, zsqrt, ) from pandas.core.window.indexers import ( BaseIndexer, FixedWindowIndexer, - GroupbyIndexer, + GroupbyRollingIndexer, VariableWindowIndexer, ) from pandas.core.window.numba_ import generate_numba_apply_func -if TYPE_CHECKING: - from pandas import DataFrame, Series - from pandas.core.internals import Block # noqa:F401 + +def calculate_center_offset(window) -> int: + """ + Calculate an offset necessary to have the window label to be centered. + + Parameters + ---------- + window: ndarray or int + window weights or window + + Returns + ------- + int + """ + if not is_integer(window): + window = len(window) + return int((window - 1) / 2.0) -class BaseWindow(ShallowMixin, SelectionMixin): - """Provides utilities for performing windowing operations.""" +def calculate_min_periods( + window: int, + min_periods: Optional[int], + num_values: int, + required_min_periods: int, + floor: int, +) -> int: + """ + Calculate final minimum periods value for rolling aggregations. + Parameters + ---------- + window : passed window value + min_periods : passed min periods value + num_values : total number of values + required_min_periods : required min periods per aggregation function + floor : required min periods per aggregation function + + Returns + ------- + min_periods : int + """ + if min_periods is None: + min_periods = window + else: + min_periods = max(required_min_periods, min_periods) + if min_periods > window: + raise ValueError(f"min_periods {min_periods} must be <= window {window}") + elif min_periods > num_values: + min_periods = num_values + 1 + elif min_periods < 0: + raise ValueError("min_periods must be >= 0") + return max(min_periods, floor) + + +def get_weighted_roll_func(cfunc: Callable) -> Callable: + """ + Wrap weighted rolling cython function with min periods argument. + + Parameters + ---------- + cfunc : function + Cython weighted rolling function + + Returns + ------- + function + """ + + def func(arg, window, min_periods=None): + if min_periods is None: + min_periods = len(window) + return cfunc(arg, window, min_periods) + + return func + + +class _Window(PandasObject, ShallowMixin, SelectionMixin): _attributes: List[str] = [ "window", "min_periods", @@ -112,6 +168,10 @@ class BaseWindow(ShallowMixin, SelectionMixin): self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() + @property + def _constructor(self): + return Window + @property def is_datetimelike(self) -> Optional[bool]: return None @@ -127,15 +187,8 @@ class BaseWindow(ShallowMixin, SelectionMixin): def validate(self) -> None: if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") - if self.min_periods is not None: - if not is_integer(self.min_periods): - raise ValueError("min_periods must be an integer") - elif self.min_periods < 0: - raise ValueError("min_periods must be >= 0") - elif is_integer(self.window) and self.min_periods > self.window: - raise ValueError( - f"min_periods {self.min_periods} must be <= window {self.window}" - ) + if self.min_periods is not None and not is_integer(self.min_periods): + raise ValueError("min_periods must be an integer") if self.closed is not None and self.closed not in [ "right", "both", @@ -146,21 +199,27 @@ class BaseWindow(ShallowMixin, SelectionMixin): if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): raise TypeError(f"invalid type: {type(self)}") if isinstance(self.window, BaseIndexer): - # Validate that the passed BaseIndexer subclass has - # a get_window_bounds with the correct signature. - get_window_bounds_signature = inspect.signature( - self.window.get_window_bounds - ).parameters.keys() - expected_signature = inspect.signature( - BaseIndexer().get_window_bounds - ).parameters.keys() - if get_window_bounds_signature != expected_signature: - raise ValueError( - f"{type(self.window).__name__} does not implement " - f"the correct signature for get_window_bounds" - ) + self._validate_get_window_bounds_signature(self.window) - def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: + @staticmethod + def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: + """ + Validate that the passed BaseIndexer subclass has + a get_window_bounds with the correct signature. + """ + get_window_bounds_signature = inspect.signature( + window.get_window_bounds + ).parameters.keys() + expected_signature = inspect.signature( + BaseIndexer().get_window_bounds + ).parameters.keys() + if get_window_bounds_signature != expected_signature: + raise ValueError( + f"{type(window).__name__} does not implement the correct signature for " + f"get_window_bounds" + ) + + def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. """ @@ -168,14 +227,9 @@ class BaseWindow(ShallowMixin, SelectionMixin): if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - if self.axis == 1: - # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything - # to float to calculate the complete row at once. We exclude all non-numeric - # dtypes. - obj = obj.select_dtypes(include=["integer", "float"], exclude=["timedelta"]) - obj = obj.astype("float64", copy=False) - obj._mgr = obj._mgr.consolidate() - return obj + blocks = obj._to_dict_of_blocks(copy=False).values() + + return blocks, obj def _gotitem(self, key, ndim, subset=None): """ @@ -212,21 +266,38 @@ class BaseWindow(ShallowMixin, SelectionMixin): def _dir_additions(self): return self.obj._dir_additions() - def _get_cov_corr_window( - self, other: Optional[Union[np.ndarray, FrameOrSeries]] = None - ) -> Optional[Union[int, timedelta, BaseOffset, BaseIndexer]]: + def _get_win_type(self, kwargs: Dict): + """ + Exists for compatibility, overridden by subclass Window. + + Parameters + ---------- + kwargs : dict + ignored, exists for compatibility + + Returns + ------- + None + """ + return None + + def _get_window(self, other=None, win_type: Optional[str] = None) -> int: """ Return window length. Parameters ---------- other : - Used in Expanding + ignored, exists for compatibility + win_type : + ignored, exists for compatibility Returns ------- window : int """ + if isinstance(self.window, BaseIndexer): + return self.min_periods or 0 return self.window @property @@ -246,10 +317,11 @@ class BaseWindow(ShallowMixin, SelectionMixin): return f"{self._window_type} [{attrs}]" def __iter__(self): - obj = self._create_data(self._selected_obj) - indexer = self._get_window_indexer() + window = self._get_window(win_type=None) + blocks, obj = self._create_blocks(self._selected_obj) + index = self._get_window_indexer(window=window) - start, end = indexer.get_window_bounds( + start, end = index.get_window_bounds( num_values=len(obj), min_periods=self.min_periods, center=self.center, @@ -291,32 +363,91 @@ class BaseWindow(ShallowMixin, SelectionMixin): return values - def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): - # if we have an 'on' column we want to put it back into - # the results in the same location - from pandas import Series + def _wrap_result(self, result, block=None, obj=None): + """ + Wrap a single result. + """ + if obj is None: + obj = self._selected_obj + index = obj.index + if isinstance(result, np.ndarray): + + if result.ndim == 1: + from pandas import Series + + return Series(result, index, name=obj.name) + + return type(obj)(result, index=index, columns=block.columns) + return result + + def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: + """ + Wrap the results. + + Parameters + ---------- + results : list of ndarrays + blocks : list of blocks + obj : conformed data (may be resampled) + exclude: list of columns to exclude, default to None + """ + from pandas import Series, concat + + final = [] + for result, block in zip(results, blocks): + + result = self._wrap_result(result, block=block, obj=obj) + if result.ndim == 1: + return result + final.append(result) + + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns if self.on is not None and not self._on.equals(obj.index): - name = self._on.name - extra_col = Series(self._on, index=self.obj.index, name=name) - if name in result.columns: - # TODO: sure we want to overwrite results? - result[name] = extra_col - elif name in result.index.names: - pass - elif name in self._selected_obj.columns: - # insert in the same location as we had in _selected_obj - old_cols = self._selected_obj.columns - new_cols = result.columns - old_loc = old_cols.get_loc(name) - overlap = new_cols.intersection(old_cols[:old_loc]) - new_loc = len(overlap) - result.insert(new_loc, name, extra_col) - else: - # insert at the end - result[name] = extra_col - def _get_roll_func(self, func_name: str) -> Callable[..., Any]: + name = self._on.name + final.append(Series(self._on, index=self.obj.index, name=name)) + + if self._selection is not None: + + selection = ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + + # exclude nuisance columns so that they are not reindexed + if exclude is not None and exclude: + columns = [c for c in columns if c not in exclude] + + if not columns: + raise DataError("No numeric types to aggregate") + + if not len(final): + return obj.astype("float64") + return concat(final, axis=1).reindex(columns=columns, copy=False) + + def _center_window(self, result, window) -> np.ndarray: + """ + Center the result in the window. + """ + if self.axis > result.ndim - 1: + raise ValueError("Requested axis is larger then no. of argument dimensions") + + offset = calculate_center_offset(window) + if offset > 0: + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) + return result + + def _get_roll_func(self, func_name: str) -> Callable: """ Wrap rolling function to check values passed. @@ -336,82 +467,35 @@ class BaseWindow(ShallowMixin, SelectionMixin): ) return window_func - @property - def _index_array(self): - # TODO: why do we get here with e.g. MultiIndex? - if needs_i8_conversion(self._on.dtype): - return self._on.asi8 - return None + def _get_cython_func_type(self, func: str) -> Callable: + """ + Return a variable or fixed cython function type. - def _get_window_indexer(self) -> BaseIndexer: + Variable algorithms do not use window while fixed do. + """ + if self.is_freq_type or isinstance(self.window, BaseIndexer): + return self._get_roll_func(f"{func}_variable") + return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) + + def _get_window_indexer(self, window: int) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return VariableWindowIndexer( - index_array=self._index_array, window_size=self.window - ) - return FixedWindowIndexer(window_size=self.window) - - def _apply_series( - self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None - ) -> "Series": - """ - Series version of _apply_blockwise - """ - obj = self._create_data(self._selected_obj) - - try: - # GH 12541: Special case for count where we support date-like types - input = obj.values if name != "count" else notna(obj.values).astype(int) - values = self._prep_values(input) - except (TypeError, NotImplementedError) as err: - raise DataError("No numeric types to aggregate") from err - - result = homogeneous_func(values) - return obj._constructor(result, index=obj.index, name=obj.name) - - def _apply_blockwise( - self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None - ) -> FrameOrSeriesUnion: - """ - Apply the given function to the DataFrame broken down into homogeneous - sub-frames. - """ - if self._selected_obj.ndim == 1: - return self._apply_series(homogeneous_func, name) - - obj = self._create_data(self._selected_obj) - if name == "count": - # GH 12541: Special case for count where we support date-like types - obj = notna(obj).astype(int) - obj._mgr = obj._mgr.consolidate() - mgr = obj._mgr - - def hfunc(bvalues: ArrayLike) -> ArrayLike: - # TODO(EA2D): getattr unnecessary with 2D EAs - values = self._prep_values(getattr(bvalues, "T", bvalues)) - res_values = homogeneous_func(values) - return getattr(res_values, "T", res_values) - - new_mgr = mgr.apply(hfunc, ignore_failures=True) - out = obj._constructor(new_mgr) - - if out.shape[1] == 0 and obj.shape[1] > 0: - raise DataError("No numeric types to aggregate") - elif out.shape[1] == 0: - return obj.astype("float64") - - self._insert_on_column(out, obj) - return out + return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) + return FixedWindowIndexer(window_size=window) def _apply( self, - func: Callable[..., Any], + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -422,9 +506,15 @@ class BaseWindow(ShallowMixin, SelectionMixin): Parameters ---------- func : callable function to apply + center : bool + require_min_periods : int + floor : int + is_weighted : bool name : str, - numba_cache_key : tuple - caching key to be used to store a compiled numba func + compatibility with groupby.rolling + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function @@ -432,27 +522,64 @@ class BaseWindow(ShallowMixin, SelectionMixin): ------- y : type of input """ - window_indexer = self._get_window_indexer() - min_periods = ( - self.min_periods - if self.min_periods is not None - else window_indexer.window_size - ) + win_type = self._get_win_type(kwargs) + window = self._get_window(win_type=win_type) - def homogeneous_func(values: np.ndarray): - # calculation function + blocks, obj = self._create_blocks(self._selected_obj) + block_list = list(blocks) + window_indexer = self._get_window_indexer(window) + + results = [] + exclude: List[Scalar] = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError) as err: + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") from err if values.size == 0: - return values.copy() + results.append(values.copy()) + continue - def calc(x): - start, end = window_indexer.get_window_bounds( - num_values=len(x), - min_periods=min_periods, - center=self.center, - closed=self.closed, - ) - return func(x, start, end, min_periods) + # calculation function + offset = calculate_center_offset(window) if center else 0 + additional_nans = np.array([np.nan] * offset) + + if not is_weighted: + + def calc(x): + x = np.concatenate((x, additional_nans)) + if not isinstance(self.window, BaseIndexer): + min_periods = calculate_min_periods( + window, self.min_periods, len(x), require_min_periods, floor + ) + else: + min_periods = calculate_min_periods( + window_indexer.window_size, + self.min_periods, + len(x), + require_min_periods, + floor, + ) + start, end = window_indexer.get_window_bounds( + num_values=len(x), + min_periods=self.min_periods, + center=self.center, + closed=self.closed, + ) + return func(x, start, end, min_periods) + + else: + + def calc(x): + x = np.concatenate((x, additional_nans)) + return func(x, window, self.min_periods) with np.errstate(all="ignore"): if values.ndim > 1: @@ -461,15 +588,18 @@ class BaseWindow(ShallowMixin, SelectionMixin): result = calc(values) result = np.asarray(result) - if numba_cache_key is not None: - NUMBA_FUNC_CACHE[numba_cache_key] = func + if use_numba_cache: + NUMBA_FUNC_CACHE[(kwargs["original_func"], "rolling_apply")] = func - return result + if center: + result = self._center_window(result, window) - return self._apply_blockwise(homogeneous_func, name) + results.append(result) + + return self._wrap_results(results, block_list, obj, exclude) def aggregate(self, func, *args, **kwargs): - result, how = aggregate(self, func, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if result is None: return self.apply(func, raw=False, args=args, kwargs=kwargs) return result @@ -720,133 +850,7 @@ class BaseWindow(ShallowMixin, SelectionMixin): ) -def dispatch(name: str, *args, **kwargs): - """ - Dispatch to groupby apply. - """ - - def outer(self, *args, **kwargs): - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return getattr(x, name)(*args, **kwargs) - - return self._groupby.apply(f) - - outer.__name__ = name - return outer - - -class BaseWindowGroupby(GotItemMixin, BaseWindow): - """ - Provide the groupby windowing facilities. - """ - - def __init__(self, obj, *args, **kwargs): - kwargs.pop("parent", None) - groupby = kwargs.pop("groupby", None) - if groupby is None: - groupby, obj = obj, obj._selected_obj - self._groupby = groupby - self._groupby.mutated = True - self._groupby.grouper.mutated = True - super().__init__(obj, *args, **kwargs) - - corr = dispatch("corr", other=None, pairwise=None) - cov = dispatch("cov", other=None, pairwise=None) - - def _apply( - self, - func: Callable[..., Any], - name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, - **kwargs, - ) -> FrameOrSeries: - result = super()._apply( - func, - name, - numba_cache_key, - **kwargs, - ) - # Reconstruct the resulting MultiIndex from tuples - # 1st set of levels = group by labels - # 2nd set of levels = original index - # Ignore 2nd set of levels if a group by label include an index level - result_index_names = [ - grouping.name for grouping in self._groupby.grouper._groupings - ] - grouped_object_index = None - - column_keys = [ - key - for key in result_index_names - if key not in self.obj.index.names or key is None - ] - - if len(column_keys) == len(result_index_names): - grouped_object_index = self.obj.index - grouped_index_name = [*grouped_object_index.names] - result_index_names += grouped_index_name - else: - # Our result will have still kept the column in the result - result = result.drop(columns=column_keys, errors="ignore") - - codes = self._groupby.grouper.codes - levels = self._groupby.grouper.levels - - group_indices = self._groupby.grouper.indices.values() - if group_indices: - indexer = np.concatenate(list(group_indices)) - else: - indexer = np.array([], dtype=np.intp) - codes = [c.take(indexer) for c in codes] - - # if the index of the original dataframe needs to be preserved, append - # this index (but reordered) to the codes/levels from the groupby - if grouped_object_index is not None: - idx = grouped_object_index.take(indexer) - if not isinstance(idx, MultiIndex): - idx = MultiIndex.from_arrays([idx]) - codes.extend(list(idx.codes)) - levels.extend(list(idx.levels)) - - result_index = MultiIndex( - levels, codes, names=result_index_names, verify_integrity=False - ) - - result.index = result_index - return result - - def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: - """ - Split data into blocks & return conformed data. - """ - # Ensure the object we're rolling over is monotonically sorted relative - # to the groups - # GH 36197 - if not obj.empty: - groupby_order = np.concatenate( - list(self._groupby.grouper.indices.values()) - ).astype(np.int64) - obj = obj.take(groupby_order) - return super()._create_data(obj) - - def _gotitem(self, key, ndim, subset=None): - # we are setting the index on the actual object - # here so our index is carried through to the selected obj - # when we do the splitting for the groupby - if self.on is not None: - self.obj = self.obj.set_index(self._on) - self.on = None - return super()._gotitem(key, ndim, subset=subset) - - def _validate_monotonic(self): - """ - Validate that "on" is monotonic; already validated at a higher level. - """ - pass - - -class Window(BaseWindow): +class Window(_Window): """ Provide rolling window calculations. @@ -882,11 +886,10 @@ class Window(BaseWindow): axis : int or str, default 0 closed : str, default None Make the interval closed on the 'right', 'left', 'both' or - 'neither' endpoints. Defaults to 'right'. - - .. versionchanged:: 1.2.0 - - The closed parameter with fixed windows is now supported. + 'neither' endpoints. + For offset-based windows, it defaults to 'right'. + For fixed windows, defaults to 'both'. Remaining cases not implemented + for fixed windows. Returns ------- @@ -905,14 +908,30 @@ class Window(BaseWindow): To learn more about the offsets & frequency strings, please see `this link `__. - If ``win_type=None``, all points are evenly weighted; otherwise, ``win_type`` - can accept a string of any `scipy.signal window function - `__. + The recognized win_types are: - Certain Scipy window types require additional parameters to be passed - in the aggregation function. The additional parameters must match - the keywords specified in the Scipy window type method signature. - Please see the third example below on how to add the additional parameters. + * ``boxcar`` + * ``triang`` + * ``blackman`` + * ``hamming`` + * ``bartlett`` + * ``parzen`` + * ``bohman`` + * ``blackmanharris`` + * ``nuttall`` + * ``barthann`` + * ``kaiser`` (needs parameter: beta) + * ``gaussian`` (needs parameter: std) + * ``general_gaussian`` (needs parameters: power, width) + * ``slepian`` (needs parameter: width) + * ``exponential`` (needs parameter: tau), center is set to None. + + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. + + Certain window types require additional parameters to be passed. Please see + the third example below on how to add the additional parameters. Examples -------- @@ -1009,99 +1028,101 @@ class Window(BaseWindow): 2013-01-01 09:00:06 4.0 """ - @property - def _constructor(self): - return Window - def validate(self): super().validate() - if isinstance(self.window, BaseIndexer): + window = self.window + if isinstance(window, BaseIndexer): raise NotImplementedError( "BaseIndexer subclasses not implemented with win_types." ) - elif is_integer(self.window): - if self.window <= 0: + elif isinstance(window, (list, tuple, np.ndarray)): + pass + elif is_integer(window): + if window <= 0: raise ValueError("window must be > 0 ") - sig = import_optional_dependency( - "scipy.signal", extra="Scipy is required to generate window weight." + import_optional_dependency( + "scipy", extra="Scipy is required to generate window weight." ) + import scipy.signal as sig + if not isinstance(self.win_type, str): raise ValueError(f"Invalid win_type {self.win_type}") if getattr(sig, self.win_type, None) is None: raise ValueError(f"Invalid win_type {self.win_type}") else: - raise ValueError(f"Invalid window {self.window}") + raise ValueError(f"Invalid window {window}") - def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray: + def _get_win_type(self, kwargs: Dict) -> Union[str, Tuple]: """ - Center the result in the window for weighted rolling aggregations. - """ - if self.axis > result.ndim - 1: - raise ValueError("Requested axis is larger then no. of argument dimensions") - - if offset > 0: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) - return result - - def _apply( - self, - func: Callable[[np.ndarray, int, int], np.ndarray], - name: Optional[str] = None, - numba_cache_key: Optional[Tuple[Callable, str]] = None, - **kwargs, - ): - """ - Rolling with weights statistical measure using supplied function. - - Designed to be used with passed-in Cython array-based functions. + Extract arguments for the window type, provide validation for it + and return the validated window type. Parameters ---------- - func : callable function to apply - name : str, - use_numba_cache : tuple - unused - **kwargs - additional arguments for scipy windows if necessary + kwargs : dict Returns ------- - y : type of input + win_type : str, or tuple """ - signal = import_optional_dependency( - "scipy.signal", extra="Scipy is required to generate window weight." - ) - assert self.win_type is not None # for mypy - window = getattr(signal, self.win_type)(self.window, **kwargs) - offset = (len(window) - 1) // 2 if self.center else 0 + # the below may pop from kwargs + def _validate_win_type(win_type, kwargs): + arg_map = { + "kaiser": ["beta"], + "gaussian": ["std"], + "general_gaussian": ["power", "width"], + "slepian": ["width"], + "exponential": ["tau"], + } - def homogeneous_func(values: np.ndarray): - # calculation function + if win_type in arg_map: + win_args = _pop_args(win_type, arg_map[win_type], kwargs) + if win_type == "exponential": + # exponential window requires the first arg (center) + # to be set to None (necessary for symmetric window) + win_args.insert(0, None) - if values.size == 0: - return values.copy() + return tuple([win_type] + win_args) - def calc(x): - additional_nans = np.array([np.nan] * offset) - x = np.concatenate((x, additional_nans)) - return func(x, window, self.min_periods or len(window)) + return win_type - with np.errstate(all="ignore"): - if values.ndim > 1: - result = np.apply_along_axis(calc, self.axis, values) - else: - result = calc(values) - result = np.asarray(result) + def _pop_args(win_type, arg_names, kwargs): + all_args = [] + for n in arg_names: + if n not in kwargs: + raise ValueError(f"{win_type} window requires {n}") + all_args.append(kwargs.pop(n)) + return all_args - if self.center: - result = self._center_window(result, offset) + return _validate_win_type(self.win_type, kwargs) - return result + def _get_window( + self, other=None, win_type: Optional[Union[str, Tuple]] = None + ) -> np.ndarray: + """ + Get the window, weights. - return self._apply_blockwise(homogeneous_func, name) + Parameters + ---------- + other : + ignored, exists for compatibility + win_type : str, or tuple + type of window to create + + Returns + ------- + window : ndarray + the window, weights + """ + window = self.window + if isinstance(window, (list, tuple, np.ndarray)): + return com.asarray_tuplesafe(window).astype(float) + elif is_integer(window): + import scipy.signal as sig + + # GH #15662. `False` makes symmetric window, rather than periodic. + return sig.get_window(win_type, window, False).astype(float) _agg_see_also_doc = dedent( """ @@ -1135,11 +1156,12 @@ class Window(BaseWindow): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="", klass="Series/DataFrame", axis="", ) def aggregate(self, func, *args, **kwargs): - result, how = aggregate(self, func, *args, **kwargs) + result, how = self._aggregate(func, *args, **kwargs) if result is None: # these must apply directly @@ -1154,31 +1176,46 @@ class Window(BaseWindow): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_weighted_sum") - return self._apply(window_func, name="sum", **kwargs) + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="sum", **kwargs + ) @Substitution(name="window") @Appender(_shared_docs["mean"]) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = self._get_roll_func("roll_weighted_mean") - return self._apply(window_func, name="mean", **kwargs) + window_func = get_weighted_roll_func(window_func) + return self._apply( + window_func, center=self.center, is_weighted=True, name="mean", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["var"]) - def var(self, ddof: int = 1, *args, **kwargs): + def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) + window_func = get_weighted_roll_func(window_func) kwargs.pop("name", None) - return self._apply(window_func, name="var", **kwargs) + return self._apply( + window_func, center=self.center, is_weighted=True, name="var", **kwargs + ) @Substitution(name="window", versionadded="\n.. versionadded:: 1.0.0\n") @Appender(_shared_docs["std"]) - def std(self, ddof: int = 1, *args, **kwargs): + def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) -class RollingAndExpandingMixin(BaseWindow): +class _Rolling(_Window): + @property + def _constructor(self): + return Rolling + + +class _Rolling_and_Expanding(_Rolling): _shared_docs["count"] = dedent( r""" @@ -1221,8 +1258,25 @@ class RollingAndExpandingMixin(BaseWindow): ) def count(self): - window_func = self._get_roll_func("roll_sum") - return self._apply(window_func, name="count") + # GH 32865. Using count with custom BaseIndexer subclass + # implementations shouldn't end up here + assert not isinstance(self.window, BaseIndexer) + + blocks, obj = self._create_blocks(self._selected_obj) + results = [] + for b in blocks: + result = b.notna().astype(int) + result = self._constructor( + result, + window=self._get_window(), + min_periods=self.min_periods or 0, + center=self.center, + axis=self.axis, + closed=self.closed, + ).sum() + results.append(result) + + return self._wrap_results(results, blocks, obj) _shared_docs["apply"] = dedent( r""" @@ -1281,61 +1335,78 @@ class RollingAndExpandingMixin(BaseWindow): Notes ----- - See :ref:`window.numba_engine` for extended documentation and performance + See :ref:`stats.rolling_apply` for extended documentation and performance considerations for the Numba engine. """ ) def apply( self, - func: Callable[..., Any], + func, raw: bool = False, engine: Optional[str] = None, - engine_kwargs: Optional[Dict[str, bool]] = None, - args: Optional[Tuple[Any, ...]] = None, - kwargs: Optional[Dict[str, Any]] = None, + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, ): if args is None: args = () if kwargs is None: kwargs = {} - + kwargs.pop("_level", None) + kwargs.pop("floor", None) if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") - numba_cache_key = None if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") - apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs) - numba_cache_key = (func, "rolling_apply") + cache_key = (func, "rolling_apply") + if cache_key in NUMBA_FUNC_CACHE: + # Return an already compiled version of roll_apply if available + apply_func = NUMBA_FUNC_CACHE[cache_key] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + center = self.center elif engine in ("cython", None): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") - apply_func = self._generate_cython_apply_func(args, kwargs, raw, func) + # Cython apply functions handle center, so don't need to use + # _apply's center handling + window = self._get_window() + offset = calculate_center_offset(window) if self.center else 0 + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + center = False else: raise ValueError("engine must be either 'numba' or 'cython'") + # name=func & raw=raw for WindowGroupByMixin._apply return self._apply( apply_func, - numba_cache_key=numba_cache_key, + center=center, + floor=0, + name=func, + use_numba_cache=engine == "numba", + raw=raw, + original_func=func, + args=args, + kwargs=kwargs, ) - def _generate_cython_apply_func( - self, - args: Tuple[Any, ...], - kwargs: Dict[str, Any], - raw: bool, - function: Callable[..., Any], - ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, int], np.ndarray]: + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): from pandas import Series window_func = partial( - self._get_roll_func("roll_apply"), + self._get_cython_func_type("roll_generic"), args=args, kwargs=kwargs, raw=raw, - function=function, + offset=offset, + func=func, ) def apply_func(values, begin, end, min_periods, raw=raw): @@ -1347,8 +1418,11 @@ class RollingAndExpandingMixin(BaseWindow): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) - window_func = self._get_roll_func("roll_sum") - return self._apply(window_func, name="sum", **kwargs) + window_func = self._get_cython_func_type("roll_sum") + kwargs.pop("floor", None) + return self._apply( + window_func, center=self.center, floor=0, name="sum", **kwargs + ) _shared_docs["max"] = dedent( """ @@ -1363,8 +1437,8 @@ class RollingAndExpandingMixin(BaseWindow): def max(self, *args, **kwargs): nv.validate_window_func("max", args, kwargs) - window_func = self._get_roll_func("roll_max") - return self._apply(window_func, name="max", **kwargs) + window_func = self._get_cython_func_type("roll_max") + return self._apply(window_func, center=self.center, name="max", **kwargs) _shared_docs["min"] = dedent( """ @@ -1405,13 +1479,13 @@ class RollingAndExpandingMixin(BaseWindow): def min(self, *args, **kwargs): nv.validate_window_func("min", args, kwargs) - window_func = self._get_roll_func("roll_min") - return self._apply(window_func, name="min", **kwargs) + window_func = self._get_cython_func_type("roll_min") + return self._apply(window_func, center=self.center, name="min", **kwargs) def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) - window_func = self._get_roll_func("roll_mean") - return self._apply(window_func, name="mean", **kwargs) + window_func = self._get_cython_func_type("roll_mean") + return self._apply(window_func, center=self.center, name="mean", **kwargs) _shared_docs["median"] = dedent( """ @@ -1454,27 +1528,37 @@ class RollingAndExpandingMixin(BaseWindow): window_func = self._get_roll_func("roll_median_c") # GH 32865. Move max window size calculation to # the median function implementation - return self._apply(window_func, name="median", **kwargs) + return self._apply(window_func, center=self.center, name="median", **kwargs) - def std(self, ddof: int = 1, *args, **kwargs): + def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - window_func = self._get_roll_func("roll_var") + kwargs.pop("require_min_periods", None) + window_func = self._get_cython_func_type("roll_var") def zsqrt_func(values, begin, end, min_periods): return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + # ddof passed again for compat with groupby.rolling return self._apply( zsqrt_func, + center=self.center, + require_min_periods=1, name="std", + ddof=ddof, **kwargs, ) - def var(self, ddof: int = 1, *args, **kwargs): + def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) - window_func = partial(self._get_roll_func("roll_var"), ddof=ddof) + kwargs.pop("require_min_periods", None) + window_func = partial(self._get_cython_func_type("roll_var"), ddof=ddof) + # ddof passed again for compat with groupby.rolling return self._apply( window_func, + center=self.center, + require_min_periods=1, name="var", + ddof=ddof, **kwargs, ) @@ -1490,9 +1574,12 @@ class RollingAndExpandingMixin(BaseWindow): """ def skew(self, **kwargs): - window_func = self._get_roll_func("roll_skew") + window_func = self._get_cython_func_type("roll_skew") + kwargs.pop("require_min_periods", None) return self._apply( window_func, + center=self.center, + require_min_periods=3, name="skew", **kwargs, ) @@ -1529,63 +1616,13 @@ class RollingAndExpandingMixin(BaseWindow): """ ) - def sem(self, ddof: int = 1, *args, **kwargs): - return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) - - _shared_docs["sem"] = dedent( - """ - Compute %(name)s standard error of mean. - - Parameters - ---------- - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - - *args, **kwargs - For NumPy compatibility. No additional arguments are used. - - Returns - ------- - Series or DataFrame - Returned object type is determined by the caller of the %(name)s - calculation. - - See Also - -------- - pandas.Series.%(name)s : Calling object with Series data. - pandas.DataFrame.%(name)s : Calling object with DataFrames. - pandas.Series.sem : Equivalent method for Series. - pandas.DataFrame.sem : Equivalent method for DataFrame. - - Notes - ----- - A minimum of one period is required for the rolling calculation. - - Examples - -------- - >>> s = pd.Series([0, 1, 2, 3]) - >>> s.rolling(2, min_periods=1).sem() - 0 NaN - 1 0.707107 - 2 0.707107 - 3 0.707107 - dtype: float64 - - >>> s.expanding().sem() - 0 NaN - 1 0.707107 - 2 0.707107 - 3 0.745356 - dtype: float64 - """ - ) - def kurt(self, **kwargs): - window_func = self._get_roll_func("roll_kurt") + window_func = self._get_cython_func_type("roll_kurt") + kwargs.pop("require_min_periods", None) return self._apply( window_func, + center=self.center, + require_min_periods=4, name="kurt", **kwargs, ) @@ -1599,6 +1636,8 @@ class RollingAndExpandingMixin(BaseWindow): quantile : float Quantile to compute. 0 <= quantile <= 1. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + .. versionadded:: 0.23.0 + This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -1644,19 +1683,23 @@ class RollingAndExpandingMixin(BaseWindow): """ ) - def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): + def quantile(self, quantile, interpolation="linear", **kwargs): if quantile == 1.0: - window_func = self._get_roll_func("roll_max") + window_func = self._get_cython_func_type("roll_max") elif quantile == 0.0: - window_func = self._get_roll_func("roll_min") + window_func = self._get_cython_func_type("roll_min") else: window_func = partial( self._get_roll_func("roll_quantile"), + win=self._get_window(), quantile=quantile, interpolation=interpolation, ) - return self._apply(window_func, name="quantile", **kwargs) + # Pass through for groupby.rolling + kwargs["quantile"] = quantile + kwargs["interpolation"] = interpolation + return self._apply(window_func, center=self.center, name="quantile", **kwargs) _shared_docs[ "cov" @@ -1692,10 +1735,14 @@ class RollingAndExpandingMixin(BaseWindow): # GH 32865. We leverage rolling.mean, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - # GH 16058: offset window - window = ( - self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq - ) + if isinstance(self.window, BaseIndexer): + window = self.window + else: + # GH 16058: offset window + if self.is_freq_type: + window = self.win_freq + else: + window = self._get_window(other) def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data @@ -1713,7 +1760,7 @@ class RollingAndExpandingMixin(BaseWindow): bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return flex_binary_moment( + return _flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -1837,10 +1884,10 @@ class RollingAndExpandingMixin(BaseWindow): # GH 32865. We leverage rolling.cov and rolling.std here, so we pass # to the rolling constructors the data used when constructing self: # window width, frequency data, or a BaseIndexer subclass - # GH 16058: offset window - window = ( - self._get_cov_corr_window(other) if not self.is_freq_type else self.win_freq - ) + if isinstance(self.window, BaseIndexer): + window = self.window + else: + window = self._get_window(other) if not self.is_freq_type else self.win_freq def _get_corr(a, b): a = a.rolling( @@ -1849,17 +1896,15 @@ class RollingAndExpandingMixin(BaseWindow): b = b.rolling( window=window, min_periods=self.min_periods, center=self.center ) - # GH 31286: Through using var instead of std we can avoid numerical - # issues when the result of var is withing floating proint precision - # while std is not. - return a.cov(b, **kwargs) / (a.var(**kwargs) * b.var(**kwargs)) ** 0.5 - return flex_binary_moment( + return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) + + return _flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) -class Rolling(RollingAndExpandingMixin): +class Rolling(_Rolling_and_Expanding): @cache_readonly def is_datetimelike(self) -> bool: return isinstance( @@ -1884,10 +1929,6 @@ class Rolling(RollingAndExpandingMixin): "must be a column (of DataFrame), an Index or None" ) - @property - def _constructor(self): - return Rolling - def validate(self): super().validate() @@ -1897,6 +1938,7 @@ class Rolling(RollingAndExpandingMixin): ): self._validate_monotonic() + freq = self._validate_freq() # we don't allow center if self.center: @@ -1907,7 +1949,7 @@ class Rolling(RollingAndExpandingMixin): # this will raise ValueError on non-fixed freqs self.win_freq = self.window - self.window = self._determine_window_length() + self.window = freq.nanos self.win_type = "freq" # min_periods must be an integer @@ -1922,28 +1964,20 @@ class Rolling(RollingAndExpandingMixin): elif self.window < 0: raise ValueError("window must be non-negative") - def _determine_window_length(self) -> Union[int, float]: - """ - Calculate freq for PeriodIndexes based on Index freq. Can not use - nanos, because asi8 of PeriodIndex is not in nanos - """ - freq = self._validate_freq() - if isinstance(self._on, ABCPeriodIndex): - return freq.nanos / (self._on.freq.nanos / self._on.freq.n) - return freq.nanos + if not self.is_datetimelike and self.closed is not None: + raise ValueError( + "closed only implemented for datetimelike and offset based windows" + ) def _validate_monotonic(self): """ Validate monotonic (increasing or decreasing). """ if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing): - self._raise_monotonic_error() - - def _raise_monotonic_error(self): - formatted = self.on - if self.on is None: - formatted = "index" - raise ValueError(f"{formatted} must be monotonic") + formatted = self.on + if self.on is None: + formatted = "index" + raise ValueError(f"{formatted} must be monotonic") def _validate_freq(self): """ @@ -1995,6 +2029,7 @@ class Rolling(RollingAndExpandingMixin): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, + versionadded="", klass="Series/Dataframe", axis="", ) @@ -2006,22 +2041,20 @@ class Rolling(RollingAndExpandingMixin): @Substitution(name="rolling") @Appender(_shared_docs["count"]) def count(self): - if self.min_periods is None: - warnings.warn( - ( - "min_periods=None will default to the size of window " - "consistent with other methods in a future version. " - "Specify min_periods=0 instead." - ), - FutureWarning, - ) - self.min_periods = 0 + + # different impl for freq counting + # GH 32865. Use a custom count function implementation + # when using a BaseIndexer subclass as a window + if self.is_freq_type or isinstance(self.window, BaseIndexer): + window_func = self._get_roll_func("roll_count") + return self._apply(window_func, center=self.center, name="count") + return super().count() @Substitution(name="rolling") @Appender(_shared_docs["apply"]) def apply( - self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None + self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None, ): return super().apply( func, @@ -2080,11 +2113,6 @@ class Rolling(RollingAndExpandingMixin): def skew(self, **kwargs): return super().skew(**kwargs) - @Substitution(name="rolling") - @Appender(_shared_docs["sem"]) - def sem(self, ddof=1, *args, **kwargs): - return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) - _agg_doc = dedent( """ Examples @@ -2138,49 +2166,159 @@ class Rolling(RollingAndExpandingMixin): Rolling.__doc__ = Window.__doc__ -class RollingGroupby(BaseWindowGroupby, Rolling): +class RollingGroupby(WindowGroupByMixin, Rolling): """ Provide a rolling groupby implementation. """ - def _get_window_indexer(self) -> GroupbyIndexer: + def _apply( + self, + func: Callable, + center: bool, + require_min_periods: int = 0, + floor: int = 1, + is_weighted: bool = False, + name: Optional[str] = None, + use_numba_cache: bool = False, + **kwargs, + ): + result = Rolling._apply( + self, + func, + center, + require_min_periods, + floor, + is_weighted, + name, + use_numba_cache, + **kwargs, + ) + # Reconstruct the resulting MultiIndex from tuples + # 1st set of levels = group by labels + # 2nd set of levels = original index + # Ignore 2nd set of levels if a group by label include an index level + result_index_names = [ + grouping.name for grouping in self._groupby.grouper._groupings + ] + grouped_object_index = None + + column_keys = [ + key + for key in result_index_names + if key not in self.obj.index.names or key is None + ] + + if len(column_keys) == len(result_index_names): + grouped_object_index = self.obj.index + grouped_index_name = [*grouped_object_index.names] + result_index_names += grouped_index_name + else: + # Our result will have still kept the column in the result + result = result.drop(columns=column_keys, errors="ignore") + + codes = self._groupby.grouper.codes + levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + codes = [c.take(indexer) for c in codes] + + # if the index of the original dataframe needs to be preserved, append + # this index (but reordered) to the codes/levels from the groupby + if grouped_object_index is not None: + idx = grouped_object_index.take(indexer) + if not isinstance(idx, MultiIndex): + idx = MultiIndex.from_arrays([idx]) + codes.extend(list(idx.codes)) + levels.extend(list(idx.levels)) + + result_index = MultiIndex( + levels, codes, names=result_index_names, verify_integrity=False + ) + + result.index = result_index + return result + + @property + def _constructor(self): + return Rolling + + def _create_blocks(self, obj: FrameOrSeries): + """ + Split data into blocks & return conformed data. + """ + # Ensure the object we're rolling over is monotonically sorted relative + # to the groups + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate( + list(self._groupby.grouper.indices.values()) + ).astype(np.int64) + obj = obj.take(groupby_order) + return super()._create_blocks(obj) + + def _get_cython_func_type(self, func: str) -> Callable: + """ + Return the cython function type. + + RollingGroupby needs to always use "variable" algorithms since processing + the data in group order may not be monotonic with the data which + "fixed" algorithms assume + """ + return self._get_roll_func(f"{func}_variable") + + def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: """ Return an indexer class that will compute the window start and end bounds + Parameters + ---------- + window : int + window size for FixedWindowIndexer + Returns ------- - GroupbyIndexer + GroupbyRollingIndexer """ rolling_indexer: Type[BaseIndexer] - indexer_kwargs: Optional[Dict[str, Any]] = None - index_array = self._index_array - window = self.window + indexer_kwargs: Optional[Dict] = None + index_array = self._on.asi8 if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ - assert isinstance(indexer_kwargs, dict) # for mypy # We'll be using the index of each group later indexer_kwargs.pop("index_array", None) - window = 0 elif self.is_freq_type: rolling_indexer = VariableWindowIndexer else: rolling_indexer = FixedWindowIndexer index_array = None - window_indexer = GroupbyIndexer( + window_indexer = GroupbyRollingIndexer( index_array=index_array, window_size=window, groupby_indicies=self._groupby.indices, - window_indexer=rolling_indexer, + rolling_indexer=rolling_indexer, indexer_kwargs=indexer_kwargs, ) return window_indexer + def _gotitem(self, key, ndim, subset=None): + # we are setting the index on the actual object + # here so our index is carried thru to the selected obj + # when we do the splitting for the groupby + if self.on is not None: + self.obj = self.obj.set_index(self._on) + self.on = None + return super()._gotitem(key, ndim, subset=subset) + def _validate_monotonic(self): """ Validate that on is monotonic; - in this case we have to check only for nans, because - monotonicy was already validated at a higher level. + we don't care for groupby.rolling + because we have already validated at a higher + level. """ - if self._on.hasnans: - self._raise_monotonic_error() + pass diff --git a/venv/lib/python3.8/site-packages/pandas/errors/__init__.py b/venv/lib/python3.8/site-packages/pandas/errors/__init__.py index ea60ae5..6ac3004 100644 --- a/venv/lib/python3.8/site-packages/pandas/errors/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/errors/__init__.py @@ -202,30 +202,9 @@ class NumbaUtilError(Exception): """ -class DuplicateLabelError(ValueError): - """ - Error raised when an operation would introduce duplicate labels. - - .. versionadded:: 1.2.0 - - Examples - -------- - >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( - ... allows_duplicate_labels=False - ... ) - >>> s.reindex(['a', 'a', 'b']) - Traceback (most recent call last): - ... - DuplicateLabelError: Index has duplicates. - positions - label - a [0, 1] - """ - - class InvalidIndexError(Exception): """ - Exception raised when attempting to use an invalid index key. + Exception raised when attemping to use an invalid index key. .. versionadded:: 1.1.0 """ diff --git a/venv/lib/python3.8/site-packages/pandas/io/clipboard/__init__.py b/venv/lib/python3.8/site-packages/pandas/io/clipboard/__init__.py index a8020f4..d16955a 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/clipboard/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/io/clipboard/__init__.py @@ -274,7 +274,7 @@ def init_dev_clipboard_clipboard(): fo.write(text) def paste_dev_clipboard() -> str: - with open("/dev/clipboard") as fo: + with open("/dev/clipboard", "rt") as fo: content = fo.read() return content @@ -521,7 +521,7 @@ def determine_clipboard(): return init_windows_clipboard() if platform.system() == "Linux": - with open("/proc/version") as f: + with open("/proc/version", "r") as f: if "Microsoft" in f.read(): return init_wsl_clipboard() diff --git a/venv/lib/python3.8/site-packages/pandas/io/common.py b/venv/lib/python3.8/site-packages/pandas/io/common.py index 64c5d31..bd77a1e 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/common.py +++ b/venv/lib/python3.8/site-packages/pandas/io/common.py @@ -2,12 +2,24 @@ import bz2 from collections import abc -import dataclasses import gzip -from io import BufferedIOBase, BytesIO, RawIOBase, TextIOWrapper +from io import BufferedIOBase, BytesIO, RawIOBase import mmap import os -from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, cast +import pathlib +from typing import ( + IO, + TYPE_CHECKING, + Any, + AnyStr, + Dict, + List, + Mapping, + Optional, + Tuple, + Type, + Union, +) from urllib.parse import ( urljoin, urlparse as parse_url, @@ -15,93 +27,23 @@ from urllib.parse import ( uses_params, uses_relative, ) -import warnings import zipfile -from pandas._typing import ( - Buffer, - CompressionDict, - CompressionOptions, - FileOrBuffer, - FilePathOrBuffer, - StorageOptions, -) -from pandas.compat import get_lzma_file, import_lzma +from pandas._typing import FilePathOrBuffer +from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like -lzma = import_lzma() +lzma = _import_lzma() _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard("") -@dataclasses.dataclass -class IOArgs: - """ - Return value of io/common.py:_get_filepath_or_buffer. - - Note (copy&past from io/parsers): - filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - though mypy handling of conditional imports is difficult. - See https://github.com/python/mypy/issues/1297 - """ - - filepath_or_buffer: FileOrBuffer - encoding: str - mode: str - compression: CompressionDict - should_close: bool = False - - -@dataclasses.dataclass -class IOHandles: - """ - Return value of io/common.py:get_handle - - Can be used as a context manager. - - This is used to easily close created buffers and to handle corner cases when - TextIOWrapper is inserted. - - handle: The file handle to be used. - created_handles: All file handles that are created by get_handle - is_wrapped: Whether a TextIOWrapper needs to be detached. - """ - - handle: Buffer - compression: CompressionDict - created_handles: List[Buffer] = dataclasses.field(default_factory=list) - is_wrapped: bool = False - is_mmap: bool = False - - def close(self) -> None: - """ - Close all created buffers. - - Note: If a TextIOWrapper was inserted, it is flushed and detached to - avoid closing the potentially user-created buffer. - """ - if self.is_wrapped: - assert isinstance(self.handle, TextIOWrapper) - self.handle.flush() - self.handle.detach() - self.created_handles.remove(self.handle) - try: - for handle in self.created_handles: - handle.close() - except (OSError, ValueError): - pass - self.created_handles = [] - self.is_wrapped = False - - def __enter__(self) -> "IOHandles": - return self - - def __exit__(self, *args: Any) -> None: - self.close() +if TYPE_CHECKING: + from io import IOBase # noqa: F401 def is_url(url) -> bool: @@ -122,7 +64,9 @@ def is_url(url) -> bool: return parse_url(url).scheme in _VALID_URLS -def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]: +def _expand_user( + filepath_or_buffer: FilePathOrBuffer[AnyStr], +) -> FilePathOrBuffer[AnyStr]: """ Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -152,8 +96,7 @@ def validate_header_arg(header) -> None: def stringify_path( filepath_or_buffer: FilePathOrBuffer[AnyStr], - convert_file_like: bool = False, -) -> FileOrBuffer[AnyStr]: +) -> FilePathOrBuffer[AnyStr]: """ Attempt to convert a path-like object to a string. @@ -170,17 +113,17 @@ def stringify_path( Objects supporting the fspath protocol (python 3.6+) are coerced according to its __fspath__ method. + For backwards compatibility with older pythons, pathlib.Path and + py.path objects are specially coerced. + Any other object is passed through unchanged, which includes bytes, strings, buffers, or anything else that's not even path-like. """ - if not convert_file_like and is_file_like(filepath_or_buffer): - # GH 38125: some fsspec objects implement os.PathLike but have already opened a - # file. This prevents opening the file a second time. infer_compression calls - # this function with convert_file_like=True to infer the compression. - return cast(FileOrBuffer[AnyStr], filepath_or_buffer) - - if isinstance(filepath_or_buffer, os.PathLike): - filepath_or_buffer = filepath_or_buffer.__fspath__() + if hasattr(filepath_or_buffer, "__fspath__"): + # https://github.com/python/mypy/issues/1424 + return filepath_or_buffer.__fspath__() # type: ignore + elif isinstance(filepath_or_buffer, pathlib.Path): + return str(filepath_or_buffer) return _expand_user(filepath_or_buffer) @@ -206,13 +149,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def _get_filepath_or_buffer( +def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: str = "utf-8", - compression: CompressionOptions = None, - mode: str = "r", - storage_options: StorageOptions = None, -) -> IOArgs: + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, + storage_options: Optional[Dict[str, Any]] = None, +): """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -224,81 +167,27 @@ def _get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional + storage_options: dict, optional + passed on to fsspec, if using it; this is not yet accessed by the public API - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values - - .. versionadded:: 1.2.0 - - ..versionchange:: 1.2.0 - - Returns the dataclass IOArgs. + Returns + ------- + Tuple[FilePathOrBuffer, str, str, bool] + Tuple containing the filepath or buffer, the encoding, the compression + and should_close. """ filepath_or_buffer = stringify_path(filepath_or_buffer) - # handle compression dict - compression_method, compression = get_compression_method(compression) - compression_method = infer_compression(filepath_or_buffer, compression_method) - - # GH21227 internal compression is not used for non-binary handles. - if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode: - warnings.warn( - "compression has no effect when passing a non-binary object as input.", - RuntimeWarning, - stacklevel=2, - ) - compression_method = None - - compression = dict(compression, method=compression_method) - - # uniform encoding names - if encoding is not None: - encoding = encoding.replace("_", "-").lower() - - # bz2 and xz do not write the byte order mark for utf-16 and utf-32 - # print a warning when writing such files - if ( - "w" in mode - and compression_method in ["bz2", "xz"] - and encoding in ["utf-16", "utf-32"] - ): - warnings.warn( - f"{compression} will not write the byte order mark for {encoding}", - UnicodeWarning, - ) - - # Use binary mode when converting path-like objects to file-like objects (fsspec) - # except when text mode is explicitly requested. The original mode is returned if - # fsspec is not used. - fsspec_mode = mode - if "t" not in fsspec_mode and "b" not in fsspec_mode: - fsspec_mode += "b" - if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged - if storage_options: - raise ValueError( - "storage_options passed with file object or non-fsspec file path" - ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = {"method": "gzip"} + compression = "gzip" reader = BytesIO(req.read()) req.close() - return IOArgs( - filepath_or_buffer=reader, - encoding=encoding, - compression=compression, - should_close=True, - mode=fsspec_mode, - ) + return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): assert isinstance( @@ -330,7 +219,7 @@ def _get_filepath_or_buffer( try: file_obj = fsspec.open( - filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): @@ -341,41 +230,19 @@ def _get_filepath_or_buffer( storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( - filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() - return IOArgs( - filepath_or_buffer=file_obj, - encoding=encoding, - compression=compression, - should_close=True, - mode=fsspec_mode, - ) - elif storage_options: - raise ValueError( - "storage_options passed with file object or non-fsspec file path" - ) + return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return IOArgs( - filepath_or_buffer=_expand_user(filepath_or_buffer), - encoding=encoding, - compression=compression, - should_close=False, - mode=mode, - ) + return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return IOArgs( - filepath_or_buffer=filepath_or_buffer, - encoding=encoding, - compression=compression, - should_close=False, - mode=mode, - ) + return filepath_or_buffer, None, compression, False def file_path_to_url(path: str) -> str: @@ -400,8 +267,8 @@ _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": def get_compression_method( - compression: CompressionOptions, -) -> Tuple[Optional[str], CompressionDict]: + compression: Optional[Union[str, Mapping[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -415,23 +282,21 @@ def get_compression_method( Returns ------- tuple of ({compression method}, Optional[str] - {compression arguments}, Dict[str, Any]) + {compression arguments}, Dict[str, str]) Raises ------ ValueError on mapping missing 'method' key """ - compression_method: Optional[str] if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression_method = compression_args.pop("method") + compression = compression_args.pop("method") except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: compression_args = {} - compression_method = compression - return compression_method, compression_args + return compression, compression_args def infer_compression( @@ -460,13 +325,14 @@ def infer_compression( ------ ValueError on invalid compression specified. """ + # No compression has been explicitly specified if compression is None: return None # Infer compression if compression == "infer": # Convert all path types (e.g. pathlib.Path) to strings - filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True) + filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): # Cannot infer compression of a buffer, assume no compression return None @@ -488,15 +354,14 @@ def infer_compression( def get_handle( - path_or_buf: FilePathOrBuffer, + path_or_buf, mode: str, - encoding: Optional[str] = None, - compression: CompressionOptions = None, + encoding=None, + compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, - errors: Optional[str] = None, - storage_options: StorageOptions = None, -) -> IOHandles: + errors=None, +): """ Get file handle for given path/buffer and mode. @@ -532,85 +397,77 @@ def get_handle( memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - Whether the type of the content passed to the file/buffer is string or - bytes. This is not the same as `"b" not in mode`. If a string content is - passed to a binary file/buffer, a wrapper is inserted. + whether file/buffer is in text format (csv, json, etc.), or in binary + mode (pickle, etc.). errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. - storage_options: StorageOptions = None - Passed to _get_filepath_or_buffer - .. versionchanged:: 1.2.0 + .. versionadded:: 1.1.0 - Returns the dataclass IOHandles + Returns + ------- + f : file-like + A file-like object. + handles : list of file-like objects + A list of file-like object that were opened in this function. """ - # Windows does not default to utf-8. Set to utf-8 for a consistent behavior - if encoding is None: - encoding = "utf-8" + need_text_wrapping: Tuple[Type["IOBase"], ...] + try: + from s3fs import S3File - # read_csv does not know whether the buffer is opened in binary/text mode - if _is_binary_mode(path_or_buf, mode) and "b" not in mode: - mode += "b" + need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) + except ImportError: + need_text_wrapping = (BufferedIOBase, RawIOBase) - # open URLs - ioargs = _get_filepath_or_buffer( - path_or_buf, - encoding=encoding, - compression=compression, - mode=mode, - storage_options=storage_options, - ) + handles: List[IO] = list() + f = path_or_buf - handle = ioargs.filepath_or_buffer - handles: List[Buffer] + # Convert pathlib.Path/py.path.local or string + path_or_buf = stringify_path(path_or_buf) + is_path = isinstance(path_or_buf, str) - # memory mapping needs to be the first step - handle, memory_map, handles = _maybe_memory_map( - handle, memory_map, ioargs.encoding, ioargs.mode, errors - ) - - is_path = isinstance(handle, str) - compression_args = dict(ioargs.compression) - compression = compression_args.pop("method") + compression, compression_args = get_compression_method(compression) + if is_path: + compression = infer_compression(path_or_buf, compression) if compression: - # compression libraries do not like an explicit text-mode - ioargs.mode = ioargs.mode.replace("t", "") + + # GH33398 the type ignores here seem related to mypy issue #5382; + # it may be possible to remove them once that is resolved. # GZ Compression if compression == "gzip": if is_path: - assert isinstance(handle, str) - handle = gzip.GzipFile( - filename=handle, - mode=ioargs.mode, - **compression_args, + f = gzip.open( + path_or_buf, mode, **compression_args # type: ignore ) else: - handle = gzip.GzipFile( - fileobj=handle, # type: ignore[arg-type] - mode=ioargs.mode, - **compression_args, + f = gzip.GzipFile( + fileobj=path_or_buf, **compression_args # type: ignore ) # BZ Compression elif compression == "bz2": - handle = bz2.BZ2File( - handle, # type: ignore[arg-type] - mode=ioargs.mode, - **compression_args, - ) + if is_path: + f = bz2.BZ2File( + path_or_buf, mode, **compression_args # type: ignore + ) + else: + f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore # ZIP Compression elif compression == "zip": - handle = _BytesZipFile(handle, ioargs.mode, **compression_args) - if handle.mode == "r": - handles.append(handle) - zip_names = handle.namelist() + zf = _BytesZipFile(path_or_buf, mode, **compression_args) + # Ensure the container is closed as well. + handles.append(zf) + if zf.mode == "w": + f = zf + elif zf.mode == "r": + zip_names = zf.namelist() if len(zip_names) == 1: - handle = handle.open(zip_names.pop()) + f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: @@ -621,76 +478,52 @@ def get_handle( # XZ Compression elif compression == "xz": - handle = get_lzma_file(lzma)(handle, ioargs.mode) + f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) - assert not isinstance(handle, str) - handles.append(handle) + handles.append(f) - elif isinstance(handle, str): - # Check whether the filename is to be opened in binary mode. - # Binary mode does not support 'encoding' and 'newline'. - if ioargs.encoding and "b" not in ioargs.mode: + elif is_path: + if encoding: # Encoding - handle = open( - handle, - ioargs.mode, - encoding=ioargs.encoding, - errors=errors, - newline="", - ) + f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") + elif is_text: + # No explicit encoding + f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode - handle = open(handle, ioargs.mode) - handles.append(handle) + f = open(path_or_buf, mode) + handles.append(f) # Convert BytesIO or file objects passed with an encoding - is_wrapped = False - if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): - handle = TextIOWrapper( - handle, # type: ignore[arg-type] - encoding=ioargs.encoding, - errors=errors, - newline="", - ) - handles.append(handle) - # only marked as wrapped when the caller provided a handle - is_wrapped = not ( - isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close - ) + if is_text and (compression or isinstance(f, need_text_wrapping)): + from io import TextIOWrapper - handles.reverse() # close the most recently added buffer first - if ioargs.should_close: - assert not isinstance(ioargs.filepath_or_buffer, str) - handles.append(ioargs.filepath_or_buffer) + g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") + if not isinstance(f, (BufferedIOBase, RawIOBase)): + handles.append(g) + f = g - assert not isinstance(handle, str) - return IOHandles( - handle=handle, - created_handles=handles, - is_wrapped=is_wrapped, - is_mmap=memory_map, - compression=ioargs.compression, - ) + if memory_map and hasattr(f, "fileno"): + try: + wrapped = _MMapWrapper(f) + f.close() + f = wrapped + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + pass + + return f, handles -# error: Definition of "__exit__" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "BinaryIO" [misc] -# error: Definition of "__enter__" in base class "ZipFile" is incompatible with -# definition in base class "IO" [misc] -# error: Definition of "read" in base class "ZipFile" is incompatible with -# definition in base class "BytesIO" [misc] -# error: Definition of "read" in base class "ZipFile" is incompatible with -# definition in base class "IO" [misc] -class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -710,13 +543,12 @@ class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} - kwargs_zip.update(kwargs) - super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - # ZipFile needs a non-empty string - archive_name = self.archive_name or self.filename or "zip" + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name super().writestr(archive_name, data) @property @@ -738,16 +570,9 @@ class _MMapWrapper(abc.Iterator): """ def __init__(self, f: IO): - self.attributes = {} - for attribute in ("seekable", "readable", "writeable"): - if not hasattr(f, attribute): - continue - self.attributes[attribute] = getattr(f, attribute)() self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) def __getattr__(self, name: str): - if name in self.attributes: - return lambda: self.attributes[name] return getattr(self.mmap, name) def __iter__(self) -> "_MMapWrapper": @@ -766,66 +591,3 @@ class _MMapWrapper(abc.Iterator): if newline == "": raise StopIteration return newline - - -def _maybe_memory_map( - handle: FileOrBuffer, - memory_map: bool, - encoding: str, - mode: str, - errors: Optional[str], -) -> Tuple[FileOrBuffer, bool, List[Buffer]]: - """Try to memory map file/buffer.""" - handles: List[Buffer] = [] - memory_map &= hasattr(handle, "fileno") or isinstance(handle, str) - if not memory_map: - return handle, memory_map, handles - - # need to open the file first - if isinstance(handle, str): - if encoding and "b" not in mode: - # Encoding - handle = open(handle, mode, encoding=encoding, errors=errors, newline="") - else: - # Binary mode - handle = open(handle, mode) - handles.append(handle) - - try: - wrapped = cast(mmap.mmap, _MMapWrapper(handle)) # type: ignore[arg-type] - handle.close() - handles.remove(handle) - handles.append(wrapped) - handle = wrapped - except Exception: - # we catch any errors that may have occurred - # because that is consistent with the lower-level - # functionality of the C engine (pd.read_csv), so - # leave the file handler as is then - memory_map = False - - return handle, memory_map, handles - - -def file_exists(filepath_or_buffer: FilePathOrBuffer) -> bool: - """Test whether file exists.""" - exists = False - filepath_or_buffer = stringify_path(filepath_or_buffer) - if not isinstance(filepath_or_buffer, str): - return exists - try: - exists = os.path.exists(filepath_or_buffer) - # gh-5874: if the filepath is too long will raise here - except (TypeError, ValueError): - pass - return exists - - -def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: - """Whether the handle is opened in binary mode""" - # classes that expect bytes - binary_classes = [BufferedIOBase, RawIOBase] - - return isinstance(handle, tuple(binary_classes)) or "b" in getattr( - handle, "mode", mode - ) diff --git a/venv/lib/python3.8/site-packages/pandas/io/date_converters.py b/venv/lib/python3.8/site-packages/pandas/io/date_converters.py index f079a25..07919db 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/date_converters.py +++ b/venv/lib/python3.8/site-packages/pandas/io/date_converters.py @@ -1,46 +1,16 @@ """This module is designed for community supported date conversion functions""" -import warnings - import numpy as np from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): - """ - Parse columns with dates and times into a single datetime column. - - .. deprecated:: 1.2 - """ - warnings.warn( - """ - Use pd.to_datetime(date_col + " " + time_col) instead to get a Pandas Series. - Use pd.to_datetime(date_col + " " + time_col).to_pydatetime() instead to get a Numpy array. -""", # noqa: E501 - FutureWarning, - stacklevel=2, - ) date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): - """ - Parse columns with years, months and days into a single date column. - - .. deprecated:: 1.2 - """ - warnings.warn( - """ - Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) instead to get a Pandas Series. - Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) and - np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. -""", # noqa: E501 - FutureWarning, - stacklevel=2, - ) - year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -48,24 +18,6 @@ def parse_date_fields(year_col, month_col, day_col): def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): - """ - Parse columns with datetime information into a single datetime column. - - .. deprecated:: 1.2 - """ - - warnings.warn( - """ - Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, - "hour": hour_col, "minute": minute_col, second": second_col}) instead to get a Pandas Series. - Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, - "hour": hour_col, "minute": minute_col, second": second_col}) and - np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. -""", # noqa: E501 - FutureWarning, - stacklevel=2, - ) - year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -78,20 +30,6 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_ def generic_parser(parse_func, *cols): - """ - Use dateparser to parse columns with data information into a single datetime column. - - .. deprecated:: 1.2 - """ - - warnings.warn( - """ - Use pd.to_datetime instead. -""", - FutureWarning, - stacklevel=2, - ) - N = _check_columns(cols) results = np.empty(N, dtype=object) diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/__init__.py b/venv/lib/python3.8/site-packages/pandas/io/excel/__init__.py index 3bad493..d035223 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/__init__.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/__init__.py @@ -1,9 +1,9 @@ from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel -from pandas.io.excel._odswriter import ODSWriter as _ODSWriter -from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter +from pandas.io.excel._odswriter import _ODSWriter +from pandas.io.excel._openpyxl import _OpenpyxlWriter from pandas.io.excel._util import register_writer -from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter -from pandas.io.excel._xlwt import XlwtWriter as _XlwtWriter +from pandas.io.excel._xlsxwriter import _XlsxWriter +from pandas.io.excel._xlwt import _XlwtWriter __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_base.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_base.py index 221e8b9..b1bbda4 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_base.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_base.py @@ -1,34 +1,33 @@ import abc import datetime -from distutils.version import LooseVersion -import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import IO, Any, Dict, Mapping, Optional, Union, cast -import warnings -import zipfile +from typing import Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Buffer, FilePathOrBuffer, StorageOptions -from pandas.compat._optional import import_optional_dependency from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments, doc +from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments from pandas.core.dtypes.common import is_bool, is_float, is_integer, is_list_like from pandas.core.frame import DataFrame -from pandas.core.shared_docs import _shared_docs -from pandas.io.common import IOHandles, get_handle, stringify_path, validate_header_arg +from pandas.io.common import ( + get_filepath_or_buffer, + is_url, + stringify_path, + urlopen, + validate_header_arg, +) from pandas.io.excel._util import ( - fill_mi_header, - get_default_writer, + _fill_mi_header, + _get_default_writer, + _maybe_convert_usecols, + _pop_header_name, get_writer, - maybe_convert_usecols, - pop_header_name, ) from pandas.io.parsers import TextParser @@ -50,7 +49,7 @@ io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) + such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. sheet_name : str, int, list, or None, default 0 Strings are used for sheet names. Integers are used in zero-indexed @@ -105,30 +104,12 @@ dtype : Type name or dict of column -> type, default None of dtype conversion. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb", default "xlrd". Engine compatibility : - - - "xlrd" supports old-style Excel files (.xls). + - "xlrd" supports most old/new Excel file formats. - "openpyxl" supports newer Excel file formats. - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - "pyxlsb" supports Binary Excel files. - - .. versionchanged:: 1.2.0 - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This - case will raise a ``ValueError`` in a future version of pandas. - converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -138,14 +119,13 @@ true_values : list, default None Values to consider as True. false_values : list, default None Values to consider as False. -skiprows : list-like, int, or callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (int) at the - start of the file. If callable, the callable function will be evaluated - against the row indices, returning True if the row should be skipped and - False otherwise. An example of a valid callable argument would be ``lambda - x: x in [0, 2]``. +skiprows : list-like + Rows to skip at the beginning (0-indexed). nrows : int, default None Number of rows to parse. + + .. versionadded:: 0.23.0 + na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted @@ -219,15 +199,6 @@ mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. -storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values. - - .. versionadded:: 1.2.0 Returns ------- @@ -327,70 +298,61 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - storage_options: StorageOptions = None, ): - should_close = False if not isinstance(io, ExcelFile): - should_close = True - io = ExcelFile(io, storage_options=storage_options, engine=engine) + io = ExcelFile(io, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " "an ExcelFile - ExcelFile already has the engine set" ) - try: - data = io.parse( - sheet_name=sheet_name, - header=header, - names=names, - index_col=index_col, - usecols=usecols, - squeeze=squeeze, - dtype=dtype, - converters=converters, - true_values=true_values, - false_values=false_values, - skiprows=skiprows, - nrows=nrows, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - verbose=verbose, - parse_dates=parse_dates, - date_parser=date_parser, - thousands=thousands, - comment=comment, - skipfooter=skipfooter, - convert_float=convert_float, - mangle_dupe_cols=mangle_dupe_cols, - ) - finally: - # make sure to close opened file handles - if should_close: - io.close() - return data + return io.parse( + sheet_name=sheet_name, + header=header, + names=names, + index_col=index_col, + usecols=usecols, + squeeze=squeeze, + dtype=dtype, + converters=converters, + true_values=true_values, + false_values=false_values, + skiprows=skiprows, + nrows=nrows, + na_values=na_values, + keep_default_na=keep_default_na, + na_filter=na_filter, + verbose=verbose, + parse_dates=parse_dates, + date_parser=date_parser, + thousands=thousands, + comment=comment, + skipfooter=skipfooter, + convert_float=convert_float, + mangle_dupe_cols=mangle_dupe_cols, + ) -class BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): - self.handles = IOHandles( - handle=filepath_or_buffer, compression={"method": None} - ) - if not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - self.handles = get_handle( - filepath_or_buffer, "rb", storage_options=storage_options, is_text=False - ) +class _BaseExcelReader(metaclass=abc.ABCMeta): + def __init__(self, filepath_or_buffer): + # If filepath_or_buffer is a url, load the data into a BytesIO + if is_url(filepath_or_buffer): + filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) + elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) - if isinstance(self.handles.handle, self._workbook_class): - self.book = self.handles.handle - elif hasattr(self.handles.handle, "read"): + if isinstance(filepath_or_buffer, self._workbook_class): + self.book = filepath_or_buffer + elif hasattr(filepath_or_buffer, "read"): # N.B. xlrd.Book has a read attribute too - self.handles.handle.seek(0) - self.book = self.load_workbook(self.handles.handle) - elif isinstance(self.handles.handle, bytes): - self.book = self.load_workbook(BytesIO(self.handles.handle)) + filepath_or_buffer.seek(0) + self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, str): + self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -406,7 +368,7 @@ class BaseExcelReader(metaclass=abc.ABCMeta): pass def close(self): - self.handles.close() + pass @property @abc.abstractmethod @@ -479,7 +441,7 @@ class BaseExcelReader(metaclass=abc.ABCMeta): sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) - usecols = maybe_convert_usecols(usecols) + usecols = _maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() @@ -498,10 +460,10 @@ class BaseExcelReader(metaclass=abc.ABCMeta): if is_integer(skiprows): row += skiprows - data[row], control_row = fill_mi_header(data[row], control_row) + data[row], control_row = _fill_mi_header(data[row], control_row) if index_col is not None: - header_name, _ = pop_header_name(data[row], index_col) + header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): @@ -577,39 +539,23 @@ class ExcelWriter(metaclass=abc.ABCMeta): Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. See DataFrame.to_excel for typical usage. - The writer should be used as a context manager. Otherwise, call `close()` to save - and close any opened file handles. - Parameters ---------- - path : str or typing.BinaryIO + path : str Path to xls or xlsx or ods file. engine : str (optional) Engine to use for writing. If None, defaults to ``io.excel..writer``. NOTE: can only be passed as a keyword argument. - - .. deprecated:: 1.2.0 - - As the `xlwt `__ package is no longer - maintained, the ``xlwt`` engine will be removed in a future - version of pandas. - date_format : str, default None Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None Format string for datetime objects written into Excel files. (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' - File mode to use (write or append). Append does not work with fsspec URLs. + File mode to use (write or append). .. versionadded:: 0.24.0 - storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". - - .. versionadded:: 1.2.0 Attributes ---------- @@ -642,29 +588,14 @@ class ExcelWriter(metaclass=abc.ABCMeta): You can set the date format or datetime format: >>> with ExcelWriter('path_to_file.xlsx', - ... date_format='YYYY-MM-DD', - ... datetime_format='YYYY-MM-DD HH:MM:SS') as writer: + date_format='YYYY-MM-DD', + datetime_format='YYYY-MM-DD HH:MM:SS') as writer: ... df.to_excel(writer) You can also append to an existing Excel file: >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: ... df.to_excel(writer, sheet_name='Sheet3') - - You can store Excel file in RAM: - - >>> import io - >>> buffer = io.BytesIO() - >>> with pd.ExcelWriter(buffer) as writer: - ... df.to_excel(writer) - - You can pack Excel file into zip archive: - - >>> import zipfile - >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf: - ... with zf.open('filename.xlsx', 'w') as buffer: - ... with pd.ExcelWriter(buffer) as writer: - ... df.to_excel(writer) """ # Defining an ExcelWriter implementation (see abstract methods for more...) @@ -699,36 +630,17 @@ class ExcelWriter(metaclass=abc.ABCMeta): ext = "xlsx" try: - engine = config.get_option(f"io.excel.{ext}.writer", silent=True) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": - engine = get_default_writer(ext) + engine = _get_default_writer(ext) except KeyError as err: raise ValueError(f"No engine for filetype: '{ext}'") from err - - if engine == "xlwt": - xls_config_engine = config.get_option( - "io.excel.xls.writer", silent=True - ) - # Don't warn a 2nd time if user has changed the default engine for xls - if xls_config_engine != "xlwt": - warnings.warn( - "As the xlwt package is no longer maintained, the xlwt " - "engine will be removed in a future version of pandas. " - "This is the only engine in pandas that supports writing " - "in the xls format. Install openpyxl and write to an xlsx " - "file instead. You can set the option io.excel.xls.writer " - "to 'xlwt' to silence this warning. While this option is " - "deprecated and will also raise a warning, it can " - "be globally set and the warning suppressed.", - FutureWarning, - stacklevel=4, - ) - cls = get_writer(engine) return object.__new__(cls) # declare external properties you can count on + book = None curr_sheet = None path = None @@ -773,12 +685,11 @@ class ExcelWriter(metaclass=abc.ABCMeta): def __init__( self, - path: Union[FilePathOrBuffer, "ExcelWriter"], + path, engine=None, date_format=None, datetime_format=None, - mode: str = "w", - storage_options: StorageOptions = None, + mode="w", **engine_kwargs, ): # validate that this engine can handle the extension @@ -786,20 +697,8 @@ class ExcelWriter(metaclass=abc.ABCMeta): ext = os.path.splitext(path)[-1] self.check_extension(ext) - # use mode to open the file - if "b" not in mode: - mode += "b" - # use "a" for the user to append data to excel but internally use "r+" to let - # the excel backend first read the existing file and then write any data to it - mode = mode.replace("a", "r+") - - # cast ExcelWriter to avoid adding 'if self.handles is not None' - self.handles = IOHandles(cast(Buffer, path), compression={"copression": None}) - if not isinstance(path, ExcelWriter): - self.handles = get_handle( - path, mode, storage_options=storage_options, is_text=False - ) - self.sheets: Dict[str, Any] = {} + self.path = path + self.sheets = {} self.cur_sheet = None if date_format is None: @@ -814,7 +713,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): self.mode = mode def __fspath__(self): - return getattr(self.handles.handle, "name", "") + return stringify_path(self.path) def _get_sheet_name(self, sheet_name): if sheet_name is None: @@ -858,19 +757,14 @@ class ExcelWriter(metaclass=abc.ABCMeta): return val, fmt @classmethod - def check_extension(cls, ext: str): + def check_extension(cls, ext): """ checks that path's extension against the Writer's supported extensions. If it isn't supported, raises UnsupportedFiletypeError. """ if ext.startswith("."): ext = ext[1:] - # error: "Callable[[ExcelWriter], Any]" has no attribute "__iter__" - # (not iterable) [attr-defined] - if not any( - ext in extension - for extension in cls.supported_extensions # type: ignore[attr-defined] - ): + if not any(ext in extension for extension in cls.supported_extensions): raise ValueError(f"Invalid extension for engine '{cls.engine}': '{ext}'") else: return True @@ -884,97 +778,42 @@ class ExcelWriter(metaclass=abc.ABCMeta): def close(self): """synonym for save, to make it more file-like""" - content = self.save() - self.handles.close() - return content + return self.save() -XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" -ZIP_SIGNATURE = b"PK\x03\x04" -PEEK_SIZE = max(len(XLS_SIGNATURE), len(ZIP_SIGNATURE)) - - -@doc(storage_options=_shared_docs["storage_options"]) -def inspect_excel_format( - path: Optional[str] = None, - content: Union[None, BufferedIOBase, RawIOBase, bytes] = None, - storage_options: StorageOptions = None, -) -> str: +def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: """ - Inspect the path or content of an excel file and get its format. + Check if the stream is an OpenDocument Spreadsheet (.ods) file - At least one of path or content must be not None. If both are not None, - content will take precedence. - - Adopted from xlrd: https://github.com/python-excel/xlrd. + It uses magic values inside the stream Parameters ---------- - path : str, optional - Path to file to inspect. May be a URL. - content : file-like object, optional - Content of file to inspect. - {storage_options} + stream : Union[BufferedIOBase, RawIOBase] + IO stream with data which might be an ODS file Returns ------- - str - Format of file. - - Raises - ------ - ValueError - If resulting stream is empty. - BadZipFile - If resulting stream does not have an XLS signature and is not a valid zipfile. + is_ods : bool + Boolean indication that this is indeed an ODS file or not """ - content_or_path: Union[None, str, BufferedIOBase, RawIOBase, IO[bytes]] - if isinstance(content, bytes): - content_or_path = BytesIO(content) - else: - content_or_path = content or path - assert content_or_path is not None - - with get_handle( - content_or_path, "rb", storage_options=storage_options, is_text=False - ) as handle: - stream = handle.handle - stream.seek(0) - buf = stream.read(PEEK_SIZE) - if buf is None: - raise ValueError("stream is empty") - else: - assert isinstance(buf, bytes) - peek = buf - stream.seek(0) - - if peek.startswith(XLS_SIGNATURE): - return "xls" - elif not peek.startswith(ZIP_SIGNATURE): - raise ValueError("File is not a recognized excel file") - - # ZipFile typing is overly-strict - # https://github.com/python/typeshed/issues/4212 - zf = zipfile.ZipFile(stream) # type: ignore[arg-type] - - # Workaround for some third party files that use forward slashes and - # lower case names. - component_names = [name.replace("\\", "/").lower() for name in zf.namelist()] - - if "xl/workbook.xml" in component_names: - return "xlsx" - if "xl/workbook.bin" in component_names: - return "xlsb" - if "content.xml" in component_names: - return "ods" - return "zip" + stream.seek(0) + is_ods = False + if stream.read(4) == b"PK\003\004": + stream.seek(30) + is_ods = ( + stream.read(54) == b"mimetype" + b"application/vnd.oasis.opendocument.spreadsheet" + ) + stream.seek(0) + return is_ods class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. - See read_excel for more documentation. + Uses xlrd engine by default. See read_excel for more documentation Parameters ---------- @@ -984,134 +823,48 @@ class ExcelFile: .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None If io is not a buffer or path, this must be set to identify io. - Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb`` + Supported engines: ``xlrd``, ``openpyxl``, ``odf``, ``pyxlsb``, + default ``xlrd``. Engine compatibility : - - - ``xlrd`` supports old-style Excel files (.xls). + - ``xlrd`` supports most old/new Excel file formats. - ``openpyxl`` supports newer Excel file formats. - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). - ``pyxlsb`` supports Binary Excel files. - - .. versionchanged:: 1.2.0 - - The engine `xlrd `_ - now only supports old-style ``.xls`` files. - When ``engine=None``, the following logic will be - used to determine the engine: - - - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt), - then `odf `_ will be used. - - Otherwise if ``path_or_buffer`` is an xls format, - ``xlrd`` will be used. - - Otherwise if `openpyxl `_ is installed, - then ``openpyxl`` will be used. - - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised. - - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. - This case will raise a ``ValueError`` in a future version of pandas. - - .. warning:: - - Please do not report issues when using ``xlrd`` to read ``.xlsx`` files. - This is not supported, switch to using ``openpyxl`` instead. """ - from pandas.io.excel._odfreader import ODFReader - from pandas.io.excel._openpyxl import OpenpyxlReader - from pandas.io.excel._pyxlsb import PyxlsbReader - from pandas.io.excel._xlrd import XlrdReader + from pandas.io.excel._odfreader import _ODFReader + from pandas.io.excel._openpyxl import _OpenpyxlReader + from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._xlrd import _XlrdReader - _engines: Mapping[str, Any] = { - "xlrd": XlrdReader, - "openpyxl": OpenpyxlReader, - "odf": ODFReader, - "pyxlsb": PyxlsbReader, + _engines = { + "xlrd": _XlrdReader, + "openpyxl": _OpenpyxlReader, + "odf": _ODFReader, + "pyxlsb": _PyxlsbReader, } - def __init__( - self, path_or_buffer, engine=None, storage_options: StorageOptions = None - ): - if engine is not None and engine not in self._engines: + def __init__(self, path_or_buffer, engine=None): + if engine is None: + engine = "xlrd" + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + if _is_ods_stream(path_or_buffer): + engine = "odf" + else: + ext = os.path.splitext(str(path_or_buffer))[-1] + if ext == ".ods": + engine = "odf" + if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") + self.engine = engine + # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - # Determine xlrd version if installed - if ( - import_optional_dependency( - "xlrd", raise_on_missing=False, on_version="ignore" - ) - is None - ): - xlrd_version = None - else: - import xlrd - - xlrd_version = LooseVersion(xlrd.__version__) - - if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)): - ext = inspect_excel_format( - content=path_or_buffer, storage_options=storage_options - ) - elif xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: - # path_or_buffer is path-like, use stringified path - ext = inspect_excel_format( - path=str(self._io), storage_options=storage_options - ) - - if engine is None: - if ext == "ods": - engine = "odf" - elif ext == "xls": - engine = "xlrd" - else: - # GH 35029 - Prefer openpyxl except for xls files - if ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - engine = "openpyxl" - else: - engine = "xlrd" - - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": - raise ValueError( - f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported. Install openpyxl instead." - ) - else: - caller = inspect.stack()[1] - if ( - caller.filename.endswith( - os.path.join("pandas", "io", "excel", "_base.py") - ) - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " - f"only the xls format is supported. As a result, the " - f"openpyxl engine will be used if it is installed and the " - f"engine argument is not specified. Install " - f"openpyxl instead.", - FutureWarning, - stacklevel=stacklevel, - ) - assert engine in self._engines, f"Engine {engine} not recognized" - - self.engine = engine - self.storage_options = storage_options - - self._reader = self._engines[engine](self._io, storage_options=storage_options) + self._reader = self._engines[engine](self._io) def __fspath__(self): return self._io diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_odfreader.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_odfreader.py index c5c3927..40e2665 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_odfreader.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_odfreader.py @@ -2,33 +2,27 @@ from typing import List, cast import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency import pandas as pd -from pandas.io.excel._base import BaseExcelReader +from pandas.io.excel._base import _BaseExcelReader -class ODFReader(BaseExcelReader): +class _ODFReader(_BaseExcelReader): """ Read tables out of OpenDocument formatted files. Parameters ---------- - filepath_or_buffer : string, path to be parsed or + filepath_or_buffer: string, path to be parsed or an open readable stream. - storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): import_optional_dependency("odf") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): @@ -69,7 +63,6 @@ class ODFReader(BaseExcelReader): if table.getAttribute("name") == name: return table - self.close() raise ValueError(f"sheet {name} not found") def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: @@ -191,7 +184,6 @@ class ODFReader(BaseExcelReader): result = cast(pd.Timestamp, result) return result.time() else: - self.close() raise ValueError(f"Unrecognized type {cell_type}") def _get_cell_string_value(self, cell) -> str: diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_odswriter.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_odswriter.py index 0bea19b..0131240 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_odswriter.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_odswriter.py @@ -3,24 +3,18 @@ import datetime from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import validate_freeze_panes +from pandas.io.excel._util import _validate_freeze_panes from pandas.io.formats.excel import ExcelCell -class ODSWriter(ExcelWriter): +class _ODSWriter(ExcelWriter): engine = "odf" supported_extensions = (".ods",) def __init__( - self, - path: str, - engine: Optional[str] = None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, + self, path: str, engine: Optional[str] = None, mode: str = "w", **engine_kwargs ): from odf.opendocument import OpenDocumentSpreadsheet @@ -29,11 +23,9 @@ class ODSWriter(ExcelWriter): if mode == "a": raise ValueError("Append mode is not supported with odf!") - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) - self.book = OpenDocumentSpreadsheet() + self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() self._style_dict: Dict[str, str] = {} def save(self) -> None: @@ -42,7 +34,7 @@ class ODSWriter(ExcelWriter): """ for sheet in self.sheets.values(): self.book.spreadsheet.addElement(sheet) - self.book.save(self.handles.handle) + self.book.save(self.path) def write_cells( self, @@ -50,7 +42,7 @@ class ODSWriter(ExcelWriter): sheet_name: Optional[str] = None, startrow: int = 0, startcol: int = 0, - freeze_panes: Optional[Tuple[int, int]] = None, + freeze_panes: Optional[List] = None, ) -> None: """ Write the frame cells using odf @@ -67,7 +59,7 @@ class ODSWriter(ExcelWriter): wks = Table(name=sheet_name) self.sheets[sheet_name] = wks - if validate_freeze_panes(freeze_panes): + if _validate_freeze_panes(freeze_panes): assert freeze_panes is not None self._create_freeze_panes(sheet_name, freeze_panes) @@ -182,7 +174,7 @@ class ODSWriter(ExcelWriter): Returns ------- style_key : str - Unique style key for later reference in sheet + Unique style key for for later reference in sheet """ from odf.style import ( ParagraphProperties, @@ -223,17 +215,14 @@ class ODSWriter(ExcelWriter): self.book.styles.addElement(odf_style) return name - def _create_freeze_panes( - self, sheet_name: str, freeze_panes: Tuple[int, int] - ) -> None: - """ - Create freeze panes in the sheet. + def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: + """Create freeze panes in the sheet Parameters ---------- sheet_name : str Name of the spreadsheet - freeze_panes : tuple of (int, int) + freeze_panes : list Freeze pane location x and y """ from odf.config import ( diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_openpyxl.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_openpyxl.py index 7de958d..03a30cb 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_openpyxl.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_openpyxl.py @@ -1,57 +1,74 @@ -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import List import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import BaseExcelReader, ExcelWriter -from pandas.io.excel._util import validate_freeze_panes - -if TYPE_CHECKING: - from openpyxl.descriptors.serialisable import Serialisable +from pandas.io.excel._base import ExcelWriter, _BaseExcelReader +from pandas.io.excel._util import _validate_freeze_panes -class OpenpyxlWriter(ExcelWriter): +class _OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") - def __init__( - self, - path, - engine=None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, - ): + def __init__(self, path, engine=None, mode="w", **engine_kwargs): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) - # ExcelWriter replaced "a" by "r+" to allow us to first read the excel file from - # the file and later write to it - if "r+" in self.mode: # Load from existing workbook + if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook - self.book = load_workbook(self.handles.handle) + book = load_workbook(self.path) + self.book = book else: # Create workbook object with default optimized_write=True. self.book = Workbook() if self.book.worksheets: - self.book.remove(self.book.worksheets[0]) + try: + self.book.remove(self.book.worksheets[0]) + except AttributeError: + + # compat - for openpyxl <= 2.4 + self.book.remove_sheet(self.book.worksheets[0]) def save(self): """ Save workbook to disk. """ - self.book.save(self.handles.handle) + return self.book.save(self.path) @classmethod - def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: + def _convert_to_style(cls, style_dict): + """ + Converts a style_dict to an openpyxl style object. + + Parameters + ---------- + style_dict : style dictionary to convert + """ + from openpyxl.style import Style + + xls_style = Style() + for key, value in style_dict.items(): + for nk, nv in value.items(): + if key == "borders": + ( + xls_style.borders.__getattribute__(nk).__setattr__( + "border_style", nv + ) + ) + else: + xls_style.__getattribute__(key).__setattr__(nk, nv) + + return xls_style + + @classmethod + def _convert_to_style_kwargs(cls, style_dict): """ Convert a style_dict to a set of kwargs suitable for initializing or updating-on-copy an openpyxl v2 style object. @@ -76,7 +93,7 @@ class OpenpyxlWriter(ExcelWriter): """ _style_key_map = {"borders": "border"} - style_kwargs: Dict[str, Serialisable] = {} + style_kwargs = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] @@ -387,7 +404,7 @@ class OpenpyxlWriter(ExcelWriter): # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) - _style_cache: Dict[str, Dict[str, Serialisable]] = {} + _style_cache = {} if sheet_name in self.sheets: wks = self.sheets[sheet_name] @@ -396,7 +413,7 @@ class OpenpyxlWriter(ExcelWriter): wks.title = sheet_name self.sheets[sheet_name] = wks - if validate_freeze_panes(freeze_panes): + if _validate_freeze_panes(freeze_panes): wks.freeze_panes = wks.cell( row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 ) @@ -409,7 +426,7 @@ class OpenpyxlWriter(ExcelWriter): if fmt: xcell.number_format = fmt - style_kwargs: Optional[Dict[str, Serialisable]] = {} + style_kwargs = {} if cell.style: key = str(cell.style) style_kwargs = _style_cache.get(key) @@ -449,12 +466,8 @@ class OpenpyxlWriter(ExcelWriter): setattr(xcell, k, v) -class OpenpyxlReader(BaseExcelReader): - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ) -> None: +class _OpenpyxlReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: """ Reader using openpyxl engine. @@ -462,11 +475,9 @@ class OpenpyxlReader(BaseExcelReader): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): @@ -485,7 +496,6 @@ class OpenpyxlReader(BaseExcelReader): # https://stackoverflow.com/questions/31416842/ # openpyxl-does-not-close-excel-workbook-in-read-only-mode self.book.close() - super().close() @property def sheet_names(self) -> List[str]: @@ -499,17 +509,16 @@ class OpenpyxlReader(BaseExcelReader): def _convert_cell(self, cell, convert_float: bool) -> Scalar: - from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC - + # TODO: replace with openpyxl constants if cell.is_date: return cell.value - elif cell.data_type == TYPE_ERROR: + elif cell.data_type == "e": return np.nan - elif cell.data_type == TYPE_BOOL: + elif cell.data_type == "b": return bool(cell.value) elif cell.value is None: return "" # compat with xlrd - elif cell.data_type == TYPE_NUMERIC: + elif cell.data_type == "n": # GH5394 if convert_float: val = int(cell.value) diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_pyxlsb.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_pyxlsb.py index de4f7bb..0d96c8c 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_pyxlsb.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_pyxlsb.py @@ -1,31 +1,25 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import BaseExcelReader +from pandas.io.excel._base import _BaseExcelReader -class PyxlsbReader(BaseExcelReader): - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ): +class _PyxlsbReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer : str, path object, or Workbook + filepath_or_buffer: str, path object, or Workbook Object to be parsed. - storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_util.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_util.py index 4710591..285aeaf 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_util.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_util.py @@ -1,5 +1,3 @@ -from typing import List - from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -23,7 +21,7 @@ def register_writer(klass): _writers[engine_name] = klass -def get_default_writer(ext): +def _get_default_writer(ext): """ Return the default writer for the given extension. @@ -58,7 +56,7 @@ def get_writer(engine_name): raise ValueError(f"No Excel writer '{engine_name}'") from err -def _excel2num(x: str) -> int: +def _excel2num(x): """ Convert Excel column name like 'AB' to 0-based column index. @@ -90,7 +88,7 @@ def _excel2num(x: str) -> int: return index - 1 -def _range2cols(areas: str) -> List[int]: +def _range2cols(areas): """ Convert comma separated list of column names and ranges to indices. @@ -111,19 +109,19 @@ def _range2cols(areas: str) -> List[int]: >>> _range2cols('A,C,Z:AB') [0, 2, 25, 26, 27] """ - cols: List[int] = [] + cols = [] for rng in areas.split(","): if ":" in rng: - rngs = rng.split(":") - cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1)) + rng = rng.split(":") + cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) else: cols.append(_excel2num(rng)) return cols -def maybe_convert_usecols(usecols): +def _maybe_convert_usecols(usecols): """ Convert `usecols` into a compatible format for parsing in `parsers.py`. @@ -152,7 +150,7 @@ def maybe_convert_usecols(usecols): return usecols -def validate_freeze_panes(freeze_panes): +def _validate_freeze_panes(freeze_panes): if freeze_panes is not None: if len(freeze_panes) == 2 and all( isinstance(item, int) for item in freeze_panes @@ -169,7 +167,15 @@ def validate_freeze_panes(freeze_panes): return False -def fill_mi_header(row, control_row): +def _trim_excel_header(row): + # trim header row so auto-index inference works + # xlrd uses '' , openpyxl None + while len(row) > 0 and (row[0] == "" or row[0] is None): + row = row[1:] + return row + + +def _fill_mi_header(row, control_row): """ Forward fill blank entries in row but only inside the same parent index. @@ -202,7 +208,7 @@ def fill_mi_header(row, control_row): return row, control_row -def pop_header_name(row, index_col): +def _pop_header_name(row, index_col): """ Pop the header name for MultiIndex parsing. diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlrd.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlrd.py index c655db4..af82c15 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlrd.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlrd.py @@ -2,14 +2,13 @@ from datetime import time import numpy as np -from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import BaseExcelReader +from pandas.io.excel._base import _BaseExcelReader -class XlrdReader(BaseExcelReader): - def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): +class _XlrdReader(_BaseExcelReader): + def __init__(self, filepath_or_buffer): """ Reader using xlrd engine. @@ -17,12 +16,10 @@ class XlrdReader(BaseExcelReader): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : dict, optional - passed to fsspec for appropriate URLs (see ``_get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlsxwriter.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlsxwriter.py index d7bbec5..85a1bb0 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlsxwriter.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlsxwriter.py @@ -1,17 +1,14 @@ -from typing import Dict, List, Tuple - import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import validate_freeze_panes +from pandas.io.excel._util import _validate_freeze_panes class _XlsxStyler: # Map from openpyxl-oriented styles to flatter xlsxwriter representation # Ordering necessary for both determinism and because some are keyed by # prefixes of others. - STYLE_MAPPING: Dict[str, List[Tuple[Tuple[str, ...], str]]] = { + STYLE_MAPPING = { "font": [ (("name",), "font_name"), (("sz",), "font_size"), @@ -159,7 +156,7 @@ class _XlsxStyler: return props -class XlsxWriter(ExcelWriter): +class _XlsxWriter(ExcelWriter): engine = "xlsxwriter" supported_extensions = (".xlsx",) @@ -169,12 +166,11 @@ class XlsxWriter(ExcelWriter): engine=None, date_format=None, datetime_format=None, - mode: str = "w", - storage_options: StorageOptions = None, + mode="w", **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. - from xlsxwriter import Workbook + import xlsxwriter if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") @@ -185,11 +181,10 @@ class XlsxWriter(ExcelWriter): date_format=date_format, datetime_format=datetime_format, mode=mode, - storage_options=storage_options, **engine_kwargs, ) - self.book = Workbook(self.handles.handle, **engine_kwargs) + self.book = xlsxwriter.Workbook(path, **engine_kwargs) def save(self): """ @@ -211,7 +206,7 @@ class XlsxWriter(ExcelWriter): style_dict = {"null": None} - if validate_freeze_panes(freeze_panes): + if _validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) for cell in cells: diff --git a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlwt.py b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlwt.py index 9a725c1..78efe77 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/excel/_xlwt.py +++ b/venv/lib/python3.8/site-packages/pandas/io/excel/_xlwt.py @@ -1,28 +1,14 @@ -from typing import TYPE_CHECKING, Dict - import pandas._libs.json as json -from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import validate_freeze_panes - -if TYPE_CHECKING: - from xlwt import XFStyle +from pandas.io.excel._util import _validate_freeze_panes -class XlwtWriter(ExcelWriter): +class _XlwtWriter(ExcelWriter): engine = "xlwt" supported_extensions = (".xls",) - def __init__( - self, - path, - engine=None, - encoding=None, - mode: str = "w", - storage_options: StorageOptions = None, - **engine_kwargs, - ): + def __init__(self, path, engine=None, encoding=None, mode="w", **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt @@ -31,9 +17,7 @@ class XlwtWriter(ExcelWriter): if mode == "a": raise ValueError("Append mode is not supported with xlwt!") - super().__init__( - path, mode=mode, storage_options=storage_options, **engine_kwargs - ) + super().__init__(path, mode=mode, **engine_kwargs) if encoding is None: encoding = "ascii" @@ -45,13 +29,12 @@ class XlwtWriter(ExcelWriter): """ Save workbook to disk. """ - if self.sheets: - # fails when the ExcelWriter is just opened and then closed - self.book.save(self.handles.handle) + return self.book.save(self.path) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None ): + # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -61,12 +44,12 @@ class XlwtWriter(ExcelWriter): wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks - if validate_freeze_panes(freeze_panes): + if _validate_freeze_panes(freeze_panes): wks.set_panes_frozen(True) wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) - style_dict: Dict[str, XFStyle] = {} + style_dict = {} for cell in cells: val, fmt = self._value_with_fmt(cell.val) @@ -118,14 +101,14 @@ class XlwtWriter(ExcelWriter): f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{line_sep.join(it)} " + out = f"{(line_sep).join(it)} " return out else: it = [ f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{field_sep.join(it)} " + out = f"{(field_sep).join(it)} " return out else: item = f"{item}" diff --git a/venv/lib/python3.8/site-packages/pandas/io/feather_format.py b/venv/lib/python3.8/site-packages/pandas/io/feather_format.py index 4226777..dfa4394 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/feather_format.py +++ b/venv/lib/python3.8/site-packages/pandas/io/feather_format.py @@ -1,24 +1,13 @@ """ feather-format compat """ -from typing import AnyStr - -from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.util._decorators import doc from pandas import DataFrame, Int64Index, RangeIndex -from pandas.core import generic -from pandas.io.common import get_handle +from pandas.io.common import get_filepath_or_buffer, stringify_path -@doc(storage_options=generic._shared_docs["storage_options"]) -def to_feather( - df: DataFrame, - path: FilePathOrBuffer[AnyStr], - storage_options: StorageOptions = None, - **kwargs, -): +def to_feather(df: DataFrame, path, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -26,10 +15,6 @@ def to_feather( ---------- df : DataFrame path : string file path, or file-like object - {storage_options} - - .. versionadded:: 1.2.0 - **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. @@ -38,6 +23,8 @@ def to_feather( import_optional_dependency("pyarrow") from pyarrow import feather + path = stringify_path(path) + if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -74,16 +61,10 @@ def to_feather( if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - with get_handle( - path, "wb", storage_options=storage_options, is_text=False - ) as handles: - feather.write_feather(df, handles.handle, **kwargs) + feather.write_feather(df, path, **kwargs) -@doc(storage_options=generic._shared_docs["storage_options"]) -def read_feather( - path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None -): +def read_feather(path, columns=None, use_threads: bool = True): """ Load a feather-format object from the file path. @@ -99,7 +80,7 @@ def read_feather( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handle (e.g. via builtin ``open`` function) + such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. @@ -109,9 +90,6 @@ def read_feather( Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 - {storage_options} - - .. versionadded:: 1.2.0 Returns ------- @@ -120,10 +98,12 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - with get_handle( - path, "rb", storage_options=storage_options, is_text=False - ) as handles: + path, _, _, should_close = get_filepath_or_buffer(path) - return feather.read_feather( - handles.handle, columns=columns, use_threads=bool(use_threads) - ) + df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) + + # s3fs only validates the credentials when the file is closed. + if should_close: + path.close() + + return df diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/_color_data.py b/venv/lib/python3.8/site-packages/pandas/io/formats/_color_data.py deleted file mode 100644 index e5b72b2..0000000 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/_color_data.py +++ /dev/null @@ -1,155 +0,0 @@ -# GH37967: Enable the use of CSS named colors, as defined in -# matplotlib.colors.CSS4_COLORS, when exporting to Excel. -# This data has been copied here, instead of being imported from matplotlib, -# not to have ``to_excel`` methods require matplotlib. -# source: matplotlib._color_data (3.3.3) -CSS4_COLORS = { - "aliceblue": "F0F8FF", - "antiquewhite": "FAEBD7", - "aqua": "00FFFF", - "aquamarine": "7FFFD4", - "azure": "F0FFFF", - "beige": "F5F5DC", - "bisque": "FFE4C4", - "black": "000000", - "blanchedalmond": "FFEBCD", - "blue": "0000FF", - "blueviolet": "8A2BE2", - "brown": "A52A2A", - "burlywood": "DEB887", - "cadetblue": "5F9EA0", - "chartreuse": "7FFF00", - "chocolate": "D2691E", - "coral": "FF7F50", - "cornflowerblue": "6495ED", - "cornsilk": "FFF8DC", - "crimson": "DC143C", - "cyan": "00FFFF", - "darkblue": "00008B", - "darkcyan": "008B8B", - "darkgoldenrod": "B8860B", - "darkgray": "A9A9A9", - "darkgreen": "006400", - "darkgrey": "A9A9A9", - "darkkhaki": "BDB76B", - "darkmagenta": "8B008B", - "darkolivegreen": "556B2F", - "darkorange": "FF8C00", - "darkorchid": "9932CC", - "darkred": "8B0000", - "darksalmon": "E9967A", - "darkseagreen": "8FBC8F", - "darkslateblue": "483D8B", - "darkslategray": "2F4F4F", - "darkslategrey": "2F4F4F", - "darkturquoise": "00CED1", - "darkviolet": "9400D3", - "deeppink": "FF1493", - "deepskyblue": "00BFFF", - "dimgray": "696969", - "dimgrey": "696969", - "dodgerblue": "1E90FF", - "firebrick": "B22222", - "floralwhite": "FFFAF0", - "forestgreen": "228B22", - "fuchsia": "FF00FF", - "gainsboro": "DCDCDC", - "ghostwhite": "F8F8FF", - "gold": "FFD700", - "goldenrod": "DAA520", - "gray": "808080", - "green": "008000", - "greenyellow": "ADFF2F", - "grey": "808080", - "honeydew": "F0FFF0", - "hotpink": "FF69B4", - "indianred": "CD5C5C", - "indigo": "4B0082", - "ivory": "FFFFF0", - "khaki": "F0E68C", - "lavender": "E6E6FA", - "lavenderblush": "FFF0F5", - "lawngreen": "7CFC00", - "lemonchiffon": "FFFACD", - "lightblue": "ADD8E6", - "lightcoral": "F08080", - "lightcyan": "E0FFFF", - "lightgoldenrodyellow": "FAFAD2", - "lightgray": "D3D3D3", - "lightgreen": "90EE90", - "lightgrey": "D3D3D3", - "lightpink": "FFB6C1", - "lightsalmon": "FFA07A", - "lightseagreen": "20B2AA", - "lightskyblue": "87CEFA", - "lightslategray": "778899", - "lightslategrey": "778899", - "lightsteelblue": "B0C4DE", - "lightyellow": "FFFFE0", - "lime": "00FF00", - "limegreen": "32CD32", - "linen": "FAF0E6", - "magenta": "FF00FF", - "maroon": "800000", - "mediumaquamarine": "66CDAA", - "mediumblue": "0000CD", - "mediumorchid": "BA55D3", - "mediumpurple": "9370DB", - "mediumseagreen": "3CB371", - "mediumslateblue": "7B68EE", - "mediumspringgreen": "00FA9A", - "mediumturquoise": "48D1CC", - "mediumvioletred": "C71585", - "midnightblue": "191970", - "mintcream": "F5FFFA", - "mistyrose": "FFE4E1", - "moccasin": "FFE4B5", - "navajowhite": "FFDEAD", - "navy": "000080", - "oldlace": "FDF5E6", - "olive": "808000", - "olivedrab": "6B8E23", - "orange": "FFA500", - "orangered": "FF4500", - "orchid": "DA70D6", - "palegoldenrod": "EEE8AA", - "palegreen": "98FB98", - "paleturquoise": "AFEEEE", - "palevioletred": "DB7093", - "papayawhip": "FFEFD5", - "peachpuff": "FFDAB9", - "peru": "CD853F", - "pink": "FFC0CB", - "plum": "DDA0DD", - "powderblue": "B0E0E6", - "purple": "800080", - "rebeccapurple": "663399", - "red": "FF0000", - "rosybrown": "BC8F8F", - "royalblue": "4169E1", - "saddlebrown": "8B4513", - "salmon": "FA8072", - "sandybrown": "F4A460", - "seagreen": "2E8B57", - "seashell": "FFF5EE", - "sienna": "A0522D", - "silver": "C0C0C0", - "skyblue": "87CEEB", - "slateblue": "6A5ACD", - "slategray": "708090", - "slategrey": "708090", - "snow": "FFFAFA", - "springgreen": "00FF7F", - "steelblue": "4682B4", - "tan": "D2B48C", - "teal": "008080", - "thistle": "D8BFD8", - "tomato": "FF6347", - "turquoise": "40E0D0", - "violet": "EE82EE", - "wheat": "F5DEB3", - "white": "FFFFFF", - "whitesmoke": "F5F5F5", - "yellow": "FFFF00", - "yellowgreen": "9ACD32", -} diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/console.py b/venv/lib/python3.8/site-packages/pandas/io/formats/console.py index ea291bc..bed29e1 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/console.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/console.py @@ -69,25 +69,21 @@ def in_interactive_session(): return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: - # pandas\io\formats\console.py:72: error: Name '__IPYTHON__' is not - # defined [name-defined] - return __IPYTHON__ or check_main() # type: ignore[name-defined] + return __IPYTHON__ or check_main() # noqa except NameError: return check_main() def in_ipython_frontend(): """ - Check if we're inside an IPython zmq frontend. + Check if we're inside an an IPython zmq frontend. Returns ------- bool """ try: - # pandas\io\formats\console.py:86: error: Name 'get_ipython' is not - # defined [name-defined] - ip = get_ipython() # type: ignore[name-defined] + ip = get_ipython() # noqa return "zmq" in str(type(ip)).lower() except NameError: pass diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/css.py b/venv/lib/python3.8/site-packages/pandas/io/formats/css.py index 8abe13d..b40d2a5 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/css.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/css.py @@ -3,7 +3,6 @@ Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. """ import re -from typing import Dict, Optional import warnings @@ -12,6 +11,8 @@ class CSSWarning(UserWarning): This CSS syntax cannot currently be parsed. """ + pass + def _side_expander(prop_fmt: str): def expand(self, prop, value: str): @@ -19,7 +20,9 @@ def _side_expander(prop_fmt: str): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning) + warnings.warn( + f'Could not expand "{prop}: {value}"', CSSWarning, + ) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] @@ -32,6 +35,100 @@ class CSSResolver: A callable for parsing and resolving CSS to atomic properties. """ + def __call__(self, declarations_str, inherited=None): + """ + The given declarations to atomic properties. + + Parameters + ---------- + declarations_str : str + A list of CSS declarations + inherited : dict, optional + Atomic properties indicating the inherited style context in which + declarations_str is to be resolved. ``inherited`` should already + be resolved, i.e. valid output of this method. + + Returns + ------- + dict + Atomic CSS 2.2 properties. + + Examples + -------- + >>> resolve = CSSResolver() + >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} + >>> out = resolve(''' + ... border-color: BLUE RED; + ... font-size: 1em; + ... font-size: 2em; + ... font-weight: normal; + ... font-weight: inherit; + ... ''', inherited) + >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE + [('border-bottom-color', 'blue'), + ('border-left-color', 'red'), + ('border-right-color', 'red'), + ('border-top-color', 'blue'), + ('font-family', 'serif'), + ('font-size', '24pt'), + ('font-weight', 'bold')] + """ + props = dict(self.atomize(self.parse(declarations_str))) + if inherited is None: + inherited = {} + + # 1. resolve inherited, initial + for prop, val in inherited.items(): + if prop not in props: + props[prop] = val + + for prop, val in list(props.items()): + if val == "inherit": + val = inherited.get(prop, "initial") + if val == "initial": + val = None + + if val is None: + # we do not define a complete initial stylesheet + del props[prop] + else: + props[prop] = val + + # 2. resolve relative font size + if props.get("font-size"): + if "font-size" in inherited: + em_pt = inherited["font-size"] + assert em_pt[-2:] == "pt" + em_pt = float(em_pt[:-2]) + else: + em_pt = None + props["font-size"] = self.size_to_pt( + props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + ) + + font_size = float(props["font-size"][:-2]) + else: + font_size = None + + # 3. TODO: resolve other font-relative units + for side in self.SIDES: + prop = f"border-{side}-width" + if prop in props: + props[prop] = self.size_to_pt( + props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + ) + for prop in [ + f"margin-{side}", + f"padding-{side}", + ]: + if prop in props: + # TODO: support % + props[prop] = self.size_to_pt( + props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + ) + + return props + UNIT_RATIOS = { "rem": ("pt", 12), "ex": ("em", 0.5), @@ -76,143 +173,15 @@ class CSSResolver: } ) - SIDE_SHORTHANDS = { - 1: [0, 0, 0, 0], - 2: [0, 1, 0, 1], - 3: [0, 1, 2, 1], - 4: [0, 1, 2, 3], - } - - SIDES = ("top", "right", "bottom", "left") - - def __call__( - self, - declarations_str: str, - inherited: Optional[Dict[str, str]] = None, - ) -> Dict[str, str]: - """ - The given declarations to atomic properties. - - Parameters - ---------- - declarations_str : str - A list of CSS declarations - inherited : dict, optional - Atomic properties indicating the inherited style context in which - declarations_str is to be resolved. ``inherited`` should already - be resolved, i.e. valid output of this method. - - Returns - ------- - dict - Atomic CSS 2.2 properties. - - Examples - -------- - >>> resolve = CSSResolver() - >>> inherited = {'font-family': 'serif', 'font-weight': 'bold'} - >>> out = resolve(''' - ... border-color: BLUE RED; - ... font-size: 1em; - ... font-size: 2em; - ... font-weight: normal; - ... font-weight: inherit; - ... ''', inherited) - >>> sorted(out.items()) # doctest: +NORMALIZE_WHITESPACE - [('border-bottom-color', 'blue'), - ('border-left-color', 'red'), - ('border-right-color', 'red'), - ('border-top-color', 'blue'), - ('font-family', 'serif'), - ('font-size', '24pt'), - ('font-weight', 'bold')] - """ - props = dict(self.atomize(self.parse(declarations_str))) - if inherited is None: - inherited = {} - - props = self._update_initial(props, inherited) - props = self._update_font_size(props, inherited) - return self._update_other_units(props) - - def _update_initial( - self, - props: Dict[str, str], - inherited: Dict[str, str], - ) -> Dict[str, str]: - # 1. resolve inherited, initial - for prop, val in inherited.items(): - if prop not in props: - props[prop] = val - - new_props = props.copy() - for prop, val in props.items(): - if val == "inherit": - val = inherited.get(prop, "initial") - - if val in ("initial", None): - # we do not define a complete initial stylesheet - del new_props[prop] - else: - new_props[prop] = val - return new_props - - def _update_font_size( - self, - props: Dict[str, str], - inherited: Dict[str, str], - ) -> Dict[str, str]: - # 2. resolve relative font size - if props.get("font-size"): - props["font-size"] = self.size_to_pt( - props["font-size"], - self._get_font_size(inherited), - conversions=self.FONT_SIZE_RATIOS, - ) - return props - - def _get_font_size(self, props: Dict[str, str]) -> Optional[float]: - if props.get("font-size"): - font_size_string = props["font-size"] - return self._get_float_font_size_from_pt(font_size_string) - return None - - def _get_float_font_size_from_pt(self, font_size_string: str) -> float: - assert font_size_string.endswith("pt") - return float(font_size_string.rstrip("pt")) - - def _update_other_units(self, props: Dict[str, str]) -> Dict[str, str]: - font_size = self._get_font_size(props) - # 3. TODO: resolve other font-relative units - for side in self.SIDES: - prop = f"border-{side}-width" - if prop in props: - props[prop] = self.size_to_pt( - props[prop], - em_pt=font_size, - conversions=self.BORDER_WIDTH_RATIOS, - ) - - for prop in [f"margin-{side}", f"padding-{side}"]: - if prop in props: - # TODO: support % - props[prop] = self.size_to_pt( - props[prop], - em_pt=font_size, - conversions=self.MARGIN_RATIOS, - ) - return props - def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) return self.size_to_pt("1!!default", conversions=conversions) - match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val) - if match is None: + try: + val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() + except AttributeError: return _error() - - val, unit = match.groups() if val == "": # hack for 'large' etc. val = 1 @@ -255,6 +224,14 @@ class CSSResolver: for prop, value in expand(prop, value): yield prop, value + SIDE_SHORTHANDS = { + 1: [0, 0, 0, 0], + 2: [0, 1, 0, 1], + 3: [0, 1, 2, 1], + 4: [0, 1, 2, 3], + } + SIDES = ("top", "right", "bottom", "left") + expand_border_color = _side_expander("border-{:s}-color") expand_border_style = _side_expander("border-{:s}-style") expand_border_width = _side_expander("border-{:s}-width") diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/csvs.py b/venv/lib/python3.8/site-packages/pandas/io/formats/csvs.py index 6d14d61..5bd51dc 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/csvs.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/csvs.py @@ -3,20 +3,16 @@ Module for formatting output data into CSV files. """ import csv as csvlib +from io import StringIO import os -from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Sequence, Union +from typing import Hashable, List, Mapping, Optional, Sequence, Union +import warnings +from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import ( - CompressionOptions, - FilePathOrBuffer, - FloatFormatType, - IndexLabel, - Label, - StorageOptions, -) +from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -26,217 +22,179 @@ from pandas.core.dtypes.generic import ( ) from pandas.core.dtypes.missing import notna -from pandas.core.indexes.api import Index - -from pandas.io.common import get_handle - -if TYPE_CHECKING: - from pandas.io.formats.format import DataFrameFormatter +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + infer_compression, +) class CSVFormatter: def __init__( self, - formatter: "DataFrameFormatter", - path_or_buf: FilePathOrBuffer[str] = "", + obj, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, sep: str = ",", - cols: Optional[Sequence[Label]] = None, - index_label: Optional[IndexLabel] = None, + na_rep: str = "", + float_format: Optional[str] = None, + cols=None, + header: Union[bool, Sequence[Hashable]] = True, + index: bool = True, + index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", - compression: CompressionOptions = "infer", + compression: Union[str, Mapping[str, str], None] = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, - quotechar: Optional[str] = '"', + quotechar='"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, - storage_options: StorageOptions = None, + decimal=".", ): - self.fmt = formatter + self.obj = obj - self.obj = self.fmt.frame + if path_or_buf is None: + path_or_buf = StringIO() - self.filepath_or_buffer = path_or_buf - self.encoding = encoding - self.compression = compression - self.mode = mode - self.storage_options = storage_options + # Extract compression mode as given, if dict + compression, self.compression_args = get_compression_method(compression) + self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( + path_or_buf, encoding=encoding, compression=compression, mode=mode + ) self.sep = sep - self.index_label = self._initialize_index_label(index_label) + self.na_rep = na_rep + self.float_format = float_format + self.decimal = decimal + + self.header = header + self.index = index + self.index_label = index_label + self.mode = mode + if encoding is None: + encoding = "utf-8" + self.encoding = encoding self.errors = errors - self.quoting = quoting or csvlib.QUOTE_MINIMAL - self.quotechar = self._initialize_quotechar(quotechar) + self.compression = infer_compression(self.path_or_buf, compression) + + if quoting is None: + quoting = csvlib.QUOTE_MINIMAL + self.quoting = quoting + + if quoting == csvlib.QUOTE_NONE: + # prevents crash in _csv + quotechar = None + self.quotechar = quotechar + self.doublequote = doublequote self.escapechar = escapechar + self.line_terminator = line_terminator or os.linesep + self.date_format = date_format - self.cols = self._initialize_columns(cols) - self.chunksize = self._initialize_chunksize(chunksize) - @property - def na_rep(self) -> str: - return self.fmt.na_rep + self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) - @property - def float_format(self) -> Optional["FloatFormatType"]: - return self.fmt.float_format - - @property - def decimal(self) -> str: - return self.fmt.decimal - - @property - def header(self) -> Union[bool, Sequence[str]]: - return self.fmt.header - - @property - def index(self) -> bool: - return self.fmt.index - - def _initialize_index_label(self, index_label: Optional[IndexLabel]) -> IndexLabel: - if index_label is not False: - if index_label is None: - return self._get_index_label_from_obj() - elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): - # given a string for a DF with Index - return [index_label] - return index_label - - def _get_index_label_from_obj(self) -> List[str]: - if isinstance(self.obj.index, ABCMultiIndex): - return self._get_index_label_multiindex() - else: - return self._get_index_label_flat() - - def _get_index_label_multiindex(self) -> List[str]: - return [name or "" for name in self.obj.index.names] - - def _get_index_label_flat(self) -> List[str]: - index_label = self.obj.index.name - return [""] if index_label is None else [index_label] - - def _initialize_quotechar(self, quotechar: Optional[str]) -> Optional[str]: - if self.quoting != csvlib.QUOTE_NONE: - # prevents crash in _csv - return quotechar - return None - - @property - def has_mi_columns(self) -> bool: - return bool(isinstance(self.obj.columns, ABCMultiIndex)) - - def _initialize_columns(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: - msg = "cannot specify cols with a MultiIndex on the columns" - raise TypeError(msg) + raise TypeError("cannot specify cols with a MultiIndex on the columns") if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols._format_native_types(**self._number_format) + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, + ) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes - # and make sure cols is just a list of labels - new_cols = self.obj.columns - if isinstance(new_cols, ABCIndexClass): - return new_cols._format_native_types(**self._number_format) - else: - return list(new_cols) - - def _initialize_chunksize(self, chunksize: Optional[int]) -> int: - if chunksize is None: - return (100000 // (len(self.cols) or 1)) or 1 - return int(chunksize) - - @property - def _number_format(self) -> Dict[str, Any]: - """Dictionary used for storing number formatting settings.""" - return { - "na_rep": self.na_rep, - "float_format": self.float_format, - "date_format": self.date_format, - "quoting": self.quoting, - "decimal": self.decimal, - } - - @property - def data_index(self) -> Index: - data_index = self.obj.index - if ( - isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and self.date_format is not None - ): - data_index = Index( - [x.strftime(self.date_format) if notna(x) else "" for x in data_index] + # and make sure sure cols is just a list of labels + cols = self.obj.columns + if isinstance(cols, ABCIndexClass): + cols = cols.to_native_types( + na_rep=na_rep, + float_format=float_format, + date_format=date_format, + quoting=self.quoting, ) - return data_index - - @property - def nlevels(self) -> int: - if self.index: - return getattr(self.data_index, "nlevels", 1) else: - return 0 + cols = list(cols) - @property - def _has_aliases(self) -> bool: - return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) + # save it + self.cols = cols - @property - def _need_to_save_header(self) -> bool: - return bool(self._has_aliases or self.header) + # preallocate data 2d list + ncols = self.obj.shape[-1] + self.data = [None] * ncols - @property - def write_cols(self) -> Sequence[Label]: - if self._has_aliases: - assert not isinstance(self.header, bool) - if len(self.header) != len(self.cols): - raise ValueError( - f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" - ) - else: - return self.header - else: - return self.cols + if chunksize is None: + chunksize = (100000 // (len(self.cols) or 1)) or 1 + self.chunksize = int(chunksize) - @property - def encoded_labels(self) -> List[Label]: - encoded_labels: List[Label] = [] + self.data_index = obj.index + if ( + isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and date_format is not None + ): + from pandas import Index - if self.index and self.index_label: - assert isinstance(self.index_label, Sequence) - encoded_labels = list(self.index_label) + self.data_index = Index( + [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + ) - if not self.has_mi_columns or self._has_aliases: - encoded_labels += list(self.write_cols) - - return encoded_labels + self.nlevels = getattr(self.data_index, "nlevels", 1) + if not index: + self.nlevels = 0 def save(self) -> None: """ Create the writer & save. """ - # apply compression and byte/text conversion - with get_handle( - self.filepath_or_buffer, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=self.compression, - storage_options=self.storage_options, - ) as handles: + # GH21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, "write"): + warnings.warn( + "compression has no effect when passing file-like object as input.", + RuntimeWarning, + stacklevel=2, + ) + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, "write") and self.compression == "zip" + ) + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH21241, GH21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, "write"): + f = self.path_or_buf + close = False + else: + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=dict(self.compression_args, method=self.compression), + ) + close = True + + try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( - handles.handle, # type: ignore[arg-type] + f, lineterminator=self.line_terminator, delimiter=self.sep, quoting=self.quoting, @@ -247,56 +205,158 @@ class CSVFormatter: self._save() - def _save(self) -> None: - if self._need_to_save_header: - self._save_header() - self._save_body() + finally: + if is_zip: + # GH17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, "write"): + self.path_or_buf.write(buf) + else: + compression = dict(self.compression_args, method=self.compression) - def _save_header(self) -> None: - if not self.has_mi_columns or self._has_aliases: - self.writer.writerow(self.encoded_labels) + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=compression, + ) + f.write(buf) + close = True + if close: + f.close() + for _fh in handles: + _fh.close() + elif self.should_close: + f.close() + + def _save_header(self): + writer = self.writer + obj = self.obj + index_label = self.index_label + cols = self.cols + has_mi_columns = self.has_mi_columns + header = self.header + encoded_labels: List[str] = [] + + has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) + if not (has_aliases or self.header): + return + if has_aliases: + if len(header) != len(cols): + raise ValueError( + f"Writing {len(cols)} cols but got {len(header)} aliases" + ) + else: + write_cols = header else: - for row in self._generate_multiindex_header_rows(): - self.writer.writerow(row) + write_cols = cols - def _generate_multiindex_header_rows(self) -> Iterator[List[Label]]: - columns = self.obj.columns - for i in range(columns.nlevels): - # we need at least 1 index column to write our col names - col_line = [] - if self.index: - # name is the first column - col_line.append(columns.names[i]) + if self.index: + # should write something for index label + if index_label is not False: + if index_label is None: + if isinstance(obj.index, ABCMultiIndex): + index_label = [] + for i, name in enumerate(obj.index.names): + if name is None: + name = "" + index_label.append(name) + else: + index_label = obj.index.name + if index_label is None: + index_label = [""] + else: + index_label = [index_label] + elif not isinstance( + index_label, (list, tuple, np.ndarray, ABCIndexClass) + ): + # given a string for a DF with Index + index_label = [index_label] - if isinstance(self.index_label, list) and len(self.index_label) > 1: - col_line.extend([""] * (len(self.index_label) - 1)) + encoded_labels = list(index_label) + else: + encoded_labels = [] - col_line.extend(columns._get_level_values(i)) - yield col_line + if not has_mi_columns or has_aliases: + encoded_labels += list(write_cols) + writer.writerow(encoded_labels) + else: + # write out the mi + columns = obj.columns - # Write out the index line if it's not empty. - # Otherwise, we will print out an extraneous - # blank line between the mi and the data rows. - if self.encoded_labels and set(self.encoded_labels) != {""}: - yield self.encoded_labels + [""] * len(columns) + # write out the names for each level, then ALL of the values for + # each level + for i in range(columns.nlevels): + + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(index_label, list) and len(index_label) > 1: + col_line.extend([""] * (len(index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + + writer.writerow(col_line) + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if encoded_labels and set(encoded_labels) != {""}: + encoded_labels.extend([""] * len(columns)) + writer.writerow(encoded_labels) + + def _save(self) -> None: + self._save_header() - def _save_body(self) -> None: nrows = len(self.data_index) - chunks = int(nrows / self.chunksize) + 1 + + # write in chunksize bites + chunksize = self.chunksize + chunks = int(nrows / chunksize) + 1 + for i in range(chunks): - start_i = i * self.chunksize - end_i = min(start_i + self.chunksize, nrows) + start_i = i * chunksize + end_i = min((i + 1) * chunksize, nrows) if start_i >= end_i: break + self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: + data_index = self.data_index + # create the data for a chunk slicer = slice(start_i, end_i) + df = self.obj.iloc[slicer] + blocks = df._mgr.blocks - res = df._mgr.to_native_types(**self._number_format) - data = [res.iget_values(i) for i in range(len(res.items))] + for i in range(len(blocks)): + b = blocks[i] + d = b.to_native_types( + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) - ix = self.data_index[slicer]._format_native_types(**self._number_format) - libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) + for col_loc, col in zip(b.mgr_locs, d): + # self.data is a preallocated list + self.data[col_loc] = col + + ix = data_index.to_native_types( + slicer=slicer, + na_rep=self.na_rep, + float_format=self.float_format, + decimal=self.decimal, + date_format=self.date_format, + quoting=self.quoting, + ) + + libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/excel.py b/venv/lib/python3.8/site-packages/pandas/io/formats/excel.py index 0cad671..bf4586a 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/excel.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/excel.py @@ -5,23 +5,21 @@ Utilities for conversion to writer-agnostic Excel representation. from functools import reduce import itertools import re -from typing import Callable, Dict, Iterable, Mapping, Optional, Sequence, Union, cast +from typing import Callable, Dict, Optional, Sequence, Union import warnings import numpy as np -from pandas._libs.lib import is_list_like -from pandas._typing import Label, StorageOptions -from pandas.util._decorators import doc +from pandas._typing import Label from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar +from pandas.core.dtypes.generic import ABCIndex from pandas import DataFrame, Index, MultiIndex, PeriodIndex -from pandas.core import generic import pandas.core.common as com -from pandas.io.formats._color_data import CSS4_COLORS +from pandas.io.common import stringify_path from pandas.io.formats.css import CSSResolver, CSSWarning from pandas.io.formats.format import get_level_lengths from pandas.io.formats.printing import pprint_thing @@ -32,13 +30,7 @@ class ExcelCell: __slots__ = __fields__ def __init__( - self, - row: int, - col: int, - val, - style=None, - mergestart: Optional[int] = None, - mergeend: Optional[int] = None, + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None ): self.row = row self.col = col @@ -66,58 +58,16 @@ class CSSToExcelConverter: CSS processed by :meth:`__call__`. """ - NAMED_COLORS = CSS4_COLORS - - VERTICAL_MAP = { - "top": "top", - "text-top": "top", - "middle": "center", - "baseline": "bottom", - "bottom": "bottom", - "text-bottom": "bottom", - # OpenXML also has 'justify', 'distributed' - } - - BOLD_MAP = { - "bold": True, - "bolder": True, - "600": True, - "700": True, - "800": True, - "900": True, - "normal": False, - "lighter": False, - "100": False, - "200": False, - "300": False, - "400": False, - "500": False, - } - - ITALIC_MAP = { - "normal": False, - "italic": True, - "oblique": True, - } - - FAMILY_MAP = { - "serif": 1, # roman - "sans-serif": 2, # swiss - "cursive": 4, # script - "fantasy": 5, # decorative - } - # NB: Most of the methods here could be classmethods, as only __init__ # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions # without monkey-patching. - inherited: Optional[Dict[str, str]] def __init__(self, inherited: Optional[str] = None): if inherited is not None: - self.inherited = self.compute_css(inherited) - else: - self.inherited = None + inherited = self.compute_css(inherited) + + self.inherited = inherited compute_css = CSSResolver() @@ -141,7 +91,7 @@ class CSSToExcelConverter: properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props: Mapping[str, str]) -> Dict[str, Dict[str, str]]: + def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -165,30 +115,29 @@ class CSSToExcelConverter: remove_none(out) return out - def build_alignment( - self, props: Mapping[str, str] - ) -> Dict[str, Optional[Union[bool, str]]]: + VERTICAL_MAP = { + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", + # OpenXML also has 'justify', 'distributed' + } + + def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), - "vertical": self._get_vertical_alignment(props), - "wrap_text": self._get_is_wrap_text(props), + "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), + "wrap_text": ( + None + if props.get("white-space") is None + else props["white-space"] not in ("nowrap", "pre", "pre-line") + ), } - def _get_vertical_alignment(self, props: Mapping[str, str]) -> Optional[str]: - vertical_align = props.get("vertical-align") - if vertical_align: - return self.VERTICAL_MAP.get(vertical_align) - return None - - def _get_is_wrap_text(self, props: Mapping[str, str]) -> Optional[bool]: - if props.get("white-space") is None: - return None - return bool(props["white-space"] not in ("nowrap", "pre", "pre-line")) - - def build_border( - self, props: Mapping[str, str] - ) -> Dict[str, Dict[str, Optional[str]]]: + def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: return { side: { "style": self._border_style( @@ -200,7 +149,7 @@ class CSSToExcelConverter: for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style: Optional[str], width: Optional[str]): + def _border_style(self, style: Optional[str], width): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -220,16 +169,26 @@ class CSSToExcelConverter: if style == "none" or style == "hidden": return None - width_name = self._get_width_name(width) - if width_name is None: + if width is None: + width = "2pt" + width = float(width[:-2]) + if width < 1e-5: return None + elif width < 1.3: + width_name = "thin" + elif width < 2.8: + width_name = "medium" + else: + width_name = "thick" - if style in (None, "groove", "ridge", "inset", "outset", "solid"): + if style in (None, "groove", "ridge", "inset", "outset"): # not handled - return width_name + style = "solid" if style == "double": return "double" + if style == "solid": + return width_name if style == "dotted": if width_name in ("hair", "thin"): return "dotted" @@ -239,89 +198,36 @@ class CSSToExcelConverter: return "dashed" return "mediumDashed" - def _get_width_name(self, width_input: Optional[str]) -> Optional[str]: - width = self._width_to_float(width_input) - if width < 1e-5: - return None - elif width < 1.3: - return "thin" - elif width < 2.8: - return "medium" - return "thick" - - def _width_to_float(self, width: Optional[str]) -> float: - if width is None: - width = "2pt" - return self._pt_to_float(width) - - def _pt_to_float(self, pt_string: str) -> float: - assert pt_string.endswith("pt") - return float(pt_string.rstrip("pt")) - - def build_fill(self, props: Mapping[str, str]): + def build_fill(self, props: Dict[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") if fill_color not in (None, "transparent", "none"): return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} - def build_number_format(self, props: Mapping[str, str]) -> Dict[str, Optional[str]]: - return {"format_code": props.get("number-format")} + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} - def build_font( - self, props: Mapping[str, str] - ) -> Dict[str, Optional[Union[bool, int, float, str]]]: - font_names = self._get_font_names(props) - decoration = self._get_decoration(props) - return { - "name": font_names[0] if font_names else None, - "family": self._select_font_family(font_names), - "size": self._get_font_size(props), - "bold": self._get_is_bold(props), - "italic": self._get_is_italic(props), - "underline": ("single" if "underline" in decoration else None), - "strike": ("line-through" in decoration) or None, - "color": self.color_to_excel(props.get("color")), - # shadow if nonzero digit before shadow color - "shadow": self._get_shadow(props), - # FIXME: dont leave commented-out - # 'vertAlign':, - # 'charset': , - # 'scheme': , - # 'outline': , - # 'condense': , - } + def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: + size = props.get("font-size") + if size is not None: + assert size.endswith("pt") + size = float(size[:-2]) - def _get_is_bold(self, props: Mapping[str, str]) -> Optional[bool]: - weight = props.get("font-weight") - if weight: - return self.BOLD_MAP.get(weight) - return None - - def _get_is_italic(self, props: Mapping[str, str]) -> Optional[bool]: - font_style = props.get("font-style") - if font_style: - return self.ITALIC_MAP.get(font_style) - return None - - def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]: - decoration = props.get("text-decoration") - if decoration is not None: - return decoration.split() - else: - return () - - def _get_underline(self, decoration: Sequence[str]) -> Optional[str]: - if "underline" in decoration: - return "single" - return None - - def _get_shadow(self, props: Mapping[str, str]) -> Optional[bool]: - if "text-shadow" in props: - return bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) - return None - - def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]: font_names_tmp = re.findall( r"""(?x) ( @@ -334,7 +240,6 @@ class CSSToExcelConverter: """, props.get("font-family", ""), ) - font_names = [] for name in font_names_tmp: if name[:1] == '"': @@ -345,58 +250,88 @@ class CSSToExcelConverter: name = name.strip() if name: font_names.append(name) - return font_names - def _get_font_size(self, props: Mapping[str, str]) -> Optional[float]: - size = props.get("font-size") - if size is None: - return size - return self._pt_to_float(size) - - def _select_font_family(self, font_names) -> Optional[int]: family = None for name in font_names: - family = self.FAMILY_MAP.get(name) - if family: + if name == "serif": + family = 1 # roman + break + elif name == "sans-serif": + family = 2 # swiss + break + elif name == "cursive": + family = 4 # script + break + elif name == "fantasy": + family = 5 # decorative break - return family + decoration = props.get("text-decoration") + if decoration is not None: + decoration = decoration.split() + else: + decoration = () - def color_to_excel(self, val: Optional[str]) -> Optional[str]: + return { + "name": font_names[0] if font_names else None, + "family": family, + "size": size, + "bold": self.BOLD_MAP.get(props.get("font-weight")), + "italic": self.ITALIC_MAP.get(props.get("font-style")), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), + # shadow if nonzero digit before shadow color + "shadow": ( + bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + if "text-shadow" in props + else None + ), + # FIXME: dont leave commented-out + # 'vertAlign':, + # 'charset': , + # 'scheme': , + # 'outline': , + # 'condense': , + } + + NAMED_COLORS = { + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", + } + + def color_to_excel(self, val: Optional[str]): if val is None: return None - - if self._is_hex_color(val): - return self._convert_hex_to_excel(val) - + if val.startswith("#") and len(val) == 7: + return val[1:].upper() + if val.startswith("#") and len(val) == 4: + return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() try: return self.NAMED_COLORS[val] except KeyError: warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) - return None - def _is_hex_color(self, color_string: str) -> bool: - return bool(color_string.startswith("#")) - - def _convert_hex_to_excel(self, color_string: str) -> str: - code = color_string.lstrip("#") - if self._is_shorthand_color(color_string): - return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper() - else: - return code.upper() - - def _is_shorthand_color(self, color_string: str) -> bool: - """Check if color code is shorthand. - - #FFF is a shorthand as opposed to full #FFFFFF. - """ - code = color_string.lstrip("#") - if len(code) == 3: - return True - elif len(code) == 6: - return False - else: - raise ValueError(f"Unexpected color {color_string}") + def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: + return {"format_code": props.get("number-format")} class ExcelFormatter: @@ -411,7 +346,7 @@ class ExcelFormatter: Format string for floating point numbers cols : sequence, optional Columns to write - header : boolean or sequence of str, default True + header : boolean or list of string, default True Write out column names. If a list of string is given it is assumed to be aliases for the column names index : boolean, default True @@ -462,10 +397,10 @@ class ExcelFormatter: if cols is not None: # all missing, raise - if not len(Index(cols).intersection(df.columns)): + if not len(Index(cols) & df.columns): raise KeyError("passes columns are not ALL present dataframe") - if len(Index(cols).intersection(df.columns)) != len(cols): + if len(Index(cols) & df.columns) != len(cols): # Deprecated in GH#17295, enforced in 1.0.0 raise KeyError("Not all names specified in 'columns' are found") @@ -510,7 +445,7 @@ class ExcelFormatter: ) return val - def _format_header_mi(self) -> Iterable[ExcelCell]: + def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( @@ -518,7 +453,8 @@ class ExcelFormatter: "index ('index'=False) is not yet implemented." ) - if not (self._has_aliases or self.header): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if not (has_aliases or self.header): return columns = self.columns @@ -534,30 +470,28 @@ class ExcelFormatter: if self.merge_cells: # Format multi-index as a merged cells. - for lnum, name in enumerate(columns.names): - yield ExcelCell( - row=lnum, - col=coloffset, - val=name, - style=self.header_style, - ) + for lnum in range(len(level_lengths)): + name = columns.names[lnum] + yield ExcelCell(lnum, coloffset, name, self.header_style) for lnum, (spans, levels, level_codes) in enumerate( zip(level_lengths, columns.levels, columns.codes) ): values = levels.take(level_codes) - for i, span_val in spans.items(): - spans_multiple_cells = span_val > 1 - yield ExcelCell( - row=lnum, - col=coloffset + i + 1, - val=values[i], - style=self.header_style, - mergestart=lnum if spans_multiple_cells else None, - mergeend=( - coloffset + i + span_val if spans_multiple_cells else None - ), - ) + for i in spans: + if spans[i] > 1: + yield ExcelCell( + lnum, + coloffset + i + 1, + values[i], + self.header_style, + lnum, + coloffset + i + spans[i], + ) + else: + yield ExcelCell( + lnum, coloffset + i + 1, values[i], self.header_style + ) else: # Format in legacy format with dots to indicate levels. for i, values in enumerate(zip(*level_strs)): @@ -566,8 +500,9 @@ class ExcelFormatter: self.rowcounter = lnum - def _format_header_regular(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_header_regular(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: coloffset = 0 if self.index: @@ -576,12 +511,11 @@ class ExcelFormatter: coloffset = len(self.df.index[0]) colnames = self.columns - if self._has_aliases: - self.header = cast(Sequence, self.header) + if has_aliases: if len(self.header) != len(self.columns): raise ValueError( - f"Writing {len(self.columns)} cols " - f"but got {len(self.header)} aliases" + f"Writing {len(self.columns)} cols but got {len(self.header)} " + "aliases" ) else: colnames = self.header @@ -591,7 +525,7 @@ class ExcelFormatter: self.rowcounter, colindex + coloffset, colname, self.header_style ) - def _format_header(self) -> Iterable[ExcelCell]: + def _format_header(self): if isinstance(self.columns, MultiIndex): gen = self._format_header_mi() else: @@ -603,24 +537,22 @@ class ExcelFormatter: "" ] * len(self.columns) if reduce(lambda x, y: x and y, map(lambda x: x != "", row)): - # pandas\io\formats\excel.py:618: error: Incompatible types in - # assignment (expression has type "Generator[ExcelCell, None, - # None]", variable has type "Tuple[]") [assignment] - gen2 = ( # type: ignore[assignment] + gen2 = ( ExcelCell(self.rowcounter, colindex, val, self.header_style) for colindex, val in enumerate(row) ) self.rowcounter += 1 return itertools.chain(gen, gen2) - def _format_body(self) -> Iterable[ExcelCell]: + def _format_body(self): if isinstance(self.df.index, MultiIndex): return self._format_hierarchical_rows() else: return self._format_regular_rows() - def _format_regular_rows(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_regular_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: self.rowcounter += 1 # output index and index_label? @@ -655,10 +587,12 @@ class ExcelFormatter: else: coloffset = 0 - yield from self._generate_body(coloffset) + for cell in self._generate_body(coloffset): + yield cell - def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: - if self._has_aliases or self.header: + def _format_hierarchical_rows(self): + has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) + if has_aliases or self.header: self.rowcounter += 1 gcolidx = 0 @@ -696,25 +630,26 @@ class ExcelFormatter: ): values = levels.take( - level_codes, - allow_fill=levels._can_hold_na, - fill_value=levels._na_value, + level_codes, allow_fill=levels._can_hold_na, fill_value=True ) - for i, span_val in spans.items(): - spans_multiple_cells = span_val > 1 - yield ExcelCell( - row=self.rowcounter + i, - col=gcolidx, - val=values[i], - style=self.header_style, - mergestart=( - self.rowcounter + i + span_val - 1 - if spans_multiple_cells - else None - ), - mergeend=gcolidx if spans_multiple_cells else None, - ) + for i in spans: + if spans[i] > 1: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + self.rowcounter + i + spans[i] - 1, + gcolidx, + ) + else: + yield ExcelCell( + self.rowcounter + i, + gcolidx, + values[i], + self.header_style, + ) gcolidx += 1 else: @@ -722,21 +657,17 @@ class ExcelFormatter: for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): yield ExcelCell( - row=self.rowcounter + idx, - col=gcolidx, - val=indexcolval, - style=self.header_style, + self.rowcounter + idx, + gcolidx, + indexcolval, + self.header_style, ) gcolidx += 1 - yield from self._generate_body(gcolidx) + for cell in self._generate_body(gcolidx): + yield cell - @property - def _has_aliases(self) -> bool: - """Whether the aliases for column names are present.""" - return is_list_like(self.header) - - def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: + def _generate_body(self, coloffset: int): if self.styler is None: styles = None else: @@ -753,12 +684,11 @@ class ExcelFormatter: xlstyle = self.style_converter(";".join(styles[i, colidx])) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) - def get_formatted_cells(self) -> Iterable[ExcelCell]: + def get_formatted_cells(self): for cell in itertools.chain(self._format_header(), self._format_body()): cell.val = self._format_value(cell.val) yield cell - @doc(storage_options=generic._shared_docs["storage_options"]) def write( self, writer, @@ -767,10 +697,9 @@ class ExcelFormatter: startcol=0, freeze_panes=None, engine=None, - storage_options: StorageOptions = None, ): """ - writer : path-like, file-like, or ExcelWriter object + writer : string or ExcelWriter object File path or existing ExcelWriter sheet_name : string, default 'Sheet1' Name of sheet which will contain DataFrame @@ -785,16 +714,6 @@ class ExcelFormatter: write engine to use if writer is a path - you can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. - - .. deprecated:: 1.2.0 - - As the `xlwt `__ package is no longer - maintained, the ``xlwt`` engine will be removed in a future - version of pandas. - - {storage_options} - - .. versionadded:: 1.2.0 """ from pandas.io.excel import ExcelWriter @@ -805,27 +724,19 @@ class ExcelFormatter: f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) - formatted_cells = self.get_formatted_cells() if isinstance(writer, ExcelWriter): need_save = False else: - # pandas\io\formats\excel.py:808: error: Cannot instantiate - # abstract class 'ExcelWriter' with abstract attributes 'engine', - # 'save', 'supported_extensions' and 'write_cells' [abstract] - writer = ExcelWriter( # type: ignore[abstract] - writer, engine=engine, storage_options=storage_options - ) + writer = ExcelWriter(stringify_path(writer), engine=engine) need_save = True - try: - writer.write_cells( - formatted_cells, - sheet_name, - startrow=startrow, - startcol=startcol, - freeze_panes=freeze_panes, - ) - finally: - # make sure to close opened file handles - if need_save: - writer.close() + formatted_cells = self.get_formatted_cells() + writer.write_cells( + formatted_cells, + sheet_name, + startrow=startrow, + startcol=startcol, + freeze_panes=freeze_panes, + ) + if need_save: + writer.save() diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/format.py b/venv/lib/python3.8/site-packages/pandas/io/formats/format.py index db34b88..c05f79f 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/format.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/format.py @@ -5,6 +5,7 @@ and latex files. This module also applies to display formatting. from contextlib import contextmanager from csv import QUOTE_NONE, QUOTE_NONNUMERIC +from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -35,17 +36,11 @@ from pandas._config.config import get_option, set_option from pandas._libs import lib from pandas._libs.missing import NA +from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import ( - ArrayLike, - CompressionOptions, - FilePathOrBuffer, - FloatFormatType, - IndexLabel, - Label, - StorageOptions, -) +from pandas._typing import FilePathOrBuffer, Label +from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -72,7 +67,6 @@ from pandas.core.construction import extract_array from pandas.core.indexes.api import Index, MultiIndex, PeriodIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.reshape.concat import concat from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing @@ -80,13 +74,13 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: from pandas import Categorical, DataFrame, Series - FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] +FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] + str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]], ] common_docstring = """ @@ -103,7 +97,7 @@ common_docstring = """ index : bool, optional, default True Whether to print index (row) labels. na_rep : str, optional, default 'NaN' - String representation of ``NaN`` to use. + String representation of NAN to use. formatters : list, tuple or dict of one-param. functions, optional Formatter functions to apply to columns' elements by position or name. @@ -111,12 +105,7 @@ common_docstring = """ List/tuple must be of length equal to the number of columns. float_format : one-parameter function, optional, default None Formatter function to apply to columns' elements if they are - floats. This function must return a unicode string and will be - applied only to the non-``NaN`` elements, with ``NaN`` being - handled by ``na_rep``. - - .. versionchanged:: 1.2.0 - + floats. The result of this function must be a unicode string. sparsify : bool, optional, default True Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row. @@ -267,20 +256,22 @@ class SeriesFormatter: float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = get_adjustment() + self.adj = _get_adjustment() self._chk_truncate() def _chk_truncate(self) -> None: + from pandas.core.reshape.concat import concat + self.tr_row_num: Optional[int] min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows # used below by min_rows - is_truncated_vertically = max_rows and (len(self.series) > max_rows) + truncate_v = max_rows and (len(self.series) > max_rows) series = self.series - if is_truncated_vertically: + if truncate_v: max_rows = cast(int, max_rows) if min_rows: # if min_rows is set (not None or 0), set max_rows to minimum @@ -296,7 +287,7 @@ class SeriesFormatter: else: self.tr_row_num = None self.tr_series = series - self.is_truncated_vertically = is_truncated_vertically + self.truncate_v = truncate_v def _get_footer(self) -> str: name = self.series.name @@ -315,9 +306,7 @@ class SeriesFormatter: series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) footer += f"Name: {series_name}" - if self.length is True or ( - self.length == "truncate" and self.is_truncated_vertically - ): + if self.length is True or (self.length == "truncate" and self.truncate_v): if footer: footer += ", " footer += f"Length: {len(self.series)}" @@ -356,7 +345,6 @@ class SeriesFormatter: None, float_format=self.float_format, na_rep=self.na_rep, - leading_space=self.index, ) def to_string(self) -> str: @@ -369,7 +357,7 @@ class SeriesFormatter: fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - if self.is_truncated_vertically: + if self.truncate_v: n_header_rows = 0 row_num = self.tr_row_num row_num = cast(int, row_num) @@ -451,7 +439,7 @@ class EastAsianTextAdjustment(TextAdjustment): return [x.rjust(_get_pad(x)) for x in texts] -def get_adjustment() -> TextAdjustment: +def _get_adjustment() -> TextAdjustment: use_east_asian_width = get_option("display.unicode.east_asian_width") if use_east_asian_width: return EastAsianTextAdjustment() @@ -459,12 +447,97 @@ def get_adjustment() -> TextAdjustment: return TextAdjustment() -class DataFrameFormatter: - """Class for processing dataframe formatting options and data.""" +class TableFormatter: + + show_dimensions: Union[bool, str] + is_truncated: bool + formatters: FormattersType + columns: Index + + @property + def should_show_dimensions(self) -> bool: + return self.show_dimensions is True or ( + self.show_dimensions == "truncate" and self.is_truncated + ) + + def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: + if isinstance(self.formatters, (list, tuple)): + if is_integer(i): + i = cast(int, i) + return self.formatters[i] + else: + return None + else: + if is_integer(i) and i not in self.columns: + i = self.columns[i] + return self.formatters.get(i, None) + + @contextmanager + def get_buffer( + self, buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None + ): + """ + Context manager to open, yield and close buffer for filenames or Path-like + objects, otherwise yield buf unchanged. + """ + if buf is not None: + buf = stringify_path(buf) + else: + buf = StringIO() + + if encoding is None: + encoding = "utf-8" + elif not isinstance(buf, str): + raise ValueError("buf is not a file name and encoding is specified.") + + if hasattr(buf, "write"): + yield buf + elif isinstance(buf, str): + with open(buf, "w", encoding=encoding, newline="") as f: + # GH#30034 open instead of codecs.open prevents a file leak + # if we have an invalid encoding argument. + # newline="" is needed to roundtrip correctly on + # windows test_to_latex_filename + yield f + else: + raise TypeError("buf is not a file name and it has no write method") + + def write_result(self, buf: IO[str]) -> None: + """ + Write the result of serialization to buf. + """ + raise AbstractMethodError(self) + + def get_result( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + """ + Perform serialization. Write to buf or return as string if buf is None. + """ + with self.get_buffer(buf, encoding=encoding) as f: + self.write_result(buf=f) + if buf is None: + return f.getvalue() + return None + + +class DataFrameFormatter(TableFormatter): + """ + Render a DataFrame + + self.to_string() : console-friendly tabular output + self.to_html() : html table + self.to_latex() : LaTeX tabular environment table + + """ __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring + col_space: ColspaceType + def __init__( self, frame: "DataFrame", @@ -478,73 +551,484 @@ class DataFrameFormatter: float_format: Optional[FloatFormatType] = None, sparsify: Optional[bool] = None, index_names: bool = True, + line_width: Optional[int] = None, max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, show_dimensions: Union[bool, str] = False, decimal: str = ".", + table_id: Optional[str] = None, + render_links: bool = False, bold_rows: bool = False, escape: bool = True, ): self.frame = frame - self.columns = self._initialize_columns(columns) - self.col_space = self._initialize_colspace(col_space) + self.show_index_names = index_names + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + self.sparsify = sparsify + + self.float_format = float_format + if formatters is None: + self.formatters = {} + elif len(frame.columns) == len(formatters) or isinstance(formatters, dict): + self.formatters = formatters + else: + raise ValueError( + f"Formatters length({len(formatters)}) should match " + f"DataFrame number of columns({len(frame.columns)})" + ) + self.na_rep = na_rep + self.decimal = decimal + if col_space is None: + self.col_space = {} + elif isinstance(col_space, (int, str)): + self.col_space = {"": col_space} + self.col_space.update({column: col_space for column in self.frame.columns}) + elif isinstance(col_space, Mapping): + for column in col_space.keys(): + if column not in self.frame.columns and column != "": + raise ValueError( + f"Col_space is defined for an unknown column: {column}" + ) + self.col_space = col_space + else: + if len(frame.columns) != len(col_space): + raise ValueError( + f"Col_space length({len(col_space)}) should match " + f"DataFrame number of columns({len(frame.columns)})" + ) + self.col_space = dict(zip(self.frame.columns, col_space)) + self.header = header self.index = index - self.na_rep = na_rep - self.formatters = self._initialize_formatters(formatters) - self.justify = self._initialize_justify(justify) - self.float_format = float_format - self.sparsify = self._initialize_sparsify(sparsify) - self.show_index_names = index_names - self.decimal = decimal - self.bold_rows = bold_rows - self.escape = escape + self.line_width = line_width self.max_rows = max_rows self.min_rows = min_rows self.max_cols = max_cols + self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) self.show_dimensions = show_dimensions + self.table_id = table_id + self.render_links = render_links - self.max_cols_fitted = self._calc_max_cols_fitted() - self.max_rows_fitted = self._calc_max_rows_fitted() + if justify is None: + self.justify = get_option("display.colheader_justify") + else: + self.justify = justify - self.tr_frame = self.frame - self.truncate() - self.adj = get_adjustment() + self.bold_rows = bold_rows + self.escape = escape - def get_strcols(self) -> List[List[str]]: + if columns is not None: + self.columns = ensure_index(columns) + self.frame = self.frame[self.columns] + else: + self.columns = frame.columns + + self._chk_truncate() + self.adj = _get_adjustment() + + def _chk_truncate(self) -> None: + """ + Checks whether the frame should be truncated. If so, slices + the frame up. + """ + from pandas.core.reshape.concat import concat + + # Cut the data to the information actually printed + max_cols = self.max_cols + max_rows = self.max_rows + self.max_rows_adj: Optional[int] + max_rows_adj: Optional[int] + + if max_cols == 0 or max_rows == 0: # assume we are in the terminal + (w, h) = get_terminal_size() + self.w = w + self.h = h + if self.max_rows == 0: + dot_row = 1 + prompt_row = 1 + if self.show_dimensions: + show_dimension_rows = 3 + # assume we only get here if self.header is boolean. + # i.e. not to_latex() where self.header may be List[str] + self.header = cast(bool, self.header) + n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row + # rows available to fill with actual data + max_rows_adj = self.h - n_add_rows + self.max_rows_adj = max_rows_adj + + # Format only rows and columns that could potentially fit the + # screen + if max_cols == 0 and len(self.frame.columns) > w: + max_cols = w + if max_rows == 0 and len(self.frame) > h: + max_rows = h + + if not hasattr(self, "max_rows_adj"): + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) + self.max_rows_adj = max_rows + if not hasattr(self, "max_cols_adj"): + self.max_cols_adj = max_cols + + max_cols_adj = self.max_cols_adj + max_rows_adj = self.max_rows_adj + + truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj) + truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj) + + frame = self.frame + if truncate_h: + # cast here since if truncate_h is True, max_cols_adj is not None + max_cols_adj = cast(int, max_cols_adj) + if max_cols_adj == 0: + col_num = len(frame.columns) + elif max_cols_adj == 1: + max_cols = cast(int, max_cols) + frame = frame.iloc[:, :max_cols] + col_num = max_cols + else: + col_num = max_cols_adj // 2 + frame = concat( + (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 + ) + # truncate formatter + if isinstance(self.formatters, (list, tuple)): + truncate_fmt = self.formatters + self.formatters = [ + *truncate_fmt[:col_num], + *truncate_fmt[-col_num:], + ] + self.tr_col_num = col_num + if truncate_v: + # cast here since if truncate_v is True, max_rows_adj is not None + max_rows_adj = cast(int, max_rows_adj) + if max_rows_adj == 1: + row_num = max_rows + frame = frame.iloc[:max_rows, :] + else: + row_num = max_rows_adj // 2 + frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) + self.tr_row_num = row_num + else: + self.tr_row_num = None + + self.tr_frame = frame + self.truncate_h = truncate_h + self.truncate_v = truncate_v + self.is_truncated = bool(self.truncate_h or self.truncate_v) + + def _to_str_columns(self) -> List[List[str]]: """ Render a DataFrame to a list of columns (as lists of strings). """ - strcols = self._get_strcols_without_index() + # this method is not used by to_html where self.col_space + # could be a string so safe to cast + col_space = {k: cast(int, v) for k, v in self.col_space.items()} + frame = self.tr_frame + # may include levels names also + + str_index = self._get_formatted_index(frame) + + if not is_list_like(self.header) and not self.header: + stringified = [] + for i, c in enumerate(frame): + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj, + ) + stringified.append(fmt_values) + else: + if is_list_like(self.header): + # cast here since can't be bool if is_list_like + self.header = cast(List[str], self.header) + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" + ) + str_columns = [[label] for label in self.header] + else: + str_columns = self._get_formatted_column_labels(frame) + + if self.show_row_idx_names: + for x in str_columns: + x.append("") + + stringified = [] + for i, c in enumerate(frame): + cheader = str_columns[i] + header_colwidth = max( + col_space.get(c, 0), *(self.adj.len(x) for x in cheader) + ) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) + + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + stringified.append(cheader + fmt_values) + + strcols = stringified if self.index: - str_index = self._get_formatted_index(self.tr_frame) strcols.insert(0, str_index) + # Add ... to signal truncated + truncate_h = self.truncate_h + truncate_v = self.truncate_v + + if truncate_h: + col_num = self.tr_col_num + strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) + if truncate_v: + n_header_rows = len(str_index) - len(frame) + row_num = self.tr_row_num + # cast here since if truncate_v is True, self.tr_row_num is not None + row_num = cast(int, row_num) + for ix, col in enumerate(strcols): + # infer from above row + cwidth = self.adj.len(strcols[ix][row_num]) + is_dot_col = False + if truncate_h: + is_dot_col = ix == col_num + 1 + if cwidth > 3 or is_dot_col: + my_str = "..." + else: + my_str = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] + strcols[ix].insert(row_num + n_header_rows, dot_str) return strcols - @property - def should_show_dimensions(self) -> bool: - return self.show_dimensions is True or ( - self.show_dimensions == "truncate" and self.is_truncated + def write_result(self, buf: IO[str]) -> None: + """ + Render a DataFrame to a console-friendly tabular output. + """ + from pandas import Series + + frame = self.frame + + if len(frame.columns) == 0 or len(frame.index) == 0: + info_line = ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {pprint_thing(frame.columns)}\n" + f"Index: {pprint_thing(frame.index)}" + ) + text = info_line + else: + + strcols = self._to_str_columns() + if self.line_width is None: # no need to wrap around just print + # the whole frame + text = self.adj.adjoin(1, *strcols) + elif ( + not isinstance(self.max_cols, int) or self.max_cols > 0 + ): # need to wrap around + text = self._join_multiline(*strcols) + else: # max_cols == 0. Try to fit frame to terminal + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + dif = max_len - self.w + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + # subtract index column + max_cols_adj = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_adj = max(max_cols_adj, 2) + self.max_cols_adj = max_cols_adj + + # Call again _chk_truncate to cut frame appropriately + # and then generate string representation + self._chk_truncate() + strcols = self._to_str_columns() + text = self.adj.adjoin(1, *strcols) + buf.writelines(text) + + if self.should_show_dimensions: + buf.write(f"\n\n[{len(frame)} rows x {len(frame.columns)} columns]") + + def _join_multiline(self, *args) -> str: + lwidth = self.line_width + adjoin_width = 1 + strcols = list(args) + if self.index: + idx = strcols.pop(0) + lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width + + col_widths = [ + np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 + for col in strcols + ] + + assert lwidth is not None + col_bins = _binify(col_widths, lwidth) + nbins = len(col_bins) + + if self.truncate_v: + # cast here since if truncate_v is True, max_rows_adj is not None + self.max_rows_adj = cast(int, self.max_rows_adj) + nrows = self.max_rows_adj + 1 + else: + nrows = len(self.frame) + + str_lst = [] + st = 0 + for i, ed in enumerate(col_bins): + row = strcols[st:ed] + if self.index: + row.insert(0, idx) + if nbins > 1: + if ed <= len(strcols) and i < nbins - 1: + row.append([" \\"] + [" "] * (nrows - 1)) + else: + row.append([" "] * nrows) + str_lst.append(self.adj.adjoin(adjoin_width, *row)) + st = ed + return "\n\n".join(str_lst) + + def to_string( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + ) -> Optional[str]: + return self.get_result(buf=buf, encoding=encoding) + + def to_latex( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + column_format: Optional[str] = None, + longtable: bool = False, + encoding: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + from pandas.io.formats.latex import LatexFormatter + + return LatexFormatter( + self, + column_format=column_format, + longtable=longtable, + multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow, + caption=caption, + label=label, + ).get_result(buf=buf, encoding=encoding) + + def _format_col(self, i: int) -> List[str]: + frame = self.tr_frame + formatter = self._get_formatter(i) + return format_array( + frame.iloc[:, i]._values, + formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space.get(frame.columns[i]), + decimal=self.decimal, ) - @property - def is_truncated(self) -> bool: - return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + def to_html( + self, + buf: Optional[FilePathOrBuffer[str]] = None, + encoding: Optional[str] = None, + classes: Optional[Union[str, List, Tuple]] = None, + notebook: bool = False, + border: Optional[int] = None, + ) -> Optional[str]: + """ + Render a DataFrame to a html table. - @property - def is_truncated_horizontally(self) -> bool: - return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + Parameters + ---------- + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
`` tag, in addition to the default "dataframe". + notebook : {True, False}, optional, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
`` tag. Default ``pd.options.display.html.border``. + """ + from pandas.io.formats.html import HTMLFormatter, NotebookFormatter - @property - def is_truncated_vertically(self) -> bool: - return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + Klass = NotebookFormatter if notebook else HTMLFormatter + return Klass(self, classes=classes, border=border).get_result( + buf=buf, encoding=encoding + ) - @property - def dimensions_info(self) -> str: - return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" + def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: + from pandas.core.indexes.multi import _sparsify + + columns = frame.columns + + if isinstance(columns, MultiIndex): + fmt_columns = columns.format(sparsify=False, adjoin=False) + fmt_columns = list(zip(*fmt_columns)) + dtypes = self.frame.dtypes._values + + # if we have a Float level, they don't use leading space at all + restrict_formatting = any(l.is_floating for l in columns.levels) + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + + def space_format(x, y): + if ( + y not in self.formatters + and need_leadsp[x] + and not restrict_formatting + ): + return " " + y + return y + + str_columns = list( + zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) + ) + if self.sparsify and len(str_columns): + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + else: + fmt_columns = columns.format() + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [ + [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] + for i, (col, x) in enumerate(zip(columns, fmt_columns)) + ] + # self.str_columns = str_columns + return str_columns @property def has_index_names(self) -> bool: @@ -562,303 +1046,6 @@ class DataFrameFormatter: def show_col_idx_names(self) -> bool: return all((self.has_column_names, self.show_index_names, self.header)) - @property - def max_rows_displayed(self) -> int: - return min(self.max_rows or len(self.frame), len(self.frame)) - - def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: - if sparsify is None: - return get_option("display.multi_sparse") - return sparsify - - def _initialize_formatters( - self, formatters: Optional[FormattersType] - ) -> FormattersType: - if formatters is None: - return {} - elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict): - return formatters - else: - raise ValueError( - f"Formatters length({len(formatters)}) should match " - f"DataFrame number of columns({len(self.frame.columns)})" - ) - - def _initialize_justify(self, justify: Optional[str]) -> str: - if justify is None: - return get_option("display.colheader_justify") - else: - return justify - - def _initialize_columns(self, columns: Optional[Sequence[str]]) -> Index: - if columns is not None: - cols = ensure_index(columns) - self.frame = self.frame[cols] - return cols - else: - return self.frame.columns - - def _initialize_colspace( - self, col_space: Optional[ColspaceArgType] - ) -> ColspaceType: - result: ColspaceType - - if col_space is None: - result = {} - elif isinstance(col_space, (int, str)): - result = {"": col_space} - result.update({column: col_space for column in self.frame.columns}) - elif isinstance(col_space, Mapping): - for column in col_space.keys(): - if column not in self.frame.columns and column != "": - raise ValueError( - f"Col_space is defined for an unknown column: {column}" - ) - result = col_space - else: - if len(self.frame.columns) != len(col_space): - raise ValueError( - f"Col_space length({len(col_space)}) should match " - f"DataFrame number of columns({len(self.frame.columns)})" - ) - result = dict(zip(self.frame.columns, col_space)) - return result - - def _calc_max_cols_fitted(self) -> Optional[int]: - """Number of columns fitting the screen.""" - if not self._is_in_terminal(): - return self.max_cols - - width, _ = get_terminal_size() - if self._is_screen_narrow(width): - return width - else: - return self.max_cols - - def _calc_max_rows_fitted(self) -> Optional[int]: - """Number of rows with data fitting the screen.""" - max_rows: Optional[int] - - if self._is_in_terminal(): - _, height = get_terminal_size() - if self.max_rows == 0: - # rows available to fill with actual data - return height - self._get_number_of_auxillary_rows() - - if self._is_screen_short(height): - max_rows = height - else: - max_rows = self.max_rows - else: - max_rows = self.max_rows - - return self._adjust_max_rows(max_rows) - - def _adjust_max_rows(self, max_rows: Optional[int]) -> Optional[int]: - """Adjust max_rows using display logic. - - See description here: - https://pandas.pydata.org/docs/dev/user_guide/options.html#frequently-used-options - - GH #37359 - """ - if max_rows: - if (len(self.frame) > max_rows) and self.min_rows: - # if truncated, set max_rows showed to min_rows - max_rows = min(self.min_rows, max_rows) - return max_rows - - def _is_in_terminal(self) -> bool: - """Check if the output is to be shown in terminal.""" - return bool(self.max_cols == 0 or self.max_rows == 0) - - def _is_screen_narrow(self, max_width) -> bool: - return bool(self.max_cols == 0 and len(self.frame.columns) > max_width) - - def _is_screen_short(self, max_height) -> bool: - return bool(self.max_rows == 0 and len(self.frame) > max_height) - - def _get_number_of_auxillary_rows(self) -> int: - """Get number of rows occupied by prompt, dots and dimension info.""" - dot_row = 1 - prompt_row = 1 - num_rows = dot_row + prompt_row - - if self.show_dimensions: - num_rows += len(self.dimensions_info.splitlines()) - - if self.header: - num_rows += 1 - - return num_rows - - def truncate(self) -> None: - """ - Check whether the frame should be truncated. If so, slice the frame up. - """ - if self.is_truncated_horizontally: - self._truncate_horizontally() - - if self.is_truncated_vertically: - self._truncate_vertically() - - def _truncate_horizontally(self) -> None: - """Remove columns, which are not to be displayed and adjust formatters. - - Attributes affected: - - tr_frame - - formatters - - tr_col_num - """ - assert self.max_cols_fitted is not None - col_num = self.max_cols_fitted // 2 - if col_num >= 1: - left = self.tr_frame.iloc[:, :col_num] - right = self.tr_frame.iloc[:, -col_num:] - self.tr_frame = concat((left, right), axis=1) - - # truncate formatter - if isinstance(self.formatters, (list, tuple)): - self.formatters = [ - *self.formatters[:col_num], - *self.formatters[-col_num:], - ] - else: - col_num = cast(int, self.max_cols) - self.tr_frame = self.tr_frame.iloc[:, :col_num] - self.tr_col_num = col_num - - def _truncate_vertically(self) -> None: - """Remove rows, which are not to be displayed. - - Attributes affected: - - tr_frame - - tr_row_num - """ - assert self.max_rows_fitted is not None - row_num = self.max_rows_fitted // 2 - if row_num >= 1: - head = self.tr_frame.iloc[:row_num, :] - tail = self.tr_frame.iloc[-row_num:, :] - self.tr_frame = concat((head, tail)) - else: - row_num = cast(int, self.max_rows) - self.tr_frame = self.tr_frame.iloc[:row_num, :] - self.tr_row_num = row_num - - def _get_strcols_without_index(self) -> List[List[str]]: - strcols: List[List[str]] = [] - - if not is_list_like(self.header) and not self.header: - for i, c in enumerate(self.tr_frame): - fmt_values = self.format_col(i) - fmt_values = _make_fixed_width( - strings=fmt_values, - justify=self.justify, - minimum=int(self.col_space.get(c, 0)), - adj=self.adj, - ) - strcols.append(fmt_values) - return strcols - - if is_list_like(self.header): - # cast here since can't be bool if is_list_like - self.header = cast(List[str], self.header) - if len(self.header) != len(self.columns): - raise ValueError( - f"Writing {len(self.columns)} cols " - f"but got {len(self.header)} aliases" - ) - str_columns = [[label] for label in self.header] - else: - str_columns = self._get_formatted_column_labels(self.tr_frame) - - if self.show_row_idx_names: - for x in str_columns: - x.append("") - - for i, c in enumerate(self.tr_frame): - cheader = str_columns[i] - header_colwidth = max( - int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) - ) - fmt_values = self.format_col(i) - fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=header_colwidth, adj=self.adj - ) - - max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) - cheader = self.adj.justify(cheader, max_len, mode=self.justify) - strcols.append(cheader + fmt_values) - - return strcols - - def format_col(self, i: int) -> List[str]: - frame = self.tr_frame - formatter = self._get_formatter(i) - return format_array( - frame.iloc[:, i]._values, - formatter, - float_format=self.float_format, - na_rep=self.na_rep, - space=self.col_space.get(frame.columns[i]), - decimal=self.decimal, - leading_space=self.index, - ) - - def _get_formatter(self, i: Union[str, int]) -> Optional[Callable]: - if isinstance(self.formatters, (list, tuple)): - if is_integer(i): - i = cast(int, i) - return self.formatters[i] - else: - return None - else: - if is_integer(i) and i not in self.columns: - i = self.columns[i] - return self.formatters.get(i, None) - - def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: - from pandas.core.indexes.multi import sparsify_labels - - columns = frame.columns - - if isinstance(columns, MultiIndex): - fmt_columns = columns.format(sparsify=False, adjoin=False) - fmt_columns = list(zip(*fmt_columns)) - dtypes = self.frame.dtypes._values - - # if we have a Float level, they don't use leading space at all - restrict_formatting = any(level.is_floating for level in columns.levels) - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - - def space_format(x, y): - if ( - y not in self.formatters - and need_leadsp[x] - and not restrict_formatting - ): - return " " + y - return y - - str_columns = list( - zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) - ) - if self.sparsify and len(str_columns): - str_columns = sparsify_labels(str_columns) - - str_columns = [list(x) for x in zip(*str_columns)] - else: - fmt_columns = columns.format() - dtypes = self.frame.dtypes - need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [ - [" " + x if not self._get_formatter(i) and need_leadsp[x] else x] - for i, (col, x) in enumerate(zip(columns, fmt_columns)) - ] - # self.str_columns = str_columns - return str_columns - def _get_formatted_index(self, frame: "DataFrame") -> List[str]: # Note: this is only used by to_string() and to_latex(), not by # to_html(). so safe to cast col_space here. @@ -880,7 +1067,7 @@ class DataFrameFormatter: fmt_index = [ tuple( _make_fixed_width( - list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj, ) ) for x in fmt_index @@ -909,232 +1096,6 @@ class DataFrameFormatter: return names -class DataFrameRenderer: - """Class for creating dataframe output in multiple formats. - - Called in pandas.core.generic.NDFrame: - - to_csv - - to_latex - - Called in pandas.core.frame.DataFrame: - - to_html - - to_string - - Parameters - ---------- - fmt : DataFrameFormatter - Formatter with the formating options. - """ - - def __init__(self, fmt: DataFrameFormatter): - self.fmt = fmt - - def to_latex( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - column_format: Optional[str] = None, - longtable: bool = False, - encoding: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a LaTeX tabular/longtable environment output. - """ - from pandas.io.formats.latex import LatexFormatter - - latex_formatter = LatexFormatter( - self.fmt, - longtable=longtable, - column_format=column_format, - multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow, - caption=caption, - label=label, - position=position, - ) - string = latex_formatter.to_string() - return save_to_buffer(string, buf=buf, encoding=encoding) - - def to_html( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - classes: Optional[Union[str, List, Tuple]] = None, - notebook: bool = False, - border: Optional[int] = None, - table_id: Optional[str] = None, - render_links: bool = False, - ) -> Optional[str]: - """ - Render a DataFrame to a html table. - - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - encoding : str, default “utf-8” - Set character encoding. - classes : str or list-like - classes to include in the `class` attribute of the opening - ``
`` tag, in addition to the default "dataframe". - notebook : {True, False}, optional, default False - Whether the generated HTML is for IPython Notebook. - border : int - A ``border=border`` attribute is included in the opening - ``
`` tag. Default ``pd.options.display.html.border``. - table_id : str, optional - A css id is included in the opening `
` tag if specified. - render_links : bool, default False - Convert URLs to HTML links. - """ - from pandas.io.formats.html import HTMLFormatter, NotebookFormatter - - Klass = NotebookFormatter if notebook else HTMLFormatter - - html_formatter = Klass( - self.fmt, - classes=classes, - border=border, - table_id=table_id, - render_links=render_links, - ) - string = html_formatter.to_string() - return save_to_buffer(string, buf=buf, encoding=encoding) - - def to_string( - self, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - line_width: Optional[int] = None, - ) -> Optional[str]: - """ - Render a DataFrame to a console-friendly tabular output. - - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - encoding: str, default “utf-8” - Set character encoding. - line_width : int, optional - Width to wrap a line in characters. - """ - from pandas.io.formats.string import StringFormatter - - string_formatter = StringFormatter(self.fmt, line_width=line_width) - string = string_formatter.to_string() - return save_to_buffer(string, buf=buf, encoding=encoding) - - def to_csv( - self, - path_or_buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, - sep: str = ",", - columns: Optional[Sequence[Label]] = None, - index_label: Optional[IndexLabel] = None, - mode: str = "w", - compression: CompressionOptions = "infer", - quoting: Optional[int] = None, - quotechar: str = '"', - line_terminator: Optional[str] = None, - chunksize: Optional[int] = None, - date_format: Optional[str] = None, - doublequote: bool = True, - escapechar: Optional[str] = None, - errors: str = "strict", - storage_options: StorageOptions = None, - ) -> Optional[str]: - """ - Render dataframe as comma-separated file. - """ - from pandas.io.formats.csvs import CSVFormatter - - if path_or_buf is None: - created_buffer = True - path_or_buf = StringIO() - else: - created_buffer = False - - csv_formatter = CSVFormatter( - path_or_buf=path_or_buf, - line_terminator=line_terminator, - sep=sep, - encoding=encoding, - errors=errors, - compression=compression, - quoting=quoting, - cols=columns, - index_label=index_label, - mode=mode, - chunksize=chunksize, - quotechar=quotechar, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, - storage_options=storage_options, - formatter=self.fmt, - ) - csv_formatter.save() - - if created_buffer: - assert isinstance(path_or_buf, StringIO) - content = path_or_buf.getvalue() - path_or_buf.close() - return content - - return None - - -def save_to_buffer( - string: str, - buf: Optional[FilePathOrBuffer[str]] = None, - encoding: Optional[str] = None, -) -> Optional[str]: - """ - Perform serialization. Write to buf or return as string if buf is None. - """ - with get_buffer(buf, encoding=encoding) as f: - f.write(string) - if buf is None: - return f.getvalue() - return None - - -@contextmanager -def get_buffer(buf: Optional[FilePathOrBuffer[str]], encoding: Optional[str] = None): - """ - Context manager to open, yield and close buffer for filenames or Path-like - objects, otherwise yield buf unchanged. - """ - if buf is not None: - buf = stringify_path(buf) - else: - buf = StringIO() - - if encoding is None: - encoding = "utf-8" - elif not isinstance(buf, str): - raise ValueError("buf is not a file name and encoding is specified.") - - if hasattr(buf, "write"): - yield buf - elif isinstance(buf, str): - with open(buf, "w", encoding=encoding, newline="") as f: - # GH#30034 open instead of codecs.open prevents a file leak - # if we have an invalid encoding argument. - # newline="" is needed to roundtrip correctly on - # windows test_to_latex_filename - yield f - else: - raise TypeError("buf is not a file name and it has no write method") - - # ---------------------------------------------------------------------- # Array formatters @@ -1148,7 +1109,7 @@ def format_array( space: Optional[Union[str, int]] = None, justify: str = "right", decimal: str = ".", - leading_space: Optional[bool] = True, + leading_space: Optional[bool] = None, quoting: Optional[int] = None, ) -> List[str]: """ @@ -1164,7 +1125,7 @@ def format_array( space justify decimal - leading_space : bool, optional, default True + leading_space : bool, optional Whether the array should be formatted with a leading space. When an array as a column of a Series or DataFrame, we do want the leading space to pad between columns. @@ -1231,7 +1192,7 @@ class GenericArrayFormatter: decimal: str = ".", quoting: Optional[int] = None, fixed_width: bool = True, - leading_space: Optional[bool] = True, + leading_space: Optional[bool] = None, ): self.values = values self.digits = digits @@ -1254,7 +1215,7 @@ class GenericArrayFormatter: float_format = get_option("display.float_format") if float_format is None: precision = get_option("display.precision") - float_format = lambda x: f"{x: .{precision:d}f}" + float_format = lambda x: f"{x: .{precision:d}g}" else: float_format = self.float_format @@ -1315,12 +1276,14 @@ class GenericArrayFormatter: tpl = " {v}" fmt_values.append(tpl.format(v=_format(v))) - fmt_values = _trim_zeros_float(str_floats=fmt_values, decimal=".") - return fmt_values class FloatArrayFormatter(GenericArrayFormatter): + """ + + """ + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1352,17 +1315,7 @@ class FloatArrayFormatter(GenericArrayFormatter): if float_format: def base_formatter(v): - assert float_format is not None # for mypy - # pandas\io\formats\format.py:1411: error: "str" not callable - # [operator] - - # pandas\io\formats\format.py:1411: error: Unexpected keyword - # argument "value" for "__call__" of "EngFormatter" [call-arg] - return ( - float_format(value=v) # type: ignore[operator,call-arg] - if notna(v) - else self.na_rep - ) + return float_format(value=v) if notna(v) else self.na_rep else: @@ -1396,19 +1349,8 @@ class FloatArrayFormatter(GenericArrayFormatter): Returns the float values converted into strings using the parameters given at initialisation, as a numpy array """ - - def format_with_na_rep(values: ArrayLike, formatter: Callable, na_rep: str): - mask = isna(values) - formatted = np.array( - [ - formatter(val) if not m else na_rep - for val, m in zip(values.ravel(), mask.ravel()) - ] - ).reshape(values.shape) - return formatted - if self.formatter is not None: - return format_with_na_rep(self.values, self.formatter, self.na_rep) + return np.array([self.formatter(x) for x in self.values]) if self.fixed_width: threshold = get_option("display.chop_threshold") @@ -1429,13 +1371,19 @@ class FloatArrayFormatter(GenericArrayFormatter): # separate the wheat from the chaff values = self.values is_complex = is_complex_dtype(values) - values = format_with_na_rep(values, formatter, na_rep) + mask = isna(values) + values = np.array(values, dtype="object") + values[mask] = na_rep + imask = (~mask).ravel() + values.flat[imask] = np.array( + [formatter(val) for val in values.ravel()[imask]] + ) if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, self.decimal) + result = _trim_zeros_complex(values, self.decimal, na_rep) else: - result = _trim_zeros_float(values, self.decimal) + result = _trim_zeros_float(values, self.decimal, na_rep) return np.asarray(result, dtype="object") return values @@ -1445,11 +1393,9 @@ class FloatArrayFormatter(GenericArrayFormatter): float_format: Optional[FloatFormatType] if self.float_format is None: if self.fixed_width: - if self.leading_space is True: - fmt_str = "{value: .{digits:d}f}" - else: - fmt_str = "{value:.{digits:d}f}" - float_format = partial(fmt_str.format, digits=self.digits) + float_format = partial( + "{value: .{digits:d}f}".format, digits=self.digits + ) else: float_format = self.float_format else: @@ -1481,26 +1427,22 @@ class FloatArrayFormatter(GenericArrayFormatter): ).any() if has_small_values or (too_long and has_large_values): - if self.leading_space is True: - fmt_str = "{value: .{digits:d}e}" - else: - fmt_str = "{value:.{digits:d}e}" - float_format = partial(fmt_str.format, digits=self.digits) + float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values def _format_strings(self) -> List[str]: + # shortcut + if self.formatter is not None: + return [self.formatter(x) for x in self.values] + return list(self.get_result_as_array()) class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - if self.leading_space is False: - formatter_str = lambda x: f"{x:d}".format(x=x) - else: - formatter_str = lambda x: f"{x: d}".format(x=x) - formatter = self.formatter or formatter_str + formatter = self.formatter or (lambda x: f"{x: d}") fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -1527,9 +1469,11 @@ class Datetime64Formatter(GenericArrayFormatter): if self.formatter is not None and callable(self.formatter): return [self.formatter(x) for x in values] - fmt_values = values._data._format_native_types( - na_rep=self.nat_rep, date_format=self.date_format - ) + fmt_values = format_array_from_datetime( + values.asi8.ravel(), + format=_get_format_datetime64_from_values(values, self.date_format), + na_rep=self.nat_rep, + ).reshape(values.shape) return fmt_values.tolist() @@ -1537,9 +1481,7 @@ class ExtensionArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: values = extract_array(self.values, extract_numpy=True) - formatter = self.formatter - if formatter is None: - formatter = values._formatter(boxed=True) + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo @@ -1555,9 +1497,7 @@ class ExtensionArrayFormatter(GenericArrayFormatter): digits=self.digits, space=self.space, justify=self.justify, - decimal=self.decimal, leading_space=self.leading_space, - quoting=self.quoting, ) return fmt_values @@ -1631,12 +1571,11 @@ def format_percentiles( return [i + "%" for i in out] -def is_dates_only( +def _is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) - if not isinstance(values, Index): - values = values.ravel() + values = values.ravel() values = DatetimeIndex(values) if values.tz is not None: @@ -1653,40 +1592,49 @@ def is_dates_only( return False -def _format_datetime64(x: Union[NaTType, Timestamp], nat_rep: str = "NaT") -> str: - if x is NaT: +def _format_datetime64( + x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" +) -> str: + if x is None or (is_scalar(x) and isna(x)): return nat_rep + if tz is not None or not isinstance(x, Timestamp): + if getattr(x, "tzinfo", None) is not None: + x = Timestamp(x).tz_convert(tz) + else: + x = Timestamp(x).tz_localize(tz) + return str(x) def _format_datetime64_dateonly( - x: Union[NaTType, Timestamp], - nat_rep: str = "NaT", - date_format: Optional[str] = None, + x: Union[NaTType, Timestamp], nat_rep: str = "NaT", date_format: None = None ) -> str: - if x is NaT: + if x is None or (is_scalar(x) and isna(x)): return nat_rep + if not isinstance(x, Timestamp): + x = Timestamp(x) + if date_format: return x.strftime(date_format) else: return x._date_repr -def get_format_datetime64( - is_dates_only: bool, nat_rep: str = "NaT", date_format: Optional[str] = None +def _get_format_datetime64( + is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None ) -> Callable: if is_dates_only: - return lambda x: _format_datetime64_dateonly( + return lambda x, tz=None: _format_datetime64_dateonly( x, nat_rep=nat_rep, date_format=date_format ) else: - return lambda x: _format_datetime64(x, nat_rep=nat_rep) + return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) -def get_format_datetime64_from_values( +def _get_format_datetime64_from_values( values: Union[np.ndarray, DatetimeArray, DatetimeIndex], date_format: Optional[str] ) -> Optional[str]: """ given values and a date_format, return a string format """ @@ -1695,8 +1643,8 @@ def get_format_datetime64_from_values( # only accepts 1D values values = values.ravel() - ido = is_dates_only(values) - if ido: + is_dates_only = _is_dates_only(values) + if is_dates_only: return date_format or "%Y-%m-%d" return date_format @@ -1705,9 +1653,9 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> List[str]: """ we by definition have a TZ """ values = self.values.astype(object) - ido = is_dates_only(values) - formatter = self.formatter or get_format_datetime64( - ido, date_format=self.date_format + is_dates_only = _is_dates_only(values) + formatter = self.formatter or _get_format_datetime64( + is_dates_only, date_format=self.date_format ) fmt_values = [formatter(x) for x in values] @@ -1727,13 +1675,13 @@ class Timedelta64Formatter(GenericArrayFormatter): self.box = box def _format_strings(self) -> List[str]: - formatter = self.formatter or get_format_timedelta64( + formatter = self.formatter or _get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) return [formatter(x) for x in self.values] -def get_format_timedelta64( +def _get_format_timedelta64( values: Union[np.ndarray, TimedeltaIndex, TimedeltaArray], nat_rep: str = "NaT", box: bool = False, @@ -1783,11 +1731,9 @@ def _make_fixed_width( return strings if adj is None: - adjustment = get_adjustment() - else: - adjustment = adj + adj = _get_adjustment() - max_len = max(adjustment.len(x) for x in strings) + max_len = max(adj.len(x) for x in strings) if minimum is not None: max_len = max(minimum, max_len) @@ -1796,74 +1742,57 @@ def _make_fixed_width( if conf_max is not None and max_len > conf_max: max_len = conf_max - def just(x: str) -> str: + def just(x): if conf_max is not None: - if (conf_max > 3) & (adjustment.len(x) > max_len): + if (conf_max > 3) & (adj.len(x) > max_len): x = x[: max_len - 3] + "..." return x strings = [just(x) for x in strings] - result = adjustment.justify(strings, max_len, mode=justify) + result = adj.justify(strings, max_len, mode=justify) return result -def _trim_zeros_complex(str_complexes: np.ndarray, decimal: str = ".") -> List[str]: +def _trim_zeros_complex( + str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" +) -> List[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ - trimmed = [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal)) + return [ + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) for x in str_complexes ] - # pad strings to the length of the longest trimmed string for alignment - lengths = [len(s) for s in trimmed] - max_length = max(lengths) - padded = [ - s[: -((k - 1) // 2 + 1)] # real part - + (max_length - k) // 2 * "0" - + s[-((k - 1) // 2 + 1) : -((k - 1) // 2)] # + / - - + s[-((k - 1) // 2) : -1] # imaginary part - + (max_length - k) // 2 * "0" - + s[-1] - for s, k in zip(trimmed, lengths) - ] - return padded - def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], decimal: str = "." + str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. """ trimmed = str_floats - number_regex = re.compile(fr"^\s*[\+-]?[0-9]+\{decimal}[0-9]*$") - def is_number_with_decimal(x): - return re.match(number_regex, x) is not None + def _is_number(x): + return x != na_rep and not x.endswith("inf") - def should_trim(values: Union[np.ndarray, List[str]]) -> bool: - """ - Determine if an array of strings should be trimmed. + def _cond(values): + finite = [x for x in values if _is_number(x)] + has_decimal = [decimal in x for x in finite] - Returns True if all numbers containing decimals (defined by the - above regular expression) within the array end in a zero, otherwise - returns False. - """ - numbers = [x for x in values if is_number_with_decimal(x)] - return len(numbers) > 0 and all(x.endswith("0") for x in numbers) + return ( + len(finite) > 0 + and all(has_decimal) + and all(x.endswith("0") for x in finite) + and not (any(("e" in x) or ("E" in x) for x in finite)) + ) - while should_trim(trimmed): - trimmed = [x[:-1] if is_number_with_decimal(x) else x for x in trimmed] + while _cond(trimmed): + trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - result = [ - x + "0" if is_number_with_decimal(x) and x.endswith(decimal) else x - for x in trimmed - ] - return result + return [x + "0" if x.endswith(decimal) and _is_number(x) else x for x in trimmed] def _has_names(index: Index) -> bool: @@ -1982,6 +1911,26 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non set_option("display.column_space", max(12, accuracy + 9)) +def _binify(cols: List[int], line_width: int) -> List[int]: + adjoin_width = 1 + bins = [] + curr_width = 0 + i_last_column = len(cols) - 1 + for i, w in enumerate(cols): + w_adjoined = w + adjoin_width + curr_width += w_adjoined + if i_last_column == i: + wrap = curr_width + 1 > line_width and i > 0 + else: + wrap = curr_width + 2 > line_width and i > 0 + if wrap: + bins.append(i) + curr_width = w_adjoined + + bins.append(len(cols)) + return bins + + def get_level_lengths( levels: Any, sentinel: Union[bool, object, str] = "" ) -> List[Dict[int, int]]: diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/html.py b/venv/lib/python3.8/site-packages/pandas/io/formats/html.py index b4f7e39..13f0ab1 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/html.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/html.py @@ -3,7 +3,7 @@ Module for formatting output data in HTML. """ from textwrap import dedent -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast +from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option @@ -12,11 +12,16 @@ from pandas._libs import lib from pandas import MultiIndex, option_context from pandas.io.common import is_url -from pandas.io.formats.format import DataFrameFormatter, get_level_lengths +from pandas.io.formats.format import ( + DataFrameFormatter, + TableFormatter, + buffer_put_lines, + get_level_lengths, +) from pandas.io.formats.printing import pprint_thing -class HTMLFormatter: +class HTMLFormatter(TableFormatter): """ Internal class for formatting output data in html. This class is intended for shared functionality between @@ -33,8 +38,6 @@ class HTMLFormatter: formatter: DataFrameFormatter, classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, - table_id: Optional[str] = None, - render_links: bool = False, ) -> None: self.fmt = formatter self.classes = classes @@ -48,35 +51,14 @@ class HTMLFormatter: if border is None: border = cast(int, get_option("display.html.border")) self.border = border - self.table_id = table_id - self.render_links = render_links + self.table_id = self.fmt.table_id + self.render_links = self.fmt.render_links self.col_space = { column: f"{value}px" if isinstance(value, int) else value for column, value in self.fmt.col_space.items() } - def to_string(self) -> str: - lines = self.render() - if any(isinstance(x, str) for x in lines): - lines = [str(x) for x in lines] - return "\n".join(lines) - - def render(self) -> List[str]: - self._write_table() - - if self.should_show_dimensions: - by = chr(215) # × - self.write( - f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" - ) - - return self.elements - - @property - def should_show_dimensions(self): - return self.fmt.should_show_dimensions - @property def show_row_idx_names(self) -> bool: return self.fmt.show_row_idx_names @@ -103,8 +85,9 @@ class HTMLFormatter: def _get_columns_formatted_values(self) -> Iterable: return self.columns + # https://github.com/python/mypy/issues/1237 @property - def is_truncated(self) -> bool: + def is_truncated(self) -> bool: # type: ignore return self.fmt.is_truncated @property @@ -205,6 +188,20 @@ class HTMLFormatter: indent -= indent_delta self.write("", indent) + def render(self) -> List[str]: + self._write_table() + + if self.should_show_dimensions: + by = chr(215) # × + self.write( + f"

{len(self.frame)} rows {by} {len(self.frame.columns)} columns

" + ) + + return self.elements + + def write_result(self, buf: IO[str]) -> None: + buffer_put_lines(buf, self.render()) + def _write_table(self, indent: int = 0) -> None: _classes = ["dataframe"] # Default class. use_mathjax = get_option("display.html.use_mathjax") @@ -238,7 +235,7 @@ class HTMLFormatter: self.write("
", indent) def _write_col_header(self, indent: int) -> None: - is_truncated_horizontally = self.fmt.is_truncated_horizontally + truncate_h = self.fmt.truncate_h if isinstance(self.columns, MultiIndex): template = 'colspan="{span:d}" halign="left"' @@ -251,7 +248,7 @@ class HTMLFormatter: level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): - if is_truncated_horizontally: + if truncate_h: # modify the header lines ins_col = self.fmt.tr_col_num if self.fmt.sparsify: @@ -348,16 +345,16 @@ class HTMLFormatter: row.extend(self._get_columns_formatted_values()) align = self.fmt.justify - if is_truncated_horizontally: + if truncate_h: ins_col = self.row_levels + self.fmt.tr_col_num row.insert(ins_col, "...") self.write_tr(row, indent, self.indent_delta, header=True, align=align) def _write_row_header(self, indent: int) -> None: - is_truncated_horizontally = self.fmt.is_truncated_horizontally + truncate_h = self.fmt.truncate_h row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( - self.ncols + (1 if is_truncated_horizontally else 0) + self.ncols + (1 if truncate_h else 0) ) self.write_tr(row, indent, self.indent_delta, header=True) @@ -374,7 +371,7 @@ class HTMLFormatter: def _get_formatted_values(self) -> Dict[int, List[str]]: with option_context("display.max_colwidth", None): - fmt_values = {i: self.fmt.format_col(i) for i in range(self.ncols)} + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} return fmt_values def _write_body(self, indent: int) -> None: @@ -392,8 +389,8 @@ class HTMLFormatter: def _write_regular_rows( self, fmt_values: Mapping[int, List[str]], indent: int ) -> None: - is_truncated_horizontally = self.fmt.is_truncated_horizontally - is_truncated_vertically = self.fmt.is_truncated_vertically + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v nrows = len(self.fmt.tr_frame) @@ -407,7 +404,7 @@ class HTMLFormatter: row: List[str] = [] for i in range(nrows): - if is_truncated_vertically and i == (self.fmt.tr_row_num): + if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -428,7 +425,7 @@ class HTMLFormatter: row.append("") row.extend(fmt_values[j][i] for j in range(self.ncols)) - if is_truncated_horizontally: + if truncate_h: dot_col_ix = self.fmt.tr_col_num + self.row_levels row.insert(dot_col_ix, "...") self.write_tr( @@ -440,8 +437,8 @@ class HTMLFormatter: ) -> None: template = 'rowspan="{span}" valign="top"' - is_truncated_horizontally = self.fmt.is_truncated_horizontally - is_truncated_vertically = self.fmt.is_truncated_vertically + truncate_h = self.fmt.truncate_h + truncate_v = self.fmt.truncate_v frame = self.fmt.tr_frame nrows = len(frame) @@ -456,10 +453,12 @@ class HTMLFormatter: level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 - if is_truncated_vertically: + if truncate_v: # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num + # cast here since if truncate_v is True, self.fmt.tr_row_num is not None + ins_row = cast(int, ins_row) inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} @@ -520,7 +519,7 @@ class HTMLFormatter: row.append(v) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if is_truncated_horizontally: + if truncate_h: row.insert( self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." ) @@ -534,7 +533,7 @@ class HTMLFormatter: else: row = [] for i in range(len(frame)): - if is_truncated_vertically and i == (self.fmt.tr_row_num): + if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -550,7 +549,7 @@ class HTMLFormatter: row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if is_truncated_horizontally: + if truncate_h: row.insert(self.row_levels + self.fmt.tr_col_num, "...") self.write_tr( row, @@ -569,7 +568,7 @@ class NotebookFormatter(HTMLFormatter): """ def _get_formatted_values(self) -> Dict[int, List[str]]: - return {i: self.fmt.format_col(i) for i in range(self.ncols)} + return {i: self.fmt._format_col(i) for i in range(self.ncols)} def _get_columns_formatted_values(self) -> List[str]: return self.columns.format() diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/info.py b/venv/lib/python3.8/site-packages/pandas/io/formats/info.py index 98bd159..db6704f 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/info.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/info.py @@ -1,20 +1,10 @@ -from abc import ABC, abstractmethod +from abc import ABCMeta, abstractmethod import sys -from typing import ( - IO, - TYPE_CHECKING, - Iterable, - Iterator, - List, - Mapping, - Optional, - Sequence, - Union, -) +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeriesUnion +from pandas._typing import Dtype, FrameOrSeries from pandas.core.indexes.api import Index @@ -22,7 +12,7 @@ from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.frame import DataFrame + from pandas.core.series import Series # noqa: F401 def _put_str(s: Union[str, Dtype], space: int) -> str: @@ -82,101 +72,101 @@ def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: return f"{num:3.1f}{size_qualifier} PB" -def _initialize_memory_usage( - memory_usage: Optional[Union[bool, str]] = None, -) -> Union[bool, str]: - """Get memory usage based on inputs and display options.""" - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - return memory_usage +class BaseInfo(metaclass=ABCMeta): + def __init__( + self, + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ): + if buf is None: # pragma: no cover + buf = sys.stdout + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + self.data = data + self.verbose = verbose + self.buf = buf + self.max_cols = max_cols + self.memory_usage = memory_usage + self.null_counts = null_counts -class BaseInfo(ABC): - """ - Base class for DataFrameInfo and SeriesInfo. - - Parameters - ---------- - data : DataFrame or Series - Either dataframe or series. - memory_usage : bool or str, optional - If "deep", introspect the data deeply by interrogating object dtypes - for system-level memory consumption, and include it in the returned - values. - """ - - data: FrameOrSeriesUnion - memory_usage: Union[bool, str] - - @property @abstractmethod - def dtypes(self) -> Iterable[Dtype]: + def _get_mem_usage(self, deep: bool) -> int: """ - Dtypes. + Get memory usage in bytes. + + Parameters + ---------- + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. Returns ------- - dtypes : sequence - Dtype of each of the DataFrame's columns (or one series column). - """ - - @property - @abstractmethod - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - - @property - @abstractmethod - def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns or column (if series).""" - - @property - @abstractmethod - def memory_usage_bytes(self) -> int: - """ - Memory usage in bytes. - - Returns - ------- - memory_usage_bytes : int + mem_usage : int Object's total memory usage in bytes. """ - - @property - def memory_usage_string(self) -> str: - """Memory usage in a form of human readable string.""" - return f"{_sizeof_fmt(self.memory_usage_bytes, self.size_qualifier)}\n" - - @property - def size_qualifier(self) -> str: - size_qualifier = "" - if self.memory_usage: - if self.memory_usage != "deep": - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - if ( - "object" in self.dtype_counts - or self.data.index._is_memory_usage_qualified() - ): - size_qualifier = "+" - return size_qualifier + pass @abstractmethod - def render( - self, - *, - buf: Optional[IO[str]], - max_cols: Optional[int], - verbose: Optional[bool], - show_counts: Optional[bool], + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get column names and dtypes. + + Returns + ------- + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + pass + + @abstractmethod + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool ) -> None: + """ + Append name, non-null count (optional), and dtype for each column to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + dtypes : Series + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. + """ + pass + + @abstractmethod + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ + pass + + def info(self) -> None: """ Print a concise summary of a %(klass)s. This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. - %(version_added_sub)s\ Parameters ---------- @@ -203,7 +193,12 @@ class BaseInfo(ABC): consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - %(show_counts_sub)s + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. Returns ------- @@ -218,470 +213,139 @@ class BaseInfo(ABC): -------- %(examples_sub)s """ + lines = [] + + lines.append(str(type(self.data))) + lines.append(self.data.index._summary()) + + ids, dtypes = self._get_ids_and_dtypes() + col_count = len(ids) + + if col_count == 0: + lines.append(f"Empty {type(self.data).__name__}") + fmt.buffer_put_lines(self.buf, lines) + return + + # hack + max_cols = self.max_cols + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self.data) + 1) + + if self.null_counts is None: + show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + else: + show_counts = self.null_counts + exceeds_info_cols = col_count > max_cols + + if self.verbose: + self._verbose_repr(lines, ids, dtypes, show_counts) + elif self.verbose is False: # specifically set to False, not necessarily None + self._non_verbose_repr(lines, ids) + else: + if exceeds_info_cols: + self._non_verbose_repr(lines, ids) + else: + self._verbose_repr(lines, ids, dtypes, show_counts) + + # groupby dtype.name to collect e.g. Categorical columns + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() + collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + if self.memory_usage: + # append memory usage of df to display + size_qualifier = "" + if self.memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self._get_mem_usage(deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(self.buf, lines) class DataFrameInfo(BaseInfo): - """ - Class storing dataframe-specific info. - """ - - def __init__( - self, - data: "DataFrame", - memory_usage: Optional[Union[bool, str]] = None, - ): - self.data: "DataFrame" = data - self.memory_usage = _initialize_memory_usage(memory_usage) - - @property - def dtype_counts(self) -> Mapping[str, int]: - return _get_dataframe_dtype_counts(self.data) - - @property - def dtypes(self) -> Iterable[Dtype]: - """ - Dtypes. - - Returns - ------- - dtypes - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property - def ids(self) -> Index: - """ - Column names. - - Returns - ------- - ids : Index - DataFrame's column names. - """ - return self.data.columns - - @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return len(self.ids) - - @property - def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns or column (if series).""" - return self.data.count() - - @property - def memory_usage_bytes(self) -> int: - if self.memory_usage == "deep": - deep = True - else: - deep = False + def _get_mem_usage(self, deep: bool) -> int: return self.data.memory_usage(index=True, deep=deep).sum() - def render( - self, - *, - buf: Optional[IO[str]], - max_cols: Optional[int], - verbose: Optional[bool], - show_counts: Optional[bool], + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + return self.data.columns, self.data.dtypes + + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool ) -> None: - printer = DataFrameInfoPrinter( - info=self, - max_cols=max_cols, - verbose=verbose, - show_counts=show_counts, - ) - printer.to_buffer(buf) + col_count = len(ids) + lines.append(f"Data columns (total {col_count} columns):") + id_head = " # " + column_head = "Column" + col_space = 2 -class InfoPrinterAbstract: - """ - Class for printing dataframe or series info. - """ + max_col = max(len(pprint_thing(k)) for k in ids) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space - def to_buffer(self, buf: Optional[IO[str]] = None) -> None: - """Save dataframe info into buffer.""" - table_builder = self._create_table_builder() - lines = table_builder.get_lines() - if buf is None: # pragma: no cover - buf = sys.stdout - fmt.buffer_put_lines(buf, lines) + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space - @abstractmethod - def _create_table_builder(self) -> "TableBuilderAbstract": - """Create instance of table builder.""" - - -class DataFrameInfoPrinter(InfoPrinterAbstract): - """ - Class for printing dataframe info. - - Parameters - ---------- - info : DataFrameInfo - Instance of DataFrameInfo. - max_cols : int, optional - When to switch from the verbose to the truncated output. - verbose : bool, optional - Whether to print the full summary. - show_counts : bool, optional - Whether to show the non-null counts. - """ - - def __init__( - self, - info: DataFrameInfo, - max_cols: Optional[int] = None, - verbose: Optional[bool] = None, - show_counts: Optional[bool] = None, - ): - self.info = info - self.data = info.data - self.verbose = verbose - self.max_cols = self._initialize_max_cols(max_cols) - self.show_counts = self._initialize_show_counts(show_counts) - - @property - def max_rows(self) -> int: - """Maximum info rows to be displayed.""" - return get_option("display.max_info_rows", len(self.data) + 1) - - @property - def exceeds_info_cols(self) -> bool: - """Check if number of columns to be summarized does not exceed maximum.""" - return bool(self.col_count > self.max_cols) - - @property - def exceeds_info_rows(self) -> bool: - """Check if number of rows to be summarized does not exceed maximum.""" - return bool(len(self.data) > self.max_rows) - - @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return self.info.col_count - - def _initialize_max_cols(self, max_cols: Optional[int]) -> int: - if max_cols is None: - return get_option("display.max_info_columns", self.col_count + 1) - return max_cols - - def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: - if show_counts is None: - return bool(not self.exceeds_info_cols and not self.exceeds_info_rows) - else: - return show_counts - - def _create_table_builder(self) -> "DataFrameTableBuilder": - """ - Create instance of table builder based on verbosity and display settings. - """ - if self.verbose: - return DataFrameTableBuilderVerbose( - info=self.info, - with_counts=self.show_counts, - ) - elif self.verbose is False: # specifically set to False, not necessarily None - return DataFrameTableBuilderNonVerbose(info=self.info) - else: - if self.exceeds_info_cols: - return DataFrameTableBuilderNonVerbose(info=self.info) - else: - return DataFrameTableBuilderVerbose( - info=self.info, - with_counts=self.show_counts, + header = _put_str(id_head, space_num) + _put_str(column_head, space) + if show_counts: + counts = self.data.count() + if col_count != len(counts): # pragma: no cover + raise AssertionError( + f"Columns must equal counts ({col_count} != {len(counts)})" ) - - -class TableBuilderAbstract(ABC): - """ - Abstract builder for info table. - """ - - _lines: List[str] - info: BaseInfo - - @abstractmethod - def get_lines(self) -> List[str]: - """Product in a form of list of lines (strings).""" - - @property - def data(self) -> FrameOrSeriesUnion: - return self.info.data - - @property - def dtypes(self) -> Iterable[Dtype]: - """Dtypes of each of the DataFrame's columns.""" - return self.info.dtypes - - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - return self.info.dtype_counts - - @property - def display_memory_usage(self) -> bool: - """Whether to display memory usage.""" - return bool(self.info.memory_usage) - - @property - def memory_usage_string(self) -> str: - """Memory usage string with proper size qualifier.""" - return self.info.memory_usage_string - - @property - def non_null_counts(self) -> Sequence[int]: - return self.info.non_null_counts - - def add_object_type_line(self) -> None: - """Add line with string representation of dataframe to the table.""" - self._lines.append(str(type(self.data))) - - def add_index_range_line(self) -> None: - """Add line with range of indices to the table.""" - self._lines.append(self.data.index._summary()) - - def add_dtypes_line(self) -> None: - """Add summary line with dtypes present in dataframe.""" - collected_dtypes = [ - f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) - ] - self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") - - -class DataFrameTableBuilder(TableBuilderAbstract): - """ - Abstract builder for dataframe info table. - - Parameters - ---------- - info : DataFrameInfo. - Instance of DataFrameInfo. - """ - - def __init__(self, *, info: DataFrameInfo): - self.info: DataFrameInfo = info - - def get_lines(self) -> List[str]: - self._lines = [] - if self.col_count == 0: - self._fill_empty_info() + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null else: - self._fill_non_empty_info() - return self._lines + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" - def _fill_empty_info(self) -> None: - """Add lines to the info table, pertaining to empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self._lines.append(f"Empty {type(self.data).__name__}") - - @abstractmethod - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - - @property - def data(self) -> "DataFrame": - """DataFrame.""" - return self.info.data - - @property - def ids(self) -> Index: - """Dataframe columns.""" - return self.info.ids - - @property - def col_count(self) -> int: - """Number of dataframe columns to be summarized.""" - return self.info.col_count - - def add_memory_usage_line(self) -> None: - """Add line containing memory usage.""" - self._lines.append(f"memory usage: {self.memory_usage_string}") - - -class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): - """ - Dataframe info table builder for non-verbose output. - """ - - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() - - def add_columns_summary_line(self) -> None: - self._lines.append(self.ids._summary(name="Columns")) - - -class TableBuilderVerboseMixin(TableBuilderAbstract): - """ - Mixin for verbose info output. - """ - - SPACING: str = " " * 2 - strrows: Sequence[Sequence[str]] - gross_column_widths: Sequence[int] - with_counts: bool - - @property - @abstractmethod - def headers(self) -> Sequence[str]: - """Headers names of the columns in verbose table.""" - - @property - def header_column_widths(self) -> Sequence[int]: - """Widths of header columns (only titles).""" - return [len(col) for col in self.headers] - - def _get_gross_column_widths(self) -> Sequence[int]: - """Get widths of columns containing both headers and actual content.""" - body_column_widths = self._get_body_column_widths() - return [ - max(*widths) - for widths in zip(self.header_column_widths, body_column_widths) - ] - - def _get_body_column_widths(self) -> Sequence[int]: - """Get widths of table content columns.""" - strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) - return [max(len(x) for x in col) for col in strcols] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - """ - Generator function yielding rows content. - - Each element represents a row comprising a sequence of strings. - """ - if self.with_counts: - return self._gen_rows_with_counts() - else: - return self._gen_rows_without_counts() - - @abstractmethod - def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data with counts.""" - - @abstractmethod - def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data without counts.""" - - def add_header_line(self) -> None: - header_line = self.SPACING.join( - [ - _put_str(header, col_width) - for header, col_width in zip(self.headers, self.gross_column_widths) - ] + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype ) - self._lines.append(header_line) - def add_separator_line(self) -> None: - separator_line = self.SPACING.join( - [ - _put_str("-" * header_colwidth, gross_colwidth) - for header_colwidth, gross_colwidth in zip( - self.header_column_widths, self.gross_column_widths - ) - ] + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) ) - self._lines.append(separator_line) - def add_body_lines(self) -> None: - for row in self.strrows: - body_line = self.SPACING.join( - [ - _put_str(col, gross_colwidth) - for col, gross_colwidth in zip(row, self.gross_column_widths) - ] + for i, col in enumerate(ids): + dtype = dtypes.iloc[i] + col = pprint_thing(col) + + line_no = _put_str(f" {i}", space_num) + count = "" + if show_counts: + count = counts.iloc[i] + + lines.append( + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) - self._lines.append(body_line) - def _gen_non_null_counts(self) -> Iterator[str]: - """Iterator with string representation of non-null counts.""" - for count in self.non_null_counts: - yield f"{count} non-null" - - def _gen_dtypes(self) -> Iterator[str]: - """Iterator with string representation of column dtypes.""" - for dtype in self.dtypes: - yield pprint_thing(dtype) - - -class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): - """ - Dataframe info table builder for verbose output. - """ - - def __init__( - self, - *, - info: DataFrameInfo, - with_counts: bool, - ): - self.info = info - self.with_counts = with_counts - self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() - - def _fill_non_empty_info(self) -> None: - """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_header_line() - self.add_separator_line() - self.add_body_lines() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() - - @property - def headers(self) -> Sequence[str]: - """Headers names of the columns in verbose table.""" - if self.with_counts: - return [" # ", "Column", "Non-Null Count", "Dtype"] - return [" # ", "Column", "Dtype"] - - def add_columns_summary_line(self) -> None: - self._lines.append(f"Data columns (total {self.col_count} columns):") - - def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data without counts.""" - yield from zip( - self._gen_line_numbers(), - self._gen_columns(), - self._gen_dtypes(), - ) - - def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: - """Iterator with string representation of body data with counts.""" - yield from zip( - self._gen_line_numbers(), - self._gen_columns(), - self._gen_non_null_counts(), - self._gen_dtypes(), - ) - - def _gen_line_numbers(self) -> Iterator[str]: - """Iterator with string representation of column numbers.""" - for i, _ in enumerate(self.ids): - yield f" {i}" - - def _gen_columns(self) -> Iterator[str]: - """Iterator with string representation of column names.""" - for col in self.ids: - yield pprint_thing(col) - - -def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: - """ - Create mapping between datatypes and their number of occurences. - """ - # groupby dtype.name to collect e.g. Categorical columns - return df.dtypes.value_counts().groupby(lambda x: x.name).sum() + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + lines.append(ids._summary(name="Columns")) diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/latex.py b/venv/lib/python3.8/site-packages/pandas/io/formats/latex.py index f6f3571..3a3ca84 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/latex.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/latex.py @@ -1,142 +1,79 @@ """ Module for formatting output data in Latex. """ -from abc import ABC, abstractmethod -from typing import Iterator, List, Optional, Sequence, Tuple, Type, Union +from typing import IO, List, Optional, Tuple import numpy as np from pandas.core.dtypes.generic import ABCMultiIndex -from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.format import DataFrameFormatter, TableFormatter -def _split_into_full_short_caption( - caption: Optional[Union[str, Tuple[str, str]]] -) -> Tuple[str, str]: - """Extract full and short captions from caption string/tuple. - - Parameters - ---------- - caption : str or tuple, optional - Either table caption string or tuple (full_caption, short_caption). - If string is provided, then it is treated as table full caption, - while short_caption is considered an empty string. - - Returns - ------- - full_caption, short_caption : tuple - Tuple of full_caption, short_caption strings. +class LatexFormatter(TableFormatter): """ - if caption: - if isinstance(caption, str): - full_caption = caption - short_caption = "" - else: - try: - full_caption, short_caption = caption - except ValueError as err: - msg = "caption must be either a string or a tuple of two strings" - raise ValueError(msg) from err - else: - full_caption = "" - short_caption = "" - return full_caption, short_caption - - -class RowStringConverter(ABC): - r"""Converter for dataframe rows into LaTeX strings. + Used to render a DataFrame to a LaTeX tabular/longtable environment output. Parameters ---------- formatter : `DataFrameFormatter` - Instance of `DataFrameFormatter`. - multicolumn: bool, optional - Whether to use \multicolumn macro. - multicolumn_format: str, optional - Multicolumn format. - multirow: bool, optional - Whether to use \multirow macro. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + longtable : boolean, default False + Use a longtable environment instead of tabular. + See Also + -------- + HTMLFormatter """ def __init__( self, formatter: DataFrameFormatter, + column_format: Optional[str] = None, + longtable: bool = False, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame + self.bold_rows = self.fmt.bold_rows + self.column_format = column_format + self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.clinebuf: List[List[int]] = [] - self.strcols = self._get_strcols() - self.strrows = list(zip(*self.strcols)) + self.caption = caption + self.label = label + self.escape = self.fmt.escape - def get_strrow(self, row_num: int) -> str: - """Get string representation of the row.""" - row = self.strrows[row_num] - - is_multicol = ( - row_num < self.column_levels and self.fmt.header and self.multicolumn - ) - - is_multirow = ( - row_num >= self.header_levels - and self.fmt.index - and self.multirow - and self.index_levels > 1 - ) - - is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 - - crow = self._preprocess_row(row) - - if is_multicol: - crow = self._format_multicolumn(crow) - if is_multirow: - crow = self._format_multirow(crow, row_num) - - lst = [] - lst.append(" & ".join(crow)) - lst.append(" \\\\") - if is_cline_maybe_required: - cline = self._compose_cline(row_num, len(self.strcols)) - lst.append(cline) - return "".join(lst) - - @property - def _header_row_num(self) -> int: - """Number of rows in header.""" - return self.header_levels if self.fmt.header else 0 - - @property - def index_levels(self) -> int: - """Integer number of levels in index.""" - return self.frame.index.nlevels - - @property - def column_levels(self) -> int: - return self.frame.columns.nlevels - - @property - def header_levels(self) -> int: - nlevels = self.column_levels - if self.fmt.has_index_names and self.fmt.show_index_names: - nlevels += 1 - return nlevels - - def _get_strcols(self) -> List[List[str]]: - """String representation of the columns.""" - if self.fmt.frame.empty: - strcols = [[self._empty_info_line]] + def write_result(self, buf: IO[str]) -> None: + """ + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. + """ + # string representation of the columns + if len(self.frame.columns) == 0 or len(self.frame.index) == 0: + info_line = ( + f"Empty {type(self.frame).__name__}\n" + f"Columns: {self.frame.columns}\n" + f"Index: {self.frame.index}" + ) + strcols = [[info_line]] else: - strcols = self.fmt.get_strcols() + strcols = self.fmt._to_str_columns() - # reestablish the MultiIndex that has been joined by get_strcols() + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return "r" + else: + return "l" + + # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( adjoin=False, @@ -153,11 +90,11 @@ class RowStringConverter(ABC): break return [x[0]] + [i if i else " " * len(pad) for i in x[1:]] - gen = (pad_empties(i) for i in out) + out = (pad_empties(i) for i in out) # Add empty spaces for each column level clevels = self.frame.columns.nlevels - out = [[" " * len(i[-1])] * clevels + i for i in gen] + out = [[" " * len(i[-1])] * clevels + i for i in out] # Add the column names to the last index column cnames = self.frame.columns.names @@ -167,27 +104,95 @@ class RowStringConverter(ABC): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] - return strcols - @property - def _empty_info_line(self): - return ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {self.frame.columns}\n" - f"Index: {self.frame.index}" - ) - - def _preprocess_row(self, row: Sequence[str]) -> List[str]: - """Preprocess elements of the row.""" - if self.fmt.escape: - crow = _escape_symbols(row) + if self.column_format is None: + dtypes = self.frame.dtypes._values + column_format = "".join(map(get_col_type, dtypes)) + if self.fmt.index: + index_format = "l" * self.frame.index.nlevels + column_format = index_format + column_format + elif not isinstance(self.column_format, str): # pragma: no cover + raise AssertionError( + f"column_format must be str or unicode, not {type(column_format)}" + ) else: - crow = [x if x else "{}" for x in row] - if self.fmt.bold_rows and self.fmt.index: - crow = _convert_to_bold(crow, self.index_levels) - return crow + column_format = self.column_format - def _format_multicolumn(self, row: List[str]) -> List[str]: + if self.longtable: + self._write_longtable_begin(buf, column_format) + else: + self._write_tabular_begin(buf, column_format) + + buf.write("\\toprule\n") + + ilevels = self.frame.index.nlevels + clevels = self.frame.columns.nlevels + nlevels = clevels + if self.fmt.has_index_names and self.fmt.show_index_names: + nlevels += 1 + strrows = list(zip(*strcols)) + self.clinebuf: List[List[int]] = [] + + for i, row in enumerate(strrows): + if i == nlevels and self.fmt.header: + buf.write("\\midrule\n") # End of header + if self.longtable: + buf.write("\\endhead\n") + buf.write("\\midrule\n") + buf.write( + f"\\multicolumn{{{len(row)}}}{{r}}" + "{{Continued on next page}} \\\\\n" + ) + buf.write("\\midrule\n") + buf.write("\\endfoot\n\n") + buf.write("\\bottomrule\n") + buf.write("\\endlastfoot\n") + if self.escape: + # escape backslashes first + crow = [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] + else: + crow = [x if x else "{}" for x in row] + if self.bold_rows and self.fmt.index: + # bold row labels + crow = [ + f"\\textbf{{{x}}}" + if j < ilevels and x.strip() not in ["", "{}"] + else x + for j, x in enumerate(crow) + ] + if i < clevels and self.fmt.header and self.multicolumn: + # sum up columns to multicolumns + crow = self._format_multicolumn(crow, ilevels) + if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: + # sum up rows to multirows + crow = self._format_multirow(crow, ilevels, i, strrows) + buf.write(" & ".join(crow)) + buf.write(" \\\\\n") + if self.multirow and i < len(strrows) - 1: + self._print_cline(buf, i, len(strcols)) + + if self.longtable: + self._write_longtable_end(buf) + else: + self._write_tabular_end(buf) + + def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format @@ -197,7 +202,7 @@ class RowStringConverter(ABC): will become \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} """ - row2 = row[: self.index_levels] + row2 = list(row[:ilevels]) ncol = 1 coltext = "" @@ -212,7 +217,7 @@ class RowStringConverter(ABC): else: row2.append(coltext) - for c in row[self.index_levels :]: + for c in row[ilevels:]: # if next col has text, write the previous if c.strip(): if coltext: @@ -227,7 +232,9 @@ class RowStringConverter(ABC): append_col() return row2 - def _format_multirow(self, row: List[str], i: int) -> List[str]: + def _format_multirow( + self, row: List[str], ilevels: int, i: int, rows: List[Tuple[str, ...]] + ) -> List[str]: r""" Check following rows, whether row should be a multirow @@ -237,10 +244,10 @@ class RowStringConverter(ABC): b & 0 & \cline{1-2} b & 0 & """ - for j in range(self.index_levels): + for j in range(ilevels): if row[j].strip(): nrow = 1 - for r in self.strrows[i + 1 :]: + for r in rows[i + 1 :]: if not r[j].strip(): nrow += 1 else: @@ -252,574 +259,114 @@ class RowStringConverter(ABC): self.clinebuf.append([i + nrow - 1, j + 1]) return row - def _compose_cline(self, i: int, icol: int) -> str: + def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: """ - Create clines after multirow-blocks are finished. + Print clines after multirow-blocks are finished. """ - lst = [] for cl in self.clinebuf: if cl[0] == i: - lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] - return "".join(lst) + buf.write(f"\\cline{{{cl[1]:d}-{icol:d}}}\n") + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] - -class RowStringIterator(RowStringConverter): - """Iterator over rows of the header or the body of the table.""" - - @abstractmethod - def __iter__(self) -> Iterator[str]: - """Iterate over LaTeX string representations of rows.""" - - -class RowHeaderIterator(RowStringIterator): - """Iterator for the table header rows.""" - - def __iter__(self) -> Iterator[str]: - for row_num in range(len(self.strrows)): - if row_num < self._header_row_num: - yield self.get_strrow(row_num) - - -class RowBodyIterator(RowStringIterator): - """Iterator for the table body rows.""" - - def __iter__(self) -> Iterator[str]: - for row_num in range(len(self.strrows)): - if row_num >= self._header_row_num: - yield self.get_strrow(row_num) - - -class TableBuilderAbstract(ABC): - """ - Abstract table builder producing string representation of LaTeX table. - - Parameters - ---------- - formatter : `DataFrameFormatter` - Instance of `DataFrameFormatter`. - column_format: str, optional - Column format, for example, 'rcl' for three columns. - multicolumn: bool, optional - Use multicolumn to enhance MultiIndex columns. - multicolumn_format: str, optional - The alignment for multicolumns, similar to column_format. - multirow: bool, optional - Use multirow to enhance MultiIndex rows. - caption: str, optional - Table caption. - short_caption: str, optional - Table short caption. - label: str, optional - LaTeX label. - position: str, optional - Float placement specifier, for example, 'htb'. - """ - - def __init__( - self, - formatter: DataFrameFormatter, - column_format: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[str] = None, - short_caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, - ): - self.fmt = formatter - self.column_format = column_format - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - self.caption = caption - self.short_caption = short_caption - self.label = label - self.position = position - - def get_result(self) -> str: - """String representation of LaTeX table.""" - elements = [ - self.env_begin, - self.top_separator, - self.header, - self.middle_separator, - self.env_body, - self.bottom_separator, - self.env_end, - ] - result = "\n".join([item for item in elements if item]) - trailing_newline = "\n" - result += trailing_newline - return result - - @property - @abstractmethod - def env_begin(self) -> str: - """Beginning of the environment.""" - - @property - @abstractmethod - def top_separator(self) -> str: - """Top level separator.""" - - @property - @abstractmethod - def header(self) -> str: - """Header lines.""" - - @property - @abstractmethod - def middle_separator(self) -> str: - """Middle level separator.""" - - @property - @abstractmethod - def env_body(self) -> str: - """Environment body.""" - - @property - @abstractmethod - def bottom_separator(self) -> str: - """Bottom level separator.""" - - @property - @abstractmethod - def env_end(self) -> str: - """End of the environment.""" - - -class GenericTableBuilder(TableBuilderAbstract): - """Table builder producing string representation of LaTeX table.""" - - @property - def header(self) -> str: - iterator = self._create_row_iterator(over="header") - return "\n".join(list(iterator)) - - @property - def top_separator(self) -> str: - return "\\toprule" - - @property - def middle_separator(self) -> str: - return "\\midrule" if self._is_separator_required() else "" - - @property - def env_body(self) -> str: - iterator = self._create_row_iterator(over="body") - return "\n".join(list(iterator)) - - def _is_separator_required(self) -> bool: - return bool(self.header and self.env_body) - - @property - def _position_macro(self) -> str: - r"""Position macro, extracted from self.position, like [h].""" - return f"[{self.position}]" if self.position else "" - - @property - def _caption_macro(self) -> str: - r"""Caption macro, extracted from self.caption. - - With short caption: - \caption[short_caption]{caption_string}. - - Without short caption: - \caption{caption_string}. + def _write_tabular_begin(self, buf, column_format: str): """ - if self.caption: - return "".join( - [ - r"\caption", - f"[{self.short_caption}]" if self.short_caption else "", - f"{{{self.caption}}}", - ] - ) - return "" - - @property - def _label_macro(self) -> str: - r"""Label macro, extracted from self.label, like \label{ref}.""" - return f"\\label{{{self.label}}}" if self.label else "" - - def _create_row_iterator(self, over: str) -> RowStringIterator: - """Create iterator over header or body of the table. + Write the beginning of a tabular environment or + nested table/tabular environments including caption and label. Parameters ---------- - over : {'body', 'header'} - Over what to iterate. - - Returns - ------- - RowStringIterator - Iterator over body or header. + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns """ - iterator_kind = self._select_iterator(over) - return iterator_kind( - formatter=self.fmt, - multicolumn=self.multicolumn, - multicolumn_format=self.multicolumn_format, - multirow=self.multirow, - ) + if self.caption is not None or self.label is not None: + # then write output in a nested table/tabular environment + if self.caption is None: + caption_ = "" + else: + caption_ = f"\n\\caption{{{self.caption}}}" - def _select_iterator(self, over: str) -> Type[RowStringIterator]: - """Select proper iterator over table rows.""" - if over == "header": - return RowHeaderIterator - elif over == "body": - return RowBodyIterator + if self.label is None: + label_ = "" + else: + label_ = f"\n\\label{{{self.label}}}" + + buf.write(f"\\begin{{table}}\n\\centering{caption_}{label_}\n") else: - msg = f"'over' must be either 'header' or 'body', but {over} was provided" - raise ValueError(msg) + # then write output only in a tabular environment + pass + buf.write(f"\\begin{{tabular}}{{{column_format}}}\n") -class LongTableBuilder(GenericTableBuilder): - """Concrete table builder for longtable. + def _write_tabular_end(self, buf): + """ + Write the end of a tabular environment or nested table/tabular + environment. - >>> from pandas import DataFrame - >>> from pandas.io.formats import format as fmt - >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = LongTableBuilder(formatter, caption='a long table', - ... label='tab:long', column_format='lrl') - >>> table = builder.get_result() - >>> print(table) - \\begin{longtable}{lrl} - \\caption{a long table} - \\label{tab:long}\\\\ - \\toprule - {} & a & b \\\\ - \\midrule - \\endfirsthead - \\caption[]{a long table} \\\\ - \\toprule - {} & a & b \\\\ - \\midrule - \\endhead - \\midrule - \\multicolumn{3}{r}{{Continued on next page}} \\\\ - \\midrule - \\endfoot - - \\bottomrule - \\endlastfoot - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\end{longtable} - - """ + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. - @property - def env_begin(self) -> str: - first_row = ( - f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" - ) - elements = [first_row, f"{self._caption_and_label()}"] - return "\n".join([item for item in elements if item]) - - def _caption_and_label(self) -> str: - if self.caption or self.label: - double_backslash = "\\\\" - elements = [f"{self._caption_macro}", f"{self._label_macro}"] - caption_and_label = "\n".join([item for item in elements if item]) - caption_and_label += double_backslash - return caption_and_label + """ + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self.caption is not None or self.label is not None: + buf.write("\\end{table}\n") else: - return "" + pass - @property - def middle_separator(self) -> str: - iterator = self._create_row_iterator(over="header") - - # the content between \endfirsthead and \endhead commands - # mitigates repeated List of Tables entries in the final LaTeX - # document when dealing with longtable environments; GH #34360 - elements = [ - "\\midrule", - "\\endfirsthead", - f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "", - self.top_separator, - self.header, - "\\midrule", - "\\endhead", - "\\midrule", - f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" - "{{Continued on next page}} \\\\", - "\\midrule", - "\\endfoot\n", - "\\bottomrule", - "\\endlastfoot", - ] - if self._is_separator_required(): - return "\n".join(elements) - return "" - - @property - def bottom_separator(self) -> str: - return "" - - @property - def env_end(self) -> str: - return "\\end{longtable}" - - -class RegularTableBuilder(GenericTableBuilder): - """Concrete table builder for regular table. - - >>> from pandas import DataFrame - >>> from pandas.io.formats import format as fmt - >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', - ... column_format='lrc') - >>> table = builder.get_result() - >>> print(table) - \\begin{table} - \\centering - \\caption{caption} - \\label{lab} - \\begin{tabular}{lrc} - \\toprule - {} & a & b \\\\ - \\midrule - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\bottomrule - \\end{tabular} - \\end{table} - - """ - - @property - def env_begin(self) -> str: - elements = [ - f"\\begin{{table}}{self._position_macro}", - "\\centering", - f"{self._caption_macro}", - f"{self._label_macro}", - f"\\begin{{tabular}}{{{self.column_format}}}", - ] - return "\n".join([item for item in elements if item]) - - @property - def bottom_separator(self) -> str: - return "\\bottomrule" - - @property - def env_end(self) -> str: - return "\n".join(["\\end{tabular}", "\\end{table}"]) - - -class TabularBuilder(GenericTableBuilder): - """Concrete table builder for tabular environment. - - >>> from pandas import DataFrame - >>> from pandas.io.formats import format as fmt - >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = TabularBuilder(formatter, column_format='lrc') - >>> table = builder.get_result() - >>> print(table) - \\begin{tabular}{lrc} - \\toprule - {} & a & b \\\\ - \\midrule - 0 & 1 & b1 \\\\ - 1 & 2 & b2 \\\\ - \\bottomrule - \\end{tabular} - - """ - - @property - def env_begin(self) -> str: - return f"\\begin{{tabular}}{{{self.column_format}}}" - - @property - def bottom_separator(self) -> str: - return "\\bottomrule" - - @property - def env_end(self) -> str: - return "\\end{tabular}" - - -class LatexFormatter: - r""" - Used to render a DataFrame to a LaTeX tabular/longtable environment output. - - Parameters - ---------- - formatter : `DataFrameFormatter` - longtable : bool, default False - Use longtable environment. - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - multicolumn : bool, default False - Use \multicolumn to enhance MultiIndex columns. - multicolumn_format : str, default 'l' - The alignment for multicolumns, similar to `column_format` - multirow : bool, default False - Use \multirow to enhance MultiIndex rows. - caption : str or tuple, optional - Tuple (full_caption, short_caption), - which results in \caption[short_caption]{full_caption}; - if a single string is passed, no short caption will be set. - label : str, optional - The LaTeX label to be placed inside ``\label{}`` in the output. - position : str, optional - The LaTeX positional argument for tables, to be placed after - ``\begin{}`` in the output. - - See Also - -------- - HTMLFormatter - """ - - def __init__( - self, - formatter: DataFrameFormatter, - longtable: bool = False, - column_format: Optional[str] = None, - multicolumn: bool = False, - multicolumn_format: Optional[str] = None, - multirow: bool = False, - caption: Optional[Union[str, Tuple[str, str]]] = None, - label: Optional[str] = None, - position: Optional[str] = None, - ): - self.fmt = formatter - self.frame = self.fmt.frame - self.longtable = longtable - self.column_format = column_format - self.multicolumn = multicolumn - self.multicolumn_format = multicolumn_format - self.multirow = multirow - self.caption, self.short_caption = _split_into_full_short_caption(caption) - self.label = label - self.position = position - - def to_string(self) -> str: + def _write_longtable_begin(self, buf, column_format: str): """ - Render a DataFrame to a LaTeX tabular, longtable, or table/tabular - environment output. + Write the beginning of a longtable environment including caption and + label if provided by user. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns """ - return self.builder.get_result() + buf.write(f"\\begin{{longtable}}{{{column_format}}}\n") - @property - def builder(self) -> TableBuilderAbstract: - """Concrete table builder. + if self.caption is not None or self.label is not None: + if self.caption is None: + pass + else: + buf.write(f"\\caption{{{self.caption}}}") - Returns - ------- - TableBuilder - """ - builder = self._select_builder() - return builder( - formatter=self.fmt, - column_format=self.column_format, - multicolumn=self.multicolumn, - multicolumn_format=self.multicolumn_format, - multirow=self.multirow, - caption=self.caption, - short_caption=self.short_caption, - label=self.label, - position=self.position, - ) + if self.label is None: + pass + else: + buf.write(f"\\label{{{self.label}}}") - def _select_builder(self) -> Type[TableBuilderAbstract]: - """Select proper table builder.""" - if self.longtable: - return LongTableBuilder - if any([self.caption, self.label, self.position]): - return RegularTableBuilder - return TabularBuilder - - @property - def column_format(self) -> Optional[str]: - """Column format.""" - return self._column_format - - @column_format.setter - def column_format(self, input_column_format: Optional[str]) -> None: - """Setter for column format.""" - if input_column_format is None: - self._column_format = ( - self._get_index_format() + self._get_column_format_based_on_dtypes() - ) - elif not isinstance(input_column_format, str): - raise ValueError( - f"column_format must be str or unicode, " - f"not {type(input_column_format)}" - ) + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + buf.write("\\\\\n") else: - self._column_format = input_column_format + pass - def _get_column_format_based_on_dtypes(self) -> str: - """Get column format based on data type. - - Right alignment for numbers and left - for strings. + @staticmethod + def _write_longtable_end(buf): """ + Write the end of a longtable environment. - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return "r" - return "l" + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. - dtypes = self.frame.dtypes._values - return "".join(map(get_col_type, dtypes)) - - def _get_index_format(self) -> str: - """Get index column format.""" - return "l" * self.frame.index.nlevels if self.fmt.index else "" - - -def _escape_symbols(row: Sequence[str]) -> List[str]: - """Carry out string replacements for special symbols. - - Parameters - ---------- - row : list - List of string, that may contain special symbols. - - Returns - ------- - list - list of strings with the special symbols replaced. - """ - return [ - ( - x.replace("\\", "\\textbackslash ") - .replace("_", "\\_") - .replace("%", "\\%") - .replace("$", "\\$") - .replace("#", "\\#") - .replace("{", "\\{") - .replace("}", "\\}") - .replace("~", "\\textasciitilde ") - .replace("^", "\\textasciicircum ") - .replace("&", "\\&") - if (x and x != "{}") - else "{}" - ) - for x in row - ] - - -def _convert_to_bold(crow: Sequence[str], ilevels: int) -> List[str]: - """Convert elements in ``crow`` to bold.""" - return [ - f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x - for j, x in enumerate(crow) - ] - - -if __name__ == "__main__": - import doctest - - doctest.testmod() + """ + buf.write("\\end{longtable}\n") diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/printing.py b/venv/lib/python3.8/site-packages/pandas/io/formats/printing.py index 128e50d..1cf79dc 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/printing.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/printing.py @@ -12,7 +12,6 @@ from typing import ( Mapping, Optional, Sequence, - Sized, Tuple, TypeVar, Union, @@ -206,7 +205,7 @@ def pprint_thing( translate = escape_chars escape_chars = list(escape_chars.keys()) else: - escape_chars = escape_chars or () + escape_chars = escape_chars or tuple() result = str(thing) for c in escape_chars: @@ -244,7 +243,7 @@ def pprint_thing_encoded( return value.encode(encoding, errors) -def enable_data_resource_formatter(enable: bool) -> None: +def _enable_data_resource_formatter(enable: bool) -> None: if "IPython" not in sys.modules: # definitely not in IPython return @@ -308,7 +307,7 @@ def format_object_summary( name : name, optional defaults to the class name of the obj indent_for_name : bool, default True - Whether subsequent lines should be indented to + Whether subsequent lines should be be indented to align with the name. line_break_each_value : bool, default False If True, inserts a line break for each value of ``obj``. @@ -322,7 +321,7 @@ def format_object_summary( summary string """ from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import get_adjustment + from pandas.io.formats.format import _get_adjustment display_width, _ = get_console_size() if display_width is None: @@ -351,7 +350,7 @@ def format_object_summary( is_truncated = n > max_seq_items # adj can optionally handle unicode eastern asian width - adj = get_adjustment() + adj = _get_adjustment() def _extend_line( s: str, line: str, value: str, display_width: int, next_line_prefix: str @@ -500,11 +499,11 @@ def _justify( # error: Incompatible return value type (got "Tuple[List[Sequence[str]], # List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]], # List[Tuple[str, ...]]]") - return head, tail # type: ignore[return-value] + return head, tail # type: ignore def format_object_attrs( - obj: Sized, include_dtype: bool = True + obj: Sequence, include_dtype: bool = True ) -> List[Tuple[str, Union[str, int]]]: """ Return a list of tuples of the (attr, formatted_value) @@ -513,7 +512,7 @@ def format_object_attrs( Parameters ---------- obj : object - Must be sized. + must be iterable include_dtype : bool If False, dtype won't be in the returned list @@ -524,17 +523,15 @@ def format_object_attrs( """ attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: - # error: "Sized" has no attribute "dtype" - attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] + # error: "Sequence[Any]" has no attribute "dtype" + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore if getattr(obj, "name", None) is not None: - # error: "Sized" has no attribute "name" - attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined] - # error: "Sized" has no attribute "names" - elif getattr(obj, "names", None) is not None and any( - obj.names # type: ignore[attr-defined] - ): - # error: "Sized" has no attribute "names" - attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined] + # error: "Sequence[Any]" has no attribute "name" + attrs.append(("name", default_pprint(obj.name))) # type: ignore + # error: "Sequence[Any]" has no attribute "names" + elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore + # error: "Sequence[Any]" has no attribute "names" + attrs.append(("names", default_pprint(obj.names))) # type: ignore max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: attrs.append(("length", len(obj))) diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/string.py b/venv/lib/python3.8/site-packages/pandas/io/formats/string.py deleted file mode 100644 index 4ebb78f..0000000 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/string.py +++ /dev/null @@ -1,201 +0,0 @@ -""" -Module for formatting output data in console (to string). -""" -from shutil import get_terminal_size -from typing import Iterable, List, Optional - -import numpy as np - -from pandas.io.formats.format import DataFrameFormatter -from pandas.io.formats.printing import pprint_thing - - -class StringFormatter: - """Formatter for string representation of a dataframe.""" - - def __init__(self, fmt: DataFrameFormatter, line_width: Optional[int] = None): - self.fmt = fmt - self.adj = fmt.adj - self.frame = fmt.frame - self.line_width = line_width - - def to_string(self) -> str: - text = self._get_string_representation() - if self.fmt.should_show_dimensions: - text = "".join([text, self.fmt.dimensions_info]) - return text - - def _get_strcols(self) -> List[List[str]]: - strcols = self.fmt.get_strcols() - if self.fmt.is_truncated: - strcols = self._insert_dot_separators(strcols) - return strcols - - def _get_string_representation(self) -> str: - if self.fmt.frame.empty: - return self._empty_info_line - - strcols = self._get_strcols() - - if self.line_width is None: - # no need to wrap around just print the whole frame - return self.adj.adjoin(1, *strcols) - - if self._need_to_wrap_around: - return self._join_multiline(strcols) - - return self._fit_strcols_to_terminal_width(strcols) - - @property - def _empty_info_line(self) -> str: - return ( - f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(self.frame.columns)}\n" - f"Index: {pprint_thing(self.frame.index)}" - ) - - @property - def _need_to_wrap_around(self) -> bool: - return bool(self.fmt.max_cols is None or self.fmt.max_cols > 0) - - def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: - str_index = self.fmt._get_formatted_index(self.fmt.tr_frame) - index_length = len(str_index) - - if self.fmt.is_truncated_horizontally: - strcols = self._insert_dot_separator_horizontal(strcols, index_length) - - if self.fmt.is_truncated_vertically: - strcols = self._insert_dot_separator_vertical(strcols, index_length) - - return strcols - - def _insert_dot_separator_horizontal( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - strcols.insert(self.fmt.tr_col_num + 1, [" ..."] * index_length) - return strcols - - def _insert_dot_separator_vertical( - self, strcols: List[List[str]], index_length: int - ) -> List[List[str]]: - n_header_rows = index_length - len(self.fmt.tr_frame) - row_num = self.fmt.tr_row_num - for ix, col in enumerate(strcols): - cwidth = self.adj.len(col[row_num]) - - if self.fmt.is_truncated_horizontally: - is_dot_col = ix == self.fmt.tr_col_num + 1 - else: - is_dot_col = False - - if cwidth > 3 or is_dot_col: - dots = "..." - else: - dots = ".." - - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - - dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] - col.insert(row_num + n_header_rows, dot_str) - return strcols - - def _join_multiline(self, strcols_input: Iterable[List[str]]) -> str: - lwidth = self.line_width - adjoin_width = 1 - strcols = list(strcols_input) - - if self.fmt.index: - idx = strcols.pop(0) - lwidth -= np.array([self.adj.len(x) for x in idx]).max() + adjoin_width - - col_widths = [ - np.array([self.adj.len(x) for x in col]).max() if len(col) > 0 else 0 - for col in strcols - ] - - assert lwidth is not None - col_bins = _binify(col_widths, lwidth) - nbins = len(col_bins) - - if self.fmt.is_truncated_vertically: - assert self.fmt.max_rows_fitted is not None - nrows = self.fmt.max_rows_fitted + 1 - else: - nrows = len(self.frame) - - str_lst = [] - start = 0 - for i, end in enumerate(col_bins): - row = strcols[start:end] - if self.fmt.index: - row.insert(0, idx) - if nbins > 1: - if end <= len(strcols) and i < nbins - 1: - row.append([" \\"] + [" "] * (nrows - 1)) - else: - row.append([" "] * nrows) - str_lst.append(self.adj.adjoin(adjoin_width, *row)) - start = end - return "\n\n".join(str_lst) - - def _fit_strcols_to_terminal_width(self, strcols: List[List[str]]) -> str: - from pandas import Series - - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - width, _ = get_terminal_size() - dif = max_len - width - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - - # subtract index column - max_cols_fitted = n_cols - self.fmt.index - # GH-21180. Ensure that we print at least two. - max_cols_fitted = max(max_cols_fitted, 2) - self.fmt.max_cols_fitted = max_cols_fitted - - # Call again _truncate to cut frame appropriately - # and then generate string representation - self.fmt.truncate() - strcols = self._get_strcols() - return self.adj.adjoin(1, *strcols) - - -def _binify(cols: List[int], line_width: int) -> List[int]: - adjoin_width = 1 - bins = [] - curr_width = 0 - i_last_column = len(cols) - 1 - for i, w in enumerate(cols): - w_adjoined = w + adjoin_width - curr_width += w_adjoined - if i_last_column == i: - wrap = curr_width + 1 > line_width and i > 0 - else: - wrap = curr_width + 2 > line_width and i > 0 - if wrap: - bins.append(i) - curr_width = w_adjoined - - bins.append(len(cols)) - return bins diff --git a/venv/lib/python3.8/site-packages/pandas/io/formats/style.py b/venv/lib/python3.8/site-packages/pandas/io/formats/style.py index 6ed31f3..3bbb527 100644 --- a/venv/lib/python3.8/site-packages/pandas/io/formats/style.py +++ b/venv/lib/python3.8/site-packages/pandas/io/formats/style.py @@ -1,6 +1,7 @@ """ Module for applying conditional formatting to DataFrames and Series. """ + from collections import defaultdict from contextlib import contextmanager import copy @@ -17,7 +18,7 @@ from typing import ( Tuple, Union, ) -from uuid import uuid4 +from uuid import uuid1 import numpy as np @@ -32,11 +33,10 @@ from pandas.core.dtypes.common import is_float import pandas as pd from pandas.api.types import is_dict_like, is_list_like -from pandas.core import generic import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice +from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") @@ -89,12 +89,6 @@ class Styler: .. versionadded:: 1.0.0 - uuid_len : int, default 5 - If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate - expressed in hex characters, in range [0, 32]. - - .. versionadded:: 1.2.0 - Attributes ---------- env : Jinja2 jinja2.Environment @@ -150,7 +144,6 @@ class Styler: table_attributes: Optional[str] = None, cell_ids: bool = True, na_rep: Optional[str] = None, - uuid_len: int = 5, ): self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) self._todo: List[Tuple[Callable, Tuple, Dict]] = [] @@ -166,10 +159,7 @@ class Styler: self.index = data.index self.columns = data.columns - if not isinstance(uuid_len, int) or not uuid_len >= 0: - raise TypeError("``uuid_len`` must be an integer in range [0, 32].") - self.uuid_len = min(32, uuid_len) - self.uuid = (uuid or uuid4().hex[: self.uuid_len]) + "_" + self.uuid = uuid self.table_styles = table_styles self.caption = caption if precision is None: @@ -181,8 +171,6 @@ class Styler: self.cell_ids = cell_ids self.na_rep = na_rep - self.cell_context: Dict[str, Any] = {} - # display_funcs maps (row, col) -> formatting function def default_display_func(x): @@ -204,11 +192,7 @@ class Styler: """ return self.render() - @doc( - NDFrame.to_excel, - klass="Styler", - storage_options=generic._shared_docs["storage_options"], - ) + @doc(NDFrame.to_excel, klass="Styler") def to_excel( self, excel_writer, @@ -262,7 +246,7 @@ class Styler: precision = self.precision hidden_index = self.hidden_index hidden_columns = self.hidden_columns - uuid = self.uuid + uuid = self.uuid or str(uuid1()).replace("-", "_") ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" INDEX_NAME_CLASS = "index_name" @@ -278,7 +262,7 @@ class Styler: idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns, hidden_columns) - cell_context = self.cell_context + cell_context = dict() n_rlvls = self.data.index.nlevels n_clvls = self.data.columns.nlevels @@ -343,7 +327,7 @@ class Styler: colspan = col_lengths.get((r, c), 0) if colspan > 1: es["attributes"] = [ - format_attr({"key": "colspan", "value": f'"{colspan}"'}) + format_attr({"key": "colspan", "value": colspan}) ] row_es.append(es) head.append(row_es) @@ -389,7 +373,7 @@ class Styler: rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: es["attributes"] = [ - format_attr({"key": "rowspan", "value": f'"{rowspan}"'}) + format_attr({"key": "rowspan", "value": rowspan}) ] row_es.append(es) @@ -433,16 +417,16 @@ class Styler: else: table_attr += ' class="tex2jax_ignore"' - return { - "head": head, - "cellstyle": cellstyle, - "body": body, - "uuid": uuid, - "precision": precision, - "table_styles": table_styles, - "caption": caption, - "table_attributes": table_attr, - } + return dict( + head=head, + cellstyle=cellstyle, + body=body, + uuid=uuid, + precision=precision, + table_styles=table_styles, + caption=caption, + table_attributes=table_attr, + ) def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler": """ @@ -491,7 +475,7 @@ class Styler: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: - subset = non_reducing_slice(subset) + subset = _non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns @@ -515,69 +499,6 @@ class Styler: self._display_funcs[(i, j)] = formatter return self - def set_td_classes(self, classes: DataFrame) -> "Styler": - """ - Add string based CSS class names to data cells that will appear within the - `Styler` HTML result. These classes are added within specified `` elements. - - Parameters - ---------- - classes : DataFrame - DataFrame containing strings that will be translated to CSS classes, - mapped by identical column and index values that must exist on the - underlying `Styler` data. None, NaN values, and empty strings will - be ignored and not affect the rendered HTML. - - Returns - ------- - self : Styler - - Examples - -------- - >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) - >>> classes = pd.DataFrame([ - ... ["min-val red", "", "blue"], - ... ["red", None, "blue max-val"] - ... ], index=df.index, columns=df.columns) - >>> df.style.set_td_classes(classes) - - Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the - underlying, - - >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], - ... columns=[["level0", "level0"], ["level1a", "level1b"]]) - >>> classes = pd.DataFrame(["min-val"], index=["a"], - ... columns=[["level0"],["level1a"]]) - >>> df.style.set_td_classes(classes) - - Form of the output with new additional css classes, - - >>> df = pd.DataFrame([[1]]) - >>> css = pd.DataFrame(["other-class"]) - >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css) - >>> s.hide_index().render() - '' - '' - ' ' - ' ' - ' ' - ' ' - ' ' - ' ' - '
0
1
' - """ - classes = classes.reindex_like(self.data) - - mask = (classes.isna()) | (classes.eq("")) - self.cell_context["data"] = { - r: {c: [str(classes.iloc[r, c])]} - for r, rn in enumerate(classes.index) - for c, cn in enumerate(classes.columns) - if not mask.iloc[r, c] - } - - return self - def render(self, **kwargs) -> str: """ Render the built up styles to HTML. @@ -688,7 +609,6 @@ class Styler: Returns None. """ self.ctx.clear() - self.cell_context = {} self._todo = [] def _compute(self): @@ -713,7 +633,7 @@ class Styler: **kwargs, ) -> "Styler": subset = slice(None) if subset is None else subset - subset = non_reducing_slice(subset) + subset = _non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: result = data.apply(func, axis=axis, result_type="expand", **kwargs) @@ -805,7 +725,7 @@ class Styler: func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] - subset = non_reducing_slice(subset) + subset = _non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) return self @@ -832,8 +752,7 @@ class Styler: See Also -------- - Styler.where: Updates the HTML representation with a style which is - selected in accordance with the return value of a function. + Styler.where """ self._todo.append( (lambda instance: getattr(instance, "_applymap"), (func, subset), kwargs) @@ -874,7 +793,7 @@ class Styler: See Also -------- - Styler.applymap: Updates the HTML representation with the result. + Styler.applymap """ if other is None: other = "" @@ -903,7 +822,7 @@ class Styler: Set the table attributes. These are the items that show up in the opening ```` tag - in addition to automatic (by default) id. + in addition to to automatic (by default) id. Parameters ---------- @@ -934,7 +853,7 @@ class Styler: See Also -------- - Styler.use: Set the styles on the current Styler. + Styler.use """ return self._todo @@ -955,7 +874,7 @@ class Styler: See Also -------- - Styler.export : Export the styles to applied to the current Styler. + Styler.export """ self._todo.extend(styles) return self @@ -990,46 +909,20 @@ class Styler: self.caption = caption return self - def set_table_styles(self, table_styles, axis=0, overwrite=True) -> "Styler": + def set_table_styles(self, table_styles) -> "Styler": """ Set the table styles on a Styler. These are placed in a ``