Source code for pandera.dtypes

# pylint: disable=no-member,too-many-public-methods
"""Schema datatypes."""

from enum import Enum
from typing import Optional, Union

import numpy as np
import pandas as pd
from packaging import version

PandasExtensionType = pd.core.dtypes.base.ExtensionDtype

LEGACY_PANDAS = version.parse(pd.__version__).major < 1  # type: ignore

NUMPY_TYPES = frozenset(
    [item for sublist in np.sctypes.values() for item in sublist]  # type: ignore
    frozenset([np.complex_, np.int_, np.uint, np.float_, np.str_, np.bool_])

# for int and float dtype, delegate string representation to the
# default based on OS. In Windows, pandas defaults to int64 while numpy
# defaults to int32.
_DEFAULT_PANDAS_INT_TYPE = str(pd.Series([1]).dtype)
_DEFAULT_PANDAS_FLOAT_TYPE = str(pd.Series([1.0]).dtype)
_DEFAULT_PANDAS_COMPLEX_TYPE = str(pd.Series([complex(1)]).dtype)
_DEFAULT_NUMPY_INT_TYPE = str(np.dtype(int))
_DEFAULT_NUMPY_FLOAT_TYPE = str(np.dtype(float))

def is_extension_dtype(dtype):
    """Check if a value is a pandas extension type or instance of one."""
    return isinstance(dtype, PandasExtensionType) or (
        isinstance(dtype, type) and issubclass(dtype, PandasExtensionType)

[docs]class PandasDtype(Enum): # pylint: disable=line-too-long """Enumerate all valid pandas data types. ``pandera`` follows the `numpy data types <>`_ subscribed to by ``pandas`` and by default supports using the numpy data type string aliases to validate DataFrame or Series dtypes. This class simply enumerates the valid numpy dtypes for pandas arrays. For convenience ``PandasDtype`` enums can all be accessed in the top-level ``pandera`` name space via the same enum name. :examples: >>> import pandas as pd >>> import pandera as pa >>> >>> >>> pa.SeriesSchema(pa.Int).validate(pd.Series([1, 2, 3])) 0 1 1 2 2 3 dtype: int64 >>> pa.SeriesSchema(pa.Float).validate(pd.Series([1.1, 2.3, 3.4])) 0 1.1 1 2.3 2 3.4 dtype: float64 >>> pa.SeriesSchema(pa.String).validate(pd.Series(["a", "b", "c"])) 0 a 1 b 2 c dtype: object Alternatively, you can use built-in python scalar types for integers, floats, booleans, and strings: >>> pa.SeriesSchema(int).validate(pd.Series([1, 2, 3])) 0 1 1 2 2 3 dtype: int64 You can also use the pandas string aliases in the schema definition: >>> pa.SeriesSchema("int").validate(pd.Series([1, 2, 3])) 0 1 1 2 2 3 dtype: int64 .. note:: ``pandera`` also offers limited support for `pandas extension types <>`_, however since the release of pandas 1.0.0 there are backwards incompatible extension types like the ``Integer`` array. The extension types, e.g. ``pd.IntDtype64()`` and their string alias should work when supplied to the ``pandas_dtype`` argument, unless otherwise specified below, but this functionality is only tested for pandas >= 1.0.0. Extension types in earlier versions are not guaranteed to work as the ``pandas_dtype`` argument in schemas or schema components. """ Bool = "bool" #: ``"bool"`` numpy dtype DateTime = "datetime64[ns]" #: ``"datetime64[ns]"`` numpy dtype Timedelta = "timedelta64[ns]" #: ``"timedelta64[ns]"`` numpy dtype Category = "category" #: pandas ``"categorical"`` datatype Float = "float" #: ``"float"`` numpy dtype Float16 = "float16" #: ``"float16"`` numpy dtype Float32 = "float32" #: ``"float32"`` numpy dtype Float64 = "float64" #: ``"float64"`` numpy dtype Int = "int" #: ``"int"`` numpy dtype Int8 = "int8" #: ``"int8"`` numpy dtype Int16 = "int16" #: ``"int16"`` numpy dtype Int32 = "int32" #: ``"int32"`` numpy dtype Int64 = "int64" #: ``"int64"`` numpy dtype UInt8 = "uint8" #: ``"uint8"`` numpy dtype UInt16 = "uint16" #: ``"uint16"`` numpy dtype UInt32 = "uint32" #: ``"uint32"`` numpy dtype UInt64 = "uint64" #: ``"uint64"`` numpy dtype INT8 = "Int8" #: ``"Int8"`` pandas dtype:: pandas 0.24.0+ INT16 = "Int16" #: ``"Int16"`` pandas dtype: pandas 0.24.0+ INT32 = "Int32" #: ``"Int32"`` pandas dtype: pandas 0.24.0+ INT64 = "Int64" #: ``"Int64"`` pandas dtype: pandas 0.24.0+ UINT8 = "UInt8" #: ``"UInt8"`` pandas dtype: pandas 0.24.0+ UINT16 = "UInt16" #: ``"UInt16"`` pandas dtype: pandas 0.24.0+ UINT32 = "UInt32" #: ``"UInt32"`` pandas dtype: pandas 0.24.0+ UINT64 = "UInt64" #: ``"UInt64"`` pandas dtype: pandas 0.24.0+ Object = "object" #: ``"object"`` numpy dtype Complex = "complex" #: ``"complex"`` numpy dtype Complex64 = "complex64" #: ``"complex"`` numpy dtype Complex128 = "complex128" #: ``"complex"`` numpy dtype Complex256 = "complex256" #: ``"complex"`` numpy dtype String = "str" #: ``"str"`` numpy dtype #: ``"string"`` pandas dtypes: pandas 1.0.0+. For <1.0.0, this enum will #: fall back on the str-as-object-array representation. STRING = "string" @property def str_alias(self): """Get datatype string alias.""" return { "int": _DEFAULT_PANDAS_INT_TYPE, "float": _DEFAULT_PANDAS_FLOAT_TYPE, "complex": _DEFAULT_PANDAS_COMPLEX_TYPE, "str": "object", "string": "object" if LEGACY_PANDAS else "string", }.get(self.value, self.value)
[docs] @classmethod def from_str_alias(cls, str_alias: str) -> "PandasDtype": """Get PandasDtype from string alias. :param: pandas dtype string alias from :returns: pandas dtype """ pandas_dtype = { "bool": cls.Bool, "datetime64[ns]": cls.DateTime, "timedelta64[ns]": cls.Timedelta, "category": cls.Category, "float": cls.Float, "float16": cls.Float16, "float32": cls.Float32, "float64": cls.Float64, "int": cls.Int, "int8": cls.Int8, "int16": cls.Int16, "int32": cls.Int32, "int64": cls.Int64, "uint8": cls.UInt8, "uint16": cls.UInt16, "uint32": cls.UInt32, "uint64": cls.UInt64, "Int8": cls.INT8, "Int16": cls.INT16, "Int32": cls.INT32, "Int64": cls.INT64, "UInt8": cls.UINT8, "UInt16": cls.UINT16, "UInt32": cls.UINT32, "UInt64": cls.UINT64, "object": cls.Object, "complex": cls.Complex, "complex64": cls.Complex64, "complex128": cls.Complex128, "complex256": cls.Complex256, "str": cls.String, "string": cls.String if LEGACY_PANDAS else cls.STRING, }.get(str_alias) if pandas_dtype is None: raise TypeError( f"pandas dtype string alias '{str_alias}' not recognized" ) return pandas_dtype
[docs] @classmethod def from_pandas_api_type(cls, pandas_api_type: str) -> "PandasDtype": """Get PandasDtype enum from pandas api type. :param pandas_api_type: string output from :returns: pandas dtype """ if pandas_api_type.startswith("mixed"): return cls.Object pandas_dtype = { "string": cls.String, "floating": cls.Float, "integer": cls.Int, "categorical": cls.Category, "boolean": cls.Bool, "datetime64": cls.DateTime, "datetime": cls.DateTime, "timedelta64": cls.Timedelta, "timedelta": cls.Timedelta, }.get(pandas_api_type) if pandas_dtype is None: raise TypeError( f"pandas api type '{pandas_api_type}' not recognized" ) return pandas_dtype
@classmethod def from_python_type(cls, python_type: type) -> "PandasDtype": """Get PandasDtype enum from built-in python type. :param python_type: built-in python type. Allowable types are: str, int, float, and bool. """ pandas_dtype = { bool: cls.Bool, str: cls.String, int: cls.Int, float: cls.Float, object: cls.Object, complex: cls.Complex, }.get(python_type) if pandas_dtype is None: raise TypeError( f"python type '{python_type}' not recognized as pandas data type" ) return pandas_dtype @classmethod def from_numpy_type(cls, numpy_type: np.dtype) -> "PandasDtype": """Get PandasDtype enum from numpy type. :param numpy_type: numpy data type. """ pd_dtype = pd.api.types.pandas_dtype(numpy_type) return cls.from_str_alias( @classmethod def get_dtype( cls, pandas_dtype_arg: Union[ str, type, "PandasDtype", "pd.core.dtypes.dtypes.ExtensionDtype", np.dtype, ], ) -> Optional[ Union["PandasDtype", "pd.core.dtypes.dtypes.ExtensionDtype"] ]: """Get PandasDtype from schema argument. :param pandas_dtype_arg: ``pandas_dtype`` argument specified in schema definition. """ dtype_ = pandas_dtype_arg if dtype_ is None: return dtype_ elif isinstance(dtype_, PandasDtype): return pandas_dtype_arg elif is_extension_dtype(dtype_): if isinstance(dtype_, type): try: # Convert to str here because some pandas dtypes allow # an empty constructor for compatibility but fail on # str(). e.g: PeriodDtype str(dtype_().name) return dtype_() except (TypeError, AttributeError) as err: raise TypeError( f"Pandas dtype {dtype_} cannot be instantiated: " f"{err}\n Usage Tip: Use an instance or a string " "representation." ) from err return dtype_ if dtype_ in NUMPY_TYPES: dtype_ = cls.from_numpy_type(dtype_) # type: ignore elif isinstance(dtype_, str): dtype_ = cls.from_str_alias(dtype_) elif isinstance(dtype_, type): dtype_ = cls.from_python_type(dtype_) if isinstance(dtype_, PandasDtype): return dtype_ raise TypeError( "type of `pandas_dtype` argument not recognized: " f"{type(pandas_dtype_arg)}. Please specify a pandera PandasDtype " "enum, built-in python type, pandas data type, pandas data type " "string alias, or numpy data type string alias" ) @classmethod def get_str_dtype(cls, pandas_dtype_arg) -> Optional[str]: """Get pandas-compatible string representation of dtype.""" pandas_dtype = cls.get_dtype(pandas_dtype_arg) if pandas_dtype is None: return pandas_dtype elif isinstance(pandas_dtype, PandasDtype): return pandas_dtype.str_alias return str(pandas_dtype) def __eq__(self, other): # pylint: disable=comparison-with-callable # see if other is None: return False other_dtype = PandasDtype.get_dtype(other) if self.value == "string" and LEGACY_PANDAS: return PandasDtype.String.value == other_dtype.value elif self.value == "string": return self.value == other_dtype.value return self.str_alias == other_dtype.str_alias def __hash__(self): if self is PandasDtype.Int: hash_obj = _DEFAULT_PANDAS_INT_TYPE elif self is PandasDtype.Float: hash_obj = _DEFAULT_PANDAS_FLOAT_TYPE else: hash_obj = self.str_alias return id(hash_obj) @property def numpy_dtype(self): """Get numpy data type.""" if self is PandasDtype.Category: raise TypeError( "the pandas Categorical data type doesn't have a numpy " "equivalent." ) # pylint: disable=comparison-with-callable if self.value in {"str", "string"}: dtype = np.dtype("str") else: dtype = np.dtype(self.str_alias.lower()) return dtype @property def is_int(self) -> bool: """Return True if PandasDtype is an integer.""" return self.value.lower().startswith("int") @property def is_nullable_int(self) -> bool: """Return True if PandasDtype is a nullable integer.""" return self.value.startswith("Int") @property def is_nonnullable_int(self) -> bool: """Return True if PandasDtype is a non-nullable integer.""" return self.value.startswith("int") @property def is_uint(self) -> bool: """Return True if PandasDtype is an unsigned integer.""" return self.value.lower().startswith("uint") @property def is_nullable_uint(self) -> bool: """Return True if PandasDtype is a nullable unsigned integer.""" return self.value.startswith("UInt") @property def is_nonnullable_uint(self) -> bool: """Return True if PandasDtype is a non-nullable unsigned integer.""" return self.value.startswith("uint") @property def is_float(self) -> bool: """Return True if PandasDtype is a float.""" return self.value.startswith("float") @property def is_complex(self) -> bool: """Return True if PandasDtype is a complex number.""" return self.value.startswith("complex") @property def is_bool(self) -> bool: """Return True if PandasDtype is a boolean.""" return self is PandasDtype.Bool @property def is_string(self) -> bool: """Return True if PandasDtype is a string.""" return self in [PandasDtype.String, PandasDtype.STRING] @property def is_category(self) -> bool: """Return True if PandasDtype is a category.""" return self is PandasDtype.Category @property def is_datetime(self) -> bool: """Return True if PandasDtype is a datetime.""" return self is PandasDtype.DateTime @property def is_timedelta(self) -> bool: """Return True if PandasDtype is a timedelta.""" return self is PandasDtype.Timedelta @property def is_object(self) -> bool: """Return True if PandasDtype is an object.""" return self is PandasDtype.Object @property def is_continuous(self) -> bool: """Return True if PandasDtype is a continuous datatype.""" return ( self.is_int or self.is_uint or self.is_float or self.is_complex or self.is_datetime or self.is_timedelta )