Dropping Invalid Rows

New in version 0.16.0

If you wish to use the validation step to remove invalid data, you can pass the drop_invalid_rows=True argument to the schema object on creation. On schema.validate(), if a data-level check fails, then that row which caused the failure will be removed from the dataframe when it is returned.

drop_invalid_rows will prevent data-level schema errors being raised and will instead remove the rows which causes the failure.

This functionality is available on DataFrameSchema, SeriesSchema, Column, as well as DataFrameModel schemas.

Note that this functionality works by identifying the index or multi-index of the failing rows. If the index is not unique on the dataframe, this could result in incorrect rows being dropped.

Dropping invalid rows with DataFrameSchema:

import pandas as pd
import pandera as pa

from pandera import Check, Column, DataFrameSchema

df = pd.DataFrame({"counter": ["1", "2", "3"]})
schema = DataFrameSchema(
    {"counter": Column(int, checks=[Check(lambda x: x >= 3)])},
    drop_invalid_rows=True,
)

schema.validate(df, lazy=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[1], line 12
      6 df = pd.DataFrame({"counter": ["1", "2", "3"]})
      7 schema = DataFrameSchema(
      8     {"counter": Column(int, checks=[Check(lambda x: x >= 3)])},
      9     drop_invalid_rows=True,
     10 )
---> 12 schema.validate(df, lazy=True)

File ~/checkouts/readthedocs.org/user_builds/pandera/checkouts/latest/pandera/api/pandas/container.py:126, in DataFrameSchema.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    114     check_obj = check_obj.map_partitions(  # type: ignore [operator]
    115         self._validate,
    116         head=head,
   (...)    122         meta=check_obj,
    123     )
    124     return check_obj.pandera.add_schema(self)
--> 126 return self._validate(
    127     check_obj=check_obj,
    128     head=head,
    129     tail=tail,
    130     sample=sample,
    131     random_state=random_state,
    132     lazy=lazy,
    133     inplace=inplace,
    134 )

File ~/checkouts/readthedocs.org/user_builds/pandera/checkouts/latest/pandera/api/pandas/container.py:147, in DataFrameSchema._validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    136 def _validate(
    137     self,
    138     check_obj: pd.DataFrame,
   (...)    144     inplace: bool = False,
    145 ) -> pd.DataFrame:
--> 147     return self.get_backend(check_obj).validate(
    148         check_obj,
    149         schema=self,
    150         head=head,
    151         tail=tail,
    152         sample=sample,
    153         random_state=random_state,
    154         lazy=lazy,
    155         inplace=inplace,
    156     )

File ~/checkouts/readthedocs.org/user_builds/pandera/checkouts/latest/pandera/backends/pandas/container.py:119, in DataFrameSchemaBackend.validate(self, check_obj, schema, head, tail, sample, random_state, lazy, inplace)
    117 if error_handler.collected_errors:
    118     if getattr(schema, "drop_invalid_rows", False):
--> 119         check_obj = self.drop_invalid_rows(check_obj, error_handler)
    120         return check_obj
    121     else:

File ~/checkouts/readthedocs.org/user_builds/pandera/checkouts/latest/pandera/backends/pandas/base.py:194, in PandasSchemaBackend.drop_invalid_rows(self, check_obj, error_handler)
    192 errors = error_handler.schema_errors
    193 for err in errors:
--> 194     index_values = err.failure_cases["index"]
    195     if isinstance(check_obj.index, pd.MultiIndex):
    196         # MultiIndex values are saved on the error as strings so need to be cast back
    197         # to their original types
    198         index_tuples = err.failure_cases["index"].apply(eval)

TypeError: string indices must be integers, not 'str'

Dropping invalid rows with SeriesSchema:

import pandas as pd
import pandera as pa

from pandera import Check, SeriesSchema

series = pd.Series(["1", "2", "3"])
schema = SeriesSchema(
    int,
    checks=[Check(lambda x: x >= 3)],
    drop_invalid_rows=True,
)

schema.validate(series, lazy=True)

Dropping invalid rows with Column:

import pandas as pd
import pandera as pa

from pandera import Check, Column

df = pd.DataFrame({"counter": ["1", "2", "3"]})
schema = Column(
    int,
    name="counter",
    drop_invalid_rows=True,
    checks=[Check(lambda x: x >= 3)]
)

schema.validate(df, lazy=True)

Dropping invalid rows with DataFrameModel:

import pandas as pd
import pandera as pa

from pandera import Check, DataFrameModel, Field

class MySchema(DataFrameModel):
    counter: int = Field(in_range={"min_value": 3, "max_value": 5})

    class Config:
        drop_invalid_rows = True


MySchema.validate(
    pd.DataFrame({"counter": [1, 2, 3, 4, 5, 6]}), lazy=True
)

Note

In order to use drop_invalid_rows=True, lazy=True must be passed to the schema.validate(). Lazy Validation enables all schema errors to be collected and raised together, meaning all invalid rows can be dropped together. This provides clear API for ensuring the validated dataframe contains only valid data.