Data Validation with Pyspark Pandas¶

new in 0.10.0

Pyspark is a distributed compute framework that offers a pandas drop-in replacement dataframe implementation via the pyspark.pandas API . You can use pandera to validate DataFrame() and Series() objects directly. First, install pandera with the pyspark extra:

pip install 'pandera[pyspark]'

Then you can use pandera schemas to validate pyspark dataframes. In the example below we’ll use the class-based API to define a DataFrameModel for validation.

import pyspark.pandas as ps
import pandas as pd
import pandera as pa

from pandera.typing.pyspark import DataFrame, Series


class Schema(pa.DataFrameModel):
    state: Series[str]
    city: Series[str]
    price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})


# create a pyspark.pandas dataframe that's validated on object initialization
df = DataFrame[Schema](
    {
        'state': ['FL','FL','FL','CA','CA','CA'],
        'city': [
            'Orlando',
            'Miami',
            'Tampa',
            'San Francisco',
            'Los Angeles',
            'San Diego',
        ],
        'price': [8, 12, 10, 16, 20, 18],
    }
)
print(df)

/home/docs/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
  warnings.warn(

JAVA_HOME is not set

---------------------------------------------------------------------------
PySparkRuntimeError                       Traceback (most recent call last)
Cell In[1], line 15
   price: Series[int] = pa.Field(in_range={"min_value": 5, "max_value": 20})
# create a pyspark.pandas dataframe that's validated on object initialization
---> 15 df = DataFrame[Schema](
   {
       'state': ['FL','FL','FL','CA','CA','CA'],
       'city': [
           'Orlando',
           'Miami',
           'Tampa',
           'San Francisco',
           'Los Angeles',
           'San Diego',
       ],
       'price': [8, 12, 10, 16, 20, 18],
   }
)
print(df)

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pandera/typing/common.py:136, in __patched_generic_alias_call(self, *args, **kwargs)
if not self._inst:
   raise TypeError(
       f"Type {self._name} cannot be instantiated; "
       f"use {self.__origin__.__name__}() instead"
   )
--> 136 result = self.__origin__(*args, **kwargs)
try:
   result.__orig_class__ = self

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/pandas/frame.py:573, in DataFrame.__init__(self, data, index, columns, dtype, copy)
       index = index._to_pandas()
   pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
--> 573     internal = InternalFrame.from_pandas(pdf)
   index_assigned = True
if index is not None and not index_assigned:
   # TODO(SPARK-40226): Support MultiIndex

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/pandas/internal.py:1532, in InternalFrame.from_pandas(pdf)
(
   pdf,
   index_columns,
   (...)
   data_fields,
) = InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=prefer_timestamp_ntz)
schema = StructType([field.struct_field for field in index_fields + data_fields])
-> 1532 sdf = default_session().createDataFrame(pdf, schema=schema)
return InternalFrame(
   spark_frame=sdf,
   index_spark_columns=[scol_for(sdf, col) for col in index_columns],
   (...)
   column_label_names=column_label_names,
)

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/pandas/utils.py:483, in default_session()
spark = SparkSession.getActiveSession()
if spark is None:
--> 483     spark = SparkSession.builder.appName("pandas-on-Spark").getOrCreate()
# Turn ANSI off when testing the pandas API on Spark since
# the behavior of pandas API on Spark follows pandas, not SQL.
if is_testing():

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/sql/session.py:497, in SparkSession.Builder.getOrCreate(self)
   sparkConf.set(key, value)
# This SparkContext may be an existing one.
--> 497 sc = SparkContext.getOrCreate(sparkConf)
# Do not update `SparkConf` for existing `SparkContext`, as it's shared
# by all sessions.
session = SparkSession(sc, options=self._options)

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/context.py:515, in SparkContext.getOrCreate(cls, conf)
with SparkContext._lock:
   if SparkContext._active_spark_context is None:
--> 515         SparkContext(conf=conf or SparkConf())
   assert SparkContext._active_spark_context is not None
   return SparkContext._active_spark_context

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/context.py:201, in SparkContext.__init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls, udf_profiler_cls, memory_profiler_cls)
if gateway is not None and gateway.gateway_parameters.auth_token is None:
   raise ValueError(
       "You are trying to pass an insecure Py4j gateway to Spark. This"
       " is not allowed as it is a security risk."
   )
--> 201 SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
try:
   self._do_init(
       master,
       appName,
   (...)
       memory_profiler_cls,
   )

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/context.py:436, in SparkContext._ensure_initialized(cls, instance, gateway, conf)
with SparkContext._lock:
   if not SparkContext._gateway:
--> 436         SparkContext._gateway = gateway or launch_gateway(conf)
       SparkContext._jvm = SparkContext._gateway.jvm
   if instance:

File ~/checkouts/readthedocs.org/user_builds/pandera/envs/latest/lib/python3.10/site-packages/pyspark/java_gateway.py:107, in launch_gateway(conf, popen_kwargs)
   time.sleep(0.1)
if not os.path.isfile(conn_info_file):
--> 107     raise PySparkRuntimeError(
       error_class="JAVA_GATEWAY_EXITED",
       message_parameters={},
   )
with open(conn_info_file, "rb") as info:
   gateway_port = read_int(info)

PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.

You can also use the check_types() decorator to validate pyspark pandas dataframes at runtime:

@pa.check_types
def function(df: DataFrame[Schema]) -> DataFrame[Schema]:
    return df[df["state"] == "CA"]

print(function(df))

And of course, you can use the object-based API to validate dask dataframes:

schema = pa.DataFrameSchema({
    "state": pa.Column(str),
    "city": pa.Column(str),
    "price": pa.Column(int, pa.Check.in_range(min_value=5, max_value=20))
})
schema(df)