unionai-oss · cosmicBboy · Nov 20, 2023 · Nov 9, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/docs/.DS_Store b/docs/.DS_Store
diff --git a/docs/source/pyspark_sql.rst b/docs/source/pyspark_sql.rst
@@ -246,6 +246,46 @@ By default, validations are enabled and depth is set to ``SCHEMA_AND_DATA`` whic
 can be changed to ``SCHEMA_ONLY`` or ``DATA_ONLY`` as required by the use case.
 
 
+Caching control
+---------------
+
+*new in 0.17.3*
+
+Given Spark's architecture and Pandera's internal implementation of PySpark integration
+that relies on filtering conditions and *count* commands,
+the PySpark DataFrame being validated by a Pandera schema may be reprocessed
+lots of times, as each *count* command triggers a new underlying *Spark action*.
+This processing overhead is directly related to the amount of *schema* and *data* checks
+added to the Pandera schema.
+
+To avoid such reprocessing time, Pandera allows you to cache the PySpark DataFrame
+before validation starts, through the use of two environment variables:
+
+.. code-block:: bash
+
+    export PANDERA_PYSPARK_CACHE=True  # Defaults to False, do not `cache()` by default
+    export PANDERA_PYSPARK_UNPERSIST=False  # Defaults to True, `unpersist()` by default
+
+The first controls if current DataFrame state should be cached in your Spark Session
+before the validation starts. The second controls if such cached state should still be
+kept after the validation ends.
+
+.. note::
+
+    To cache or not is a trade-off analysis: if you have enough memory to keep
+    the dataframe cached, it will speed up the validation timings as the validation
+    process will make use of this cached state.
+
+    Keeping the cached state and opting for not throwing it away when the
+    validation ends is important when the Pandera validation of a dataset is not
+    an individual process, but one step of the pipeline: if you have a pipeline that,
+    in a single Spark session, uses Pandera to evaluate all input dataframes before
+    transforming them in an result that will be written to disk, it may make sense
+    to not throw away the cached states of the inputs. In the end, the already
+    processed states of these dataframes will still be used after the validation ends
+    and storing them in memory may be beneficial.
+
+
 Registering Custom Checks
 -------------------------
 

diff --git a/pandera/backends/pyspark/container.py b/pandera/backends/pyspark/container.py
@@ -11,7 +11,11 @@
 from pandera.api.pyspark.error_handler import ErrorCategory, ErrorHandler
 from pandera.api.pyspark.types import is_table
 from pandera.backends.pyspark.base import ColumnInfo, PysparkSchemaBackend
-from pandera.backends.pyspark.decorators import ValidationScope, validate_scope
+from pandera.backends.pyspark.decorators import (
+    ValidationScope,
+    validate_scope,
+    cache_check_obj,
+)
 from pandera.backends.pyspark.error_formatters import scalar_failure_case
 from pandera.config import CONFIG
 from pandera.errors import (
@@ -102,6 +106,7 @@
 
         return check_obj
 
+    @cache_check_obj()
     def validate(
         self,
         check_obj: DataFrame,

diff --git a/pandera/backends/pyspark/decorators.py b/pandera/backends/pyspark/decorators.py
@@ -1,16 +1,21 @@
 """This module holds the decorators only valid for pyspark"""
 
 import functools
+import logging
 import warnings
+from contextlib import contextmanager
 from enum import Enum
 from typing import List, Type
 
 import pyspark.sql
 
+from pyspark.sql import DataFrame
 from pandera.api.pyspark.types import PysparkDefaultTypes
 from pandera.config import CONFIG, ValidationDepth
 from pandera.errors import SchemaError
 
+logger = logging.getLogger(__name__)
+
 
 class ValidationScope(Enum):
     """Indicates whether a check/validator operates at a schema of data level."""
@@ -126,3 +131,54 @@
         return wrapper
 
     return _wrapper
+
+
+def cache_check_obj():
+    """This decorator evaluates if `check_obj` can be cached before validation.
+
+    As a new Spark action is triggered for every new data check added to the Pandera
+     schema by the user, Spark keeps reprocessing the `check_obj` dataframe lots of
+     times. To avoid such waste of processing resources and to decrease validation
+     times for complex scenarios, this decorator makes the `check_obj` be cached
+     before validation and unpersisted after it occurs.
+
+    The execution of this process depends on the `PANDERA_PYSPARK_CACHING` environment
+     variable.
+    """
+
+    def _wrapper(func):
+        @functools.wraps(func)
+        def wrapper(self, *args, **kwargs):
+
+            # Skip if not enabled
+            if CONFIG.pyspark_cache is not True:
+                return func(self, *args, **kwargs)
+
+            # Check if decorated function has a `check_obj` kwarg
+            try:
+                check_obj: DataFrame = kwargs["check_obj"]
+            except KeyError as e:
+                raise KeyError(
+                    "Expected to find a `check_obj` kwarg in the decorated function "
+                    f"{func.__name__}. Got {kwargs=}"
+                ) from e
+
+            @contextmanager
+            def cached_check_obj():
+                """Cache the dataframe and unpersist it after function execution."""
+                logger.debug("Caching dataframe...")
+                check_obj.cache()
+
+                yield  # Execute the decorated function
+
+                if CONFIG.pyspark_unpersist:
+                    # If not cached, `.unpersist()` does nothing
+                    logger.debug("Unpersisting dataframe...")
+                    check_obj.unpersist()
+
+            with cached_check_obj():
+                return func(self, *args, **kwargs)
+
+        return wrapper
+
+    return _wrapper
diff --git a/pandera/config.py b/pandera/config.py
@@ -20,10 +20,14 @@
     This should pick up environment variables automatically, e.g.:
     export PANDERA_VALIDATION_ENABLED=False
     export PANDERA_VALIDATION_DEPTH=DATA_ONLY
+    export PANDERA_PYSPARK_CACHE=True
+    export PANDERA_PYSPARK_UNPERSIST=False
     """
 
     validation_enabled: bool = True
     validation_depth: ValidationDepth = ValidationDepth.SCHEMA_AND_DATA
+    pyspark_cache: bool = False
+    pyspark_unpersist: bool = True
 
 
 # this config variable should be accessible globally
@@ -35,4 +39,12 @@
     validation_depth=os.environ.get(
         "PANDERA_VALIDATION_DEPTH", ValidationDepth.SCHEMA_AND_DATA
     ),
+    pyspark_cache=os.environ.get(
+        "PANDERA_PYSPARK_CACHE",
+        False,
+    ),
+    pyspark_unpersist=os.environ.get(
+        "PANDERA_PYSPARK_UNPERSIST",
+        True,
+    ),
 )
diff --git a/tests/pyspark/test_pyspark_config.py b/tests/pyspark/test_pyspark_config.py
@@ -2,6 +2,7 @@
 # pylint:disable=import-outside-toplevel,abstract-method
 
 import pyspark.sql.types as T
+import pytest
 
 from pandera.config import CONFIG, ValidationDepth
 from pandera.pyspark import (
@@ -24,7 +25,7 @@ def test_disable_validation(self, spark, sample_spark_schema):
 
         CONFIG.validation_enabled = False
 
-        pandra_schema = DataFrameSchema(
+        pandera_schema = DataFrameSchema(
             {
                 "product": Column(T.StringType(), Check.str_startswith("B")),
                 "price_val": Column(T.IntegerType()),
@@ -41,10 +42,12 @@ class TestSchema(DataFrameModel):
         expected = {
             "validation_enabled": False,
             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
+            "pyspark_cache": False,
+            "pyspark_unpersist": True,
         }
 
         assert CONFIG.dict() == expected
-        assert pandra_schema.validate(input_df) is None
+        assert pandera_schema.validate(input_df) is None
         assert TestSchema.validate(input_df) is None
 
     # pylint:disable=too-many-locals
@@ -63,6 +66,8 @@ def test_schema_only(self, spark, sample_spark_schema):
         expected = {
             "validation_enabled": True,
             "validation_depth": ValidationDepth.SCHEMA_ONLY,
+            "pyspark_cache": False,
+            "pyspark_unpersist": True,
         }
         assert CONFIG.dict() == expected
 
@@ -132,7 +137,7 @@ def test_data_only(self, spark, sample_spark_schema):
         CONFIG.validation_enabled = True
         CONFIG.validation_depth = ValidationDepth.DATA_ONLY
 
-        pandra_schema = DataFrameSchema(
+        pandera_schema = DataFrameSchema(
             {
                 "product": Column(T.StringType(), Check.str_startswith("B")),
                 "price_val": Column(T.IntegerType()),
@@ -141,11 +146,13 @@ def test_data_only(self, spark, sample_spark_schema):
         expected = {
             "validation_enabled": True,
             "validation_depth": ValidationDepth.DATA_ONLY,
+            "pyspark_cache": False,
+            "pyspark_unpersist": True,
         }
         assert CONFIG.dict() == expected
 
         input_df = spark_df(spark, self.sample_data, sample_spark_schema)
-        output_dataframeschema_df = pandra_schema.validate(input_df)
+        output_dataframeschema_df = pandera_schema.validate(input_df)
         expected_dataframeschema = {
             "DATA": {
                 "DATAFRAME_CHECK": [
@@ -217,7 +224,7 @@ def test_schema_and_data(self, spark, sample_spark_schema):
         CONFIG.validation_enabled = True
         CONFIG.validation_depth = ValidationDepth.SCHEMA_AND_DATA
 
-        pandra_schema = DataFrameSchema(
+        pandera_schema = DataFrameSchema(
             {
                 "product": Column(T.StringType(), Check.str_startswith("B")),
                 "price_val": Column(T.IntegerType()),
@@ -226,11 +233,13 @@ def test_schema_and_data(self, spark, sample_spark_schema):
         expected = {
             "validation_enabled": True,
             "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
+            "pyspark_cache": False,
+            "pyspark_unpersist": True,
         }
         assert CONFIG.dict() == expected
 
         input_df = spark_df(spark, self.sample_data, sample_spark_schema)
-        output_dataframeschema_df = pandra_schema.validate(input_df)
+        output_dataframeschema_df = pandera_schema.validate(input_df)
         expected_dataframeschema = {
             "DATA": {
                 "DATAFRAME_CHECK": [
@@ -326,3 +335,25 @@ class TestSchema(DataFrameModel):
             dict(output_dataframemodel_df.pandera.errors["SCHEMA"])
             == expected_dataframemodel["SCHEMA"]
         )
+
+    @pytest.mark.parametrize("cache_enabled", [True, False])
+    @pytest.mark.parametrize("unpersist_enabled", [True, False])
+    # pylint:disable=too-many-locals
+    def test_pyspark_cache_settings(
+        self,
+        cache_enabled,
+        unpersist_enabled,
+    ):
+        """This function validates that caching/unpersisting works as expected."""
+        # Set expected properties in Config object
+        CONFIG.pyspark_cache = cache_enabled
+        CONFIG.pyspark_unpersist = unpersist_enabled
+
+        # Evaluate expected Config
+        expected = {
+            "validation_enabled": True,
+            "validation_depth": ValidationDepth.SCHEMA_AND_DATA,
+            "pyspark_cache": cache_enabled,
+            "pyspark_unpersist": unpersist_enabled,
+        }
+        assert CONFIG.dict() == expected