check schema-level information from both pyspark df and pandera shcem…

…a before applying nullable check Signed-off-by: Filipe Oliveira <[email protected]>
unionai-oss · Nov 7, 2023 · 4f6ce43 · 4f6ce43
1 parent 38e0b1b
commit 4f6ce43
Showing 1 changed file with 6 additions and 4 deletions.
diff --git a/pandera/backends/pyspark/column.py b/pandera/backends/pyspark/column.py
@@ -125,11 +125,13 @@ def coerce_dtype(
 
     @validate_scope(scope=ValidationScope.SCHEMA)
     def check_nullable(self, check_obj: DataFrame, schema):
-        # If True, ignore this `nullable` check
-        passed = schema.nullable
+        passed = True
 
-        # If False, execute the costly validation
-        if not schema.nullable:
+        # Use schema level information to optimize execution of the `nullable` check:
+        # ignore this check if Pandera Field's `nullable` property is True
+        # (check not necessary) or if df column's `nullable` property is False
+        # (PySpark's nullable ensures the presence of values when creating the df)
+        if (not schema.nullable) and (check_obj.schema[schema.name].nullable):
             passed = (
                 check_obj.filter(col(schema.name).isNull()).limit(1).count()
                 == 0