diff --git a/pandera/backends/pyspark/column.py b/pandera/backends/pyspark/column.py index 9c8b88452..52c3081c1 100644 --- a/pandera/backends/pyspark/column.py +++ b/pandera/backends/pyspark/column.py @@ -125,11 +125,13 @@ def coerce_dtype( @validate_scope(scope=ValidationScope.SCHEMA) def check_nullable(self, check_obj: DataFrame, schema): - # If True, ignore this `nullable` check - passed = schema.nullable + passed = True - # If False, execute the costly validation - if not schema.nullable: + # Use schema level information to optimize execution of the `nullable` check: + # ignore this check if Pandera Field's `nullable` property is True + # (check not necessary) or if df column's `nullable` property is False + # (PySpark's nullable ensures the presence of values when creating the df) + if (not schema.nullable) and (check_obj.schema[schema.name].nullable): passed = ( check_obj.filter(col(schema.name).isNull()).limit(1).count() == 0