diff --git a/ci/requirements-py3.10-pandas1.5.3-pydantic1.10.11.txt b/ci/requirements-py3.10-pandas1.5.3-pydantic1.10.11.txt index 2cf690f14..581891c4d 100644 --- a/ci/requirements-py3.10-pandas1.5.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.10-pandas1.5.3-pydantic1.10.11.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.10-pandas1.5.3-pydantic2.3.0.txt b/ci/requirements-py3.10-pandas1.5.3-pydantic2.3.0.txt index 21d6448b0..3e49c1930 100644 --- a/ci/requirements-py3.10-pandas1.5.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.10-pandas1.5.3-pydantic2.3.0.txt @@ -120,7 +120,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.10-pandas2.0.3-pydantic1.10.11.txt b/ci/requirements-py3.10-pandas2.0.3-pydantic1.10.11.txt index b1ff9ffac..7ed737a55 100644 --- a/ci/requirements-py3.10-pandas2.0.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.10-pandas2.0.3-pydantic1.10.11.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.10-pandas2.0.3-pydantic2.3.0.txt b/ci/requirements-py3.10-pandas2.0.3-pydantic2.3.0.txt index 4ed29d8a4..d9e947f4f 100644 --- a/ci/requirements-py3.10-pandas2.0.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.10-pandas2.0.3-pydantic2.3.0.txt @@ -120,7 +120,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.11-pandas1.5.3-pydantic1.10.11.txt b/ci/requirements-py3.11-pandas1.5.3-pydantic1.10.11.txt index b36141458..7abd06f26 100644 --- a/ci/requirements-py3.11-pandas1.5.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.11-pandas1.5.3-pydantic1.10.11.txt @@ -118,7 +118,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.11-pandas1.5.3-pydantic2.3.0.txt b/ci/requirements-py3.11-pandas1.5.3-pydantic2.3.0.txt index ce8a6f5e1..c0256e535 100644 --- a/ci/requirements-py3.11-pandas1.5.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.11-pandas1.5.3-pydantic2.3.0.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.11-pandas2.0.3-pydantic1.10.11.txt b/ci/requirements-py3.11-pandas2.0.3-pydantic1.10.11.txt index ff109c080..77df18438 100644 --- a/ci/requirements-py3.11-pandas2.0.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.11-pandas2.0.3-pydantic1.10.11.txt @@ -118,7 +118,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.11-pandas2.0.3-pydantic2.3.0.txt b/ci/requirements-py3.11-pandas2.0.3-pydantic2.3.0.txt index ea40a93c6..46d7450c3 100644 --- a/ci/requirements-py3.11-pandas2.0.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.11-pandas2.0.3-pydantic2.3.0.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.8-pandas1.5.3-pydantic1.10.11.txt b/ci/requirements-py3.8-pandas1.5.3-pydantic1.10.11.txt index e3f9f3ad5..d52e166ec 100644 --- a/ci/requirements-py3.8-pandas1.5.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.8-pandas1.5.3-pydantic1.10.11.txt @@ -121,7 +121,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.8-pandas1.5.3-pydantic2.3.0.txt b/ci/requirements-py3.8-pandas1.5.3-pydantic2.3.0.txt index 9328c864c..34aff3736 100644 --- a/ci/requirements-py3.8-pandas1.5.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.8-pandas1.5.3-pydantic2.3.0.txt @@ -122,7 +122,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.8-pandas2.0.3-pydantic1.10.11.txt b/ci/requirements-py3.8-pandas2.0.3-pydantic1.10.11.txt index 7ebe088f6..21111909f 100644 --- a/ci/requirements-py3.8-pandas2.0.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.8-pandas2.0.3-pydantic1.10.11.txt @@ -121,7 +121,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.8-pandas2.0.3-pydantic2.3.0.txt b/ci/requirements-py3.8-pandas2.0.3-pydantic2.3.0.txt index c6d5d4aa6..46c3934da 100644 --- a/ci/requirements-py3.8-pandas2.0.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.8-pandas2.0.3-pydantic2.3.0.txt @@ -122,7 +122,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.9-pandas1.5.3-pydantic1.10.11.txt b/ci/requirements-py3.9-pandas1.5.3-pydantic1.10.11.txt index 2f7439cbf..d68b2e3b6 100644 --- a/ci/requirements-py3.9-pandas1.5.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.9-pandas1.5.3-pydantic1.10.11.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.9-pandas1.5.3-pydantic2.3.0.txt b/ci/requirements-py3.9-pandas1.5.3-pydantic2.3.0.txt index 2c1398f1c..46a88818f 100644 --- a/ci/requirements-py3.9-pandas1.5.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.9-pandas1.5.3-pydantic2.3.0.txt @@ -120,7 +120,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/ci/requirements-py3.9-pandas2.0.3-pydantic1.10.11.txt b/ci/requirements-py3.9-pandas2.0.3-pydantic1.10.11.txt index e1194fdab..2eea783df 100644 --- a/ci/requirements-py3.9-pandas2.0.3-pydantic1.10.11.txt +++ b/ci/requirements-py3.9-pandas2.0.3-pydantic1.10.11.txt @@ -119,7 +119,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==1.10.11 # via -r requirements.in, fastapi pygments==2.16.1 # via furo, nbconvert, readme-renderer, rich, sphinx diff --git a/ci/requirements-py3.9-pandas2.0.3-pydantic2.3.0.txt b/ci/requirements-py3.9-pandas2.0.3-pydantic2.3.0.txt index 230324e2b..809904486 100644 --- a/ci/requirements-py3.9-pandas2.0.3-pydantic2.3.0.txt +++ b/ci/requirements-py3.9-pandas2.0.3-pydantic2.3.0.txt @@ -120,7 +120,7 @@ protobuf==4.24.3 # via -r requirements.in, ray psutil==5.9.5 # via distributed, modin ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 # via -r requirements.in +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi pydantic==2.3.0 # via -r requirements.in, fastapi pydantic-core==2.6.3 # via pydantic diff --git a/dev/requirements-3.10.txt b/dev/requirements-3.10.txt index b595c6e2d..19c869a5a 100644 --- a/dev/requirements-3.10.txt +++ b/dev/requirements-3.10.txt @@ -358,7 +358,7 @@ ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi diff --git a/dev/requirements-3.11.txt b/dev/requirements-3.11.txt index 17ba4f8c1..f24bbebdd 100644 --- a/dev/requirements-3.11.txt +++ b/dev/requirements-3.11.txt @@ -353,7 +353,7 @@ ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi diff --git a/dev/requirements-3.8.txt b/dev/requirements-3.8.txt index 4209d5c52..56862fb90 100644 --- a/dev/requirements-3.8.txt +++ b/dev/requirements-3.8.txt @@ -371,7 +371,7 @@ ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi diff --git a/dev/requirements-3.9.txt b/dev/requirements-3.9.txt index 2860c9bb7..ce90b4d3d 100644 --- a/dev/requirements-3.9.txt +++ b/dev/requirements-3.9.txt @@ -365,7 +365,7 @@ ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi diff --git a/pandera/backends/pyspark/column.py b/pandera/backends/pyspark/column.py index 7c0ac168e..52c3081c1 100644 --- a/pandera/backends/pyspark/column.py +++ b/pandera/backends/pyspark/column.py @@ -125,10 +125,18 @@ def coerce_dtype( @validate_scope(scope=ValidationScope.SCHEMA) def check_nullable(self, check_obj: DataFrame, schema): - isna = ( - check_obj.filter(col(schema.name).isNull()).limit(1).count() == 0 - ) - passed = schema.nullable or isna + passed = True + + # Use schema level information to optimize execution of the `nullable` check: + # ignore this check if Pandera Field's `nullable` property is True + # (check not necessary) or if df column's `nullable` property is False + # (PySpark's nullable ensures the presence of values when creating the df) + if (not schema.nullable) and (check_obj.schema[schema.name].nullable): + passed = ( + check_obj.filter(col(schema.name).isNull()).limit(1).count() + == 0 + ) + return CoreCheckResult( check="not_nullable", reason_code=SchemaErrorReason.SERIES_CONTAINS_NULLS, diff --git a/requirements-docs.txt b/requirements-docs.txt index ebe7cf9c8..7b30b4d34 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -53,7 +53,9 @@ certifi==2023.7.22 # pyproj # requests cffi==1.15.1 - # via argon2-cffi-bindings + # via + # argon2-cffi-bindings + # cryptography cfgv==3.4.0 # via pre-commit chardet==5.2.0 @@ -87,7 +89,11 @@ colorlog==6.7.0 commonmark==0.9.1 # via recommonmark coverage[toml]==7.3.1 - # via pytest-cov + # via + # coverage + # pytest-cov +cryptography==41.0.5 + # via secretstorage dask==2023.9.2 # via # -r requirements.in @@ -174,6 +180,10 @@ isort==5.12.0 # pylint jaraco-classes==3.3.0 # via keyring +jeepney==0.8.0 + # via + # keyring + # secretstorage jinja2==3.1.2 # via # distributed @@ -358,7 +368,7 @@ ptyprocess==0.7.0 # via terminado py4j==0.10.9.7 # via pyspark -pyarrow==13.0.0 +pyarrow==14.0.1 # via -r requirements.in pycparser==2.21 # via cffi @@ -468,6 +478,8 @@ rpds-py==0.10.3 # referencing scipy==1.11.2 # via -r requirements.in +secretstorage==3.3.3 + # via keyring send2trash==1.8.2 # via jupyter-server shapely==2.0.1 @@ -578,7 +590,9 @@ twine==4.0.2 typeguard==4.1.5 # via -r requirements.in typer[all]==0.9.0 - # via frictionless + # via + # frictionless + # typer types-click==7.1.8 # via -r requirements.in types-pkg-resources==0.1.3 diff --git a/tests/pyspark/test_pyspark_container.py b/tests/pyspark/test_pyspark_container.py index 6d9f65d89..87243f8d4 100644 --- a/tests/pyspark/test_pyspark_container.py +++ b/tests/pyspark/test_pyspark_container.py @@ -1,5 +1,6 @@ """Unit tests for pyspark container.""" +from contextlib import nullcontext as does_not_raise from pyspark.sql import DataFrame, SparkSession import pyspark.sql.types as T import pytest @@ -142,7 +143,7 @@ def test_pyspark_sample(): ("Butter", 15), ("Ice Cream", 10), ("Cola", 12), - ("Choclate", 7), + ("Chocolate", 7), ] spark_schema = T.StructType( @@ -185,3 +186,48 @@ def test_pyspark_regex_column(): df_out = schema.validate(df2) assert not df_out.pandera.errors + + +def test_pyspark_nullable(): + """ + Test the nullable functionality of pyspark + """ + + data = [ + ("Bread", 9), + ("Butter", 15), + ("Ice Cream", None), + ("Cola", 12), + ("Chocolate", None), + ] + spark_schema = T.StructType( + [ + T.StructField("product", T.StringType(), False), + T.StructField("price", T.IntegerType(), True), + ], + ) + df = spark.createDataFrame(data=data, schema=spark_schema) + + # Check for `nullable=False` + schema_nullable_false = DataFrameSchema( + columns={ + "product": Column("str"), + "price": Column("int", nullable=False), + }, + ) + with does_not_raise(): + df_out = schema_nullable_false.validate(df) + assert isinstance(df_out, DataFrame) + assert "SERIES_CONTAINS_NULLS" in str(dict(df_out.pandera.errors)) + + # Check for `nullable=True` + schema_nullable_true = DataFrameSchema( + columns={ + "product": Column("str"), + "price": Column("int", nullable=True), + }, + ) + with does_not_raise(): + df_out = schema_nullable_true.validate(df) + assert isinstance(df_out, DataFrame) + assert df_out.pandera.errors == {}