diff --git a/pandera/engines/pandas_engine.py b/pandera/engines/pandas_engine.py index dff4a0859..4d3d63f5c 100644 --- a/pandera/engines/pandas_engine.py +++ b/pandera/engines/pandas_engine.py @@ -1071,7 +1071,7 @@ def from_parametrized_dtype(cls, pd_dtype: pd.IntervalDtype): from geopandas.array import GeometryArray, GeometryDtype, from_shapely import shapely import shapely.geometry - from pyproj import CRS, exceptions + import pyproj GeoPandasObject = Union[ pd.Series, pd.DataFrame, gpd.GeoSeries, gpd.GeoDataFrame @@ -1108,8 +1108,8 @@ def __init__( # pylint:disable=super-init-not-called ) -> None: if crs is not None: try: - CRS.from_user_input(crs) - except exceptions.CRSError as exc: + pyproj.CRS.from_user_input(crs) + except pyproj.exceptions.CRSError as exc: raise TypeError(f"Invalid CRS: {str(crs)}") from exc object.__setattr__(self, "crs", crs) @@ -1145,28 +1145,34 @@ def _coerce_values(self, obj: GeoPandasObject) -> GeoPandasObject: def _coerce_element(self, element: Any) -> Any: try: - if isinstance(element, dict): - coerced_element = shapely.geometry.shape(element) - else: - coerced_element = np.nan - except (TypeError, shapely.errors.GEOSException): - coerced_element = np.nan - return coerced_element + return shapely.geometry.shape(element) + except ( + AttributeError, + TypeError, + shapely.errors.GeometryTypeError, + shapely.errors.GEOSException, + ): + return np.nan def _coerce_crs(self, value: GeoPandasObject) -> GeoPandasObject: - try: - if self.crs is not None: - if value.crs is None: - # Allow assignment of CRS if currently - # null and a non-null value is designated. - # This will only work in the context of - # geopandas because assinging geometry - # CRS to a pandas dataframe isn't supported. - value.crs = self.crs - elif self.crs != value.crs: - value = value.to_crs(self.crs) # type: ignore[operator] - except AttributeError: - ... + if self.crs is not None: + if value.crs is None: + # Allow assignment of CRS if currently + # null and a non-null value is designated. + # This will only work in the context of + # geopandas because assinging geometry + # CRS to a pandas dataframe isn't supported. + value.crs = self.crs + elif ( + isinstance(value, gpd.GeoSeries) and self.crs != value.crs + ): + value = value.to_crs(self.crs) # type: ignore[operator] + elif isinstance(value, gpd.GeoDataFrame) and any( + self.crs != value[col].crs for col in value.columns + ): + for col in value.columns: + if self.crs != value[col].crs: + value[col] = value[col].to_crs(self.crs) return value def coerce(self, data_container: GeoPandasObject) -> GeoPandasObject: @@ -1176,11 +1182,15 @@ def coerce(self, data_container: GeoPandasObject) -> GeoPandasObject: orig_isna = data_container.isna() + # Copy so we don't directly modify container due + # to CRS re-projection, etc.) + data_container = data_container.copy() + # Coerce container data coerced_data = self._coerce_values(data_container) # Coerce container type - if isinstance(coerced_data, GeometryArray): + if isinstance(coerced_data, (GeometryArray, pd.DataFrame)): if isinstance(data_container, (pd.Series, gpd.GeoSeries)): coerced_data = gpd.GeoSeries(coerced_data) else: @@ -1197,12 +1207,8 @@ def coerce(self, data_container: GeoPandasObject) -> GeoPandasObject: failure_cases, ignore_na=False ), ) - return self._coerce_crs(coerced_data) - - def coerce_value(self, value: Any) -> Any: - """Coerce a value to a particular type.""" - coerced = self._coerce_values(value) - return self._coerce_crs(coerced) + coerced = self._coerce_crs(coerced_data) + return coerced def check( # type: ignore self, @@ -1216,10 +1222,10 @@ def check( # type: ignore return False else: return np.full_like(data_container, False, dtype=bool) - if data_container is None: - return True + if self.crs != pandera_dtype.crs and data_container is None: # type: ignore[attr-defined] + return False - # CRS check + # CRS check extends into container if self.crs is not None: if ( isinstance(data_container, gpd.GeoSeries) @@ -1241,12 +1247,6 @@ def check( # type: ignore return np.full_like(data_container, True, dtype=bool) - @classmethod - def from_parametrized_dtype(cls, g: GeometryArray): - """Convert a geometry to - a Pandera :class:`pandera.dtypes.pandas_engine.Geometry`.""" - return cls(crs=g.crs) # type: ignore - def __eq__(self, obj: object) -> bool: if isinstance(obj, type(self)): return obj.crs == self.crs diff --git a/pandera/typing/geopandas.py b/pandera/typing/geopandas.py index 18ba4a6c0..c937105f8 100644 --- a/pandera/typing/geopandas.py +++ b/pandera/typing/geopandas.py @@ -5,18 +5,13 @@ from typing import ( # type: ignore[attr-defined] TYPE_CHECKING, Any, - Dict, Generic, - List, - Tuple, - Type, TypeVar, Union, get_args, _type_check, ) -import numpy as np import pandas as pd from pandera.engines import PYDANTIC_V2 @@ -260,36 +255,3 @@ def _pydantic_validate(cls, obj: Any, field) -> gpd.GeoDataFrame: """ schema_model = cls._get_schema_model(field) return cls.pydantic_validate(obj, schema_model) - - @staticmethod - def from_records( # type: ignore - schema: Type[T], - data: Union[ # type: ignore - np.ndarray, - List[Tuple[Any, ...]], - Dict[Any, Any], - pd.DataFrame, - gpd.GeoDataFrame, - ], - **kwargs, - ) -> "GeoDataFrame[T]": - """ - Convert structured or record ndarray to pandera-validated GeoDataFrame. - - Creates a DataFrame object from a structured ndarray, sequence of tuples - or dicts, DataFrame, or GeoDataFrame. - - See :doc:`pandas:reference/api/pandas.DataFrame.from_records` for - more details. - """ - schema = schema.to_schema() # type: ignore[attr-defined] - schema_index = ( - schema.index.names if schema.index is not None else None - ) - if "index" not in kwargs: - kwargs["index"] = schema_index - data_df = pd.DataFrame.from_records(data=data, **kwargs) - return GeoDataFrame[schema]( # type: ignore - # set the column order according to schema - data_df[[c for c in schema.columns if c in data_df.columns]] - ) diff --git a/tests/geopandas/test_engine.py b/tests/geopandas/test_engine.py new file mode 100644 index 000000000..49f8da1bf --- /dev/null +++ b/tests/geopandas/test_engine.py @@ -0,0 +1,257 @@ +"""Unit tests for the geopandas engine dtype Geometry.""" + +import shapely +import numpy as np +import pandas as pd +import geopandas as gpd +import pytest +from shapely.geometry import Point + +import pandera as pa +from pandera.engines.pandas_engine import Geometry, DateTime + + +def test_engine_geometry_simple(): + """Test Geometry for basic attributes.""" + dtype = Geometry(crs=None) + assert dtype.crs is None + assert str(dtype) == "geometry" + + dtype = Geometry(crs="EPSG:4326") + assert dtype.crs == "EPSG:4326" + + +def test_engine_geometry_equality(): + """Test Geometry for equivalency to other Geometry.""" + dtype = Geometry(crs="EPSG:4326") + assert dtype == Geometry(crs="EPSG:4326") + assert dtype != Geometry(crs="EPSG:3857") + + with pytest.raises(TypeError): + Geometry(crs="this is definitely not a valid crs") + + +@pytest.mark.parametrize( + "pandera_dtype,data_container,invalid", + [ + [Geometry(crs="EPSG:4326"), None, False], + [Geometry(crs="EPSG:25832"), None, "fail"], + [DateTime, None, "fail"], + [ + Geometry(crs="EPSG:4326"), + gpd.GeoSeries([Point([0, 1e6])], crs="EPSG:4326"), + False, + ], + [ + Geometry(crs="EPSG:4326"), + gpd.GeoSeries([Point([0, 1e6])], crs="EPSG:25832"), + "exception", + ], + [ + Geometry(crs="EPSG:4326"), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:4326" + ), + "random": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:4326" + ), + } + ), + False, + ], + [ + Geometry(crs="EPSG:25832"), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + "random": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + } + ), + "exception", + ], + [ + Geometry(crs="EPSG:4326"), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:4326" + ), + "random": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + } + ), + "exception", + ], + [ + Geometry(crs="EPSG:25832"), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + "random": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:4326" + ), + } + ), + "exception", + ], + ], +) +def test_engine_geometry_check(pandera_dtype, data_container, invalid): + """Test Geometry for dtype match on data container.""" + + dtype = Geometry(crs="EPSG:4326") + + if invalid == "exception": + with pytest.raises(TypeError): + dtype.check(pandera_dtype, data_container) + return + if invalid == "fail": + assert not np.any(dtype.check(pandera_dtype, data_container)) + + +@pytest.mark.parametrize( + "data_container", + [ + gpd.GeoSeries([Point([0, 1e6])], crs="EPSG:25832"), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + "random": gpd.GeoSeries([Point([5e5, 1e6])], crs="EPSG:25832"), + } + ), + gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + "random": gpd.GeoSeries([Point([5e5, 1e6])], crs="EPSG:3857"), + } + ), + pd.DataFrame( + { + "geometry": gpd.GeoSeries( + [Point([5e5, 1e6])], crs="EPSG:25832" + ), + "random": gpd.GeoSeries([Point([5e5, 1e6])], crs="EPSG:25832"), + } + ), + ], +) +def test_engine_geometry_coerce_crs(data_container): + """Test Geometry coerce for GeoSeries CRS reprojection transform.""" + + dtype = Geometry(crs="EPSG:4326") + coerced = dtype.coerce(data_container) + assert not np.any( + shapely.equals_exact( + data_container.to_numpy(), + coerced.to_numpy(), + tolerance=1e-3, + ) + ) + assert np.all(coerced.crs == dtype.crs) + + +@pytest.mark.parametrize( + "data,dims,invalid", + [ + [ + [ + {"type": "Point", "coordinates": [139.86681009, 35.77565643]}, + { + "type": "LineString", + "coordinates": [ + [139.86681009, 35.77565643], + [139.86677824, 35.7756761], + ], + }, + { + "type": "LineString", + "coordinates": [ + [139.86677824, 35.7756761], + [139.86676329, 35.77568168], + ], + }, + ], + 2, + False, + ], + [["POINT (0 0)", "POINT (1 1)"], 2, False], + [[Point(0, 0), Point(1, 1)], 2, False], + [shapely.to_wkb(shapely.points([[0, 0], [1, 1]])), 2, False], + [shapely.points([[0, 0], [1, 1]]), 2, False], + [[1, 2], 2, True], + [ + [ + { + "type": "InvalidPoint!", + "coordinates": [139.86681009, 35.77565643], + }, + { + "type": "LineString", + "coordinates": [ + [139.86681009, 35.77565643], + [139.86677824, 35.7756761], + ], + }, + ], + 2, + True, + ], + [ + [ + { + "type": "Point", + "coordinates": [139.86681009, 35.77565643, 9.031], + }, + { + "type": "LineString", + "coordinates": [ + [139.86681009, 35.77565643, 9.031], + [139.86677824, 35.7756761, 9.037], + ], + }, + { + "type": "LineString", + "coordinates": [ + [139.86677824, 35.7756761, 9.037], + [139.86676329, 35.77568168, 9.041], + ], + }, + ], + 3, + False, + ], + [["POINT (0 0 0)", "POINT (1 1 1)"], 3, False], + [[Point(0, 0, 0), Point(1, 1, 1)], 3, False], + [shapely.to_wkb(shapely.points([[0, 0, 0], [1, 1, 1]])), 3, False], + [shapely.points([[0, 0, 0], [1, 1, 1]]), 3, False], + ], +) +def test_engine_geometry_coerce_data(data, dims: int, invalid: bool): + """Test Geometry input parsing.""" + series = pd.Series(data) + dtype = Geometry() + + if invalid: + with pytest.raises((pa.errors.SchemaError, pa.errors.ParserError)): + dtype.coerce(series) + return + + coerced = dtype.coerce(series) + assert isinstance(coerced, gpd.GeoSeries) + + check_2d = np.all(shapely.has_z(coerced)) and dims == 3 + check_3d = not np.any(shapely.has_z(coerced)) and dims == 2 + assert check_2d or check_3d diff --git a/tests/geopandas/test_geopandas.py b/tests/geopandas/test_geopandas.py index f5e40d7f1..868726e19 100644 --- a/tests/geopandas/test_geopandas.py +++ b/tests/geopandas/test_geopandas.py @@ -5,12 +5,10 @@ except ImportError: from typing_extensions import Annotated # type: ignore -import shapely import pandas as pd import geopandas as gpd import pytest from shapely.geometry import Polygon, Point -from pydantic import BaseModel import pandera as pa from pandera.typing import Series @@ -69,22 +67,6 @@ def geo_check(cls, geo_series: GeoSeries) -> Series[bool]: ) -def test_pydantic_model(): - """Test that GeoDataFrame type can be used in a Pydantic model""" - - class Schema(pa.DataFrameModel): - # pylint: disable=missing-class-docstring - geometry: GeoSeries - - class MyModel(BaseModel): - # pylint: disable=missing-class-docstring - data: GeoDataFrame[Schema] - - obj = MyModel(data=gpd.GeoDataFrame({"geometry": [Point(0, 0)]})) - - assert isinstance(obj.data, gpd.GeoDataFrame) - - @pytest.mark.parametrize( "gdf_args,invalid", [ @@ -180,64 +162,7 @@ class Config: assert gdf.crs == "EPSG:4326" -@pytest.mark.parametrize( - "gdf_args,invalid", - [ - [ - { - "geometry": [Polygon(((0, 0), (0, -2), (-2, -2), (-2, 0)))] - * 2, - "crs": None, - }, - False, - ], - [ - { - "geometry": [Polygon(((0, 0), (0, -2), (-2, -2), (-2, 0)))] - * 2, - "crs": "EPSG:4326", - }, - False, - ], - ], -) -def test_schema_dtype_without_crs(gdf_args, invalid: bool): - """Test Geometry without CRS.""" - # No CRS to validate - class Schema(pa.DataFrameModel): - # pylint: disable=missing-class-docstring - geometry: Geometry() # type: ignore - - # create a geodataframe that's validated on object initialization - if invalid: - with pytest.raises(TypeError): - GeoDataFrame[Schema](**gdf_args) - return - - assert isinstance(GeoDataFrame[Schema](**gdf_args), gpd.GeoDataFrame) - - -def test_schema_dtype_crs_transform(): - """Test Geometry CRS coerce for coordinate transform.""" - - class Schema(pa.DataFrameModel): - # pylint: disable=missing-class-docstring - geometry: Geometry(crs="EPSG:4326") - - class Config: - coerce = True - - gdf = GeoDataFrame[Schema]( - data={"geometry": [Point([0, 1e6])]}, crs="EPSG:25832" - ) - assert isinstance(gdf, gpd.GeoDataFrame) - assert gdf.crs == "EPSG:4326" - golden_gs = gpd.GeoSeries(data=[Point([4.4553, 9.0184])], crs="EPSG:4326") - golden_compare = gdf.geometry.geom_equals_exact(golden_gs, tolerance=1e-3) - assert golden_compare.all() - - -def test_schema_dtype_parametrized_crs(): +def test_schema_parametrized_crs(): """Test Geometry declaration using dtype_kwargs and Annotated.""" gdf = gpd.GeoDataFrame({"geometry": [Point([1, 1])]}, crs="EPSG:4326") @@ -255,13 +180,30 @@ class Schema2(pa.DataFrameModel): assert isinstance(GeoDataFrame[Schema2](gdf), gpd.GeoDataFrame) -def test_schema_dtype_invalid_crs(): - """Test Geometry for invalid CRS.""" - with pytest.raises(TypeError): - Geometry(crs="this is definitely not a valid crs") +def test_schema_multiple_geometry_same_crs(): + """Test GeoDataFrame with multiple GeoSeries columns on same CRS""" + class Schema(pa.DataFrameModel): + # pylint: disable=missing-class-docstring + geometry: Geometry(crs="EPSG:4326") + random: Geometry(crs="EPSG:4326") + + data = { + "geometry": gpd.GeoSeries( + [Point([1, 1])], name="geometry", crs="EPSG:4326" + ), + "random": gpd.GeoSeries( + [Point([2, 2])], name="random", crs="EPSG:4326" + ), + } + + # Both columns should have same CRS + gdf = GeoDataFrame[Schema](data) + pd.testing.assert_series_equal(gdf["geometry"], data["geometry"]) + pd.testing.assert_series_equal(gdf["random"], data["random"]) -def test_schema_dtype_multiple_crs(): + +def test_schema_multiple_geometry_different_crs(): """Test GeoDataFrame with multiple GeoSeries columns on different CRS""" class Schema(pa.DataFrameModel): @@ -330,7 +272,7 @@ class Config: ], ) def test_schema_from_dataframe(data, invalid: bool): - """Test that DataFrameModel works on gpd.GeoDataFrame input.""" + """Test that DataFrameModel works on gpd.GeoDataFrame or pd.DataFrame input.""" class Schema(pa.DataFrameModel): # pylint: disable=missing-class-docstring @@ -359,108 +301,3 @@ class Schema(pa.DataFrameModel): assert isinstance( GeoDataFrame[Schema]({"name": ["a", "b"]}), gpd.GeoDataFrame ) - - -@pytest.mark.parametrize( - "data,dims,invalid", - [ - [ - [ - {"type": "Point", "coordinates": [139.86681009, 35.77565643]}, - { - "type": "LineString", - "coordinates": [ - [139.86681009, 35.77565643], - [139.86677824, 35.7756761], - ], - }, - { - "type": "LineString", - "coordinates": [ - [139.86677824, 35.7756761], - [139.86676329, 35.77568168], - ], - }, - ], - 2, - False, - ], - [["POINT (0 0)", "POINT (1 1)"], 2, False], - [[Point(0, 0), Point(1, 1)], 2, False], - [shapely.to_wkb(shapely.points([[0, 0], [1, 1]])), 2, False], - [shapely.points([[0, 0], [1, 1]]), 2, False], - [[1, 2], 2, True], - [ - [ - { - "type": "InvalidPoint!", - "coordinates": [139.86681009, 35.77565643], - }, - { - "type": "LineString", - "coordinates": [ - [139.86681009, 35.77565643], - [139.86677824, 35.7756761], - ], - }, - ], - 2, - True, - ], - [ - [ - { - "type": "Point", - "coordinates": [139.86681009, 35.77565643, 9.031], - }, - { - "type": "LineString", - "coordinates": [ - [139.86681009, 35.77565643, 9.031], - [139.86677824, 35.7756761, 9.037], - ], - }, - { - "type": "LineString", - "coordinates": [ - [139.86677824, 35.7756761, 9.037], - [139.86676329, 35.77568168, 9.041], - ], - }, - ], - 3, - False, - ], - [["POINT (0 0 0)", "POINT (1 1 1)"], 3, False], - [[Point(0, 0, 0), Point(1, 1, 1)], 3, False], - [shapely.to_wkb(shapely.points([[0, 0, 0], [1, 1, 1]])), 3, False], - [shapely.points([[0, 0, 0], [1, 1, 1]]), 3, False], - ], -) -def test_schema_coerce_input(data, dims: int, invalid: bool): - """Test 3D Geometry input parsing.""" - - class Schema(pa.DataFrameModel): - # pylint: disable=missing-class-docstring - geometry: GeoSeries - - class Config: - coerce = True - - @pa.check("geometry") - @classmethod - def geo_check(cls, geo_series: GeoSeries) -> Series[bool]: - # pylint: disable=missing-function-docstring - return ((dims == 3) & geo_series.has_z) | ( - (dims == 2) & ~geo_series.has_z - ) - - # create a geodataframe that's validated on object initialization - if invalid: - with pytest.raises(pa.errors.SchemaError): - GeoDataFrame[Schema]({"geometry": data}) - return - - assert isinstance( - GeoDataFrame[Schema]({"geometry": data}), gpd.GeoDataFrame - ) diff --git a/tests/geopandas/test_model.py b/tests/geopandas/test_model.py deleted file mode 100644 index bb92806ab..000000000 --- a/tests/geopandas/test_model.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests GeoPandas schema creation and validation from type annotations.""" -# pylint:disable=missing-class-docstring,missing-function-docstring,too-few-public-methods -from typing import Optional - -import pandas as pd -import geopandas as gpd -import pytest -from shapely.geometry import Point - -import pandera as pa -from pandera.typing import Index, Series -from pandera.typing.geopandas import GeoDataFrame, GeoSeries - - -def test_from_records_validates_the_schema(): - """Test that GeoDataFrame[Schema] validates the schema""" - - class Schema(pa.DataFrameModel): - geometry: GeoSeries - state: Series[str] - city: Series[str] - price: Series[float] - postal_code: Optional[Series[int]] = pa.Field(nullable=True) - - raw_data = [ - { - "geometry": Point(0, 0), - "state": "NY", - "city": "New York", - "price": 8.0, - }, - { - "geometry": Point(1, 1), - "state": "FL", - "city": "Miami", - "price": 12.0, - }, - ] - pandera_validated_df = GeoDataFrame.from_records(Schema, raw_data) - assert isinstance(pandera_validated_df, GeoDataFrame) - - pandas_df = gpd.GeoDataFrame(pd.DataFrame.from_records(raw_data)) - pd.testing.assert_frame_equal( - pandera_validated_df, Schema.validate(pandas_df) - ) - - raw_data = [ - { - "geometry": Point(0, 0), - "state": "NY", - "city": "New York", - }, - { - "geometry": Point(1, 1), - "state": "FL", - "city": "Miami", - }, - ] - - with pytest.raises( - pa.errors.SchemaError, - match="^column 'price' not in dataframe", - ): - GeoDataFrame[Schema](raw_data) - - -def test_from_records_sets_the_index_from_schema(): - """Test that GeoDataFrame[Schema] validates the schema""" - - class Schema(pa.DataFrameModel): - geometry: GeoSeries - state: Index[str] = pa.Field(check_name=True) - city: Series[str] - price: Series[float] - - raw_data = [ - { - "geometry": Point(0, 0), - "state": "NY", - "city": "New York", - "price": 8.0, - }, - { - "geometry": Point(1, 1), - "state": "FL", - "city": "Miami", - "price": 12.0, - }, - ] - pandera_validated_df = GeoDataFrame.from_records(Schema, raw_data) - assert isinstance(pandera_validated_df, GeoDataFrame) - - pandas_df = gpd.GeoDataFrame( - pd.DataFrame.from_records(raw_data, index=["state"]) - ) - pd.testing.assert_frame_equal( - pandera_validated_df, Schema.validate(pandas_df) - ) - - -def test_from_records_sorts_the_columns(): - """Test that GeoDataFrame[Schema] validates the schema""" - - class Schema(pa.DataFrameModel): - geometry: GeoSeries - state: Series[str] - city: Series[str] - price: Series[float] - - raw_data = [ - { - "geometry": Point(0, 0), - "city": "New York", - "price": 8.0, - "state": "NY", - }, - { - "geometry": Point(1, 1), - "price": 12.0, - "state": "FL", - "city": "Miami", - }, - ] - pandera_validated_df = GeoDataFrame.from_records(Schema, raw_data) - assert isinstance(pandera_validated_df, GeoDataFrame) - - pandas_df = gpd.GeoDataFrame( - pd.DataFrame.from_records(raw_data)[ - ["geometry", "state", "city", "price"] - ] - ) - pd.testing.assert_frame_equal( - pandera_validated_df, Schema.validate(pandas_df) - ) diff --git a/tests/geopandas/test_pydantic.py b/tests/geopandas/test_pydantic.py new file mode 100644 index 000000000..b9e3de414 --- /dev/null +++ b/tests/geopandas/test_pydantic.py @@ -0,0 +1,118 @@ +"""Tests GeoPandas schema creation and validation from type annotations.""" +# pylint:disable=missing-class-docstring,missing-function-docstring,too-few-public-methods + +import pandas as pd +import geopandas as gpd +import pytest +from shapely.geometry import Point +from pydantic import BaseModel, ValidationError + +import pandera as pa +from pandera.typing.geopandas import GeoDataFrame, GeoSeries + + +def test_pydantic_active_geometry(): + """Test that GeoDataFrame type can be used in a Pydantic model with geometry activated""" + + class Schema(pa.DataFrameModel): + # pylint: disable=missing-class-docstring + geometry: GeoSeries + + class MyModel(BaseModel): + # pylint: disable=missing-class-docstring + data: GeoDataFrame[Schema] + + # gpd.GeoDataFrame input + obj = MyModel( + data=gpd.GeoDataFrame( + { + "geometry": gpd.GeoSeries([Point(0, 0)]), + } + ) + ) + + assert isinstance(obj.data, gpd.GeoDataFrame) + assert obj.data.geometry.name == "geometry" + + # pd.DataFrame input (coerce to gpd.GeoDataFrame) + obj = MyModel( + data=pd.DataFrame( + { + "geometry": gpd.GeoSeries([Point(0, 0)]), + } + ) + ) + + assert isinstance(obj.data, gpd.GeoDataFrame) + + +def test_pydantic_inactive_geometry(): + """Test that GeoDataFrame type can be used in a Pydantic model with geometry not activated""" + + # Geometry column exists but non-standard name + class Schema(pa.DataFrameModel): + # pylint: disable=missing-class-docstring + random: GeoSeries + + class MyModel(BaseModel): + # pylint: disable=missing-class-docstring + data: GeoDataFrame[Schema] + + obj = MyModel( + data=pd.DataFrame( + { + "random": gpd.GeoSeries([Point(0, 0)]), + } + ) + ) + + assert isinstance(obj.data, gpd.GeoDataFrame) + + with pytest.raises( + AttributeError, + match="the active geometry column to use has not been set", + ): + _ = obj.data.geometry + + # Geometry column doesn't exist + class Schema(pa.DataFrameModel): + # pylint: disable=missing-class-docstring + random: str + + class MyModel(BaseModel): + # pylint: disable=missing-class-docstring + data: GeoDataFrame[Schema] + + obj = MyModel( + data=pd.DataFrame( + { + "random": ["a", "b"], + } + ) + ) + + assert isinstance(obj.data, gpd.GeoDataFrame) + + with pytest.raises( + AttributeError, + match="the active geometry column to use has not been set", + ): + _ = obj.data.geometry + + +def test_pydantic_garbage_input(): + """Test that GeoDataFrame type in a Pydantic model will throw an exception with garbage input""" + + class Schema(pa.DataFrameModel): + # pylint: disable=missing-class-docstring + geometry: GeoSeries + + class MyModel(BaseModel): + # pylint: disable=missing-class-docstring + data: GeoDataFrame[Schema] + + with pytest.raises( + ValidationError, + match="Value error, Expected gpd.GeoDataFrame, found", + ): + MyModel(data="invalid")