From 559e30bfc255b9d6bc0cc9cbd83a0dbe8b56394a Mon Sep 17 00:00:00 2001 From: derinwalters <108046357+derinwalters@users.noreply.github.com> Date: Wed, 18 Oct 2023 23:59:25 +0900 Subject: [PATCH] Bugfix/1278 add_missing_columns assorted bugfixes (#1372) * Handle add_missing_columns for cases of non- python standard dtypes on coerce=False, sparse dtypes, and non-None null default values per issue #1278 Signed-off-by: Derin Walters * Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (#1370) Signed-off-by: Derin Walters * pylint issues related to Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (#1370) Signed-off-by: Derin Walters * do not assign default value to sparse dtypes since that knowledge is already baked-in (#1278, #1370) Signed-off-by: Derin Walters * lint (#1278, #1370) Signed-off-by: Derin Walters * explicit dtype setting in test_add_missing_columns_dtype as it seems windows can have different interpretations of "int" (#1278, #1370) Signed-off-by: Derin Walters --------- Signed-off-by: Derin Walters Signed-off-by: Nok --- pandera/backends/pandas/array.py | 11 ++++-- pandera/backends/pandas/container.py | 46 +++++++++++++++++----- tests/core/test_schemas.py | 59 +++++++++++++++++++++++++++- 3 files changed, 102 insertions(+), 14 deletions(-) diff --git a/pandera/backends/pandas/array.py b/pandera/backends/pandas/array.py index 98c19962a..de7b10750 100644 --- a/pandera/backends/pandas/array.py +++ b/pandera/backends/pandas/array.py @@ -52,7 +52,7 @@ def validate( ) # fill nans with `default` if it's present - if hasattr(schema, "default") and pd.notna(schema.default): + if hasattr(schema, "default") and schema.default is not None: check_obj = self.set_default(check_obj, schema) try: @@ -325,9 +325,14 @@ def run_checks(self, check_obj, schema) -> List[CoreCheckResult]: def set_default(self, check_obj, schema): """Sets the ``schema.default`` value on the ``check_obj``""" - if is_field(check_obj): + # Ignore sparse dtype as it can't assign default value directly + if is_field(check_obj) and not isinstance( + check_obj.dtype, pd.SparseDtype + ): check_obj.fillna(schema.default, inplace=True) - else: + elif not is_field(check_obj) and not isinstance( + check_obj[schema.name].dtype, pd.SparseDtype + ): check_obj[schema.name].fillna(schema.default, inplace=True) return check_obj diff --git a/pandera/backends/pandas/container.py b/pandera/backends/pandas/container.py index 94133f80f..0afc9eabd 100644 --- a/pandera/backends/pandas/container.py +++ b/pandera/backends/pandas/container.py @@ -400,7 +400,10 @@ def add_missing_columns( for col_name in check_obj.columns: pop_cols = [] for next_col_name in iter(schema_cols_dict): - if next_col_name in column_info.absent_column_names: + if ( + next_col_name in column_info.absent_column_names + and next_col_name not in concat_ordered_cols + ): # Next schema column is missing from dataframe, # so mark for insertion here concat_ordered_cols.append(next_col_name) @@ -423,15 +426,13 @@ def add_missing_columns( concat_ordered_cols.append(col_name) # Create companion dataframe of default values for missing columns - defaults = { - c: schema.columns[c].default - for c in column_info.absent_column_names + missing_cols_schema = { + k: v + for k, v in schema.columns.items() + if k in column_info.absent_column_names } - - missing_obj = pd.DataFrame( - data=defaults, - columns=column_info.absent_column_names, - index=check_obj.index, + missing_obj = self._construct_missing_df( + check_obj, missing_cols_schema ) # Append missing columns @@ -442,6 +443,33 @@ def add_missing_columns( return concat_obj + def _construct_missing_df( + self, + obj: pd.DataFrame, + missing_cols_schema: Dict[str, Any], + ) -> pd.DataFrame: + """Construct dataframe of missing columns with their default values. + + :param obj: dataframe of master dataframe from which to take index. + :param missing_cols_schema: dictionary of Column schemas + :returns: dataframe of missing columns + """ + missing_obj = pd.DataFrame( + data={k: v.default for k, v in missing_cols_schema.items()}, + index=obj.index, + ) + + # Can't specify multiple dtypes in frame construction and + # constructing the frame as a concatenation of indexed + # series is relatively slow due to copying the index for + # each one. Coerce dtypes afterwards instead. + for c in missing_obj.columns: + missing_obj[c] = missing_cols_schema[c].dtype.try_coerce( + missing_obj[c] + ) + + return missing_obj + def strict_filter_columns( self, check_obj: pd.DataFrame, schema, column_info: ColumnInfo ): diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py index cce20903f..9518f12ed 100644 --- a/tests/core/test_schemas.py +++ b/tests/core/test_schemas.py @@ -394,8 +394,8 @@ def test_duplicate_columns_dataframe(): assert not schema.unique_column_names -def test_add_missing_columns(): - """Test that missing columns are added.""" +def test_add_missing_columns_order(): + """Test that missing columns are added in the correct order.""" col_labels = ["a", "b", "c"] # Missing column is first in schema @@ -475,6 +475,61 @@ def test_add_missing_columns(): ): schema_no_default_not_nullable.validate(frame_missing_first) + # Validate missing column isn't added multiple times when multiple + # trailing columns not in the schema exists in the dataframe + # https://github.com/unionai-oss/pandera/issues/1370 + schema = DataFrameSchema( + columns={ + "col_a": Column(str), + "col_missing": Column(str, nullable=True), + }, + add_missing_columns=True, + ) + df = pd.DataFrame( + { + "col_a": ["a", "b", "c"], + "col_b": ["d", "e", "f"], + "col_c": ["g", "h", "i"], + } + ) + validated_frame_trailing = schema.validate(df) + assert validated_frame_trailing.columns.equals( + pd.Index(["col_a", "col_missing", "col_b", "col_c"]) + ) + + +def test_add_missing_columns_dtype(): + """Test that missing columns are added with the correct dtype.""" + ref_df = pd.DataFrame( + { + "a": pd.Series([2, 5], dtype=np.int64), + "b": pd.Series([9, 9], dtype=np.int64), + "c": pd.Series([9, 9], dtype=np.int8), + "d": pd.Series([np.nan, np.nan], dtype=np.float64), + "e": pd.Series( + [7, 7], dtype=pd.SparseDtype(dtype=np.int8, fill_value=5) + ), + "f": pd.Series([pd.NA, pd.NA], dtype=pd.Int32Dtype()), + } + ) + + schema = DataFrameSchema( + columns={ + "a": Column(np.int64), + "b": Column(np.int64, default=9), + "c": Column(np.int8, default=9), + "d": Column(np.float64, default=np.nan, nullable=True), + "e": Column( + pd.SparseDtype(dtype=np.int8, fill_value=5), default=7 + ), + "f": Column(pd.Int32Dtype(), default=pd.NA, nullable=True), + }, + add_missing_columns=True, + coerce=False, + ) + test_df = schema.validate(pd.DataFrame(data={"a": [2, 5]})) + pd.testing.assert_frame_equal(ref_df, test_df) + def test_series_schema() -> None: """Tests that a SeriesSchema Check behaves as expected for integers and