From 559e30bfc255b9d6bc0cc9cbd83a0dbe8b56394a Mon Sep 17 00:00:00 2001
From: derinwalters <108046357+derinwalters@users.noreply.github.com>
Date: Wed, 18 Oct 2023 23:59:25 +0900
Subject: [PATCH] Bugfix/1278 add_missing_columns assorted bugfixes (#1372)

* Handle add_missing_columns for cases of non- python standard dtypes on coerce=False, sparse dtypes, and non-None null default values per issue #1278

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

* Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (#1370)

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

* pylint issues related to Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (#1370)

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

* do not assign default value to sparse dtypes since that knowledge is already baked-in (#1278, #1370)

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

* lint (#1278, #1370)

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

* explicit dtype setting in test_add_missing_columns_dtype as it seems windows can have different interpretations of "int" (#1278, #1370)

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>

---------

Signed-off-by: Derin Walters <derin.c.walters@rijjin.com>
Signed-off-by: Nok <nok.lam.chan@quantumblack.com>
---
 pandera/backends/pandas/array.py     | 11 ++++--
 pandera/backends/pandas/container.py | 46 +++++++++++++++++-----
 tests/core/test_schemas.py           | 59 +++++++++++++++++++++++++++-
 3 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/pandera/backends/pandas/array.py b/pandera/backends/pandas/array.py
index 98c19962a..de7b10750 100644
--- a/pandera/backends/pandas/array.py
+++ b/pandera/backends/pandas/array.py
@@ -52,7 +52,7 @@ def validate(
             )
 
         # fill nans with `default` if it's present
-        if hasattr(schema, "default") and pd.notna(schema.default):
+        if hasattr(schema, "default") and schema.default is not None:
             check_obj = self.set_default(check_obj, schema)
 
         try:
@@ -325,9 +325,14 @@ def run_checks(self, check_obj, schema) -> List[CoreCheckResult]:
 
     def set_default(self, check_obj, schema):
         """Sets the ``schema.default`` value on the ``check_obj``"""
-        if is_field(check_obj):
+        # Ignore sparse dtype as it can't assign default value directly
+        if is_field(check_obj) and not isinstance(
+            check_obj.dtype, pd.SparseDtype
+        ):
             check_obj.fillna(schema.default, inplace=True)
-        else:
+        elif not is_field(check_obj) and not isinstance(
+            check_obj[schema.name].dtype, pd.SparseDtype
+        ):
             check_obj[schema.name].fillna(schema.default, inplace=True)
 
         return check_obj
diff --git a/pandera/backends/pandas/container.py b/pandera/backends/pandas/container.py
index 94133f80f..0afc9eabd 100644
--- a/pandera/backends/pandas/container.py
+++ b/pandera/backends/pandas/container.py
@@ -400,7 +400,10 @@ def add_missing_columns(
         for col_name in check_obj.columns:
             pop_cols = []
             for next_col_name in iter(schema_cols_dict):
-                if next_col_name in column_info.absent_column_names:
+                if (
+                    next_col_name in column_info.absent_column_names
+                    and next_col_name not in concat_ordered_cols
+                ):
                     # Next schema column is missing from dataframe,
                     # so mark for insertion here
                     concat_ordered_cols.append(next_col_name)
@@ -423,15 +426,13 @@ def add_missing_columns(
                 concat_ordered_cols.append(col_name)
 
         # Create companion dataframe of default values for missing columns
-        defaults = {
-            c: schema.columns[c].default
-            for c in column_info.absent_column_names
+        missing_cols_schema = {
+            k: v
+            for k, v in schema.columns.items()
+            if k in column_info.absent_column_names
         }
-
-        missing_obj = pd.DataFrame(
-            data=defaults,
-            columns=column_info.absent_column_names,
-            index=check_obj.index,
+        missing_obj = self._construct_missing_df(
+            check_obj, missing_cols_schema
         )
 
         # Append missing columns
@@ -442,6 +443,33 @@ def add_missing_columns(
 
         return concat_obj
 
+    def _construct_missing_df(
+        self,
+        obj: pd.DataFrame,
+        missing_cols_schema: Dict[str, Any],
+    ) -> pd.DataFrame:
+        """Construct dataframe of missing columns with their default values.
+
+        :param obj: dataframe of master dataframe from which to take index.
+        :param missing_cols_schema: dictionary of Column schemas
+        :returns: dataframe of missing columns
+        """
+        missing_obj = pd.DataFrame(
+            data={k: v.default for k, v in missing_cols_schema.items()},
+            index=obj.index,
+        )
+
+        # Can't specify multiple dtypes in frame construction and
+        # constructing the frame as a concatenation of indexed
+        # series is relatively slow due to copying the index for
+        # each one. Coerce dtypes afterwards instead.
+        for c in missing_obj.columns:
+            missing_obj[c] = missing_cols_schema[c].dtype.try_coerce(
+                missing_obj[c]
+            )
+
+        return missing_obj
+
     def strict_filter_columns(
         self, check_obj: pd.DataFrame, schema, column_info: ColumnInfo
     ):
diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py
index cce20903f..9518f12ed 100644
--- a/tests/core/test_schemas.py
+++ b/tests/core/test_schemas.py
@@ -394,8 +394,8 @@ def test_duplicate_columns_dataframe():
     assert not schema.unique_column_names
 
 
-def test_add_missing_columns():
-    """Test that missing columns are added."""
+def test_add_missing_columns_order():
+    """Test that missing columns are added in the correct order."""
     col_labels = ["a", "b", "c"]
 
     # Missing column is first in schema
@@ -475,6 +475,61 @@ def test_add_missing_columns():
     ):
         schema_no_default_not_nullable.validate(frame_missing_first)
 
+    # Validate missing column isn't added multiple times when multiple
+    # trailing columns not in the schema exists in the dataframe
+    # https://github.com/unionai-oss/pandera/issues/1370
+    schema = DataFrameSchema(
+        columns={
+            "col_a": Column(str),
+            "col_missing": Column(str, nullable=True),
+        },
+        add_missing_columns=True,
+    )
+    df = pd.DataFrame(
+        {
+            "col_a": ["a", "b", "c"],
+            "col_b": ["d", "e", "f"],
+            "col_c": ["g", "h", "i"],
+        }
+    )
+    validated_frame_trailing = schema.validate(df)
+    assert validated_frame_trailing.columns.equals(
+        pd.Index(["col_a", "col_missing", "col_b", "col_c"])
+    )
+
+
+def test_add_missing_columns_dtype():
+    """Test that missing columns are added with the correct dtype."""
+    ref_df = pd.DataFrame(
+        {
+            "a": pd.Series([2, 5], dtype=np.int64),
+            "b": pd.Series([9, 9], dtype=np.int64),
+            "c": pd.Series([9, 9], dtype=np.int8),
+            "d": pd.Series([np.nan, np.nan], dtype=np.float64),
+            "e": pd.Series(
+                [7, 7], dtype=pd.SparseDtype(dtype=np.int8, fill_value=5)
+            ),
+            "f": pd.Series([pd.NA, pd.NA], dtype=pd.Int32Dtype()),
+        }
+    )
+
+    schema = DataFrameSchema(
+        columns={
+            "a": Column(np.int64),
+            "b": Column(np.int64, default=9),
+            "c": Column(np.int8, default=9),
+            "d": Column(np.float64, default=np.nan, nullable=True),
+            "e": Column(
+                pd.SparseDtype(dtype=np.int8, fill_value=5), default=7
+            ),
+            "f": Column(pd.Int32Dtype(), default=pd.NA, nullable=True),
+        },
+        add_missing_columns=True,
+        coerce=False,
+    )
+    test_df = schema.validate(pd.DataFrame(data={"a": [2, 5]}))
+    pd.testing.assert_frame_equal(ref_df, test_df)
+
 
 def test_series_schema() -> None:
     """Tests that a SeriesSchema Check behaves as expected for integers and