Skip to content

Commit

Permalink
Bugfix/1278 add_missing_columns assorted bugfixes (unionai-oss#1372)
Browse files Browse the repository at this point in the history
* Handle add_missing_columns for cases of non- python standard dtypes on coerce=False, sparse dtypes, and non-None null default values per issue unionai-oss#1278

Signed-off-by: Derin Walters <[email protected]>

* Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (unionai-oss#1370)

Signed-off-by: Derin Walters <[email protected]>

* pylint issues related to Fix bug where multiple copies of a missing column are inserted into the validated dataframe when add_missing_columns is enabled (unionai-oss#1370)

Signed-off-by: Derin Walters <[email protected]>

* do not assign default value to sparse dtypes since that knowledge is already baked-in (unionai-oss#1278, unionai-oss#1370)

Signed-off-by: Derin Walters <[email protected]>

* lint (unionai-oss#1278, unionai-oss#1370)

Signed-off-by: Derin Walters <[email protected]>

* explicit dtype setting in test_add_missing_columns_dtype as it seems windows can have different interpretations of "int" (unionai-oss#1278, unionai-oss#1370)

Signed-off-by: Derin Walters <[email protected]>

---------

Signed-off-by: Derin Walters <[email protected]>
Signed-off-by: Nok <[email protected]>
  • Loading branch information
derinwalters authored and noklam committed Oct 29, 2023
1 parent 979ad1b commit 559e30b
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 14 deletions.
11 changes: 8 additions & 3 deletions pandera/backends/pandas/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def validate(
)

# fill nans with `default` if it's present
if hasattr(schema, "default") and pd.notna(schema.default):
if hasattr(schema, "default") and schema.default is not None:
check_obj = self.set_default(check_obj, schema)

try:
Expand Down Expand Up @@ -325,9 +325,14 @@ def run_checks(self, check_obj, schema) -> List[CoreCheckResult]:

def set_default(self, check_obj, schema):
"""Sets the ``schema.default`` value on the ``check_obj``"""
if is_field(check_obj):
# Ignore sparse dtype as it can't assign default value directly
if is_field(check_obj) and not isinstance(
check_obj.dtype, pd.SparseDtype
):
check_obj.fillna(schema.default, inplace=True)
else:
elif not is_field(check_obj) and not isinstance(
check_obj[schema.name].dtype, pd.SparseDtype
):
check_obj[schema.name].fillna(schema.default, inplace=True)

return check_obj
Expand Down
46 changes: 37 additions & 9 deletions pandera/backends/pandas/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,10 @@ def add_missing_columns(
for col_name in check_obj.columns:
pop_cols = []
for next_col_name in iter(schema_cols_dict):
if next_col_name in column_info.absent_column_names:
if (
next_col_name in column_info.absent_column_names
and next_col_name not in concat_ordered_cols
):
# Next schema column is missing from dataframe,
# so mark for insertion here
concat_ordered_cols.append(next_col_name)
Expand All @@ -423,15 +426,13 @@ def add_missing_columns(
concat_ordered_cols.append(col_name)

# Create companion dataframe of default values for missing columns
defaults = {
c: schema.columns[c].default
for c in column_info.absent_column_names
missing_cols_schema = {
k: v
for k, v in schema.columns.items()
if k in column_info.absent_column_names
}

missing_obj = pd.DataFrame(
data=defaults,
columns=column_info.absent_column_names,
index=check_obj.index,
missing_obj = self._construct_missing_df(
check_obj, missing_cols_schema
)

# Append missing columns
Expand All @@ -442,6 +443,33 @@ def add_missing_columns(

return concat_obj

def _construct_missing_df(
self,
obj: pd.DataFrame,
missing_cols_schema: Dict[str, Any],
) -> pd.DataFrame:
"""Construct dataframe of missing columns with their default values.
:param obj: dataframe of master dataframe from which to take index.
:param missing_cols_schema: dictionary of Column schemas
:returns: dataframe of missing columns
"""
missing_obj = pd.DataFrame(
data={k: v.default for k, v in missing_cols_schema.items()},
index=obj.index,
)

# Can't specify multiple dtypes in frame construction and
# constructing the frame as a concatenation of indexed
# series is relatively slow due to copying the index for
# each one. Coerce dtypes afterwards instead.
for c in missing_obj.columns:
missing_obj[c] = missing_cols_schema[c].dtype.try_coerce(
missing_obj[c]
)

return missing_obj

def strict_filter_columns(
self, check_obj: pd.DataFrame, schema, column_info: ColumnInfo
):
Expand Down
59 changes: 57 additions & 2 deletions tests/core/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,8 +394,8 @@ def test_duplicate_columns_dataframe():
assert not schema.unique_column_names


def test_add_missing_columns():
"""Test that missing columns are added."""
def test_add_missing_columns_order():
"""Test that missing columns are added in the correct order."""
col_labels = ["a", "b", "c"]

# Missing column is first in schema
Expand Down Expand Up @@ -475,6 +475,61 @@ def test_add_missing_columns():
):
schema_no_default_not_nullable.validate(frame_missing_first)

# Validate missing column isn't added multiple times when multiple
# trailing columns not in the schema exists in the dataframe
# https://github.com/unionai-oss/pandera/issues/1370
schema = DataFrameSchema(
columns={
"col_a": Column(str),
"col_missing": Column(str, nullable=True),
},
add_missing_columns=True,
)
df = pd.DataFrame(
{
"col_a": ["a", "b", "c"],
"col_b": ["d", "e", "f"],
"col_c": ["g", "h", "i"],
}
)
validated_frame_trailing = schema.validate(df)
assert validated_frame_trailing.columns.equals(
pd.Index(["col_a", "col_missing", "col_b", "col_c"])
)


def test_add_missing_columns_dtype():
"""Test that missing columns are added with the correct dtype."""
ref_df = pd.DataFrame(
{
"a": pd.Series([2, 5], dtype=np.int64),
"b": pd.Series([9, 9], dtype=np.int64),
"c": pd.Series([9, 9], dtype=np.int8),
"d": pd.Series([np.nan, np.nan], dtype=np.float64),
"e": pd.Series(
[7, 7], dtype=pd.SparseDtype(dtype=np.int8, fill_value=5)
),
"f": pd.Series([pd.NA, pd.NA], dtype=pd.Int32Dtype()),
}
)

schema = DataFrameSchema(
columns={
"a": Column(np.int64),
"b": Column(np.int64, default=9),
"c": Column(np.int8, default=9),
"d": Column(np.float64, default=np.nan, nullable=True),
"e": Column(
pd.SparseDtype(dtype=np.int8, fill_value=5), default=7
),
"f": Column(pd.Int32Dtype(), default=pd.NA, nullable=True),
},
add_missing_columns=True,
coerce=False,
)
test_df = schema.validate(pd.DataFrame(data={"a": [2, 5]}))
pd.testing.assert_frame_equal(ref_df, test_df)


def test_series_schema() -> None:
"""Tests that a SeriesSchema Check behaves as expected for integers and
Expand Down

0 comments on commit 559e30b

Please sign in to comment.