Merge pull request #232 from capitalone/develop

Release v0.10.3
capitalone · Aug 15, 2023 · a9e63b4 · a9e63b4
2 parents a2293f4 + 2c4bdf0
commit a9e63b4
Show file tree

Hide file tree

Showing 11 changed files with 554 additions and 188 deletions.
diff --git a/.github/workflows/edgetest.yml b/.github/workflows/edgetest.yml
@@ -16,6 +16,6 @@ jobs:
       - id: run-edgetest
         uses: fdosani/[email protected]
         with:
-          edgetest-flags: '-c setup.cfg --export'
+          edgetest-flags: '-c pyproject.toml --export'
           base-branch: 'develop'
           skip-pr: 'false'
diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml
@@ -20,7 +20,7 @@ jobs:
       with:
         python-version: '3.9'
     - name: Install dependencies
-      run: python -m pip install -r requirements.txt .[dev]
+      run: python -m pip install .[dev]
     - name: Build and publish
       env:
         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,2 @@
 include README.rst
 include LICENSE
-include requirements.txt
diff --git a/datacompy/__init__.py b/datacompy/__init__.py
@@ -13,8 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.10.2"
+__version__ = "0.10.3"
 
 from datacompy.core import *
-from datacompy.fugue import is_match, report
+from datacompy.fugue import (
+    all_columns_match,
+    intersect_columns,
+    is_match,
+    report,
+    unq_columns,
+)
 from datacompy.spark import NUMERIC_SPARK_TYPES, SparkCompare
diff --git a/datacompy/fugue.py b/datacompy/fugue.py
@@ -26,13 +26,76 @@
 import pandas as pd
 import pyarrow as pa
 from fugue import AnyDataFrame
+from ordered_set import OrderedSet
+from triad import Schema
 
 from .core import Compare, render
 
 LOG = logging.getLogger(__name__)
 HASH_COL = "__datacompy__hash__"
 
 
+def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame):
+    """Get columns that are unique to df1
+
+    Parameters
+    ----------
+    df1 : ``AnyDataFrame``
+        First dataframe to check
+
+    df2 : ``AnyDataFrame``
+        Second dataframe to check
+
+    Returns
+    -------
+    OrderedSet
+        Set of columns that are unique to df1
+    """
+    col1 = fa.get_column_names(df1)
+    col2 = fa.get_column_names(df2)
+    return OrderedSet(col1) - OrderedSet(col2)
+
+
+def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame):
+    """Get columns that are shared between the two dataframes
+
+    Parameters
+    ----------
+    df1 : ``AnyDataFrame``
+        First dataframe to check
+
+    df2 : ``AnyDataFrame``
+        Second dataframe to check
+
+    Returns
+    -------
+    OrderedSet
+        Set of that are shared between the two dataframes
+    """
+    col1 = fa.get_column_names(df1)
+    col2 = fa.get_column_names(df2)
+    return OrderedSet(col1) & OrderedSet(col2)
+
+
+def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame):
+    """Whether the columns all match in the dataframes
+
+    Parameters
+    ----------
+    df1 : ``AnyDataFrame``
+        First dataframe to check
+
+    df2 : ``AnyDataFrame``
+        Second dataframe to check
+
+    Returns
+    -------
+    bool
+        Boolean indicating whether the columns all match in the dataframes
+    """
+    return unq_columns(df1, df2) == unq_columns(df2, df1) == set()
+
+
 def is_match(
     df1: AnyDataFrame,
     df2: AnyDataFrame,
@@ -469,8 +532,8 @@ def _distributed_compare(
     assert hash_cols in tdf1.schema, f"{hash_cols} not found in {tdf1.schema}"
     assert hash_cols in tdf2.schema, f"{hash_cols} not found in {tdf2.schema}"
 
-    df1_cols = tdf1.schema.names
-    df2_cols = tdf2.schema.names
+    df1_schema = tdf1.schema
+    df2_schema = tdf2.schema
     str_cols = set(f.name for f in tdf1.schema.fields if pa.types.is_string(f.type))
     bucket = (
         parallelism if parallelism is not None else fa.get_current_parallelism() * 2
@@ -508,17 +571,19 @@ def _serialize(dfs: Iterable[pd.DataFrame], left: bool) -> Iterable[Dict[str, An
         distinct=False,
     )
 
-    def _comp(df: List[Dict[str, Any]]) -> List[List[Any]]:
-        df1 = (
-            pd.concat([pickle.loads(r["data"]) for r in df if r["left"]])
-            .sort_values(df1_cols)
-            .reset_index(drop=True)
-        )
-        df2 = (
-            pd.concat([pickle.loads(r["data"]) for r in df if not r["left"]])
-            .sort_values(df2_cols)
-            .reset_index(drop=True)
+    def _deserialize(
+        df: List[Dict[str, Any]], left: bool, schema: Schema
+    ) -> pd.DataFrame:
+        arr = [pickle.loads(r["data"]) for r in df if r["left"] == left]
+        if len(arr) > 0:
+            return pd.concat(arr).sort_values(schema.names).reset_index(drop=True)
+        return pd.DataFrame(
+            {k: pd.Series(dtype=v) for k, v in schema.pandas_dtype.items()}
         )
+
+    def _comp(df: List[Dict[str, Any]]) -> List[List[Any]]:
+        df1 = _deserialize(df, True, df1_schema)
+        df2 = _deserialize(df, False, df2_schema)
         comp = Compare(
             df1=df1,
             df2=df2,

diff --git a/docs/source/developer_instructions.rst b/docs/source/developer_instructions.rst
@@ -46,8 +46,8 @@ Run ``python -m pytest`` to run all unittests defined in the subfolder
 Management of Requirements
 --------------------------
 
-Requirements of the project should be added to ``requirements.txt``.  Optional requirements used only for testing,
-documentation, or code quality are added to ``setup.py`` and ``EXTRAS_REQUIRE``
+Requirements of the project should be added to ``pyproject.toml``.  Optional requirements used only for testing,
+documentation, or code quality are added to ``pyproject.toml`` in the ``project.optional-dependencies`` section.
 
 
 
@@ -57,16 +57,16 @@ edgetest
 edgetest is a utility to help keep requirements up to date and ensure a subset of testing requirements still work.
 More on edgetest `here <https://github.com/capitalone/edgetest>`_.
 
-The ``setup.cfg`` has configuration details on how to run edgetest. This process can be automated via GitHub Actions.
+The ``pyproject.toml`` has configuration details on how to run edgetest. This process can be automated via GitHub Actions.
 (A future addition, which will come soon).
 
 In order to execute edgetest locally you can run the following after install ``edgetest``:
 
 .. code-block:: bash
 
-    edgetest -c setup.cfg -r requirements.txt --export
+    edgetest -c pyproject.toml --export
 
-This should return output like the following and also updating ``requirements.txt``:
+This should return output like the following and also updating ``pyproject.toml``:
 
 .. code-block:: bash
 
@@ -77,7 +77,6 @@ This should return output like the following and also updating ``requirements.tx
     core           True             pandas               1.3.5
     core           True             PyYAML               6.0
     =============  ===============  ===================  =================
-    No PEP-517 style requirements in setup.cfg to update. Updating requirements.txt
 
 
 
@@ -110,13 +109,13 @@ Generating distribution archives (PyPI)
 After each release the package will need to be uploaded to PyPi. The instructions below are taken
 from `packaging.python.org <https://packaging.python.org/tutorials/packaging-projects/#generating-distribution-archives>`_
 
-Update / Install ``setuptools``, ``wheel``, and ``twine``::
+Update / Install ``build``, ``wheel``, and ``twine``::
 
-    pip install --upgrade setuptools wheel twine
+    pip install --upgrade build wheel twine
 
 Generate distributions::
 
-    python setup.py sdist bdist_wheel
+    python -m build
 
 Under the ``dist`` folder you should have something as follows::
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,114 @@
+[project]
+name = "datacompy"
+description = "Dataframe comparison in Python"
+readme = "README.md"
+authors = [
+  { name="Ian Robertson" },
+  { name="Dan Coates" },
+  { name="Faisal Dosani", email="[email protected]" },
+]
+maintainers = [
+  { name="Faisal Dosani", email="[email protected]" }
+]
+license = {text = "Apache Software License"}
+dependencies = [
+    "pandas<=2.0.2,>=0.25.0",
+    "numpy<=1.24.3,>=1.22.0",
+    "ordered-set<=4.1.0,>=4.0.2",
+    "fugue<=0.9.0,>=0.8.6",
+]
+requires-python = ">=3.8.0"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+
+dynamic = ["version"]
+
+[project.urls]
+Homepage = "https://github.com/capitalone/datacompy"
+Documentation = "https://capitalone.github.io/datacompy/"
+Repository = "https://github.com/capitalone/datacompy.git"
+"Bug Tracker" = "https://github.com/capitalone/datacompy/issues"
+"Source Code" = "https://github.com/capitalone/datacompy"
+
+[tool.setuptools]
+packages = ["datacompy"]
+zip-safe = false
+include-package-data = true
+
+[tool.setuptools.package-data]
+"*" = ["templates/*.txt"]
+
+[tool.setuptools.dynamic]
+version = {attr = "datacompy.__version__"}
+
+[tool.distutils.bdist_wheel]
+python-tag = "py3"
+
+[project.optional-dependencies]
+duckdb = ["fugue[duckdb]"]
+polars = ["fugue[polars]"]
+spark = ["fugue[spark]"]
+dask = ["fugue[dask]"]
+ray = ["fugue[ray]"]
+docs = [
+    "sphinx",
+    "furo",
+    "myst-parser",
+]
+tests = [
+    "pytest",
+    "pytest-cov",
+    "pytest-spark",
+    "fugue[polars,duckdb,spark]",
+]
+qa = [
+    "pre-commit",
+    "black",
+    "isort",
+]
+build = [
+    "build",
+    "twine",
+    "wheel",
+]
+edgetest = [
+    "edgetest",
+    "edgetest-conda",
+]
+dev = [
+    "datacompy[duckdb]",
+    "datacompy[polars]",
+    "datacompy[spark]",
+    "datacompy[docs]",
+    "datacompy[tests]",
+    "datacompy[qa]",
+    "datacompy[build]",
+]
+
+[isort]
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+use_parentheses = true
+line_length = 88
+
+[edgetest.envs.core]
+python_version = "3.9"
+conda_install = ["openjdk=8"]
+extras = ["dev"]
+command = "pytest tests -m 'not integration'"
+upgrade = [
+    "pandas",
+    "numpy",
+    "ordered-set",
+    "fugue",
+]
diff --git a/requirements.txt b/requirements.txt