Skip to content

Commit

Permalink
Merge pull request #232 from capitalone/develop
Browse files Browse the repository at this point in the history
Release v0.10.3
  • Loading branch information
Faisal authored Aug 15, 2023
2 parents a2293f4 + 2c4bdf0 commit a9e63b4
Show file tree
Hide file tree
Showing 11 changed files with 554 additions and 188 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/edgetest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ jobs:
- id: run-edgetest
uses: fdosani/[email protected]
with:
edgetest-flags: '-c setup.cfg --export'
edgetest-flags: '-c pyproject.toml --export'
base-branch: 'develop'
skip-pr: 'false'
2 changes: 1 addition & 1 deletion .github/workflows/publish-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
with:
python-version: '3.9'
- name: Install dependencies
run: python -m pip install -r requirements.txt .[dev]
run: python -m pip install .[dev]
- name: Build and publish
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
Expand Down
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
include README.rst
include LICENSE
include requirements.txt
10 changes: 8 additions & 2 deletions datacompy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.10.2"
__version__ = "0.10.3"

from datacompy.core import *
from datacompy.fugue import is_match, report
from datacompy.fugue import (
all_columns_match,
intersect_columns,
is_match,
report,
unq_columns,
)
from datacompy.spark import NUMERIC_SPARK_TYPES, SparkCompare
89 changes: 77 additions & 12 deletions datacompy/fugue.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,76 @@
import pandas as pd
import pyarrow as pa
from fugue import AnyDataFrame
from ordered_set import OrderedSet
from triad import Schema

from .core import Compare, render

LOG = logging.getLogger(__name__)
HASH_COL = "__datacompy__hash__"


def unq_columns(df1: AnyDataFrame, df2: AnyDataFrame):
"""Get columns that are unique to df1
Parameters
----------
df1 : ``AnyDataFrame``
First dataframe to check
df2 : ``AnyDataFrame``
Second dataframe to check
Returns
-------
OrderedSet
Set of columns that are unique to df1
"""
col1 = fa.get_column_names(df1)
col2 = fa.get_column_names(df2)
return OrderedSet(col1) - OrderedSet(col2)


def intersect_columns(df1: AnyDataFrame, df2: AnyDataFrame):
"""Get columns that are shared between the two dataframes
Parameters
----------
df1 : ``AnyDataFrame``
First dataframe to check
df2 : ``AnyDataFrame``
Second dataframe to check
Returns
-------
OrderedSet
Set of that are shared between the two dataframes
"""
col1 = fa.get_column_names(df1)
col2 = fa.get_column_names(df2)
return OrderedSet(col1) & OrderedSet(col2)


def all_columns_match(df1: AnyDataFrame, df2: AnyDataFrame):
"""Whether the columns all match in the dataframes
Parameters
----------
df1 : ``AnyDataFrame``
First dataframe to check
df2 : ``AnyDataFrame``
Second dataframe to check
Returns
-------
bool
Boolean indicating whether the columns all match in the dataframes
"""
return unq_columns(df1, df2) == unq_columns(df2, df1) == set()


def is_match(
df1: AnyDataFrame,
df2: AnyDataFrame,
Expand Down Expand Up @@ -469,8 +532,8 @@ def _distributed_compare(
assert hash_cols in tdf1.schema, f"{hash_cols} not found in {tdf1.schema}"
assert hash_cols in tdf2.schema, f"{hash_cols} not found in {tdf2.schema}"

df1_cols = tdf1.schema.names
df2_cols = tdf2.schema.names
df1_schema = tdf1.schema
df2_schema = tdf2.schema
str_cols = set(f.name for f in tdf1.schema.fields if pa.types.is_string(f.type))
bucket = (
parallelism if parallelism is not None else fa.get_current_parallelism() * 2
Expand Down Expand Up @@ -508,17 +571,19 @@ def _serialize(dfs: Iterable[pd.DataFrame], left: bool) -> Iterable[Dict[str, An
distinct=False,
)

def _comp(df: List[Dict[str, Any]]) -> List[List[Any]]:
df1 = (
pd.concat([pickle.loads(r["data"]) for r in df if r["left"]])
.sort_values(df1_cols)
.reset_index(drop=True)
)
df2 = (
pd.concat([pickle.loads(r["data"]) for r in df if not r["left"]])
.sort_values(df2_cols)
.reset_index(drop=True)
def _deserialize(
df: List[Dict[str, Any]], left: bool, schema: Schema
) -> pd.DataFrame:
arr = [pickle.loads(r["data"]) for r in df if r["left"] == left]
if len(arr) > 0:
return pd.concat(arr).sort_values(schema.names).reset_index(drop=True)
return pd.DataFrame(
{k: pd.Series(dtype=v) for k, v in schema.pandas_dtype.items()}
)

def _comp(df: List[Dict[str, Any]]) -> List[List[Any]]:
df1 = _deserialize(df, True, df1_schema)
df2 = _deserialize(df, False, df2_schema)
comp = Compare(
df1=df1,
df2=df2,
Expand Down
17 changes: 8 additions & 9 deletions docs/source/developer_instructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ Run ``python -m pytest`` to run all unittests defined in the subfolder
Management of Requirements
--------------------------

Requirements of the project should be added to ``requirements.txt``. Optional requirements used only for testing,
documentation, or code quality are added to ``setup.py`` and ``EXTRAS_REQUIRE``
Requirements of the project should be added to ``pyproject.toml``. Optional requirements used only for testing,
documentation, or code quality are added to ``pyproject.toml`` in the ``project.optional-dependencies`` section.



Expand All @@ -57,16 +57,16 @@ edgetest
edgetest is a utility to help keep requirements up to date and ensure a subset of testing requirements still work.
More on edgetest `here <https://github.com/capitalone/edgetest>`_.

The ``setup.cfg`` has configuration details on how to run edgetest. This process can be automated via GitHub Actions.
The ``pyproject.toml`` has configuration details on how to run edgetest. This process can be automated via GitHub Actions.
(A future addition, which will come soon).

In order to execute edgetest locally you can run the following after install ``edgetest``:

.. code-block:: bash
edgetest -c setup.cfg -r requirements.txt --export
edgetest -c pyproject.toml --export
This should return output like the following and also updating ``requirements.txt``:
This should return output like the following and also updating ``pyproject.toml``:

.. code-block:: bash
Expand All @@ -77,7 +77,6 @@ This should return output like the following and also updating ``requirements.tx
core True pandas 1.3.5
core True PyYAML 6.0
============= =============== =================== =================
No PEP-517 style requirements in setup.cfg to update. Updating requirements.txt
Expand Down Expand Up @@ -110,13 +109,13 @@ Generating distribution archives (PyPI)
After each release the package will need to be uploaded to PyPi. The instructions below are taken
from `packaging.python.org <https://packaging.python.org/tutorials/packaging-projects/#generating-distribution-archives>`_

Update / Install ``setuptools``, ``wheel``, and ``twine``::
Update / Install ``build``, ``wheel``, and ``twine``::

pip install --upgrade setuptools wheel twine
pip install --upgrade build wheel twine

Generate distributions::

python setup.py sdist bdist_wheel
python -m build

Under the ``dist`` folder you should have something as follows::

Expand Down
114 changes: 114 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
[project]
name = "datacompy"
description = "Dataframe comparison in Python"
readme = "README.md"
authors = [
{ name="Ian Robertson" },
{ name="Dan Coates" },
{ name="Faisal Dosani", email="[email protected]" },
]
maintainers = [
{ name="Faisal Dosani", email="[email protected]" }
]
license = {text = "Apache Software License"}
dependencies = [
"pandas<=2.0.2,>=0.25.0",
"numpy<=1.24.3,>=1.22.0",
"ordered-set<=4.1.0,>=4.0.2",
"fugue<=0.9.0,>=0.8.6",
]
requires-python = ">=3.8.0"
classifiers = [
"Intended Audience :: Developers",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
]

dynamic = ["version"]

[project.urls]
Homepage = "https://github.com/capitalone/datacompy"
Documentation = "https://capitalone.github.io/datacompy/"
Repository = "https://github.com/capitalone/datacompy.git"
"Bug Tracker" = "https://github.com/capitalone/datacompy/issues"
"Source Code" = "https://github.com/capitalone/datacompy"

[tool.setuptools]
packages = ["datacompy"]
zip-safe = false
include-package-data = true

[tool.setuptools.package-data]
"*" = ["templates/*.txt"]

[tool.setuptools.dynamic]
version = {attr = "datacompy.__version__"}

[tool.distutils.bdist_wheel]
python-tag = "py3"

[project.optional-dependencies]
duckdb = ["fugue[duckdb]"]
polars = ["fugue[polars]"]
spark = ["fugue[spark]"]
dask = ["fugue[dask]"]
ray = ["fugue[ray]"]
docs = [
"sphinx",
"furo",
"myst-parser",
]
tests = [
"pytest",
"pytest-cov",
"pytest-spark",
"fugue[polars,duckdb,spark]",
]
qa = [
"pre-commit",
"black",
"isort",
]
build = [
"build",
"twine",
"wheel",
]
edgetest = [
"edgetest",
"edgetest-conda",
]
dev = [
"datacompy[duckdb]",
"datacompy[polars]",
"datacompy[spark]",
"datacompy[docs]",
"datacompy[tests]",
"datacompy[qa]",
"datacompy[build]",
]

[isort]
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
line_length = 88

[edgetest.envs.core]
python_version = "3.9"
conda_install = ["openjdk=8"]
extras = ["dev"]
command = "pytest tests -m 'not integration'"
upgrade = [
"pandas",
"numpy",
"ordered-set",
"fugue",
]
20 changes: 0 additions & 20 deletions requirements.txt

This file was deleted.

Loading

0 comments on commit a9e63b4

Please sign in to comment.