From 3a5bdd18a7d088c810892afc4d531e9aa8b6f1af Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Fri, 23 Sep 2022 11:38:34 -0400
Subject: [PATCH 01/25] DOC

---
 CHANGELOG.md    | 4 ++++
 ci/gpu/build.sh | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 725c0f56..76e28046 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# dask-cuda 22.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.12.00a for the latest changes to this development branch.
+
 # dask-cuda 22.10.00 (Date TBD)
 
 Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.10.00a for the latest changes to this development branch.
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 36816e3a..234cc617 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -26,7 +26,7 @@ cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 export UCX_PATH=$CONDA_PREFIX
-export UCXPY_VERSION=0.28.*
+export UCXPY_VERSION=0.29.*
 unset GIT_DESCRIBE_TAG
 
 # Enable NumPy's __array_function__ protocol (needed for NumPy 1.16.x,

From 38ac04f320dcb6413b8d007304fb3ac0847cf12f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 4 Oct 2022 09:42:41 -0700
Subject: [PATCH 02/25] unpin dask and distributed for development

---
 ci/cpu/build.sh  | 18 +++++++++++++++---
 ci/gpu/build.sh  |  2 +-
 requirements.txt |  4 ++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index e468b1cb..c029956a 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -21,7 +21,10 @@ export GPUCI_CONDA_RETRY_SLEEP=30
 
 # Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0,
 # `dask/label/dev` channel is removed.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
+
+# Dask version to install when `INSTALL_DASK_MAIN=0`
+export DASK_STABLE_VERSION="2022.9.2"
 
 # Switch to project root; also root of repo checkout
 cd "$WORKSPACE"
@@ -69,8 +72,17 @@ conda list --show-channel-urls
 # FIX Added to deal with Anancoda SSL verification issues during conda builds
 conda config --set ssl_verify False
 
-pip install git+https://github.com/dask/dask.git@2022.9.2
-pip install git+https://github.com/dask/distributed.git@2022.9.2
+# Install latest nightly version for dask and distributed if needed
+if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
+  gpuci_logger "Installing dask and distributed from dask nightly channel"
+  gpuci_mamba_retry install -c dask/label/dev \
+    "dask/label/dev::dask" \
+    "dask/label/dev::distributed"
+else
+  gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall"
+  gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall
+fi
+
 
 ################################################################################
 # BUILD - Package builds
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index abfe20c2..b88e0ef9 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -35,7 +35,7 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 
 # Install dask and distributed from main branch. Usually needed during
 # development time and disabled before a new dask-cuda release.
-export INSTALL_DASK_MAIN=0
+export INSTALL_DASK_MAIN=1
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
 export DASK_STABLE_VERSION="2022.9.2"
diff --git a/requirements.txt b/requirements.txt
index 3d673a95..7b40e89d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-dask==2022.9.2
-distributed==2022.9.2
+dask>=2022.9.2
+distributed>=2022.9.2
 pynvml>=11.0.0
 numpy>=1.16.0
 numba>=0.54

From 23460f6f7300202a2d3a25ffedf6884755fe6fbc Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 10 Oct 2022 10:36:54 +0200
Subject: [PATCH 03/25] Merge 22.08 into 22.10 (#1010)

Closes https://github.com/rapidsai/dask-cuda/pull/978

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/dask-cuda/pull/1010
---
 CHANGELOG.md | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76e28046..0bd48400 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,9 +6,44 @@ Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.12.00a for the
 
 Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.10.00a for the latest changes to this development branch.
 
-# dask-cuda 22.08.00 (Date TBD)
+# dask-cuda 22.08.00 (17 Aug 2022)
 
-Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.08.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-)
+
+## 🐛 Bug Fixes
+
+- Fix `distributed` error related to `loop_in_thread` ([#963](https://github.com/rapidsai/dask-cuda/pull/963)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `__rmatmul__` to `ProxyObject` ([#960](https://github.com/rapidsai/dask-cuda/pull/960)) [@jakirkham](https://github.com/jakirkham)
+- Always use versioneer command classes in setup.py ([#948](https://github.com/rapidsai/dask-cuda/pull/948)) [@wence-](https://github.com/wence-)
+- Do not dispatch removed `cudf.Frame._index` object ([#947](https://github.com/rapidsai/dask-cuda/pull/947)) [@pentschev](https://github.com/pentschev)
+- Fix useless property ([#944](https://github.com/rapidsai/dask-cuda/pull/944)) [@wence-](https://github.com/wence-)
+- LocalCUDACluster&#39;s memory limit: `None` means no limit ([#943](https://github.com/rapidsai/dask-cuda/pull/943)) [@madsbk](https://github.com/madsbk)
+- ProxyManager: support `memory_limit=None` ([#941](https://github.com/rapidsai/dask-cuda/pull/941)) [@madsbk](https://github.com/madsbk)
+- Remove deprecated `loop` kwarg to `Nanny` in `CUDAWorker` ([#934](https://github.com/rapidsai/dask-cuda/pull/934)) [@pentschev](https://github.com/pentschev)
+- Import `cleanup` fixture in `test_dask_cuda_worker.py` ([#924](https://github.com/rapidsai/dask-cuda/pull/924)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Switch docs to use common `js` &amp; `css` code ([#967](https://github.com/rapidsai/dask-cuda/pull/967)) [@galipremsagar](https://github.com/galipremsagar)
+- Switch `language` from `None` to `&quot;en&quot;` in docs build ([#939](https://github.com/rapidsai/dask-cuda/pull/939)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🚀 New Features
+
+- Add communications bandwidth to benchmarks ([#938](https://github.com/rapidsai/dask-cuda/pull/938)) [@pentschev](https://github.com/pentschev)
+
+## 🛠️ Improvements
+
+- Pin `dask` &amp; `distributed` for release ([#965](https://github.com/rapidsai/dask-cuda/pull/965)) [@galipremsagar](https://github.com/galipremsagar)
+- Test memory_limit=None for CUDAWorker ([#946](https://github.com/rapidsai/dask-cuda/pull/946)) [@wence-](https://github.com/wence-)
+- benchmarks: Record total number of workers in dataframe ([#945](https://github.com/rapidsai/dask-cuda/pull/945)) [@wence-](https://github.com/wence-)
+- Benchmark refactoring: tidy data and multi-node capability via `--scheduler-file` ([#940](https://github.com/rapidsai/dask-cuda/pull/940)) [@wence-](https://github.com/wence-)
+- Add util functions to simplify printing benchmarks results ([#937](https://github.com/rapidsai/dask-cuda/pull/937)) [@pentschev](https://github.com/pentschev)
+- Add --multiprocessing-method option to benchmarks ([#933](https://github.com/rapidsai/dask-cuda/pull/933)) [@wence-](https://github.com/wence-)
+- Remove click pinning ([#932](https://github.com/rapidsai/dask-cuda/pull/932)) [@charlesbluca](https://github.com/charlesbluca)
+- Remove compiler variables ([#929](https://github.com/rapidsai/dask-cuda/pull/929)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Unpin `dask` &amp; `distributed` for development ([#927](https://github.com/rapidsai/dask-cuda/pull/927)) [@galipremsagar](https://github.com/galipremsagar)
 
 # dask-cuda 22.06.00 (7 Jun 2022)
 

From 76453ab1b643d2abc46961bdfa2a19355d8b92a2 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 12 Oct 2022 18:44:58 +0200
Subject: [PATCH 04/25] Merge 22.10 into 22.12 (#1016)

Closes #1015

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1016
---
 CHANGELOG.md | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0bd48400..44cbac4c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,29 @@
 
 Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.12.00a for the latest changes to this development branch.
 
-# dask-cuda 22.10.00 (Date TBD)
+# dask-cuda 22.10.00 (12 Oct 2022)
 
-Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.10.00a for the latest changes to this development branch.
+## 🐛 Bug Fixes
+
+- Revert &quot;Update rearrange_by_column patch for explicit comms&quot; ([#1001](https://github.com/rapidsai/dask-cuda/pull/1001)) [@rjzamora](https://github.com/rjzamora)
+- Address CI failures caused by upstream distributed and cupy changes ([#993](https://github.com/rapidsai/dask-cuda/pull/993)) [@rjzamora](https://github.com/rjzamora)
+- DeviceSerialized.__reduce_ex__: convert frame to numpy arrays ([#977](https://github.com/rapidsai/dask-cuda/pull/977)) [@madsbk](https://github.com/madsbk)
+
+## 📖 Documentation
+
+- Remove line-break that&#39;s breaking link ([#982](https://github.com/rapidsai/dask-cuda/pull/982)) [@ntabris](https://github.com/ntabris)
+- Dask-cuda best practices ([#976](https://github.com/rapidsai/dask-cuda/pull/976)) [@quasiben](https://github.com/quasiben)
+
+## 🚀 New Features
+
+- Add Groupby benchmark ([#979](https://github.com/rapidsai/dask-cuda/pull/979)) [@rjzamora](https://github.com/rjzamora)
+
+## 🛠️ Improvements
+
+- Pin `dask` and `distributed` for release ([#1003](https://github.com/rapidsai/dask-cuda/pull/1003)) [@galipremsagar](https://github.com/galipremsagar)
+- Update rearrange_by_column patch for explicit comms ([#992](https://github.com/rapidsai/dask-cuda/pull/992)) [@rjzamora](https://github.com/rjzamora)
+- benchmarks: Add option to suppress output of point to point data ([#985](https://github.com/rapidsai/dask-cuda/pull/985)) [@wence-](https://github.com/wence-)
+- Unpin `dask` and `distributed` for development ([#971](https://github.com/rapidsai/dask-cuda/pull/971)) [@galipremsagar](https://github.com/galipremsagar)
 
 # dask-cuda 22.08.00 (17 Aug 2022)
 

From acc96a5dae9e99a107fdeb9d49611bc45706f707 Mon Sep 17 00:00:00 2001
From: Benjamin Zaitlen <quasiben@users.noreply.github.com>
Date: Tue, 18 Oct 2022 13:55:41 -0400
Subject: [PATCH 05/25] Add feature to get cluster configuration (#1006)

Early attempt at resolving https://github.com/rapidsai/dask-cuda/issues/989

<img width="1374" alt="Screen Shot 2022-10-04 at 6 37 20 PM" src="https://user-images.githubusercontent.com/1403768/193943590-5ac4e20a-4dbc-46ec-aefa-3f74ce7c6d17.png">

In an ideal world, we could call `dask.config.config` or `dask.config.get(...)` and that will have been updated with live values.   Unfortunately, that is not the case.  Config options are set in all manner of places: cli, env, yaml, kwargs, etc and we are not consistent about storing them.  For example, in this PR `self.device_memory_limit = device_memory_limit` is added to `device_host_file.py` just so `client.run` could be accessed from within the worker.

This change employs a hodge-podge of techniques to get at options we _think_ may be potentially useful for debugging but in no way is this exhaustive.

Authors:
  - Benjamin Zaitlen (https://github.com/quasiben)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1006
---
 dask_cuda/cli/dask_config.py               |  95 +++++++++++++
 dask_cuda/cli/dask_cuda_worker.py          |   2 +-
 dask_cuda/tests/test_local_cuda_cluster.py |  41 +++++-
 dask_cuda/utils.py                         | 151 ++++++++++++++++++++-
 setup.py                                   |   1 +
 5 files changed, 286 insertions(+), 4 deletions(-)
 create mode 100755 dask_cuda/cli/dask_config.py

diff --git a/dask_cuda/cli/dask_config.py b/dask_cuda/cli/dask_config.py
new file mode 100755
index 00000000..51c9aa2b
--- /dev/null
+++ b/dask_cuda/cli/dask_config.py
@@ -0,0 +1,95 @@
+from __future__ import absolute_import, division, print_function
+
+import logging
+
+import click
+
+from distributed import Client
+from distributed.preloading import validate_preload_argv
+from distributed.security import Security
+
+from ..utils import print_cluster_config
+
+logger = logging.getLogger(__name__)
+
+
+pem_file_option_type = click.Path(exists=True, resolve_path=True)
+
+
+@click.command(context_settings=dict(ignore_unknown_options=True))
+@click.argument("scheduler", type=str, required=False)
+@click.argument(
+    "preload_argv", nargs=-1, type=click.UNPROCESSED, callback=validate_preload_argv
+)
+@click.option(
+    "--scheduler-file",
+    type=str,
+    default=None,
+    help="""Filename to JSON encoded scheduler information. To be used in conjunction
+    with the equivalent ``dask-scheduler`` option.""",
+)
+@click.option(
+    "--get-cluster-configuration",
+    "get_cluster_conf",
+    default=False,
+    is_flag=True,
+    required=False,
+    show_default=True,
+    help="""Print a table of the current cluster configuration""",
+)
+@click.option(
+    "--tls-ca-file",
+    type=pem_file_option_type,
+    default=None,
+    help="""CA certificate(s) file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
+)
+@click.option(
+    "--tls-cert",
+    type=pem_file_option_type,
+    default=None,
+    help="""Certificate file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no certificate(s).""",
+)
+@click.option(
+    "--tls-key",
+    type=pem_file_option_type,
+    default=None,
+    help="""Private key file for TLS (in PEM format). Can be a string (like
+    ``"path/to/certs"``), or ``None`` for no private key.""",
+)
+def main(
+    scheduler,
+    scheduler_file,
+    get_cluster_conf,
+    tls_ca_file,
+    tls_cert,
+    tls_key,
+    **kwargs,
+):
+    if tls_ca_file and tls_cert and tls_key:
+        security = Security(
+            tls_ca_file=tls_ca_file,
+            tls_worker_cert=tls_cert,
+            tls_worker_key=tls_key,
+        )
+    else:
+        security = None
+
+    if isinstance(scheduler, str) and scheduler.startswith("-"):
+        raise ValueError(
+            "The scheduler address can't start with '-'. Please check "
+            "your command line arguments, you probably attempted to use "
+            "unsupported one. Scheduler address: %s" % scheduler
+        )
+
+    if get_cluster_conf:
+        if scheduler_file is not None:
+            client = Client(scheduler_file=scheduler_file, security=security)
+        else:
+            client = Client(scheduler, security=security)
+        print_cluster_config(client)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index bd327a54..29764684 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -176,7 +176,7 @@
 @click.option(
     "--scheduler-file",
     type=str,
-    default="",
+    default=None,
     help="""Filename to JSON encoded scheduler information. To be used in conjunction
     with the equivalent ``dask-scheduler`` option.""",
 )
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index db2bb49c..cd728fe3 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -12,7 +12,13 @@
 
 from dask_cuda import CUDAWorker, LocalCUDACluster, utils
 from dask_cuda.initialize import initialize
-from dask_cuda.utils import MockWorker, get_gpu_count_mig, get_gpu_uuid_from_index
+from dask_cuda.utils import (
+    MockWorker,
+    get_cluster_configuration,
+    get_gpu_count_mig,
+    get_gpu_uuid_from_index,
+    print_cluster_config,
+)
 
 
 @gen_test(timeout=20)
@@ -350,7 +356,9 @@ async def test_gpu_uuid():
 async def test_rmm_track_allocations():
     rmm = pytest.importorskip("rmm")
     async with LocalCUDACluster(
-        rmm_pool_size="2GB", asynchronous=True, rmm_track_allocations=True
+        rmm_pool_size="2GB",
+        asynchronous=True,
+        rmm_track_allocations=True,
     ) as cluster:
         async with Client(cluster, asynchronous=True) as client:
             memory_resource_type = await client.run(
@@ -364,3 +372,32 @@ async def test_rmm_track_allocations():
             )
             for v in memory_resource_upstream_type.values():
                 assert v is rmm.mr.PoolMemoryResource
+
+
+@gen_test(timeout=20)
+async def test_get_cluster_configuration():
+    async with LocalCUDACluster(
+        rmm_pool_size="2GB",
+        device_memory_limit="30B",
+        CUDA_VISIBLE_DEVICES="0",
+        scheduler_port=0,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            ret = await get_cluster_configuration(client)
+            assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["jit-unspill"] is False
+            assert ret["device-memory-limit"] == 30
+
+
+def test_print_cluster_config(capsys):
+    with LocalCUDACluster(
+        n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol="ucx"
+    ) as cluster:
+        with Client(cluster) as client:
+            print_cluster_config(client)
+            captured = capsys.readouterr()
+            assert "Dask Cluster Configuration" in captured.out
+            assert "ucx" in captured.out
+            assert "1 B" in captured.out
+            assert "[plugin]" in captured.out
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index f5be2b24..abdc3ce6 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -1,9 +1,11 @@
 import importlib
 import math
+import operator
 import os
 import time
 import warnings
 from contextlib import suppress
+from functools import singledispatch
 from multiprocessing import cpu_count
 
 import numpy as np
@@ -13,8 +15,11 @@
 import dask
 import distributed  # noqa: required for dask.config.get("distributed.comm.ucx")
 from dask.config import canonical_name
-from dask.utils import parse_bytes
+from dask.utils import format_bytes, parse_bytes
 from distributed import Worker, wait
+from distributed.comm import parse_address
+
+from .proxify_host_file import ProxifyHostFile
 
 try:
     from nvtx import annotate as nvtx_annotate
@@ -649,3 +654,147 @@ def get_gpu_uuid_from_index(device_index=0):
     pynvml.nvmlInit()
     handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
     return pynvml.nvmlDeviceGetUUID(handle).decode("utf-8")
+
+
+def get_worker_config(dask_worker):
+    # assume homogenous cluster
+    plugin_vals = dask_worker.plugins.values()
+    ret = {}
+
+    # device and host memory configuration
+    for p in plugin_vals:
+        ret[f"[plugin] {type(p).__name__}"] = {
+            v: getattr(p, v)
+            for v in dir(p)
+            if not (v.startswith("_") or v in {"setup", "cores"})
+        }
+
+    for mem in [
+        "memory_limit",
+        "memory_pause_fraction",
+        "memory_spill_fraction",
+        "memory_target_fraction",
+    ]:
+        ret[mem] = getattr(dask_worker.memory_manager, mem)
+
+    # jit unspilling set
+    ret["jit-unspill"] = isinstance(dask_worker.data, ProxifyHostFile)
+
+    # get optional device-memory-limit
+    if ret["jit-unspill"]:
+        ret["device-memory-limit"] = dask_worker.data.manager._device_memory_limit
+    else:
+        has_device = hasattr(dask_worker.data, "device_buffer")
+        if has_device:
+            ret["device-memory-limit"] = dask_worker.data.device_buffer.n
+
+    # using ucx ?
+    scheme, loc = parse_address(dask_worker.scheduler.address)
+    ret["protocol"] = scheme
+    if scheme == "ucx":
+        import ucp
+
+        ret["ucx-transports"] = ucp.get_active_transports()
+
+    # comm timeouts
+    ret["distributed.comm.timeouts"] = dask.config.get("distributed.comm.timeouts")
+
+    return ret
+
+
+async def get_scheduler_configuration(client):
+    worker_ttl = await client.run_on_scheduler(
+        lambda dask_scheduler: dask_scheduler.worker_ttl
+    )
+    extensions = list(
+        await client.run_on_scheduler(
+            lambda dask_scheduler: dask_scheduler.extensions.keys()
+        )
+    )
+    ret = {}
+    ret["distributed.scheduler.worker-ttl"] = worker_ttl
+    ret["active-extensions"] = extensions
+
+    return ret
+
+
+async def _get_cluster_configuration(client):
+    worker_config = await client.run(get_worker_config)
+    ret = await get_scheduler_configuration(client)
+
+    # does the cluster have any workers ?
+    if worker_config:
+        w = list(worker_config.values())[0]
+        ret.update(w)
+        info = client.scheduler_info()
+        workers = info.get("workers", {})
+        ret["nworkers"] = len(workers)
+        ret["nthreads"] = sum(w["nthreads"] for w in workers.values())
+
+    return ret
+
+
+@singledispatch
+def pretty_print(obj, toplevel):
+    from rich.pretty import Pretty
+
+    return Pretty(obj)
+
+
+@pretty_print.register(str)
+def pretty_print_str(obj, toplevel):
+    from rich.markup import escape
+
+    return escape(obj)
+
+
+@pretty_print.register(dict)
+def pretty_print_dict(obj, toplevel):
+    from rich.table import Table
+
+    if not obj:
+        return "No known settings"
+    formatted_byte_keys = {
+        "memory_limit",
+        "device-memory-limit",
+        "initial_pool_size",
+        "maximum_pool_size",
+    }
+    t = Table(
+        show_header=toplevel, title="Dask Cluster Configuration" if toplevel else None
+    )
+    t.add_column("Parameter", justify="left", style="bold bright_green")
+    t.add_column("Value", justify="left", style="bold bright_green")
+    for k, v in sorted(obj.items(), key=operator.itemgetter(0)):
+        if k in formatted_byte_keys and v is not None:
+            v = format_bytes(v)
+        # need to escape tags: []
+        # https://rich.readthedocs.io/en/stable/markup.html?highlight=escape#escaping
+        t.add_row(pretty_print(k, False), pretty_print(v, False))
+    return t
+
+
+def print_cluster_config(client):
+    """print current Dask cluster configuration"""
+    if client.asynchronous:
+        print("Printing cluster configuration works only with synchronous Dask clients")
+
+    data = get_cluster_configuration(client)
+    try:
+        from rich.console import Console
+    except ModuleNotFoundError as e:
+        error_msg = (
+            "Please install rich `python -m pip install rich` "
+            "to print a table of the current Dask Cluster Configuration"
+        )
+        raise ModuleNotFoundError(error_msg) from e
+
+    formatted = pretty_print(data, True)
+    Console().print(formatted)
+
+
+def get_cluster_configuration(client):
+    data = client.sync(
+        _get_cluster_configuration, client=client, asynchronous=client.asynchronous
+    )
+    return data
diff --git a/setup.py b/setup.py
index 03511de3..69b825f9 100644
--- a/setup.py
+++ b/setup.py
@@ -58,5 +58,6 @@ def get_versions():
     entry_points="""
         [console_scripts]
         dask-cuda-worker=dask_cuda.cli.dask_cuda_worker:go
+        dask-config=dask_cuda.cli.dask_config:go
       """,
 )

From 713df2da37ef4dd9331341fdfaa237b4feed8c1c Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Thu, 20 Oct 2022 12:44:35 +0200
Subject: [PATCH 06/25] Allow specifying fractions as RMM pool initial/maximum
 size (#1021)

This is already supported by `memory_limit`/`device_memory_limit`, and this has been raised in https://github.com/rapidsai/dask-cuda/issues/1020 during discussions on how to make Dask Profiles usable in Dask-CUDA.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1021
---
 dask_cuda/cli/dask_cuda_worker.py          | 17 ++---
 dask_cuda/cuda_worker.py                   |  5 --
 dask_cuda/local_cuda_cluster.py            | 18 ++----
 dask_cuda/tests/test_dask_cuda_worker.py   | 74 ++++++++++++++++++++++
 dask_cuda/tests/test_local_cuda_cluster.py | 29 +++++++++
 dask_cuda/tests/test_utils.py              |  3 +
 dask_cuda/utils.py                         | 31 +++++++--
 7 files changed, 147 insertions(+), 30 deletions(-)

diff --git a/dask_cuda/cli/dask_cuda_worker.py b/dask_cuda/cli/dask_cuda_worker.py
index 29764684..62faeddb 100755
--- a/dask_cuda/cli/dask_cuda_worker.py
+++ b/dask_cuda/cli/dask_cuda_worker.py
@@ -67,7 +67,8 @@
     "--rmm-pool-size",
     default=None,
     help="""RMM pool size to initialize each worker with. Can be an integer (bytes),
-    string (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools.
+    float (fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
+    ``None`` to disable RMM pools.
 
     .. note::
         This size is a per-worker configuration, and not cluster-wide.""",
@@ -75,14 +76,14 @@
 @click.option(
     "--rmm-maximum-pool-size",
     default=None,
-    help="""When ``--rmm-pool-size`` is specified, this argument indicates the maximum pool size.
-        Can be an integer (bytes), string (like ``"5GB"`` or ``"5000M"``) or ``None``.
-        By default, the total available memory on the GPU is used.
-        ``rmm_pool_size`` must be specified to use RMM pool and
-        to set the maximum pool size.
+    help="""When ``--rmm-pool-size`` is specified, this argument indicates the maximum
+    pool size.  Can be an integer (bytes), float (fraction of total device memory),
+    string (like ``"5GB"`` or ``"5000M"``) or ``None``. By default, the total available
+    memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool and
+    to set the maximum pool size.
 
-        .. note::
-            This size is a per-worker configuration, and not cluster-wide.""",
+    .. note::
+        This size is a per-worker configuration, and not cluster-wide.""",
 )
 @click.option(
     "--rmm-managed-memory/--no-rmm-managed-memory",
diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index b1bf6e5b..fd85006f 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -8,7 +8,6 @@
 from toolz import valmap
 
 import dask
-from dask.utils import parse_bytes
 from distributed import Nanny
 from distributed.core import Server
 from distributed.deploy.cluster import Cluster
@@ -139,10 +138,6 @@ def del_pid_file():
                     "RMM pool and managed memory are incompatible with asynchronous "
                     "allocator"
                 )
-            if rmm_pool_size is not None:
-                rmm_pool_size = parse_bytes(rmm_pool_size)
-                if rmm_maximum_pool_size is not None:
-                    rmm_maximum_pool_size = parse_bytes(rmm_maximum_pool_size)
 
         else:
             if enable_nvlink:
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index d604294a..f9054899 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -3,7 +3,6 @@
 import warnings
 
 import dask
-from dask.utils import parse_bytes
 from distributed import LocalCluster, Nanny, Worker
 from distributed.worker_memory import parse_memory_limit
 
@@ -100,18 +99,19 @@ class LocalCUDACluster(LocalCluster):
         Set environment variables to enable UCX RDMA connection manager support,
         requires ``protocol="ucx"`` and ``enable_infiniband=True``.
     rmm_pool_size : int, str or None, default None
-        RMM pool size to initialize each worker with. Can be an integer (bytes), string
-        (like ``"5GB"`` or ``"5000M"``), or ``None`` to disable RMM pools.
+        RMM pool size to initialize each worker with. Can be an integer (bytes), float
+        (fraction of total device memory), string (like ``"5GB"`` or ``"5000M"``), or
+        ``None`` to disable RMM pools.
 
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
     rmm_maximum_pool_size : int, str or None, default None
         When ``rmm_pool_size`` is set, this argument indicates
         the maximum pool size.
-        Can be an integer (bytes), string (like ``"5GB"`` or ``"5000M"``) or ``None``.
-        By default, the total available memory on the GPU is used.
-        ``rmm_pool_size`` must be specified to use RMM pool and
-        to set the maximum pool size.
+        Can be an integer (bytes), float (fraction of total device memory), string
+        (like ``"5GB"`` or ``"5000M"``) or ``None``. By default, the total available
+        memory on the GPU is used. ``rmm_pool_size`` must be specified to use RMM pool
+        and to set the maximum pool size.
 
         .. note::
             This size is a per-worker configuration, and not cluster-wide.
@@ -256,10 +256,6 @@ def __init__(
                     "RMM pool and managed memory are incompatible with asynchronous "
                     "allocator"
                 )
-            if self.rmm_pool_size is not None:
-                self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
-                if self.rmm_maximum_pool_size is not None:
-                    self.rmm_maximum_pool_size = parse_bytes(self.rmm_maximum_pool_size)
         else:
             if enable_nvlink:
                 warnings.warn(
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 238caa03..705a2150 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -13,6 +13,8 @@
 from distributed.utils_test import cleanup, loop, loop_in_thread, popen  # noqa: F401
 
 from dask_cuda.utils import (
+    get_cluster_configuration,
+    get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     get_n_gpus,
@@ -323,3 +325,75 @@ def test_rmm_track_allocations(loop):  # noqa: F811
                 )
                 for v in memory_resource_upstream_type.values():
                     assert v is rmm.mr.PoolMemoryResource
+
+
+@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+def test_get_cluster_configuration(loop):  # noqa: F811
+    pytest.importorskip("rmm")
+    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask-cuda-worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--device-memory-limit",
+                "30 B",
+                "--rmm-pool-size",
+                "2 GB",
+                "--rmm-maximum-pool-size",
+                "3 GB",
+                "--no-dashboard",
+                "--rmm-track-allocations",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                ret = get_cluster_configuration(client)
+                wait(ret)
+                assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+                assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
+                assert ret["jit-unspill"] is False
+                assert ret["device-memory-limit"] == 30
+
+
+@patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
+def test_worker_fraction_limits(loop):  # noqa: F811
+    pytest.importorskip("rmm")
+    with popen(["dask-scheduler", "--port", "9369", "--no-dashboard"]):
+        with popen(
+            [
+                "dask-cuda-worker",
+                "127.0.0.1:9369",
+                "--host",
+                "127.0.0.1",
+                "--device-memory-limit",
+                "0.1",
+                "--rmm-pool-size",
+                "0.2",
+                "--rmm-maximum-pool-size",
+                "0.3",
+                "--no-dashboard",
+                "--rmm-track-allocations",
+            ]
+        ):
+            with Client("127.0.0.1:9369", loop=loop) as client:
+                assert wait_workers(client, n_gpus=get_n_gpus())
+
+                device_total_memory = client.run(get_device_total_memory)
+                wait(device_total_memory)
+                _, device_total_memory = device_total_memory.popitem()
+
+                ret = get_cluster_configuration(client)
+                wait(ret)
+
+                assert ret["device-memory-limit"] == int(device_total_memory * 0.1)
+                assert (
+                    ret["[plugin] RMMSetup"]["initial_pool_size"]
+                    == (device_total_memory * 0.2) // 256 * 256
+                )
+                assert (
+                    ret["[plugin] RMMSetup"]["maximum_pool_size"]
+                    == (device_total_memory * 0.3) // 256 * 256
+                )
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index cd728fe3..79e3cb7d 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -15,6 +15,7 @@
 from dask_cuda.utils import (
     MockWorker,
     get_cluster_configuration,
+    get_device_total_memory,
     get_gpu_count_mig,
     get_gpu_uuid_from_index,
     print_cluster_config,
@@ -378,6 +379,7 @@ async def test_rmm_track_allocations():
 async def test_get_cluster_configuration():
     async with LocalCUDACluster(
         rmm_pool_size="2GB",
+        rmm_maximum_pool_size="3GB",
         device_memory_limit="30B",
         CUDA_VISIBLE_DEVICES="0",
         scheduler_port=0,
@@ -386,11 +388,38 @@ async def test_get_cluster_configuration():
         async with Client(cluster, asynchronous=True) as client:
             ret = await get_cluster_configuration(client)
             assert ret["[plugin] RMMSetup"]["initial_pool_size"] == 2000000000
+            assert ret["[plugin] RMMSetup"]["maximum_pool_size"] == 3000000000
             assert ret["jit-unspill"] is False
             assert ret["device-memory-limit"] == 30
 
 
+@gen_test(timeout=20)
+async def test_worker_fraction_limits():
+    async with LocalCUDACluster(
+        device_memory_limit=0.1,
+        rmm_pool_size=0.2,
+        rmm_maximum_pool_size=0.3,
+        CUDA_VISIBLE_DEVICES="0",
+        scheduler_port=0,
+        asynchronous=True,
+    ) as cluster:
+        async with Client(cluster, asynchronous=True) as client:
+            device_total_memory = await client.run(get_device_total_memory)
+            _, device_total_memory = device_total_memory.popitem()
+            ret = await get_cluster_configuration(client)
+            assert ret["device-memory-limit"] == int(device_total_memory * 0.1)
+            assert (
+                ret["[plugin] RMMSetup"]["initial_pool_size"]
+                == (device_total_memory * 0.2) // 256 * 256
+            )
+            assert (
+                ret["[plugin] RMMSetup"]["maximum_pool_size"]
+                == (device_total_memory * 0.3) // 256 * 256
+            )
+
+
 def test_print_cluster_config(capsys):
+    pytest.importorskip("rich")
     with LocalCUDACluster(
         n_workers=1, device_memory_limit="1B", jit_unspill=True, protocol="ucx"
     ) as cluster:
diff --git a/dask_cuda/tests/test_utils.py b/dask_cuda/tests/test_utils.py
index 30ce6c4b..ca17c097 100644
--- a/dask_cuda/tests/test_utils.py
+++ b/dask_cuda/tests/test_utils.py
@@ -225,6 +225,9 @@ def test_parse_device_memory_limit():
     assert parse_device_memory_limit("auto") == total
 
     assert parse_device_memory_limit(0.8) == int(total * 0.8)
+    assert parse_device_memory_limit(0.8, alignment_size=256) == int(
+        total * 0.8 // 256 * 256
+    )
     assert parse_device_memory_limit(1000000000) == 1000000000
     assert parse_device_memory_limit("1GB") == 1000000000
 
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index abdc3ce6..23179b82 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -80,6 +80,15 @@ def setup(self, worker=None):
 
             pool_allocator = False if self.initial_pool_size is None else True
 
+            if self.initial_pool_size is not None:
+                self.initial_pool_size = parse_device_memory_limit(
+                    self.initial_pool_size, alignment_size=256
+                )
+                if self.maximum_pool_size is not None:
+                    self.maximum_pool_size = parse_device_memory_limit(
+                        self.maximum_pool_size, alignment_size=256
+                    )
+
             rmm.reinitialize(
                 pool_allocator=pool_allocator,
                 managed_memory=self.managed_memory,
@@ -573,7 +582,7 @@ def nvml_device_index(i, CUDA_VISIBLE_DEVICES):
         raise ValueError("`CUDA_VISIBLE_DEVICES` must be `str` or `list`")
 
 
-def parse_device_memory_limit(device_memory_limit, device_index=0):
+def parse_device_memory_limit(device_memory_limit, device_index=0, alignment_size=1):
     """Parse memory limit to be used by a CUDA device.
 
     Parameters
@@ -585,6 +594,9 @@ def parse_device_memory_limit(device_memory_limit, device_index=0):
     device_index: int or str
         The index or UUID of the device from which to obtain the total memory amount.
         Default: 0.
+    alignment_size: int
+        Number of bytes of alignment to use, i.e., allocation must be a multiple of
+        that size. RMM pool requires 256 bytes alignment.
 
     Examples
     --------
@@ -598,18 +610,25 @@ def parse_device_memory_limit(device_memory_limit, device_index=0):
     >>> parse_device_memory_limit("1GB")
     1000000000
     """
-    if any(device_memory_limit == v for v in [0, "0", None, "auto"]):
-        return get_device_total_memory(device_index)
+
+    def _align(size, alignment_size):
+        return size // alignment_size * alignment_size
+
+    if device_memory_limit in {0, "0", None, "auto"}:
+        return _align(get_device_total_memory(device_index), alignment_size)
 
     with suppress(ValueError, TypeError):
         device_memory_limit = float(device_memory_limit)
         if isinstance(device_memory_limit, float) and device_memory_limit <= 1:
-            return int(get_device_total_memory(device_index) * device_memory_limit)
+            return _align(
+                int(get_device_total_memory(device_index) * device_memory_limit),
+                alignment_size,
+            )
 
     if isinstance(device_memory_limit, str):
-        return parse_bytes(device_memory_limit)
+        return _align(parse_bytes(device_memory_limit), alignment_size)
     else:
-        return int(device_memory_limit)
+        return _align(int(device_memory_limit), alignment_size)
 
 
 class MockWorker(Worker):

From e9eae43d6bc0cf58a138c1f306a058871896c1d6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 25 Oct 2022 16:22:45 +0100
Subject: [PATCH 07/25] Add benchmark option to use dask-noop (#994)

To add some metrics on distributed overheads to our benchmark suite, I propose using https://github.com/gjoseph92/dask-noop/ to rewrite the compute graphs into no-ops. Now we just measure the scheduler and worker-to-worker performance on zero-sized work (or something close thereto).

Interestingly the cupy-based transpose runs take _longer_ if I noopify (cc @gjoseph92 any insight here?), but I haven't done anything systematic yet.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/994
---
 dask_cuda/benchmarks/common.py                |  2 +-
 dask_cuda/benchmarks/local_cudf_groupby.py    | 17 +++++++++++
 dask_cuda/benchmarks/local_cudf_merge.py      | 13 +++++----
 dask_cuda/benchmarks/local_cudf_shuffle.py    | 26 ++++++++++-------
 dask_cuda/benchmarks/local_cupy.py            | 18 +++++++++++-
 .../benchmarks/local_cupy_map_overlap.py      | 18 +++++++++++-
 dask_cuda/benchmarks/utils.py                 | 29 +++++++++++++++++++
 dask_cuda/explicit_comms/dataframe/shuffle.py |  4 ++-
 8 files changed, 108 insertions(+), 19 deletions(-)

diff --git a/dask_cuda/benchmarks/common.py b/dask_cuda/benchmarks/common.py
index 3864a85c..7c489d00 100644
--- a/dask_cuda/benchmarks/common.py
+++ b/dask_cuda/benchmarks/common.py
@@ -153,7 +153,7 @@ def run_client_from_existing_scheduler(args: Namespace, config: Config):
             client.shutdown()
 
 
-def run_create_client(args, config):
+def run_create_client(args: Namespace, config: Config):
     """Create a client + cluster and run
 
     Shuts down the cluster at the end of the benchmark"""
diff --git a/dask_cuda/benchmarks/local_cudf_groupby.py b/dask_cuda/benchmarks/local_cudf_groupby.py
index 379ff930..0a142698 100644
--- a/dask_cuda/benchmarks/local_cudf_groupby.py
+++ b/dask_cuda/benchmarks/local_cudf_groupby.py
@@ -11,6 +11,7 @@
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
+    as_noop,
     parse_benchmark_args,
     print_key_value,
     print_separator,
@@ -20,11 +21,14 @@
 
 def apply_groupby(
     df,
+    backend,
     sort=False,
     split_out=1,
     split_every=8,
     shuffle=None,
 ):
+    if backend == "dask-noop" and shuffle == "explicit-comms":
+        raise RuntimeError("dask-noop not valid for explicit-comms shuffle")
     # Handle special "explicit-comms" case
     config = {}
     if shuffle == "explicit-comms":
@@ -38,6 +42,8 @@ def apply_groupby(
             split_every=split_every,
             shuffle=shuffle,
         )
+        if backend == "dask-noop":
+            agg = as_noop(agg)
 
     wait(agg.persist())
     return agg
@@ -118,6 +124,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
     print("Groupby benchmark")
     print_separator(separator="-")
     print_key_value(key="Use shuffle", value=f"{args.shuffle}")
+    print_key_value(key="Backend", value=f"{args.backend}")
     print_key_value(key="Output partitions", value=f"{args.split_out}")
     print_key_value(key="Input partitions", value=f"{args.in_parts}")
     print_key_value(key="Sort Groups", value=f"{args.sort}")
@@ -150,6 +157,7 @@ def create_tidy_results(args, p2p_bw, results):
     configuration = {
         "dataframe_type": "cudf" if args.type == "gpu" else "pandas",
         "shuffle": args.shuffle,
+        "backend": args.backend,
         "sort": args.sort,
         "split_out": args.split_out,
         "split_every": args.split_every,
@@ -232,6 +240,15 @@ def parse_args():
             "type": str,
             "help": "Whether to use shuffle-based groupby.",
         },
+        {
+            "name": "--backend",
+            "choices": ["dask", "dask-noop"],
+            "default": "dask",
+            "type": str,
+            "help": (
+                "Compute engine to use, dask-noop turns the graph into a noop graph"
+            ),
+        },
         {
             "name": [
                 "-t",
diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index 24979606..54bac9bc 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -14,6 +14,7 @@
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
+    as_noop,
     parse_benchmark_args,
     print_key_value,
     print_separator,
@@ -146,7 +147,11 @@ def merge(args, ddf1, ddf2):
     ddf_join = ddf1.merge(ddf2, on=["key"], how="inner", broadcast=broadcast)
     if args.set_index:
         ddf_join = ddf_join.set_index("key")
+    if args.backend == "dask-noop":
+        ddf_join = as_noop(ddf_join)
+    t1 = perf_counter()
     wait(ddf_join.persist())
+    return perf_counter() - t1
 
 
 def bench_once(client, args, write_profile=None):
@@ -179,11 +184,9 @@ def bench_once(client, args, write_profile=None):
 
     with ctx1:
         with ctx2:
-            t1 = perf_counter()
-            merge(args, ddf_base, ddf_other)
-            t2 = perf_counter()
+            duration = merge(args, ddf_base, ddf_other)
 
-    return (data_processed, t2 - t1)
+    return (data_processed, duration)
 
 
 def pretty_print_results(args, address_to_index, p2p_bw, results):
@@ -273,7 +276,7 @@ def parse_args():
                 "-b",
                 "--backend",
             ],
-            "choices": ["dask", "explicit-comms"],
+            "choices": ["dask", "explicit-comms", "dask-noop"],
             "default": "dask",
             "type": str,
             "help": "The backend to use.",
diff --git a/dask_cuda/benchmarks/local_cudf_shuffle.py b/dask_cuda/benchmarks/local_cudf_shuffle.py
index 02745395..7ff099cc 100644
--- a/dask_cuda/benchmarks/local_cudf_shuffle.py
+++ b/dask_cuda/benchmarks/local_cudf_shuffle.py
@@ -1,6 +1,6 @@
 import contextlib
 from collections import ChainMap
-from time import perf_counter as clock
+from time import perf_counter
 
 import pandas as pd
 
@@ -13,6 +13,7 @@
 import dask_cuda.explicit_comms.dataframe.shuffle
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
+    as_noop,
     parse_benchmark_args,
     print_key_value,
     print_separator,
@@ -20,16 +21,23 @@
 )
 
 
-def shuffle_dask(df):
-    wait(shuffle(df, index="data", shuffle="tasks").persist())
+def shuffle_dask(df, *, noop=False):
+    result = shuffle(df, index="data", shuffle="tasks")
+    if noop:
+        result = as_noop(result)
+    t1 = perf_counter()
+    wait(result.persist())
+    return perf_counter() - t1
 
 
 def shuffle_explicit_comms(df):
+    t1 = perf_counter()
     wait(
         dask_cuda.explicit_comms.dataframe.shuffle.shuffle(
             df, column_names="data"
         ).persist()
     )
+    return perf_counter() - t1
 
 
 def bench_once(client, args, write_profile=None):
@@ -55,14 +63,12 @@ def bench_once(client, args, write_profile=None):
         ctx = performance_report(filename=args.profile)
 
     with ctx:
-        t1 = clock()
-        if args.backend == "dask":
-            shuffle_dask(df)
+        if args.backend in {"dask", "dask-noop"}:
+            duration = shuffle_dask(df, noop=args.backend == "dask-noop")
         else:
-            shuffle_explicit_comms(df)
-        t2 = clock()
+            duration = shuffle_explicit_comms(df)
 
-    return (data_processed, t2 - t1)
+    return (data_processed, duration)
 
 
 def pretty_print_results(args, address_to_index, p2p_bw, results):
@@ -143,7 +149,7 @@ def parse_args():
                 "-b",
                 "--backend",
             ],
-            "choices": ["dask", "explicit-comms"],
+            "choices": ["dask", "explicit-comms", "dask-noop"],
             "default": "dask",
             "type": str,
             "help": "The backend to use.",
diff --git a/dask_cuda/benchmarks/local_cupy.py b/dask_cuda/benchmarks/local_cupy.py
index 1a81d50d..1c1d12d3 100644
--- a/dask_cuda/benchmarks/local_cupy.py
+++ b/dask_cuda/benchmarks/local_cupy.py
@@ -12,6 +12,7 @@
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
+    as_noop,
     parse_benchmark_args,
     print_key_value,
     print_separator,
@@ -148,8 +149,11 @@ def bench_once(client, args, write_profile=None):
 
     with ctx:
         rng = start_range(message=args.operation, color="purple")
+        result = func(*func_args)
+        if args.backend == "dask-noop":
+            result = as_noop(result)
         t1 = clock()
-        wait(client.persist(func(*func_args)))
+        wait(client.persist(result))
         if args.type == "gpu":
             client.run(lambda xp: xp.cuda.Device().synchronize(), xp)
         took = clock() - t1
@@ -169,6 +173,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         print("```")
     print("Roundtrip benchmark")
     print_separator(separator="-")
+    print_key_value(key="Backend", value=f"{args.backend}")
     print_key_value(key="Operation", value=f"{args.operation}")
     print_key_value(key="Array type", value="cupy" if args.type == "gpu" else "numpy")
     print_key_value(key="User size", value=f"{args.size}")
@@ -206,6 +211,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
 def create_tidy_results(args, p2p_bw, results):
     configuration = {
         "operation": args.operation,
+        "backend": args.backend,
         "array_type": "cupy" if args.type == "gpu" else "numpy",
         "user_size": args.size,
         "user_second_size": args.second_size,
@@ -304,6 +310,16 @@ def parse_args():
             "type": int,
             "help": "Number of runs (default 3).",
         },
+        {
+            "name": [
+                "-b",
+                "--backend",
+            ],
+            "choices": ["dask", "dask-noop"],
+            "default": "dask",
+            "type": str,
+            "help": "Compute backend to use.",
+        },
     ]
 
     return parse_benchmark_args(
diff --git a/dask_cuda/benchmarks/local_cupy_map_overlap.py b/dask_cuda/benchmarks/local_cupy_map_overlap.py
index 247ea5a8..f4031855 100644
--- a/dask_cuda/benchmarks/local_cupy_map_overlap.py
+++ b/dask_cuda/benchmarks/local_cupy_map_overlap.py
@@ -14,6 +14,7 @@
 
 from dask_cuda.benchmarks.common import Config, execute_benchmark
 from dask_cuda.benchmarks.utils import (
+    as_noop,
     parse_benchmark_args,
     print_key_value,
     print_separator,
@@ -48,8 +49,11 @@ def bench_once(client, args, write_profile=None):
         ctx = contextlib.nullcontext()
 
     with ctx:
+        result = x.map_overlap(mean_filter, args.kernel_size, shape=ks)
+        if args.backend == "dask-noop":
+            result = as_noop(result)
         t1 = clock()
-        wait(client.persist(x.map_overlap(mean_filter, args.kernel_size, shape=ks)))
+        wait(client.persist(result))
         took = clock() - t1
 
     return (data_processed, took)
@@ -60,6 +64,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
         print("```")
     print("Cupy map overlap benchmark")
     print_separator(separator="-")
+    print_key_value(key="Backend", value=f"{args.backend}")
     print_key_value(key="Array type", value="cupy" if args.type == "gpu" else "numpy")
     print_key_value(key="Size", value=f"{args.size}*{args.size}")
     print_key_value(key="Chunk size", value=f"{args.chunk_size}")
@@ -89,6 +94,7 @@ def pretty_print_results(args, address_to_index, p2p_bw, results):
 def create_tidy_results(args, p2p_bw, results):
     configuration = {
         "array_type": "cupy" if args.type == "gpu" else "numpy",
+        "backend": args.backend,
         "user_size": args.size,
         "chunk_size": args.chunk_size,
         "ignore_size": args.ignore_size,
@@ -175,6 +181,16 @@ def parse_args():
             "type": int,
             "help": "Number of runs",
         },
+        {
+            "name": [
+                "-b",
+                "--backend",
+            ],
+            "choices": ["dask", "dask-noop"],
+            "default": "dask",
+            "type": str,
+            "help": "Compute backend to use.",
+        },
     ]
 
     return parse_benchmark_args(
diff --git a/dask_cuda/benchmarks/utils.py b/dask_cuda/benchmarks/utils.py
index 34454980..8a8419cd 100644
--- a/dask_cuda/benchmarks/utils.py
+++ b/dask_cuda/benchmarks/utils.py
@@ -18,6 +18,35 @@
 from dask_cuda.local_cuda_cluster import LocalCUDACluster
 
 
+def as_noop(dsk):
+    """
+    Turn the given dask computation into a noop.
+
+    Uses dask-noop (https://github.com/gjoseph92/dask-noop/)
+
+    Parameters
+    ----------
+    dsk
+        Dask object (on which one could call compute)
+
+    Returns
+    -------
+    New dask object representing the same task graph with no
+    computation/data attached.
+
+    Raises
+    ------
+    RuntimeError
+        If dask_noop is not importable
+    """
+    try:
+        from dask_noop import as_noop
+
+        return as_noop(dsk)
+    except ImportError:
+        raise RuntimeError("Requested noop computation but dask-noop not installed.")
+
+
 def parse_benchmark_args(description="Generic dask-cuda Benchmark", args_list=[]):
     parser = argparse.ArgumentParser(description=description)
     worker_args = parser.add_argument_group(description="Worker configuration")
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index ff72fad2..b1f99869 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import asyncio
 import functools
 import inspect
@@ -204,7 +206,7 @@ def concat(args, ignore_index=False):
 
 def shuffle(
     df: DataFrame,
-    column_names: List[str],
+    column_names: str | List[str],
     npartitions: Optional[int] = None,
     ignore_index: bool = False,
 ) -> DataFrame:

From 99894b18eb207edbf959fabd7bc8b32d09d79525 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Tue, 25 Oct 2022 15:09:24 -0400
Subject: [PATCH 08/25] Remove stale labeler (#1024)

This PR removes the stale issue labeler workflow

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1024
---
 .github/workflows/stale.yaml | 57 ------------------------------------
 1 file changed, 57 deletions(-)
 delete mode 100644 .github/workflows/stale.yaml

diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
deleted file mode 100644
index 8b65da69..00000000
--- a/.github/workflows/stale.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Mark inactive issues and pull requests
-
-on:
-  schedule:
-    - cron: "0 * * * *"
-
-jobs:
-  mark-inactive-30d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 30 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 30
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 30
-          days-before-pr-close: -1
-          operations-per-run: 50
-  mark-inactive-90d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 90 day inactive issues and pull requests
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
-          days-before-pr-close: -1
-          operations-per-run: 50

From 91d605cd2f4ba76e5a1bd6e035953ab39231792b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 28 Oct 2022 13:19:09 +0200
Subject: [PATCH 09/25] Switch pre-import not found test to sync definition
 (#1026)

Even though pytest-async is installed, it has not executed async tests that aren't wrapped in gen_cluster. Using gen_cluster is the proper way for Dask testing, but in few exceptions we may not be able to use it, such as when cleanup must be avoided.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1026
---
 dask_cuda/tests/test_local_cuda_cluster.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 79e3cb7d..8c169d4b 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -1,3 +1,4 @@
+import asyncio
 import os
 import sys
 from unittest.mock import patch
@@ -284,13 +285,17 @@ async def test_pre_import():
 
 
 # Intentionally not using @gen_test to skip cleanup checks
-async def test_pre_import_not_found():
-    with raises_with_cause(RuntimeError, None, ImportError, None):
-        await LocalCUDACluster(
-            n_workers=1,
-            pre_import="my_module",
-            asynchronous=True,
-        )
+def test_pre_import_not_found():
+    async def _test_pre_import_not_found():
+        with raises_with_cause(RuntimeError, None, ImportError, None):
+            await LocalCUDACluster(
+                n_workers=1,
+                pre_import="my_module",
+                asynchronous=True,
+                silence_logs=True,
+            )
+
+    asyncio.run(_test_pre_import_not_found())
 
 
 @gen_test(timeout=20)

From 40bbfedf4fe69c44db9a0fa26a81617a9583f741 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 28 Oct 2022 13:53:22 +0200
Subject: [PATCH 10/25] Explicit-comms-shuffle: fine control of task scheduling
 (#1025)

In shuffle, use `Client.submit()` to control where tasks are executed and release temporary dataframes ASAP.

#### Context
In the final step in explicit-comms shuffle, we call `getitem()` to extract the final dataframe partitions from the result of the local shuffle. In some cases, these `getitem()` tasks would not run on the worker that ran the local shuffle, which would result in extra communication and spilling.
We now use `submit(..., worker=...)` to make sure that the worker running the local shuffle also runs the `getitem()` task afterwards.

Is it possible to do this without the use of `submit()` to avoid the overhead of creating a `Future` for each dataframe partition?

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1025
---
 dask_cuda/explicit_comms/dataframe/shuffle.py | 41 ++++++++++++-------
 docs/source/conf.py                           |  5 ++-
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index b1f99869..294a8efd 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -11,7 +11,6 @@
 
 import dask
 import dask.dataframe
-import distributed
 from dask.base import compute_as_if_collection, tokenize
 from dask.dataframe.core import DataFrame, _concat as dd_concat, new_dd_object
 from dask.dataframe.shuffle import shuffle_group
@@ -325,25 +324,37 @@ def shuffle(
                 rank_to_out_part_ids,
                 ignore_index,
             )
-    distributed.wait(list(result_futures.values()))
-    del df_groups
+    wait(list(result_futures.values()))
 
-    # Step (c): extract individual dataframe-partitions
+    # Release dataframes from step (a)
+    for fut in df_groups:
+        fut.release()
+
+    # Step (c): extract individual dataframe-partitions. We use `submit()`
+    #           to control where the tasks are executed.
+    # TODO: can we do this without using `submit()` to avoid the overhead
+    #       of creating a Future for each dataframe partition?
     name = f"explicit-comms-shuffle-getitem-{tokenize(name)}"
     dsk = {}
-    meta = None
-    for rank, parts in rank_to_out_part_ids.items():
-        for i, part_id in enumerate(parts):
-            dsk[(name, part_id)] = (getitem, result_futures[rank], i)
-            if meta is None:
-                # Get the meta from the first output partition
-                meta = delayed(make_meta)(
-                    delayed(getitem)(result_futures[rank], i)
-                ).compute()
-    assert meta is not None
+    for rank, worker in enumerate(c.worker_addresses):
+        if rank in workers:
+            for i, part_id in enumerate(rank_to_out_part_ids[rank]):
+                dsk[(name, part_id)] = c.client.submit(
+                    getitem, result_futures[rank], i, workers=[worker]
+                )
 
+    # Get the meta from the first output partition
+    meta = delayed(make_meta)(next(iter(dsk.values()))).compute()
+
+    # Create a distributed Dataframe from all the pieces
     divs = [None] * (len(dsk) + 1)
-    return new_dd_object(dsk, name, meta, divs).persist()
+    ret = new_dd_object(dsk, name, meta, divs).persist()
+    wait(ret)
+
+    # Release all temporary dataframes
+    for fut in [*result_futures.values(), *dsk.values()]:
+        fut.release()
+    return ret
 
 
 def get_rearrange_by_column_tasks_wrapper(func):
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2f7825a3..08d8bfdf 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -189,7 +189,8 @@
 # -- Extension configuration -------------------------------------------------
 
 
-
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
-    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
+    app.add_js_file(
+        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
+    )

From 3e5a19b246aa684d2bedfb4673960d895d81dd24 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 31 Oct 2022 16:50:21 +0000
Subject: [PATCH 11/25] Fix recorded time in merge benchmark (#1028)

With the merge of #994 I accidentally changed the recorded time in the merge benchmark for the explicit-comms case. Since explicit-comms is eager, we must time the entire operation, not just `wait(result.persist())`.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/dask-cuda/pull/1028
---
 dask_cuda/benchmarks/local_cudf_merge.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/benchmarks/local_cudf_merge.py b/dask_cuda/benchmarks/local_cudf_merge.py
index 54bac9bc..f26a26ae 100644
--- a/dask_cuda/benchmarks/local_cudf_merge.py
+++ b/dask_cuda/benchmarks/local_cudf_merge.py
@@ -148,10 +148,13 @@ def merge(args, ddf1, ddf2):
     if args.set_index:
         ddf_join = ddf_join.set_index("key")
     if args.backend == "dask-noop":
+        t1 = perf_counter()
         ddf_join = as_noop(ddf_join)
-    t1 = perf_counter()
+        noopify_duration = perf_counter() - t1
+    else:
+        noopify_duration = 0
     wait(ddf_join.persist())
-    return perf_counter() - t1
+    return noopify_duration
 
 
 def bench_once(client, args, write_profile=None):
@@ -184,7 +187,9 @@ def bench_once(client, args, write_profile=None):
 
     with ctx1:
         with ctx2:
-            duration = merge(args, ddf_base, ddf_other)
+            t1 = perf_counter()
+            noopify_duration = merge(args, ddf_base, ddf_other)
+            duration = perf_counter() - t1 - noopify_duration
 
     return (data_processed, duration)
 

From 3dbc4c9353d043cc5bd4530154809f8918d571d9 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 2 Nov 2022 20:19:53 +0100
Subject: [PATCH 12/25] Install Dask nightly last in CI (#1029)

By installing Dask nightly as last step in CI, we prevent any other steps from inadvertently downgrading it before running tests.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mark Sadang (https://github.com/msadang)

URL: https://github.com/rapidsai/dask-cuda/pull/1029
---
 ci/gpu/build.sh | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b88e0ef9..e41f9976 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -68,18 +68,6 @@ gpuci_mamba_retry install "cudf=${MINOR_VERSION}" \
               "cucim" \
               "pytest-asyncio=<0.14.0"
 
-# Install latest nightly version for dask and distributed if needed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-  gpuci_logger "Installing dask and distributed from dask nightly channel"
-  gpuci_mamba_retry install -c dask/label/dev \
-    "dask/label/dev::dask" \
-    "dask/label/dev::distributed"
-else
-  gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall"
-  gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall
-  conda config --system --remove channels dask/label/dev
-fi
-
 
 gpuci_logger "Check versions"
 python --version
@@ -103,6 +91,21 @@ CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
 gpuci_conda_retry mambabuild --croot "${CONDA_BLD_DIR}" conda/recipes/dask-cuda --python="${PYTHON}"
 gpuci_mamba_retry install -c "${CONDA_BLD_DIR}" dask-cuda
 
+################################################################################
+# DASK - Install latest nightly version for dask and distributed if needed.
+#        Done after everything else to ensure packages are not downgraded.
+################################################################################
+if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
+  gpuci_logger "Installing dask and distributed from dask nightly channel"
+  gpuci_mamba_retry install -c dask/label/dev \
+    "dask/label/dev::dask" \
+    "dask/label/dev::distributed"
+else
+  gpuci_logger "gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall"
+  gpuci_mamba_retry install conda-forge::dask==${DASK_STABLE_VERSION} conda-forge::distributed==${DASK_STABLE_VERSION} conda-forge::dask-core==${DASK_STABLE_VERSION} --force-reinstall
+  conda config --system --remove channels dask/label/dev
+fi
+
 ################################################################################
 # TEST - Run pytests for ucx-py
 ################################################################################

From 4de1d8446527b333ed172b2e17982cb4bf334ba8 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 3 Nov 2022 11:03:08 +0100
Subject: [PATCH 13/25] Support the new `Buffer` in cudf (#1033)

Support the new `Buffer` in https://github.com/rapidsai/cudf/pull/12009

Fixes  #1032

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1033
---
 dask_cuda/get_device_memory_objects.py    | 5 +++--
 dask_cuda/tests/test_proxify_host_file.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dask_cuda/get_device_memory_objects.py b/dask_cuda/get_device_memory_objects.py
index 92055c19..44dc433f 100644
--- a/dask_cuda/get_device_memory_objects.py
+++ b/dask_cuda/get_device_memory_objects.py
@@ -51,8 +51,9 @@ def get_device_memory_objects_default(obj):
         return dispatch(obj._pxy_get().obj)
     if hasattr(obj, "data"):
         return dispatch(obj.data)
-    if hasattr(obj, "_owner") and obj._owner is not None:
-        return dispatch(obj._owner)
+    owner = getattr(obj, "owner", None) or getattr(obj, "_owner", None)
+    if owner:
+        return dispatch(owner)
     if hasattr(obj, "__cuda_array_interface__"):
         return [obj]
     return []
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 1edcab09..6758385f 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -273,7 +273,7 @@ def test_dataframes_share_dev_mem():
     # Even though the two dataframe doesn't point to the same cudf.Buffer object
     assert view1["a"].data is not view2["a"].data
     # They still share the same underlying device memory
-    assert view1["a"].data._owner._owner is view2["a"].data._owner._owner
+    view1["a"].data.ptr == view2["a"].data.ptr
 
     dhf = ProxifyHostFile(
         local_directory=root_dir, device_memory_limit=160, memory_limit=1000

From b678d493805b0a2e4a6cbb85a935e93a5c063458 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 8 Nov 2022 13:41:43 +0000
Subject: [PATCH 14/25] Make local_directory a required argument for spilling
 impls (#1023)

For automated cleanup when the cluster exits, the on-disk spilling directory needs to live inside the relevant worker's local_directory. Since we do not have a handle on the worker when constructing the keyword arguments to DeviceHostFile or ProxifyHostFile, instead take advantage of dask/distributed#7153 and request that we are called with the worker_local_directory as an argument. Closes #1018.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1023
---
 dask_cuda/cuda_worker.py                  |  2 -
 dask_cuda/device_host_file.py             | 45 +++++++++++++----
 dask_cuda/local_cuda_cluster.py           |  2 -
 dask_cuda/proxify_host_file.py            | 22 ++++-----
 dask_cuda/tests/test_device_host_file.py  | 13 +----
 dask_cuda/tests/test_gds.py               |  2 +-
 dask_cuda/tests/test_proxify_host_file.py | 60 +++++++++++++----------
 dask_cuda/tests/test_proxy.py             |  2 +-
 8 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index fd85006f..b5c4285d 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -175,7 +175,6 @@ def del_pid_file():
                         device_memory_limit, device_index=i
                     ),
                     "memory_limit": memory_limit,
-                    "local_directory": local_directory,
                     "shared_filesystem": shared_filesystem,
                 },
             )
@@ -187,7 +186,6 @@ def del_pid_file():
                         device_memory_limit, device_index=i
                     ),
                     "memory_limit": memory_limit,
-                    "local_directory": local_directory,
                 },
             )
 
diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index e89ba64b..aa81f104 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -1,13 +1,13 @@
 import functools
 import logging
 import os
+import sys
 import time
 
 import numpy
 from zict import Buffer, File, Func
 from zict.common import ZictBase
 
-import dask
 from distributed.protocol import (
     dask_deserialize,
     dask_serialize,
@@ -158,6 +158,8 @@ class DeviceHostFile(ZictBase):
 
     Parameters
     ----------
+    worker_local_directory: path
+        Path where to store serialized objects on disk
     device_memory_limit: int
         Number of bytes of CUDA device memory for device LRU cache,
         spills to host cache once filled.
@@ -165,8 +167,6 @@ class DeviceHostFile(ZictBase):
         Number of bytes of host memory for host LRU cache, spills to
         disk once filled. Setting this to `0` or `None` means unlimited
         host memory, implies no spilling to disk.
-    local_directory: path
-        Path where to store serialized objects on disk
     log_spilling: bool
         If True, all spilling operations will be logged directly to
         distributed.worker with an INFO loglevel. This will eventually be
@@ -175,16 +175,15 @@ class DeviceHostFile(ZictBase):
 
     def __init__(
         self,
+        # So named such that dask will pass in the worker's local
+        # directory when constructing this through the "data" callback.
+        worker_local_directory,
+        *,
         device_memory_limit=None,
         memory_limit=None,
-        local_directory=None,
         log_spilling=False,
     ):
-        self.disk_func_path = os.path.join(
-            local_directory or dask.config.get("temporary-directory") or os.getcwd(),
-            "dask-worker-space",
-            "storage",
-        )
+        self.disk_func_path = os.path.join(worker_local_directory, "storage")
         os.makedirs(self.disk_func_path, exist_ok=True)
 
         if memory_limit == 0:
@@ -236,6 +235,34 @@ def __init__(
         # For Worker compatibility only, where `fast` is host memory buffer
         self.fast = self.host_buffer if memory_limit is None else self.host_buffer.fast
 
+    if sys.version_info > (3, 8):
+
+        def __new__(
+            cls,
+            # So named such that dask will pass in the worker's local
+            # directory when constructing this through the "data" callback.
+            worker_local_directory,
+            *,
+            device_memory_limit=None,
+            memory_limit=None,
+            log_spilling=False,
+        ):
+            """
+            This is here to support Python 3.8. Right now (to support
+            3.8), ZictBase inherits from typing.MutableMapping through
+            which inspect.signature determines that the signature of
+            __init__ is just (*args, **kwargs). We need to advertise the
+            correct signature so that distributed will correctly figure
+            out that it needs to pass the worker's local directory. In
+            Python 3.9 and later, typing.MutableMapping is just an alias
+            for collections.abc.MutableMapping and we don't need to do
+            anything.
+
+            With this pass-through definition of __new__, the
+            signature of the constructor is correctly determined.
+            """
+            return super().__new__(cls)
+
     def __setitem__(self, key, value):
         if key in self.device_buffer:
             # Make sure we register the removal of an existing key
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index f9054899..014d0b4e 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -289,7 +289,6 @@ def __init__(
                     {
                         "device_memory_limit": self.device_memory_limit,
                         "memory_limit": self.memory_limit,
-                        "local_directory": local_directory,
                         "shared_filesystem": shared_filesystem,
                     },
                 )
@@ -299,7 +298,6 @@ def __init__(
                     {
                         "device_memory_limit": self.device_memory_limit,
                         "memory_limit": self.memory_limit,
-                        "local_directory": local_directory,
                         "log_spilling": log_spilling,
                     },
                 )
diff --git a/dask_cuda/proxify_host_file.py b/dask_cuda/proxify_host_file.py
index 2a1754a0..dd2e23e0 100644
--- a/dask_cuda/proxify_host_file.py
+++ b/dask_cuda/proxify_host_file.py
@@ -11,6 +11,7 @@
 import warnings
 import weakref
 from collections import defaultdict
+from collections.abc import MutableMapping
 from typing import (
     Any,
     Callable,
@@ -19,7 +20,6 @@
     Hashable,
     Iterable,
     List,
-    MutableMapping,
     Optional,
     Set,
     Tuple,
@@ -452,17 +452,14 @@ class ProxifyHostFile(MutableMapping):
 
     Parameters
     ----------
+    worker_local_directory: str
+        Path on local machine to store temporary files.
+        WARNING, this **cannot** change while running thus all serialization to
+        disk are using the same directory.
     device_memory_limit: int
         Number of bytes of CUDA device memory used before spilling to host.
     memory_limit: int
         Number of bytes of host memory used before spilling to disk.
-    local_directory: str or None, default None
-        Path on local machine to store temporary files. Can be a string (like
-        ``"path/to/files"``) or ``None`` to fall back on the value of
-        ``dask.temporary-directory`` in the local Dask configuration, using the
-        current working directory if this is not set.
-        WARNING, this **cannot** change while running thus all serialization to
-        disk are using the same directory.
     shared_filesystem: bool or None, default None
         Whether the `local_directory` above is shared between all workers or not.
         If ``None``, the "jit-unspill-shared-fs" config value are used, which
@@ -492,10 +489,12 @@ class ProxifyHostFile(MutableMapping):
 
     def __init__(
         self,
+        # So named such that dask will pass in the worker's local
+        # directory when constructing this through the "data" callback.
+        worker_local_directory: str,
         *,
         device_memory_limit: int,
         memory_limit: int,
-        local_directory: str = None,
         shared_filesystem: bool = None,
         compatibility_mode: bool = None,
         spill_on_demand: bool = None,
@@ -510,10 +509,7 @@ def __init__(
         # Create an instance of `SpillToDiskProperties` if it doesn't already exist
         path = pathlib.Path(
             os.path.join(
-                local_directory
-                or dask.config.get("temporary-directory")
-                or os.getcwd(),
-                "dask-worker-space",
+                worker_local_directory,
                 "jit-unspill-disk-storage",
             )
         ).resolve()
diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index fe627cd4..e9de6af7 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -1,4 +1,3 @@
-import os
 from random import randint
 
 import numpy as np
@@ -24,14 +23,6 @@ def assert_eq(x, y):
     return dask.array.assert_eq(cupy.asnumpy(x), cupy.asnumpy(y))
 
 
-def test_device_host_file_config(tmp_path):
-    dhf_disk_path = str(tmp_path / "dask-worker-space" / "storage")
-    with dask.config.set(temporary_directory=str(tmp_path)):
-        dhf = DeviceHostFile()
-        assert os.path.exists(dhf_disk_path)
-        assert dhf.disk_func_path == dhf_disk_path
-
-
 @pytest.mark.parametrize("num_host_arrays", [1, 10, 100])
 @pytest.mark.parametrize("num_device_arrays", [1, 10, 100])
 @pytest.mark.parametrize("array_size_range", [(1, 1000), (100, 100), (1000, 1000)])
@@ -43,7 +34,7 @@ def test_device_host_file_short(
     dhf = DeviceHostFile(
         device_memory_limit=1024 * 16,
         memory_limit=1024 * 16,
-        local_directory=tmpdir,
+        worker_local_directory=tmpdir,
     )
 
     host = [
@@ -81,7 +72,7 @@ def test_device_host_file_step_by_step(tmp_path):
     dhf = DeviceHostFile(
         device_memory_limit=1024 * 16,
         memory_limit=1024 * 16,
-        local_directory=tmpdir,
+        worker_local_directory=tmpdir,
     )
 
     a = np.random.random(1000)
diff --git a/dask_cuda/tests/test_gds.py b/dask_cuda/tests/test_gds.py
index 257e6f59..c8667025 100644
--- a/dask_cuda/tests/test_gds.py
+++ b/dask_cuda/tests/test_gds.py
@@ -11,7 +11,7 @@
 if ProxifyHostFile._spill_to_disk is None:
     tmpdir = tempfile.TemporaryDirectory()
     ProxifyHostFile(
-        local_directory=tmpdir.name,
+        worker_local_directory=tmpdir.name,
         device_memory_limit=1024,
         memory_limit=1024,
     )
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 6758385f..992679dc 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -1,5 +1,4 @@
 import re
-import tempfile
 from typing import Iterable
 from unittest.mock import patch
 
@@ -32,21 +31,24 @@
 dask_cuda.proxify_device_objects.dispatch.dispatch(cupy.ndarray)
 dask_cuda.proxify_device_objects.incompatible_types = ()  # type: ignore
 
-# Make the "disk" serializer available and use a tmp directory
-if ProxifyHostFile._spill_to_disk is None:
-    # Hold on to `tmpdir` to keep dir alive until exit
-    tmpdir = tempfile.TemporaryDirectory()
-    ProxifyHostFile(
-        local_directory=tmpdir.name,
-        device_memory_limit=1024,
-        memory_limit=1024,
-    )
-assert ProxifyHostFile._spill_to_disk is not None
 
-# In order to use the same tmp dir, we use `root_dir` for all ProxifyHostFile creations
-# Notice, we use `../..` to remove the `dask-worker-space/jit-unspill-disk-storage` part
-# added by the ProxifyHostFile implicitly.
-root_dir = str(ProxifyHostFile._spill_to_disk.root_dir / ".." / "..")
+@pytest.fixture(scope="module")
+def root_dir(tmp_path_factory):
+    tmpdir = tmp_path_factory.mktemp("jit-unspill")
+    # Make the "disk" serializer available and use a tmp directory
+    if ProxifyHostFile._spill_to_disk is None:
+        ProxifyHostFile(
+            worker_local_directory=tmpdir.name,
+            device_memory_limit=1024,
+            memory_limit=1024,
+        )
+    assert ProxifyHostFile._spill_to_disk is not None
+
+    # In order to use the same tmp dir, we use `root_dir` for all
+    # ProxifyHostFile creations Notice, we use `..` to remove the
+    # `jit-unspill-disk-storage` part added by the
+    # ProxifyHostFile implicitly.
+    return str(ProxifyHostFile._spill_to_disk.root_dir / "..")
 
 
 def is_proxies_equal(p1: Iterable[ProxyObject], p2: Iterable[ProxyObject]):
@@ -61,9 +63,11 @@ def is_proxies_equal(p1: Iterable[ProxyObject], p2: Iterable[ProxyObject]):
     return ids1 == ids2
 
 
-def test_one_dev_item_limit():
+def test_one_dev_item_limit(root_dir):
     dhf = ProxifyHostFile(
-        local_directory=root_dir, device_memory_limit=one_item_nbytes, memory_limit=1000
+        worker_local_directory=root_dir,
+        device_memory_limit=one_item_nbytes,
+        memory_limit=1000,
     )
 
     a1 = one_item_array() + 42
@@ -150,10 +154,10 @@ def test_one_dev_item_limit():
     assert len(dhf.manager) == 0
 
 
-def test_one_item_host_limit(capsys):
+def test_one_item_host_limit(capsys, root_dir):
     memory_limit = sizeof(asproxy(one_item_array(), serializers=("dask", "pickle")))
     dhf = ProxifyHostFile(
-        local_directory=root_dir,
+        worker_local_directory=root_dir,
         device_memory_limit=one_item_nbytes,
         memory_limit=memory_limit,
     )
@@ -213,7 +217,7 @@ def test_one_item_host_limit(capsys):
     assert len(dhf.manager) == 0
 
 
-def test_spill_on_demand():
+def test_spill_on_demand(root_dir):
     """
     Test spilling on demand by disabling the device_memory_limit
     and allocating two large buffers that will otherwise fail because
@@ -225,7 +229,7 @@ def test_spill_on_demand():
 
     total_mem = get_device_total_memory()
     dhf = ProxifyHostFile(
-        local_directory=root_dir,
+        worker_local_directory=root_dir,
         device_memory_limit=2 * total_mem,
         memory_limit=2 * total_mem,
         spill_on_demand=True,
@@ -263,7 +267,7 @@ def task(x):
             assert_frame_equal(got.to_pandas(), df.to_pandas())
 
 
-def test_dataframes_share_dev_mem():
+def test_dataframes_share_dev_mem(root_dir):
     cudf = pytest.importorskip("cudf")
 
     df = cudf.DataFrame({"a": range(10)})
@@ -276,7 +280,7 @@ def test_dataframes_share_dev_mem():
     view1["a"].data.ptr == view2["a"].data.ptr
 
     dhf = ProxifyHostFile(
-        local_directory=root_dir, device_memory_limit=160, memory_limit=1000
+        worker_local_directory=root_dir, device_memory_limit=160, memory_limit=1000
     )
     dhf["v1"] = view1
     dhf["v2"] = view2
@@ -303,7 +307,7 @@ def test_cudf_get_device_memory_objects():
     assert len(res) == 4, "We expect four buffer objects"
 
 
-def test_externals():
+def test_externals(root_dir):
     """Test adding objects directly to the manager
 
     Add an object directly to the manager makes it count against the
@@ -317,7 +321,9 @@ def test_externals():
     __delitem__.
     """
     dhf = ProxifyHostFile(
-        local_directory=root_dir, device_memory_limit=one_item_nbytes, memory_limit=1000
+        worker_local_directory=root_dir,
+        device_memory_limit=one_item_nbytes,
+        memory_limit=1000,
     )
     dhf["k1"] = one_item_array()
     k1 = dhf["k1"]
@@ -353,7 +359,7 @@ def test_externals():
 
 
 @patch("dask_cuda.proxify_device_objects.incompatible_types", (cupy.ndarray,))
-def test_incompatible_types():
+def test_incompatible_types(root_dir):
     """Check that ProxifyHostFile unproxifies `cupy.ndarray` on retrieval
 
     Notice, in this test we add `cupy.ndarray` to the incompatible_types temporarily.
@@ -361,7 +367,7 @@ def test_incompatible_types():
     cupy = pytest.importorskip("cupy")
     cudf = pytest.importorskip("cudf")
     dhf = ProxifyHostFile(
-        local_directory=root_dir, device_memory_limit=100, memory_limit=100
+        worker_local_directory=root_dir, device_memory_limit=100, memory_limit=100
     )
 
     # We expect `dhf` to unproxify `a1` (but not `a2`) on retrieval
diff --git a/dask_cuda/tests/test_proxy.py b/dask_cuda/tests/test_proxy.py
index 2c21023c..830b403d 100644
--- a/dask_cuda/tests/test_proxy.py
+++ b/dask_cuda/tests/test_proxy.py
@@ -28,7 +28,7 @@
 if ProxifyHostFile._spill_to_disk is None:
     tmpdir = tempfile.TemporaryDirectory()
     ProxifyHostFile(
-        local_directory=tmpdir.name,
+        worker_local_directory=tmpdir.name,
         device_memory_limit=1024,
         memory_limit=1024,
     )

From 2c99f5a0fdf59a7f89e182c4add9121e0b8dd48d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 8 Nov 2022 21:56:51 +0000
Subject: [PATCH 15/25] Fix version constraint (#1036)

Should apply custom __new__ if Python < 3.9.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1036
---
 dask_cuda/device_host_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index aa81f104..f31d618b 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -235,7 +235,7 @@ def __init__(
         # For Worker compatibility only, where `fast` is host memory buffer
         self.fast = self.host_buffer if memory_limit is None else self.host_buffer.fast
 
-    if sys.version_info > (3, 8):
+    if sys.version_info < (3, 9):
 
         def __new__(
             cls,

From c8402b5d3dad2a3cb76e14601dd791099251709b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 15 Nov 2022 15:56:31 +0100
Subject: [PATCH 16/25] Work around Jupyter errors in CI (#1041)

Work around the issue reported in #1040 . This probably needs to be fixed elsewhere, so this patch temporarily unblocks CI.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - https://github.com/jakirkham
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/dask-cuda/pull/1041
---
 ci/gpu/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index e41f9976..f200b57a 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -40,6 +40,10 @@ export INSTALL_DASK_MAIN=1
 # Dask version to install when `INSTALL_DASK_MAIN=0`
 export DASK_STABLE_VERSION="2022.9.2"
 
+# Temporary workaround for Jupyter errors.
+# See https://github.com/rapidsai/dask-cuda/issues/1040
+export JUPYTER_PLATFORM_DIRS=1
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################

From b3ed9029a1ad02a61eb7fbd899a5a6826bb5cfac Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 15 Nov 2022 20:17:05 +0100
Subject: [PATCH 17/25] Fixes for handling MIG devices (#950)

Addresses errors reported recently in https://github.com/rapidsai/dask-cuda/issues/583#issuecomment-1181505348 . Depends on https://github.com/dask/distributed/pull/6720 .

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/950
---
 dask_cuda/initialize.py                    | 17 ++++++++++-------
 dask_cuda/tests/test_dask_cuda_worker.py   |  7 +++----
 dask_cuda/tests/test_local_cuda_cluster.py |  7 ++++---
 dask_cuda/utils.py                         | 15 ++++++++++-----
 4 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/dask_cuda/initialize.py b/dask_cuda/initialize.py
index bda4d08b..f03f99ec 100644
--- a/dask_cuda/initialize.py
+++ b/dask_cuda/initialize.py
@@ -6,9 +6,9 @@
 
 import dask
 import distributed.comm.ucx
-from distributed.diagnostics.nvml import has_cuda_context
+from distributed.diagnostics.nvml import get_device_index_and_uuid, has_cuda_context
 
-from .utils import get_ucx_config, parse_cuda_visible_device
+from .utils import get_ucx_config
 
 logger = logging.getLogger(__name__)
 
@@ -35,20 +35,23 @@ def _create_cuda_context():
             # Therefore if ``import ucp`` fails we can just continue here.
             pass
 
-        cuda_visible_device = parse_cuda_visible_device(
+        cuda_visible_device = get_device_index_and_uuid(
             os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0]
         )
         ctx = has_cuda_context()
-        if ctx is not False and distributed.comm.ucx.cuda_context_created is False:
+        if (
+            ctx.has_context
+            and not distributed.comm.ucx.cuda_context_created.has_context
+        ):
             distributed.comm.ucx._warn_existing_cuda_context(ctx, os.getpid())
 
         _create_cuda_context_handler()
 
-        if distributed.comm.ucx.cuda_context_created is False:
+        if not distributed.comm.ucx.cuda_context_created.has_context:
             ctx = has_cuda_context()
-            if ctx is not False and ctx != cuda_visible_device:
+            if ctx.has_context and ctx.device_info != cuda_visible_device:
                 distributed.comm.ucx._warn_cuda_context_wrong_device(
-                    cuda_visible_device, ctx, os.getpid()
+                    cuda_visible_device, ctx.device_info, os.getpid()
                 )
 
     except Exception:
diff --git a/dask_cuda/tests/test_dask_cuda_worker.py b/dask_cuda/tests/test_dask_cuda_worker.py
index 705a2150..951e0269 100644
--- a/dask_cuda/tests/test_dask_cuda_worker.py
+++ b/dask_cuda/tests/test_dask_cuda_worker.py
@@ -230,7 +230,6 @@ def test_pre_import_not_found():
         assert b"ModuleNotFoundError: No module named 'my_module'" in ret.stderr
 
 
-@patch.dict(os.environ, {"DASK_DISTRIBUTED__DIAGNOSTICS__NVML": "False"})
 def test_cuda_mig_visible_devices_and_memory_limit_and_nthreads(loop):  # noqa: F811
     uuids = get_gpu_count_mig(return_uuids=True)[1]
     # test only with some MIG Instances assuming the test bed
@@ -268,9 +267,9 @@ def get_visible_devices():
                     wait(result)
                     assert all(len(v.split(",")) == len(uuids) for v in result.values())
                     for i in range(len(uuids)):
-                        assert set(v.split(",")[i] for v in result.values()) == set(
-                            uuids
-                        )
+                        assert set(
+                            bytes(v.split(",")[i], "utf-8") for v in result.values()
+                        ) == set(uuids)
 
 
 def test_cuda_visible_devices_uuid(loop):  # noqa: F811
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index 8c169d4b..dfb79c57 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -314,7 +314,6 @@ async def test_cluster_worker():
             await new_worker.close()
 
 
-@patch.dict(os.environ, {"DASK_DISTRIBUTED__DIAGNOSTICS__NVML": "False"})
 @gen_test(timeout=20)
 async def test_available_mig_workers():
     uuids = get_gpu_count_mig(return_uuids=True)[1]
@@ -337,8 +336,10 @@ def get_visible_devices():
                 result = await client.run(get_visible_devices)
 
                 assert all(len(v.split(",")) == len(uuids) for v in result.values())
-                for i in range(len(uuids)):
-                    assert set(v.split(",")[i] for v in result.values()) == set(uuids)
+                for i in range(len(cluster.workers)):
+                    assert set(v.split(",")[i] for v in result.values()) == set(
+                        uuid.decode("utf-8") for uuid in uuids
+                    )
 
 
 @gen_test(timeout=20)
diff --git a/dask_cuda/utils.py b/dask_cuda/utils.py
index 23179b82..a60c05e7 100644
--- a/dask_cuda/utils.py
+++ b/dask_cuda/utils.py
@@ -489,7 +489,7 @@ def parse_cuda_visible_device(dev):
 
     A device identifier must either be an integer, a string containing an
     integer or a string containing the device's UUID, beginning with prefix
-    'GPU-' or 'MIG-GPU'.
+    'GPU-' or 'MIG-'.
 
     >>> parse_cuda_visible_device(2)
     2
@@ -501,18 +501,23 @@ def parse_cuda_visible_device(dev):
     Traceback (most recent call last):
     ...
     ValueError: Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers or
-    strings beginning with 'GPU-' or 'MIG-GPU-' prefixes.
+    strings beginning with 'GPU-' or 'MIG-' prefixes.
     """
     try:
         return int(dev)
     except ValueError:
-        if any(dev.startswith(prefix) for prefix in ["GPU-", "MIG-GPU-", "MIG-"]):
+        if any(
+            dev.startswith(prefix)
+            for prefix in [
+                "GPU-",
+                "MIG-",
+            ]
+        ):
             return dev
         else:
             raise ValueError(
                 "Devices in CUDA_VISIBLE_DEVICES must be comma-separated integers "
-                "or strings beginning with 'GPU-' or 'MIG-GPU-' prefixes"
-                " or 'MIG-<UUID>'."
+                "or strings beginning with 'GPU-' or 'MIG-' prefixes."
             )
 
 

From 48cca0e20679d29ff270f9978d176af9ff005f20 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 16 Nov 2022 18:24:48 +0100
Subject: [PATCH 18/25] Remove `pytest-asyncio` dependency (#1045)

All requirements towards `pytest-asyncio` should have been removed, dropping all remains.

Closes #1044

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/dask-cuda/pull/1045
---
 ci/gpu/build.sh                            | 5 +----
 dask_cuda/tests/test_local_cuda_cluster.py | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index f200b57a..86e4a899 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -63,14 +63,11 @@ conda config --show-sources
 conda list --show-channel-urls
 
 # Installing cucim in order to test GDS spilling
-# Pin pytest-asyncio because latest versions modify the default asyncio
-# `event_loop_policy`. See https://github.com/dask/distributed/pull/4212 .
 gpuci_mamba_retry install "cudf=${MINOR_VERSION}" \
               "dask-cudf=${MINOR_VERSION}" \
               "ucx-py=${UCXPY_VERSION}" \
               "ucx-proc=*=gpu" \
-              "cucim" \
-              "pytest-asyncio=<0.14.0"
+              "cucim"
 
 
 gpuci_logger "Check versions"
diff --git a/dask_cuda/tests/test_local_cuda_cluster.py b/dask_cuda/tests/test_local_cuda_cluster.py
index dfb79c57..5e407080 100644
--- a/dask_cuda/tests/test_local_cuda_cluster.py
+++ b/dask_cuda/tests/test_local_cuda_cluster.py
@@ -89,7 +89,6 @@ def get_visible_devices():
 
 
 @pytest.mark.parametrize("protocol", ["ucx", None])
-@pytest.mark.asyncio
 @gen_test(timeout=20)
 async def test_ucx_protocol(protocol):
     pytest.importorskip("ucp")
@@ -103,7 +102,6 @@ async def test_ucx_protocol(protocol):
         )
 
 
-@pytest.mark.asyncio
 @pytest.mark.filterwarnings("ignore:Exception ignored in")
 @gen_test(timeout=20)
 async def test_ucx_protocol_type_error():

From f11abe380c9a9343faed9ecfd7f5bd009c0dfe93 Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 16 Nov 2022 10:19:53 -0800
Subject: [PATCH 19/25] Migrate as much as possible to `pyproject.toml` (#1035)

* Upgrade to versioneer 0.28 & run formatting tools on those files
* Move most of `setup.py` to `pyproject.toml`
* Move all of `setup.cfg` to `pyproject.toml`
* Move `pytest.ini` to `pyproject.toml`
* Move `.coveragerc` to `pyproject.toml`
* Move Flake8 settings to `.flake8`

Authors:
  - https://github.com/jakirkham

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/dask-cuda/pull/1035
---
 .coveragerc                           |    5 -
 .flake8                               |   14 +
 MANIFEST.in                           |    2 -
 conda/environments/builddocs_py37.yml |    9 -
 conda/recipes/dask-cuda/build.sh      |    3 -
 conda/recipes/dask-cuda/meta.yaml     |   23 +-
 dask_cuda/_version.py                 |  259 +++-
 docs/requirements.txt                 |    4 -
 pyproject.toml                        |  142 ++
 pytest.ini                            |    9 -
 requirements.txt                      |    5 -
 setup.cfg                             |   55 -
 setup.py                              |   33 +-
 versioneer.py                         | 1885 -------------------------
 14 files changed, 372 insertions(+), 2076 deletions(-)
 delete mode 100644 .coveragerc
 create mode 100644 .flake8
 delete mode 100644 conda/environments/builddocs_py37.yml
 delete mode 100644 conda/recipes/dask-cuda/build.sh
 delete mode 100644 docs/requirements.txt
 create mode 100644 pyproject.toml
 delete mode 100644 pytest.ini
 delete mode 100644 requirements.txt
 delete mode 100644 setup.cfg
 delete mode 100644 versioneer.py

diff --git a/.coveragerc b/.coveragerc
deleted file mode 100644
index fd7affab..00000000
--- a/.coveragerc
+++ /dev/null
@@ -1,5 +0,0 @@
-# Configuration file for Python coverage tests
-[run]
-disable_warnings = include-ignored
-include = dask_cuda/*
-omit = dask_cuda/tests/*
diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..6d7180c4
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,14 @@
+[flake8]
+exclude = docs, __init__.py
+max-line-length = 88
+ignore =
+    # Assigning lambda expression
+    E731
+    # Ambiguous variable names
+    E741
+    # line break before binary operator
+    W503
+    # whitespace before :
+    E203
+    # whitespace after ,
+    E231
diff --git a/MANIFEST.in b/MANIFEST.in
index fc2c165b..344d51cc 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1 @@
-include requirements.txt
-include versioneer.py
 include dask_cuda/_version.py
diff --git a/conda/environments/builddocs_py37.yml b/conda/environments/builddocs_py37.yml
deleted file mode 100644
index a3b5ec59..00000000
--- a/conda/environments/builddocs_py37.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-name: dask_cuda_docs
-channels:
-- rapidsai-nightly
-- conda-forge
-dependencies:
-- rapids-doc-env
-- sphinx-click
-- pandas
-- dask
diff --git a/conda/recipes/dask-cuda/build.sh b/conda/recipes/dask-cuda/build.sh
deleted file mode 100644
index 06f5d6ee..00000000
--- a/conda/recipes/dask-cuda/build.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/usr/bin/env bash
-
-python setup.py install --single-version-externally-managed --record=record.txt
diff --git a/conda/recipes/dask-cuda/meta.yaml b/conda/recipes/dask-cuda/meta.yaml
index 9900d48b..a31628b2 100644
--- a/conda/recipes/dask-cuda/meta.yaml
+++ b/conda/recipes/dask-cuda/meta.yaml
@@ -2,10 +2,13 @@
 
 # Usage:
 #   conda build -c conda-forge .
-{% set data = load_setup_py_data() %}
+{% set data = load_file_data("pyproject.toml") %}
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
-{% set py_version=environ.get('CONDA_PY', 36) %}
+{% set number = environ.get('GIT_DESCRIBE_NUMBER', 0) %}
+{% set py_version = environ.get('CONDA_PY', 36) %}
+{% set git_hash = environ.get('GIT_DESCRIBE_HASH', '') %}
+
 package:
   name: dask-cuda
   version: {{ version }}
@@ -14,16 +17,24 @@ source:
   git_url: ../../..
 
 build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  number: {{ number }}
+  string: py{{ py_version }}_{{ git_hash }}_{{ number }}
+  script:
+    - {{ PYTHON }} -m pip install . -vv
+  entry_points:
+    {% for e in data.get("project", {}).get("scripts", {}).items() %}
+    - {{ e|join(" = ") }}
+    {% endfor %}
 
 requirements:
   host:
     - python
-    - setuptools
+    - pip
+    - tomli
+    - versioneer >=0.24
   run:
     - python
-    {% for r in data.get("install_requires", []) %}
+    {% for r in data.get("project", {}).get("dependencies", []) %}
     - {{ r }}
     {% endfor %}
 
diff --git a/dask_cuda/_version.py b/dask_cuda/_version.py
index 99300c2a..6310ff96 100644
--- a/dask_cuda/_version.py
+++ b/dask_cuda/_version.py
@@ -4,16 +4,19 @@
 # directories (produced by setup.py build) will contain a much shorter file
 # that just contains the computed version number.
 
-# This file is released into the public domain. Generated by
-# versioneer-0.18 (https://github.com/warner/python-versioneer)
+# This file is released into the public domain.
+# Generated by versioneer-0.28
+# https://github.com/python-versioneer/python-versioneer
 
 """Git implementation of _version.py."""
 
 import errno
+import functools
 import os
 import re
 import subprocess
 import sys
+from typing import Callable, Dict
 
 
 def get_keywords():
@@ -40,7 +43,7 @@ def get_config():
     cfg = VersioneerConfig()
     cfg.VCS = "git"
     cfg.style = "pep440"
-    cfg.tag_prefix = ""
+    cfg.tag_prefix = "v"
     cfg.parentdir_prefix = "dask_cuda-"
     cfg.versionfile_source = "dask_cuda/_version.py"
     cfg.verbose = False
@@ -51,12 +54,12 @@ class NotThisMethod(Exception):
     """Exception raised if a method is not valid for the current scenario."""
 
 
-LONG_VERSION_PY = {}  # type: ignore
-HANDLERS = {}
+LONG_VERSION_PY: Dict[str, str] = {}
+HANDLERS: Dict[str, Dict[str, Callable]] = {}
 
 
 def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
+    """Create decorator to mark a method as the handler of a VCS."""
 
     def decorate(f):
         """Store f in HANDLERS[vcs][method]."""
@@ -71,20 +74,29 @@ def decorate(f):
 def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
     """Call the given command(s)."""
     assert isinstance(commands, list)
-    p = None
-    for c in commands:
+    process = None
+
+    popen_kwargs = {}
+    if sys.platform == "win32":
+        # This hides the console window if pythonw.exe is used
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+        popen_kwargs["startupinfo"] = startupinfo
+
+    for command in commands:
         try:
-            dispcmd = str([c] + args)
+            dispcmd = str([command] + args)
             # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen(
-                [c] + args,
+            process = subprocess.Popen(
+                [command] + args,
                 cwd=cwd,
                 env=env,
                 stdout=subprocess.PIPE,
                 stderr=(subprocess.PIPE if hide_stderr else None),
+                **popen_kwargs,
             )
             break
-        except EnvironmentError:
+        except OSError:
             e = sys.exc_info()[1]
             if e.errno == errno.ENOENT:
                 continue
@@ -96,15 +108,13 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=
         if verbose:
             print("unable to find command, tried %s" % (commands,))
         return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
+    stdout = process.communicate()[0].strip().decode()
+    if process.returncode != 0:
         if verbose:
             print("unable to run %s (error)" % dispcmd)
             print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
+        return None, process.returncode
+    return stdout, process.returncode
 
 
 def versions_from_parentdir(parentdir_prefix, root, verbose):
@@ -116,7 +126,7 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
     """
     rootdirs = []
 
-    for i in range(3):
+    for _ in range(3):
         dirname = os.path.basename(root)
         if dirname.startswith(parentdir_prefix):
             return {
@@ -126,9 +136,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose):
                 "error": None,
                 "date": None,
             }
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
+        rootdirs.append(root)
+        root = os.path.dirname(root)  # up a level
 
     if verbose:
         print(
@@ -147,22 +156,21 @@ def git_get_keywords(versionfile_abs):
     # _version.py.
     keywords = {}
     try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
+        with open(versionfile_abs, "r") as fobj:
+            for line in fobj:
+                if line.strip().startswith("git_refnames ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["refnames"] = mo.group(1)
+                if line.strip().startswith("git_full ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["full"] = mo.group(1)
+                if line.strip().startswith("git_date ="):
+                    mo = re.search(r'=\s*"(.*)"', line)
+                    if mo:
+                        keywords["date"] = mo.group(1)
+    except OSError:
         pass
     return keywords
 
@@ -170,10 +178,14 @@ def git_get_keywords(versionfile_abs):
 @register_vcs_handler("git", "keywords")
 def git_versions_from_keywords(keywords, tag_prefix, verbose):
     """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
+    if "refnames" not in keywords:
+        raise NotThisMethod("Short version file found")
     date = keywords.get("date")
     if date is not None:
+        # Use only the last line.  Previous lines may contain GPG signature
+        # information.
+        date = date.splitlines()[-1]
+
         # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
         # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
         # -like" string, which we must then edit to make compliant), because
@@ -186,11 +198,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         if verbose:
             print("keywords are unexpanded, not using")
         raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
+    refs = {r.strip() for r in refnames.strip("()").split(",")}
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
     TAG = "tag: "
-    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
+    tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)}
     if not tags:
         # Either we're using git < 1.8.3, or there really are no tags. We use
         # a heuristic: assume all version tags have a digit. The old git %d
@@ -199,7 +211,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # between branches and tags. By ignoring refnames without digits, we
         # filter out many common branch names like "release" and
         # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r"\d", r)])
+        tags = {r for r in refs if re.search(r"\d", r)}
         if verbose:
             print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
@@ -208,6 +220,11 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
         # sorting will prefer e.g. "2.0" over "2.0rc1"
         if ref.startswith(tag_prefix):
             r = ref[len(tag_prefix) :]
+            # Filter out refs that exactly match prefix or that don't start
+            # with a number once the prefix is stripped (mostly a concern
+            # when prefix is '')
+            if not re.match(r"\d", r):
+                continue
             if verbose:
                 print("picking %s" % r)
             return {
@@ -230,7 +247,7 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose):
 
 
 @register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
+def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command):
     """Get version from 'git describe' in the root of the source tree.
 
     This only gets called if the git-archive 'subst' keywords were *not*
@@ -241,7 +258,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if sys.platform == "win32":
         GITS = ["git.cmd", "git.exe"]
 
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
+    # GIT_DIR can interfere with correct operation of Versioneer.
+    # It may be intended to be passed to the Versioneer-versioned project,
+    # but that should not change where we get our version from.
+    env = os.environ.copy()
+    env.pop("GIT_DIR", None)
+    runner = functools.partial(runner, env=env)
+
+    _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose)
     if rc != 0:
         if verbose:
             print("Directory %s not under git control" % root)
@@ -249,7 +273,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
 
     # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
     # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(
+    describe_out, rc = runner(
         GITS,
         [
             "describe",
@@ -258,7 +282,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
             "--always",
             "--long",
             "--match",
-            "%s*" % tag_prefix,
+            f"{tag_prefix}[[:digit:]]*",
         ],
         cwd=root,
     )
@@ -266,7 +290,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     if describe_out is None:
         raise NotThisMethod("'git describe' failed")
     describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
+    full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root)
     if full_out is None:
         raise NotThisMethod("'git rev-parse' failed")
     full_out = full_out.strip()
@@ -276,6 +300,38 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     pieces["short"] = full_out[:7]  # maybe improved later
     pieces["error"] = None
 
+    branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root)
+    # --abbrev-ref was added in git-1.6.3
+    if rc != 0 or branch_name is None:
+        raise NotThisMethod("'git rev-parse --abbrev-ref' returned error")
+    branch_name = branch_name.strip()
+
+    if branch_name == "HEAD":
+        # If we aren't exactly on a branch, pick a branch which represents
+        # the current commit. If all else fails, we are on a branchless
+        # commit.
+        branches, rc = runner(GITS, ["branch", "--contains"], cwd=root)
+        # --contains was added in git-1.5.4
+        if rc != 0 or branches is None:
+            raise NotThisMethod("'git branch --contains' returned error")
+        branches = branches.split("\n")
+
+        # Remove the first line if we're running detached
+        if "(" in branches[0]:
+            branches.pop(0)
+
+        # Strip off the leading "* " from the list of branches.
+        branches = [branch[2:] for branch in branches]
+        if "master" in branches:
+            branch_name = "master"
+        elif not branches:
+            branch_name = None
+        else:
+            # Pick the first branch that is returned. Good or bad.
+            branch_name = branches[0]
+
+    pieces["branch"] = branch_name
+
     # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
     # TAG might have hyphens.
     git_describe = describe_out
@@ -292,7 +348,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
         # TAG-NUM-gHEX
         mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
         if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
+            # unparsable. Maybe git-describe is misbehaving?
             pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
             return pieces
 
@@ -318,13 +374,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
     else:
         # HEX: no tags
         pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
+        out, rc = runner(GITS, ["rev-list", "HEAD", "--left-right"], cwd=root)
+        pieces["distance"] = len(out.split())  # total number of commits
 
     # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
-        0
-    ].strip()
+    date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip()
+    # Use only the last line.  Previous lines may contain GPG signature
+    # information.
+    date = date.splitlines()[-1]
     pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
 
     return pieces
@@ -361,19 +418,66 @@ def render_pep440(pieces):
     return rendered
 
 
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
+def render_pep440_branch(pieces):
+    """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] .
+
+    The ".dev0" means not master branch. Note that .dev0 sorts backwards
+    (a feature branch will appear "older" than the master branch).
 
     Exceptions:
-    1: no tags. 0.post.devDISTANCE
+    1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty]
     """
     if pieces["closest-tag"]:
         rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0"
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
+def pep440_split_post(ver):
+    """Split pep440 version string at the post-release segment.
+
+    Returns the release segments before the post-release and the
+    post-release version number (or -1 if no post-release segment is present).
+    """
+    vc = str.split(ver, ".post")
+    return vc[0], int(vc[1] or 0) if len(vc) == 2 else None
+
+
+def render_pep440_pre(pieces):
+    """TAG[.postN.devDISTANCE] -- No -dirty.
+
+    Exceptions:
+    1: no tags. 0.post0.devDISTANCE
+    """
+    if pieces["closest-tag"]:
         if pieces["distance"]:
-            rendered += ".post.dev%d" % pieces["distance"]
+            # update the post release segment
+            tag_version, post_version = pep440_split_post(pieces["closest-tag"])
+            rendered = tag_version
+            if post_version is not None:
+                rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"])
+            else:
+                rendered += ".post0.dev%d" % (pieces["distance"])
+        else:
+            # no commits, use the tag as the version
+            rendered = pieces["closest-tag"]
     else:
         # exception #1
-        rendered = "0.post.dev%d" % pieces["distance"]
+        rendered = "0.post0.dev%d" % pieces["distance"]
     return rendered
 
 
@@ -404,12 +508,41 @@ def render_pep440_post(pieces):
     return rendered
 
 
+def render_pep440_post_branch(pieces):
+    """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] .
+
+    The ".dev0" means not master branch.
+
+    Exceptions:
+    1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty]
+    """
+    if pieces["closest-tag"]:
+        rendered = pieces["closest-tag"]
+        if pieces["distance"] or pieces["dirty"]:
+            rendered += ".post%d" % pieces["distance"]
+            if pieces["branch"] != "master":
+                rendered += ".dev0"
+            rendered += plus_or_dot(pieces)
+            rendered += "g%s" % pieces["short"]
+            if pieces["dirty"]:
+                rendered += ".dirty"
+    else:
+        # exception #1
+        rendered = "0.post%d" % pieces["distance"]
+        if pieces["branch"] != "master":
+            rendered += ".dev0"
+        rendered += "+g%s" % pieces["short"]
+        if pieces["dirty"]:
+            rendered += ".dirty"
+    return rendered
+
+
 def render_pep440_old(pieces):
     """TAG[.postDISTANCE[.dev0]] .
 
     The ".dev0" means dirty.
 
-    Eexceptions:
+    Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
     """
     if pieces["closest-tag"]:
@@ -482,10 +615,14 @@ def render(pieces, style):
 
     if style == "pep440":
         rendered = render_pep440(pieces)
+    elif style == "pep440-branch":
+        rendered = render_pep440_branch(pieces)
     elif style == "pep440-pre":
         rendered = render_pep440_pre(pieces)
     elif style == "pep440-post":
         rendered = render_pep440_post(pieces)
+    elif style == "pep440-post-branch":
+        rendered = render_pep440_post_branch(pieces)
     elif style == "pep440-old":
         rendered = render_pep440_old(pieces)
     elif style == "git-describe":
@@ -524,7 +661,7 @@ def get_versions():
         # versionfile_source is the relative path from the top of the source
         # tree (where the .git directory might live) to this file. Invert
         # this to find the root from __file__.
-        for i in cfg.versionfile_source.split("/"):
+        for _ in cfg.versionfile_source.split("/"):
             root = os.path.dirname(root)
     except NameError:
         return {
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 3287f78c..00000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-dask_cuda
-numpydoc==1.1.0
-sphinx_click==2.7.1
-sphinx_rtd_theme==0.5.1
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..6ed22d82
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,142 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools>=64.0.0",
+    "tomli  ; python_version < '3.11'",
+    "versioneer>=0.24",
+]
+
+[project]
+name = "dask-cuda"
+dynamic = [
+    "version",
+]
+description = "Utilities for Dask and CUDA interactions"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license= { text = "Apache-2.0" }
+requires-python = ">=3.8"
+dependencies = [
+    "dask >=2022.9.2",
+    "distributed >=2022.9.2",
+    "pynvml >=11.0.0",
+    "numpy >=1.18.0",
+    "numba >=0.54",
+    "pandas >=1.0",
+    "zict >=0.1.3",
+]
+classifiers=[
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+]
+
+[project.scripts]
+dask-cuda-worker = "dask_cuda.cli.dask_cuda_worker:go"
+dask-config = "dask_cuda.cli.dask_config:go"
+
+[project.optional-dependencies]
+docs = [
+    "numpydoc>=1.1.0",
+    "sphinx",
+    "sphinx-click>=2.7.1",
+    "sphinx-rtd-theme>=0.5.1",
+]
+test = [
+    "pytest",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/dask-cuda"
+
+[tool.coverage.run]
+disable_warnings = [
+    "include-ignored",
+]
+include = [
+    "dask_cuda/*",
+]
+omit = [
+    "dask_cuda/tests/*",
+]
+
+[tool.isort]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+]
+known_rapids = [
+    "rmm",
+    "cuml",
+    "cugraph",
+    "dask_cudf",
+    "cudf",
+    "ucp",
+]
+known_first_party = [
+    "dask_cuda",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "build",
+    "dist",
+    "__init__.py",
+    "versioneer.py",
+]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::DeprecationWarning",
+    "error::FutureWarning",
+    "ignore::DeprecationWarning:pkg_resources",
+    "ignore:distutils Version classes are deprecated.*:DeprecationWarning:",
+    # tornado 6.2, remove when dask/distributed#6669 is fixed
+    "ignore:clear_current is deprecated:DeprecationWarning:",
+    "ignore:make_current is deprecated:DeprecationWarning:",
+]
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.packages.find]
+exclude = [
+    "docs",
+    "tests",
+    "docs.*",
+    "tests.*",
+]
+
+[tool.versioneer]
+VCS = "git"
+style = "pep440"
+versionfile_source = "dask_cuda/_version.py"
+versionfile_build = "dask_cuda/_version.py"
+tag_prefix = "v"
+parentdir_prefix = "dask_cuda-"
diff --git a/pytest.ini b/pytest.ini
deleted file mode 100644
index c833595a..00000000
--- a/pytest.ini
+++ /dev/null
@@ -1,9 +0,0 @@
-[pytest]
-filterwarnings =
-    error::DeprecationWarning
-    error::FutureWarning
-    ignore::DeprecationWarning:pkg_resources
-    ignore:distutils Version classes are deprecated.*:DeprecationWarning:
-    # tornado 6.2, remove when dask/distributed#6669 is fixed
-    ignore:clear_current is deprecated:DeprecationWarning:
-    ignore:make_current is deprecated:DeprecationWarning:
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 7b40e89d..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-dask>=2022.9.2
-distributed>=2022.9.2
-pynvml>=11.0.0
-numpy>=1.16.0
-numba>=0.54
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 143a9d62..00000000
--- a/setup.cfg
+++ /dev/null
@@ -1,55 +0,0 @@
-[versioneer]
-VCS = git
-style = pep440
-versionfile_source = dask_cuda/_version.py
-versionfile_build = dask_cuda/_version.py
-tag_prefix = v
-parentdir_prefix = dask_cuda-
-
-[flake8]
-exclude = docs, __init__.py, versioneer.py
-max-line-length = 88
-ignore =
-    # Assigning lambda expression
-    E731
-    # Ambiguous variable names
-    E741
-    # line break before binary operator
-    W503
-    # whitespace before :
-    E203
-    # whitespace after ,
-    E231
-
-[isort]
-line_length=88
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-known_rapids=
-    rmm
-    cuml
-    cugraph
-    dask_cudf
-    cudf
-    ucp
-known_first_party=
-    dask_cuda
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    build
-    dist
-    __init__.py
-    versioneer.py
diff --git a/setup.py b/setup.py
index 69b825f9..fa90437b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,9 @@
 import os
-from codecs import open
 
-from setuptools import find_packages, setup
+from setuptools import setup
 
 import versioneer
 
-# Get the long description from the README file
-with open(os.path.join(os.path.dirname(__file__), "README.md")) as f:
-    long_description = f.read()
-
 if "GIT_DESCRIBE_TAG" in os.environ:
     # Disgusting hack. For pypi uploads we cannot use the
     # versioneer-provided version for non-release builds, since they
@@ -32,32 +27,6 @@ def get_versions():
 
 
 setup(
-    name="dask-cuda",
     version=versioneer.get_version(),
     cmdclass=versioneer.get_cmdclass(),
-    description="Utilities for Dask and CUDA interactions",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/rapidsai/dask-cuda",
-    author="RAPIDS development team",
-    author_email="mrocklin@nvidia.com",
-    license="Apache-2.0",
-    license_files=["LICENSE"],
-    classifiers=[
-        "Intended Audience :: Developers",
-        "Topic :: Database",
-        "Topic :: Scientific/Engineering",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-    ],
-    packages=find_packages(exclude=["docs", "tests", "tests.*", "docs.*"]),
-    python_requires=">=3.8",
-    install_requires=open("requirements.txt").read().strip().split("\n"),
-    entry_points="""
-        [console_scripts]
-        dask-cuda-worker=dask_cuda.cli.dask_cuda_worker:go
-        dask-config=dask_cuda.cli.dask_config:go
-      """,
 )
diff --git a/versioneer.py b/versioneer.py
deleted file mode 100644
index 2b545405..00000000
--- a/versioneer.py
+++ /dev/null
@@ -1,1885 +0,0 @@
-# Version: 0.18
-
-"""The Versioneer - like a rocketeer, but for versions.
-
-The Versioneer
-==============
-
-* like a rocketeer, but for versions!
-* https://github.com/warner/python-versioneer
-* Brian Warner
-* License: Public Domain
-* Compatible With: python2.6, 2.7, 3.2, 3.3, 3.4, 3.5, 3.6, and pypy
-* [![Latest Version]
-(https://pypip.in/version/versioneer/badge.svg?style=flat)
-](https://pypi.python.org/pypi/versioneer/)
-* [![Build Status]
-(https://travis-ci.org/warner/python-versioneer.png?branch=master)
-](https://travis-ci.org/warner/python-versioneer)
-
-This is a tool for managing a recorded version number in distutils-based
-python projects. The goal is to remove the tedious and error-prone "update
-the embedded version string" step from your release process. Making a new
-release should be as easy as recording a new tag in your version-control
-system, and maybe making new tarballs.
-
-
-## Quick Install
-
-* `pip install versioneer` to somewhere to your $PATH
-* add a `[versioneer]` section to your setup.cfg (see below)
-* run `versioneer install` in your source tree, commit the results
-
-## Version Identifiers
-
-Source trees come from a variety of places:
-
-* a version-control system checkout (mostly used by developers)
-* a nightly tarball, produced by build automation
-* a snapshot tarball, produced by a web-based VCS browser, like github's
-  "tarball from tag" feature
-* a release tarball, produced by "setup.py sdist", distributed through PyPI
-
-Within each source tree, the version identifier (either a string or a number,
-this tool is format-agnostic) can come from a variety of places:
-
-* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows
-  about recent "tags" and an absolute revision-id
-* the name of the directory into which the tarball was unpacked
-* an expanded VCS keyword ($Id$, etc)
-* a `_version.py` created by some earlier build step
-
-For released software, the version identifier is closely related to a VCS
-tag. Some projects use tag names that include more than just the version
-string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool
-needs to strip the tag prefix to extract the version identifier. For
-unreleased software (between tags), the version identifier should provide
-enough information to help developers recreate the same tree, while also
-giving them an idea of roughly how old the tree is (after version 1.2, before
-version 1.3). Many VCS systems can report a description that captures this,
-for example `git describe --tags --dirty --always` reports things like
-"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the
-0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has
-uncommitted changes.
-
-The version identifier is used for multiple purposes:
-
-* to allow the module to self-identify its version: `myproject.__version__`
-* to choose a name and prefix for a 'setup.py sdist' tarball
-
-## Theory of Operation
-
-Versioneer works by adding a special `_version.py` file into your source
-tree, where your `__init__.py` can import it. This `_version.py` knows how to
-dynamically ask the VCS tool for version information at import time.
-
-`_version.py` also contains `$Revision$` markers, and the installation
-process marks `_version.py` to have this marker rewritten with a tag name
-during the `git archive` command. As a result, generated tarballs will
-contain enough information to get the proper version.
-
-To allow `setup.py` to compute a version too, a `versioneer.py` is added to
-the top level of your source tree, next to `setup.py` and the `setup.cfg`
-that configures it. This overrides several distutils/setuptools commands to
-compute the version when invoked, and changes `setup.py build` and `setup.py
-sdist` to replace `_version.py` with a small static file that contains just
-the generated version data.
-
-## Installation
-
-See [INSTALL.md](./INSTALL.md) for detailed installation instructions.
-
-## Version-String Flavors
-
-Code which uses Versioneer can learn about its version string at runtime by
-importing `_version` from your main `__init__.py` file and running the
-`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can
-import the top-level `versioneer.py` and run `get_versions()`.
-
-Both functions return a dictionary with different flavors of version
-information:
-
-* `['version']`: A condensed version string, rendered using the selected
-  style. This is the most commonly used value for the project's version
-  string. The default "pep440" style yields strings like `0.11`,
-  `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section
-  below for alternative styles.
-
-* `['full-revisionid']`: detailed revision identifier. For Git, this is the
-  full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac".
-
-* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the
-  commit date in ISO 8601 format. This will be None if the date is not
-  available.
-
-* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that
-  this is only accurate if run in a VCS checkout, otherwise it is likely to
-  be False or None
-
-* `['error']`: if the version string could not be computed, this will be set
-  to a string describing the problem, otherwise it will be None. It may be
-  useful to throw an exception in setup.py if this is set, to avoid e.g.
-  creating tarballs with a version string of "unknown".
-
-Some variants are more useful than others. Including `full-revisionid` in a
-bug report should allow developers to reconstruct the exact code being tested
-(or indicate the presence of local changes that should be shared with the
-developers). `version` is suitable for display in an "about" box or a CLI
-`--version` output: it can be easily compared against release notes and lists
-of bugs fixed in various releases.
-
-The installer adds the following text to your `__init__.py` to place a basic
-version in `YOURPROJECT.__version__`:
-
-    from ._version import get_versions
-    __version__ = get_versions()['version']
-    del get_versions
-
-## Styles
-
-The setup.cfg `style=` configuration controls how the VCS information is
-rendered into a version string.
-
-The default style, "pep440", produces a PEP440-compliant string, equal to the
-un-prefixed tag name for actual releases, and containing an additional "local
-version" section with more detail for in-between builds. For Git, this is
-TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags
---dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the
-tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and
-that this commit is two revisions ("+2") beyond the "0.11" tag. For released
-software (exactly equal to a known tag), the identifier will only contain the
-stripped tag, e.g. "0.11".
-
-Other styles are available. See [details.md](details.md) in the Versioneer
-source tree for descriptions.
-
-## Debugging
-
-Versioneer tries to avoid fatal errors: if something goes wrong, it will tend
-to return a version of "0+unknown". To investigate the problem, run `setup.py
-version`, which will run the version-lookup code in a verbose mode, and will
-display the full contents of `get_versions()` (including the `error` string,
-which may help identify what went wrong).
-
-## Known Limitations
-
-Some situations are known to cause problems for Versioneer. This details the
-most significant ones. More can be found on Github
-[issues page](https://github.com/warner/python-versioneer/issues).
-
-### Subprojects
-
-Versioneer has limited support for source trees in which `setup.py` is not in
-the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are
-two common reasons why `setup.py` might not be in the root:
-
-* Source trees which contain multiple subprojects, such as
-  [Buildbot](https://github.com/buildbot/buildbot), which contains both
-  "master" and "slave" subprojects, each with their own `setup.py`,
-  `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI
-  distributions (and upload multiple independently-installable tarballs).
-* Source trees whose main purpose is to contain a C library, but which also
-  provide bindings to Python (and perhaps other langauges) in subdirectories.
-
-Versioneer will look for `.git` in parent directories, and most operations
-should get the right version string. However `pip` and `setuptools` have bugs
-and implementation details which frequently cause `pip install .` from a
-subproject directory to fail to find a correct version string (so it usually
-defaults to `0+unknown`).
-
-`pip install --editable .` should work correctly. `setup.py install` might
-work too.
-
-Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in
-some later version.
-
-[Bug #38](https://github.com/warner/python-versioneer/issues/38) is tracking
-this issue. The discussion in
-[PR #61](https://github.com/warner/python-versioneer/pull/61) describes the
-issue from the Versioneer side in more detail.
-[pip PR#3176](https://github.com/pypa/pip/pull/3176) and
-[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve
-pip to let Versioneer work correctly.
-
-Versioneer-0.16 and earlier only looked for a `.git` directory next to the
-`setup.cfg`, so subprojects were completely unsupported with those releases.
-
-### Editable installs with setuptools <= 18.5
-
-`setup.py develop` and `pip install --editable .` allow you to install a
-project into a virtualenv once, then continue editing the source code (and
-test) without re-installing after every change.
-
-"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a
-convenient way to specify executable scripts that should be installed along
-with the python package.
-
-These both work as expected when using modern setuptools. When using
-setuptools-18.5 or earlier, however, certain operations will cause
-`pkg_resources.DistributionNotFound` errors when running the entrypoint
-script, which must be resolved by re-installing the package. This happens
-when the install happens with one version, then the egg_info data is
-regenerated while a different version is checked out. Many setup.py commands
-cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into
-a different virtualenv), so this can be surprising.
-
-[Bug #83](https://github.com/warner/python-versioneer/issues/83) describes
-this one, but upgrading to a newer version of setuptools should probably
-resolve it.
-
-### Unicode version strings
-
-While Versioneer works (and is continually tested) with both Python 2 and
-Python 3, it is not entirely consistent with bytes-vs-unicode distinctions.
-Newer releases probably generate unicode version strings on py2. It's not
-clear that this is wrong, but it may be surprising for applications when then
-write these strings to a network connection or include them in bytes-oriented
-APIs like cryptographic checksums.
-
-[Bug #71](https://github.com/warner/python-versioneer/issues/71) investigates
-this question.
-
-
-## Updating Versioneer
-
-To upgrade your project to a new release of Versioneer, do the following:
-
-* install the new Versioneer (`pip install -U versioneer` or equivalent)
-* edit `setup.cfg`, if necessary, to include any new configuration settings
-  indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details.
-* re-run `versioneer install` in your source tree, to replace
-  `SRC/_version.py`
-* commit any changed files
-
-## Future Directions
-
-This tool is designed to make it easily extended to other version-control
-systems: all VCS-specific components are in separate directories like
-src/git/ . The top-level `versioneer.py` script is assembled from these
-components by running make-versioneer.py . In the future, make-versioneer.py
-will take a VCS name as an argument, and will construct a version of
-`versioneer.py` that is specific to the given VCS. It might also take the
-configuration arguments that are currently provided manually during
-installation by editing setup.py . Alternatively, it might go the other
-direction and include code from all supported VCS systems, reducing the
-number of intermediate scripts.
-
-
-## License
-
-To make Versioneer easier to embed, all its code is dedicated to the public
-domain. The `_version.py` that it creates is also in the public domain.
-Specifically, both are released under the Creative Commons "Public Domain
-Dedication" license (CC0-1.0), as described in
-https://creativecommons.org/publicdomain/zero/1.0/ .
-
-"""
-
-from __future__ import print_function
-
-try:
-    import configparser
-except ImportError:
-    import ConfigParser as configparser
-import errno
-import json
-import os
-import re
-import subprocess
-import sys
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_root():
-    """Get the project root directory.
-
-    We require that all commands are run from the project root, i.e. the
-    directory that contains setup.py, setup.cfg, and versioneer.py .
-    """
-    root = os.path.realpath(os.path.abspath(os.getcwd()))
-    setup_py = os.path.join(root, "setup.py")
-    versioneer_py = os.path.join(root, "versioneer.py")
-    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        # allow 'python path/to/setup.py COMMAND'
-        root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0])))
-        setup_py = os.path.join(root, "setup.py")
-        versioneer_py = os.path.join(root, "versioneer.py")
-    if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)):
-        err = (
-            "Versioneer was unable to run the project root directory. "
-            "Versioneer requires setup.py to be executed from "
-            "its immediate directory (like 'python setup.py COMMAND'), "
-            "or in a way that lets it use sys.argv[0] to find the root "
-            "(like 'python path/to/setup.py COMMAND')."
-        )
-        raise VersioneerBadRootError(err)
-    try:
-        # Certain runtime workflows (setup.py install/develop in a setuptools
-        # tree) execute all dependencies in a single python process, so
-        # "versioneer" may be imported multiple times, and python's shared
-        # module-import table will cache the first one. So we can't use
-        # os.path.dirname(__file__), as that will find whichever
-        # versioneer.py was first imported, even in later projects.
-        me = os.path.realpath(os.path.abspath(__file__))
-        me_dir = os.path.normcase(os.path.splitext(me)[0])
-        vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0])
-        if me_dir != vsr_dir:
-            print(
-                "Warning: build in %s is using versioneer.py from %s"
-                % (os.path.dirname(me), versioneer_py)
-            )
-    except NameError:
-        pass
-    return root
-
-
-def get_config_from_root(root):
-    """Read the project setup.cfg file to determine Versioneer config."""
-    # This might raise EnvironmentError (if setup.cfg is missing), or
-    # configparser.NoSectionError (if it lacks a [versioneer] section), or
-    # configparser.NoOptionError (if it lacks "VCS="). See the docstring at
-    # the top of versioneer.py for instructions on writing your setup.cfg .
-    setup_cfg = os.path.join(root, "setup.cfg")
-    parser = configparser.SafeConfigParser()
-    with open(setup_cfg, "r") as f:
-        parser.readfp(f)
-    VCS = parser.get("versioneer", "VCS")  # mandatory
-
-    def get(parser, name):
-        if parser.has_option("versioneer", name):
-            return parser.get("versioneer", name)
-        return None
-
-    cfg = VersioneerConfig()
-    cfg.VCS = VCS
-    cfg.style = get(parser, "style") or ""
-    cfg.versionfile_source = get(parser, "versionfile_source")
-    cfg.versionfile_build = get(parser, "versionfile_build")
-    cfg.tag_prefix = get(parser, "tag_prefix")
-    if cfg.tag_prefix in ("''", '""'):
-        cfg.tag_prefix = ""
-    cfg.parentdir_prefix = get(parser, "parentdir_prefix")
-    cfg.verbose = get(parser, "verbose")
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-# these dictionaries contain VCS-specific tools
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen(
-                [c] + args,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=(subprocess.PIPE if hide_stderr else None),
-            )
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %s" % dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %s" % (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %s (error)" % dispcmd)
-            print("stdout was %s" % stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-LONG_VERSION_PY[
-    "git"
-] = '''
-# This file helps to compute a version number in source trees obtained from
-# git-archive tarball (such as those provided by githubs download-from-tag
-# feature). Distribution tarballs (built by setup.py sdist) and build
-# directories (produced by setup.py build) will contain a much shorter file
-# that just contains the computed version number.
-
-# This file is released into the public domain. Generated by
-# versioneer-0.18 (https://github.com/warner/python-versioneer)
-
-"""Git implementation of _version.py."""
-
-import errno
-import os
-import re
-import subprocess
-import sys
-
-
-def get_keywords():
-    """Get the keywords needed to look up the version information."""
-    # these strings will be replaced by git during git-archive.
-    # setup.py/versioneer.py will grep for the variable names, so they must
-    # each be defined on a line of their own. _version.py will just call
-    # get_keywords().
-    git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s"
-    git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s"
-    git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s"
-    keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
-    return keywords
-
-
-class VersioneerConfig:
-    """Container for Versioneer configuration parameters."""
-
-
-def get_config():
-    """Create, populate and return the VersioneerConfig() object."""
-    # these strings are filled in when 'setup.py versioneer' creates
-    # _version.py
-    cfg = VersioneerConfig()
-    cfg.VCS = "git"
-    cfg.style = "%(STYLE)s"
-    cfg.tag_prefix = "%(TAG_PREFIX)s"
-    cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s"
-    cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s"
-    cfg.verbose = False
-    return cfg
-
-
-class NotThisMethod(Exception):
-    """Exception raised if a method is not valid for the current scenario."""
-
-
-LONG_VERSION_PY = {}
-HANDLERS = {}
-
-
-def register_vcs_handler(vcs, method):  # decorator
-    """Decorator to mark a method as the handler for a particular VCS."""
-    def decorate(f):
-        """Store f in HANDLERS[vcs][method]."""
-        if vcs not in HANDLERS:
-            HANDLERS[vcs] = {}
-        HANDLERS[vcs][method] = f
-        return f
-    return decorate
-
-
-def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
-                env=None):
-    """Call the given command(s)."""
-    assert isinstance(commands, list)
-    p = None
-    for c in commands:
-        try:
-            dispcmd = str([c] + args)
-            # remember shell=False, so use git.cmd on windows, not just git
-            p = subprocess.Popen([c] + args, cwd=cwd, env=env,
-                                 stdout=subprocess.PIPE,
-                                 stderr=(subprocess.PIPE if hide_stderr
-                                         else None))
-            break
-        except EnvironmentError:
-            e = sys.exc_info()[1]
-            if e.errno == errno.ENOENT:
-                continue
-            if verbose:
-                print("unable to run %%s" %% dispcmd)
-                print(e)
-            return None, None
-    else:
-        if verbose:
-            print("unable to find command, tried %%s" %% (commands,))
-        return None, None
-    stdout = p.communicate()[0].strip()
-    if sys.version_info[0] >= 3:
-        stdout = stdout.decode()
-    if p.returncode != 0:
-        if verbose:
-            print("unable to run %%s (error)" %% dispcmd)
-            print("stdout was %%s" %% stdout)
-        return None, p.returncode
-    return stdout, p.returncode
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {"version": dirname[len(parentdir_prefix):],
-                    "full-revisionid": None,
-                    "dirty": False, "error": None, "date": None}
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print("Tried directories %%s but none started with prefix %%s" %%
-              (str(rootdirs), parentdir_prefix))
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %%d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r'\d', r)])
-        if verbose:
-            print("discarding '%%s', no digits" %% ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %%s" %% ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix):]
-            if verbose:
-                print("picking %%s" %% r)
-            return {"version": r,
-                    "full-revisionid": keywords["full"].strip(),
-                    "dirty": False, "error": None,
-                    "date": date}
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {"version": "0+unknown",
-            "full-revisionid": keywords["full"].strip(),
-            "dirty": False, "error": "no suitable tags", "date": None}
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root,
-                          hide_stderr=True)
-    if rc != 0:
-        if verbose:
-            print("Directory %%s not under git control" %% root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty",
-                                          "--always", "--long",
-                                          "--match", "%%s*" %% tag_prefix],
-                                   cwd=root)
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[:git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = ("unable to parse git-describe output: '%%s'"
-                               %% describe_out)
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%%s' doesn't start with prefix '%%s'"
-                print(fmt %% (full_tag, tag_prefix))
-            pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'"
-                               %% (full_tag, tag_prefix))
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix):]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"],
-                                    cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%%ci", "HEAD"],
-                       cwd=root)[0].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"],
-                                          pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%%d" %% pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%%d" %% pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%%d" %% pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%%s" %% pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%%d" %% pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%%s" %% pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%%d" %% pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%%d" %% pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {"version": "unknown",
-                "full-revisionid": pieces.get("long"),
-                "dirty": None,
-                "error": pieces["error"],
-                "date": None}
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%%s'" %% style)
-
-    return {"version": rendered, "full-revisionid": pieces["long"],
-            "dirty": pieces["dirty"], "error": None,
-            "date": pieces.get("date")}
-
-
-def get_versions():
-    """Get version information or return default if unable to do so."""
-    # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have
-    # __file__, we can work backwards from there to the root. Some
-    # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which
-    # case we can only use expanded keywords.
-
-    cfg = get_config()
-    verbose = cfg.verbose
-
-    try:
-        return git_versions_from_keywords(get_keywords(), cfg.tag_prefix,
-                                          verbose)
-    except NotThisMethod:
-        pass
-
-    try:
-        root = os.path.realpath(__file__)
-        # versionfile_source is the relative path from the top of the source
-        # tree (where the .git directory might live) to this file. Invert
-        # this to find the root from __file__.
-        for i in cfg.versionfile_source.split('/'):
-            root = os.path.dirname(root)
-    except NameError:
-        return {"version": "0+unknown", "full-revisionid": None,
-                "dirty": None,
-                "error": "unable to find root of source tree",
-                "date": None}
-
-    try:
-        pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose)
-        return render(pieces, cfg.style)
-    except NotThisMethod:
-        pass
-
-    try:
-        if cfg.parentdir_prefix:
-            return versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-    except NotThisMethod:
-        pass
-
-    return {"version": "0+unknown", "full-revisionid": None,
-            "dirty": None,
-            "error": "unable to compute version", "date": None}
-'''
-
-
-@register_vcs_handler("git", "get_keywords")
-def git_get_keywords(versionfile_abs):
-    """Extract version information from the given file."""
-    # the code embedded in _version.py can just fetch the value of these
-    # keywords. When used from setup.py, we don't want to import _version.py,
-    # so we do it with a regexp instead. This function is not used from
-    # _version.py.
-    keywords = {}
-    try:
-        f = open(versionfile_abs, "r")
-        for line in f.readlines():
-            if line.strip().startswith("git_refnames ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["refnames"] = mo.group(1)
-            if line.strip().startswith("git_full ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["full"] = mo.group(1)
-            if line.strip().startswith("git_date ="):
-                mo = re.search(r'=\s*"(.*)"', line)
-                if mo:
-                    keywords["date"] = mo.group(1)
-        f.close()
-    except EnvironmentError:
-        pass
-    return keywords
-
-
-@register_vcs_handler("git", "keywords")
-def git_versions_from_keywords(keywords, tag_prefix, verbose):
-    """Get version information from git keywords."""
-    if not keywords:
-        raise NotThisMethod("no keywords at all, weird")
-    date = keywords.get("date")
-    if date is not None:
-        # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant
-        # datestamp. However we prefer "%ci" (which expands to an "ISO-8601
-        # -like" string, which we must then edit to make compliant), because
-        # it's been around since git-1.5.3, and it's too difficult to
-        # discover which version we're using, or to work around using an
-        # older one.
-        date = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-    refnames = keywords["refnames"].strip()
-    if refnames.startswith("$Format"):
-        if verbose:
-            print("keywords are unexpanded, not using")
-        raise NotThisMethod("unexpanded keywords, not a git-archive tarball")
-    refs = set([r.strip() for r in refnames.strip("()").split(",")])
-    # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
-    # just "foo-1.0". If we see a "tag: " prefix, prefer those.
-    TAG = "tag: "
-    tags = set([r[len(TAG) :] for r in refs if r.startswith(TAG)])
-    if not tags:
-        # Either we're using git < 1.8.3, or there really are no tags. We use
-        # a heuristic: assume all version tags have a digit. The old git %d
-        # expansion behaves like git log --decorate=short and strips out the
-        # refs/heads/ and refs/tags/ prefixes that would let us distinguish
-        # between branches and tags. By ignoring refnames without digits, we
-        # filter out many common branch names like "release" and
-        # "stabilization", as well as "HEAD" and "master".
-        tags = set([r for r in refs if re.search(r"\d", r)])
-        if verbose:
-            print("discarding '%s', no digits" % ",".join(refs - tags))
-    if verbose:
-        print("likely tags: %s" % ",".join(sorted(tags)))
-    for ref in sorted(tags):
-        # sorting will prefer e.g. "2.0" over "2.0rc1"
-        if ref.startswith(tag_prefix):
-            r = ref[len(tag_prefix) :]
-            if verbose:
-                print("picking %s" % r)
-            return {
-                "version": r,
-                "full-revisionid": keywords["full"].strip(),
-                "dirty": False,
-                "error": None,
-                "date": date,
-            }
-    # no suitable tags, so version is "0+unknown", but full hex is still there
-    if verbose:
-        print("no suitable tags, using unknown + full revision id")
-    return {
-        "version": "0+unknown",
-        "full-revisionid": keywords["full"].strip(),
-        "dirty": False,
-        "error": "no suitable tags",
-        "date": None,
-    }
-
-
-@register_vcs_handler("git", "pieces_from_vcs")
-def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
-    """Get version from 'git describe' in the root of the source tree.
-
-    This only gets called if the git-archive 'subst' keywords were *not*
-    expanded, and _version.py hasn't already been rewritten with a short
-    version string, meaning we're inside a checked out source tree.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-
-    out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True)
-    if rc != 0:
-        if verbose:
-            print("Directory %s not under git control" % root)
-        raise NotThisMethod("'git rev-parse --git-dir' returned error")
-
-    # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty]
-    # if there isn't one, this yields HEX[-dirty] (no NUM)
-    describe_out, rc = run_command(
-        GITS,
-        [
-            "describe",
-            "--tags",
-            "--dirty",
-            "--always",
-            "--long",
-            "--match",
-            "%s*" % tag_prefix,
-        ],
-        cwd=root,
-    )
-    # --long was added in git-1.5.5
-    if describe_out is None:
-        raise NotThisMethod("'git describe' failed")
-    describe_out = describe_out.strip()
-    full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root)
-    if full_out is None:
-        raise NotThisMethod("'git rev-parse' failed")
-    full_out = full_out.strip()
-
-    pieces = {}
-    pieces["long"] = full_out
-    pieces["short"] = full_out[:7]  # maybe improved later
-    pieces["error"] = None
-
-    # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty]
-    # TAG might have hyphens.
-    git_describe = describe_out
-
-    # look for -dirty suffix
-    dirty = git_describe.endswith("-dirty")
-    pieces["dirty"] = dirty
-    if dirty:
-        git_describe = git_describe[: git_describe.rindex("-dirty")]
-
-    # now we have TAG-NUM-gHEX or HEX
-
-    if "-" in git_describe:
-        # TAG-NUM-gHEX
-        mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe)
-        if not mo:
-            # unparseable. Maybe git-describe is misbehaving?
-            pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out
-            return pieces
-
-        # tag
-        full_tag = mo.group(1)
-        if not full_tag.startswith(tag_prefix):
-            if verbose:
-                fmt = "tag '%s' doesn't start with prefix '%s'"
-                print(fmt % (full_tag, tag_prefix))
-            pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
-                full_tag,
-                tag_prefix,
-            )
-            return pieces
-        pieces["closest-tag"] = full_tag[len(tag_prefix) :]
-
-        # distance: number of commits since tag
-        pieces["distance"] = int(mo.group(2))
-
-        # commit: short hex revision ID
-        pieces["short"] = mo.group(3)
-
-    else:
-        # HEX: no tags
-        pieces["closest-tag"] = None
-        count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], cwd=root)
-        pieces["distance"] = int(count_out)  # total number of commits
-
-    # commit date: see ISO-8601 comment in git_versions_from_keywords()
-    date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[
-        0
-    ].strip()
-    pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1)
-
-    return pieces
-
-
-def do_vcs_install(manifest_in, versionfile_source, ipy):
-    """Git-specific installation logic for Versioneer.
-
-    For Git, this means creating/changing .gitattributes to mark _version.py
-    for export-subst keyword substitution.
-    """
-    GITS = ["git"]
-    if sys.platform == "win32":
-        GITS = ["git.cmd", "git.exe"]
-    files = [manifest_in, versionfile_source]
-    if ipy:
-        files.append(ipy)
-    try:
-        me = __file__
-        if me.endswith(".pyc") or me.endswith(".pyo"):
-            me = os.path.splitext(me)[0] + ".py"
-        versioneer_file = os.path.relpath(me)
-    except NameError:
-        versioneer_file = "versioneer.py"
-    files.append(versioneer_file)
-    present = False
-    try:
-        f = open(".gitattributes", "r")
-        for line in f.readlines():
-            if line.strip().startswith(versionfile_source):
-                if "export-subst" in line.strip().split()[1:]:
-                    present = True
-        f.close()
-    except EnvironmentError:
-        pass
-    if not present:
-        f = open(".gitattributes", "a+")
-        f.write("%s export-subst\n" % versionfile_source)
-        f.close()
-        files.append(".gitattributes")
-    run_command(GITS, ["add", "--"] + files)
-
-
-def versions_from_parentdir(parentdir_prefix, root, verbose):
-    """Try to determine the version from the parent directory name.
-
-    Source tarballs conventionally unpack into a directory that includes both
-    the project name and a version string. We will also support searching up
-    two directory levels for an appropriately named parent directory
-    """
-    rootdirs = []
-
-    for i in range(3):
-        dirname = os.path.basename(root)
-        if dirname.startswith(parentdir_prefix):
-            return {
-                "version": dirname[len(parentdir_prefix) :],
-                "full-revisionid": None,
-                "dirty": False,
-                "error": None,
-                "date": None,
-            }
-        else:
-            rootdirs.append(root)
-            root = os.path.dirname(root)  # up a level
-
-    if verbose:
-        print(
-            "Tried directories %s but none started with prefix %s"
-            % (str(rootdirs), parentdir_prefix)
-        )
-    raise NotThisMethod("rootdir doesn't start with parentdir_prefix")
-
-
-SHORT_VERSION_PY = """
-# This file was generated by 'versioneer.py' (0.18) from
-# revision-control system data, or from the parent directory name of an
-# unpacked source archive. Distribution tarballs contain a pre-generated copy
-# of this file.
-
-import json
-
-version_json = '''
-%s
-'''  # END VERSION_JSON
-
-
-def get_versions():
-    return json.loads(version_json)
-"""
-
-
-def versions_from_file(filename):
-    """Try to determine the version from _version.py if present."""
-    try:
-        with open(filename) as f:
-            contents = f.read()
-    except EnvironmentError:
-        raise NotThisMethod("unable to read _version.py")
-    mo = re.search(
-        r"version_json = '''\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
-    )
-    if not mo:
-        mo = re.search(
-            r"version_json = '''\r\n(.*)'''  # END VERSION_JSON", contents, re.M | re.S
-        )
-    if not mo:
-        raise NotThisMethod("no version_json in _version.py")
-    return json.loads(mo.group(1))
-
-
-def write_to_version_file(filename, versions):
-    """Write the given version number to the given _version.py file."""
-    os.unlink(filename)
-    contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": "))
-    with open(filename, "w") as f:
-        f.write(SHORT_VERSION_PY % contents)
-
-    print("set %s to '%s'" % (filename, versions["version"]))
-
-
-def plus_or_dot(pieces):
-    """Return a + if we don't already have one, else return a ."""
-    if "+" in pieces.get("closest-tag", ""):
-        return "."
-    return "+"
-
-
-def render_pep440(pieces):
-    """Build up version string, with post-release "local version identifier".
-
-    Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you
-    get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty
-
-    Exceptions:
-    1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += plus_or_dot(pieces)
-            rendered += "%d.g%s" % (pieces["distance"], pieces["short"])
-            if pieces["dirty"]:
-                rendered += ".dirty"
-    else:
-        # exception #1
-        rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"])
-        if pieces["dirty"]:
-            rendered += ".dirty"
-    return rendered
-
-
-def render_pep440_pre(pieces):
-    """TAG[.post.devDISTANCE] -- No -dirty.
-
-    Exceptions:
-    1: no tags. 0.post.devDISTANCE
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += ".post.dev%d" % pieces["distance"]
-    else:
-        # exception #1
-        rendered = "0.post.dev%d" % pieces["distance"]
-    return rendered
-
-
-def render_pep440_post(pieces):
-    """TAG[.postDISTANCE[.dev0]+gHEX] .
-
-    The ".dev0" means dirty. Note that .dev0 sorts backwards
-    (a dirty tree will appear "older" than the corresponding clean one),
-    but you shouldn't be releasing software with -dirty anyways.
-
-    Exceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-            rendered += plus_or_dot(pieces)
-            rendered += "g%s" % pieces["short"]
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-        rendered += "+g%s" % pieces["short"]
-    return rendered
-
-
-def render_pep440_old(pieces):
-    """TAG[.postDISTANCE[.dev0]] .
-
-    The ".dev0" means dirty.
-
-    Eexceptions:
-    1: no tags. 0.postDISTANCE[.dev0]
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"] or pieces["dirty"]:
-            rendered += ".post%d" % pieces["distance"]
-            if pieces["dirty"]:
-                rendered += ".dev0"
-    else:
-        # exception #1
-        rendered = "0.post%d" % pieces["distance"]
-        if pieces["dirty"]:
-            rendered += ".dev0"
-    return rendered
-
-
-def render_git_describe(pieces):
-    """TAG[-DISTANCE-gHEX][-dirty].
-
-    Like 'git describe --tags --dirty --always'.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        if pieces["distance"]:
-            rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render_git_describe_long(pieces):
-    """TAG-DISTANCE-gHEX[-dirty].
-
-    Like 'git describe --tags --dirty --always -long'.
-    The distance/hash is unconditional.
-
-    Exceptions:
-    1: no tags. HEX[-dirty]  (note: no 'g' prefix)
-    """
-    if pieces["closest-tag"]:
-        rendered = pieces["closest-tag"]
-        rendered += "-%d-g%s" % (pieces["distance"], pieces["short"])
-    else:
-        # exception #1
-        rendered = pieces["short"]
-    if pieces["dirty"]:
-        rendered += "-dirty"
-    return rendered
-
-
-def render(pieces, style):
-    """Render the given version pieces into the requested style."""
-    if pieces["error"]:
-        return {
-            "version": "unknown",
-            "full-revisionid": pieces.get("long"),
-            "dirty": None,
-            "error": pieces["error"],
-            "date": None,
-        }
-
-    if not style or style == "default":
-        style = "pep440"  # the default
-
-    if style == "pep440":
-        rendered = render_pep440(pieces)
-    elif style == "pep440-pre":
-        rendered = render_pep440_pre(pieces)
-    elif style == "pep440-post":
-        rendered = render_pep440_post(pieces)
-    elif style == "pep440-old":
-        rendered = render_pep440_old(pieces)
-    elif style == "git-describe":
-        rendered = render_git_describe(pieces)
-    elif style == "git-describe-long":
-        rendered = render_git_describe_long(pieces)
-    else:
-        raise ValueError("unknown style '%s'" % style)
-
-    return {
-        "version": rendered,
-        "full-revisionid": pieces["long"],
-        "dirty": pieces["dirty"],
-        "error": None,
-        "date": pieces.get("date"),
-    }
-
-
-class VersioneerBadRootError(Exception):
-    """The project root directory is unknown or missing key files."""
-
-
-def get_versions(verbose=False):
-    """Get the project version from whatever source is available.
-
-    Returns dict with two keys: 'version' and 'full'.
-    """
-    if "versioneer" in sys.modules:
-        # see the discussion in cmdclass.py:get_cmdclass()
-        del sys.modules["versioneer"]
-
-    root = get_root()
-    cfg = get_config_from_root(root)
-
-    assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg"
-    handlers = HANDLERS.get(cfg.VCS)
-    assert handlers, "unrecognized VCS '%s'" % cfg.VCS
-    verbose = verbose or cfg.verbose
-    assert (
-        cfg.versionfile_source is not None
-    ), "please set versioneer.versionfile_source"
-    assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix"
-
-    versionfile_abs = os.path.join(root, cfg.versionfile_source)
-
-    # extract version from first of: _version.py, VCS command (e.g. 'git
-    # describe'), parentdir. This is meant to work for developers using a
-    # source checkout, for users of a tarball created by 'setup.py sdist',
-    # and for users of a tarball/zipball created by 'git archive' or github's
-    # download-from-tag feature or the equivalent in other VCSes.
-
-    get_keywords_f = handlers.get("get_keywords")
-    from_keywords_f = handlers.get("keywords")
-    if get_keywords_f and from_keywords_f:
-        try:
-            keywords = get_keywords_f(versionfile_abs)
-            ver = from_keywords_f(keywords, cfg.tag_prefix, verbose)
-            if verbose:
-                print("got version from expanded keyword %s" % ver)
-            return ver
-        except NotThisMethod:
-            pass
-
-    try:
-        ver = versions_from_file(versionfile_abs)
-        if verbose:
-            print("got version from file %s %s" % (versionfile_abs, ver))
-        return ver
-    except NotThisMethod:
-        pass
-
-    from_vcs_f = handlers.get("pieces_from_vcs")
-    if from_vcs_f:
-        try:
-            pieces = from_vcs_f(cfg.tag_prefix, root, verbose)
-            ver = render(pieces, cfg.style)
-            if verbose:
-                print("got version from VCS %s" % ver)
-            return ver
-        except NotThisMethod:
-            pass
-
-    try:
-        if cfg.parentdir_prefix:
-            ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose)
-            if verbose:
-                print("got version from parentdir %s" % ver)
-            return ver
-    except NotThisMethod:
-        pass
-
-    if verbose:
-        print("unable to compute version")
-
-    return {
-        "version": "0+unknown",
-        "full-revisionid": None,
-        "dirty": None,
-        "error": "unable to compute version",
-        "date": None,
-    }
-
-
-def get_version():
-    """Get the short version string for this project."""
-    return get_versions()["version"]
-
-
-def get_cmdclass():
-    """Get the custom setuptools/distutils subclasses used by Versioneer."""
-    if "versioneer" in sys.modules:
-        del sys.modules["versioneer"]
-        # this fixes the "python setup.py develop" case (also 'install' and
-        # 'easy_install .'), in which subdependencies of the main project are
-        # built (using setup.py bdist_egg) in the same python process. Assume
-        # a main project A and a dependency B, which use different versions
-        # of Versioneer. A's setup.py imports A's Versioneer, leaving it in
-        # sys.modules by the time B's setup.py is executed, causing B to run
-        # with the wrong versioneer. Setuptools wraps the sub-dep builds in a
-        # sandbox that restores sys.modules to it's pre-build state, so the
-        # parent is protected against the child's "import versioneer". By
-        # removing ourselves from sys.modules here, before the child build
-        # happens, we protect the child from the parent's versioneer too.
-        # Also see https://github.com/warner/python-versioneer/issues/52
-
-    cmds = {}
-
-    # we add "version" to both distutils and setuptools
-    from distutils.core import Command
-
-    class cmd_version(Command):
-        description = "report generated version string"
-        user_options = []
-        boolean_options = []
-
-        def initialize_options(self):
-            pass
-
-        def finalize_options(self):
-            pass
-
-        def run(self):
-            vers = get_versions(verbose=True)
-            print("Version: %s" % vers["version"])
-            print(" full-revisionid: %s" % vers.get("full-revisionid"))
-            print(" dirty: %s" % vers.get("dirty"))
-            print(" date: %s" % vers.get("date"))
-            if vers["error"]:
-                print(" error: %s" % vers["error"])
-
-    cmds["version"] = cmd_version
-
-    # we override "build_py" in both distutils and setuptools
-    #
-    # most invocation pathways end up running build_py:
-    #  distutils/build -> build_py
-    #  distutils/install -> distutils/build ->..
-    #  setuptools/bdist_wheel -> distutils/install ->..
-    #  setuptools/bdist_egg -> distutils/install_lib -> build_py
-    #  setuptools/install -> bdist_egg ->..
-    #  setuptools/develop -> ?
-    #  pip install:
-    #   copies source tree to a tempdir before running egg_info/etc
-    #   if .git isn't copied too, 'git describe' will fail
-    #   then does setup.py bdist_wheel, or sometimes setup.py install
-    #  setup.py egg_info -> ?
-
-    # we override different "build_py" commands for both environments
-    if "setuptools" in sys.modules:
-        from setuptools.command.build_py import build_py as _build_py
-    else:
-        from distutils.command.build_py import build_py as _build_py
-
-    class cmd_build_py(_build_py):
-        def run(self):
-            root = get_root()
-            cfg = get_config_from_root(root)
-            versions = get_versions()
-            _build_py.run(self)
-            # now locate _version.py in the new build/ directory and replace
-            # it with an updated value
-            if cfg.versionfile_build:
-                target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build)
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-
-    cmds["build_py"] = cmd_build_py
-
-    if "cx_Freeze" in sys.modules:  # cx_freeze enabled?
-        from cx_Freeze.dist import build_exe as _build_exe
-
-        # nczeczulin reports that py2exe won't like the pep440-style string
-        # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g.
-        # setup(console=[{
-        #   "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION
-        #   "product_version": versioneer.get_version(),
-        #   ...
-
-        class cmd_build_exe(_build_exe):
-            def run(self):
-                root = get_root()
-                cfg = get_config_from_root(root)
-                versions = get_versions()
-                target_versionfile = cfg.versionfile_source
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-
-                _build_exe.run(self)
-                os.unlink(target_versionfile)
-                with open(cfg.versionfile_source, "w") as f:
-                    LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(
-                        LONG
-                        % {
-                            "DOLLAR": "$",
-                            "STYLE": cfg.style,
-                            "TAG_PREFIX": cfg.tag_prefix,
-                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        }
-                    )
-
-        cmds["build_exe"] = cmd_build_exe
-        del cmds["build_py"]
-
-    if "py2exe" in sys.modules:  # py2exe enabled?
-        try:
-            from py2exe.distutils_buildexe import py2exe as _py2exe  # py3
-        except ImportError:
-            from py2exe.build_exe import py2exe as _py2exe  # py2
-
-        class cmd_py2exe(_py2exe):
-            def run(self):
-                root = get_root()
-                cfg = get_config_from_root(root)
-                versions = get_versions()
-                target_versionfile = cfg.versionfile_source
-                print("UPDATING %s" % target_versionfile)
-                write_to_version_file(target_versionfile, versions)
-
-                _py2exe.run(self)
-                os.unlink(target_versionfile)
-                with open(cfg.versionfile_source, "w") as f:
-                    LONG = LONG_VERSION_PY[cfg.VCS]
-                    f.write(
-                        LONG
-                        % {
-                            "DOLLAR": "$",
-                            "STYLE": cfg.style,
-                            "TAG_PREFIX": cfg.tag_prefix,
-                            "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                            "VERSIONFILE_SOURCE": cfg.versionfile_source,
-                        }
-                    )
-
-        cmds["py2exe"] = cmd_py2exe
-
-    # we override different "sdist" commands for both environments
-    if "setuptools" in sys.modules:
-        from setuptools.command.sdist import sdist as _sdist
-    else:
-        from distutils.command.sdist import sdist as _sdist
-
-    class cmd_sdist(_sdist):
-        def run(self):
-            versions = get_versions()
-            self._versioneer_generated_versions = versions
-            # unless we update this, the command will keep using the old
-            # version
-            self.distribution.metadata.version = versions["version"]
-            return _sdist.run(self)
-
-        def make_release_tree(self, base_dir, files):
-            root = get_root()
-            cfg = get_config_from_root(root)
-            _sdist.make_release_tree(self, base_dir, files)
-            # now locate _version.py in the new base_dir directory
-            # (remembering that it may be a hardlink) and replace it with an
-            # updated value
-            target_versionfile = os.path.join(base_dir, cfg.versionfile_source)
-            print("UPDATING %s" % target_versionfile)
-            write_to_version_file(
-                target_versionfile, self._versioneer_generated_versions
-            )
-
-    cmds["sdist"] = cmd_sdist
-
-    return cmds
-
-
-CONFIG_ERROR = """
-setup.cfg is missing the necessary Versioneer configuration. You need
-a section like:
-
- [versioneer]
- VCS = git
- style = pep440
- versionfile_source = src/myproject/_version.py
- versionfile_build = myproject/_version.py
- tag_prefix =
- parentdir_prefix = myproject-
-
-You will also need to edit your setup.py to use the results:
-
- import versioneer
- setup(version=versioneer.get_version(),
-       cmdclass=versioneer.get_cmdclass(), ...)
-
-Please read the docstring in ./versioneer.py for configuration instructions,
-edit setup.cfg, and re-run the installer or 'python versioneer.py setup'.
-"""
-
-SAMPLE_CONFIG = """
-# See the docstring in versioneer.py for instructions. Note that you must
-# re-run 'versioneer.py setup' after changing this section, and commit the
-# resulting files.
-
-[versioneer]
-#VCS = git
-#style = pep440
-#versionfile_source =
-#versionfile_build =
-#tag_prefix =
-#parentdir_prefix =
-
-"""
-
-INIT_PY_SNIPPET = """
-from ._version import get_versions
-__version__ = get_versions()['version']
-del get_versions
-"""
-
-
-def do_setup():
-    """Main VCS-independent setup function for installing Versioneer."""
-    root = get_root()
-    try:
-        cfg = get_config_from_root(root)
-    except (
-        EnvironmentError,
-        configparser.NoSectionError,
-        configparser.NoOptionError,
-    ) as e:
-        if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
-            print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
-            with open(os.path.join(root, "setup.cfg"), "a") as f:
-                f.write(SAMPLE_CONFIG)
-        print(CONFIG_ERROR, file=sys.stderr)
-        return 1
-
-    print(" creating %s" % cfg.versionfile_source)
-    with open(cfg.versionfile_source, "w") as f:
-        LONG = LONG_VERSION_PY[cfg.VCS]
-        f.write(
-            LONG
-            % {
-                "DOLLAR": "$",
-                "STYLE": cfg.style,
-                "TAG_PREFIX": cfg.tag_prefix,
-                "PARENTDIR_PREFIX": cfg.parentdir_prefix,
-                "VERSIONFILE_SOURCE": cfg.versionfile_source,
-            }
-        )
-
-    ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
-    if os.path.exists(ipy):
-        try:
-            with open(ipy, "r") as f:
-                old = f.read()
-        except EnvironmentError:
-            old = ""
-        if INIT_PY_SNIPPET not in old:
-            print(" appending to %s" % ipy)
-            with open(ipy, "a") as f:
-                f.write(INIT_PY_SNIPPET)
-        else:
-            print(" %s unmodified" % ipy)
-    else:
-        print(" %s doesn't exist, ok" % ipy)
-        ipy = None
-
-    # Make sure both the top-level "versioneer.py" and versionfile_source
-    # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so
-    # they'll be copied into source distributions. Pip won't be able to
-    # install the package without this.
-    manifest_in = os.path.join(root, "MANIFEST.in")
-    simple_includes = set()
-    try:
-        with open(manifest_in, "r") as f:
-            for line in f:
-                if line.startswith("include "):
-                    for include in line.split()[1:]:
-                        simple_includes.add(include)
-    except EnvironmentError:
-        pass
-    # That doesn't cover everything MANIFEST.in can do
-    # (http://docs.python.org/2/distutils/sourcedist.html#commands), so
-    # it might give some false negatives. Appending redundant 'include'
-    # lines is safe, though.
-    if "versioneer.py" not in simple_includes:
-        print(" appending 'versioneer.py' to MANIFEST.in")
-        with open(manifest_in, "a") as f:
-            f.write("include versioneer.py\n")
-    else:
-        print(" 'versioneer.py' already in MANIFEST.in")
-    if cfg.versionfile_source not in simple_includes:
-        print(
-            " appending versionfile_source ('%s') to MANIFEST.in"
-            % cfg.versionfile_source
-        )
-        with open(manifest_in, "a") as f:
-            f.write("include %s\n" % cfg.versionfile_source)
-    else:
-        print(" versionfile_source already in MANIFEST.in")
-
-    # Make VCS-specific changes. For git, this means creating/changing
-    # .gitattributes to mark _version.py for export-subst keyword
-    # substitution.
-    do_vcs_install(manifest_in, cfg.versionfile_source, ipy)
-    return 0
-
-
-def scan_setup_py():
-    """Validate the contents of setup.py against Versioneer's expectations."""
-    found = set()
-    setters = False
-    errors = 0
-    with open("setup.py", "r") as f:
-        for line in f.readlines():
-            if "import versioneer" in line:
-                found.add("import")
-            if "versioneer.get_cmdclass()" in line:
-                found.add("cmdclass")
-            if "versioneer.get_version()" in line:
-                found.add("get_version")
-            if "versioneer.VCS" in line:
-                setters = True
-            if "versioneer.versionfile_source" in line:
-                setters = True
-    if len(found) != 3:
-        print("")
-        print("Your setup.py appears to be missing some important items")
-        print("(but I might be wrong). Please make sure it has something")
-        print("roughly like the following:")
-        print("")
-        print(" import versioneer")
-        print(" setup( version=versioneer.get_version(),")
-        print("        cmdclass=versioneer.get_cmdclass(),  ...)")
-        print("")
-        errors += 1
-    if setters:
-        print("You should remove lines like 'versioneer.VCS = ' and")
-        print("'versioneer.versionfile_source = ' . This configuration")
-        print("now lives in setup.cfg, and should be removed from setup.py")
-        print("")
-        errors += 1
-    return errors
-
-
-if __name__ == "__main__":
-    cmd = sys.argv[1]
-    if cmd == "setup":
-        errors = do_setup()
-        errors += scan_setup_py()
-        if errors:
-            sys.exit(1)

From 6a94f237c0d6be99fe5a192b5e4cdf483a4cc976 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 22 Nov 2022 15:08:37 +0100
Subject: [PATCH 20/25] Support cuDF's built-in spilling (#984)

Support of the [new built-in spilling in cuDF](https://github.com/rapidsai/cudf/pull/12106) so that `device_memory_limit` and `memory_limit` ignores cuDF's device buffers.

This is only implemented for `DeviceHostFile`. Since jit-unspill also targets cuDF and libraries such as cupy isn't supported, I don't think it is important to support cuDF's built-in spilling in `ProxifyHostFile`.

For now, `DeviceHostFile` simply ignores cuDF's device buffers and let cuDF handle the spilling. This means that `DeviceHostFile` might estimate the device and host memory usage incorrectly ([or more incorrectly than usually](https://github.com/dask/distributed/issues/4568#issuecomment-805049321)).

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/984
---
 dask_cuda/device_host_file.py                 |  25 ++-
 dask_cuda/is_spillable_object.py              |  55 +++++++
 dask_cuda/proxify_host_file.py                |   8 +
 dask_cuda/tests/test_cudf_builtin_spilling.py | 148 ++++++++++++++++++
 dask_cuda/tests/test_device_host_file.py      |  12 +-
 dask_cuda/tests/test_proxify_host_file.py     |   2 +-
 setup.py                                      |   3 +-
 7 files changed, 241 insertions(+), 12 deletions(-)
 create mode 100644 dask_cuda/is_spillable_object.py
 create mode 100644 dask_cuda/tests/test_cudf_builtin_spilling.py

diff --git a/dask_cuda/device_host_file.py b/dask_cuda/device_host_file.py
index f31d618b..fb31c3dd 100644
--- a/dask_cuda/device_host_file.py
+++ b/dask_cuda/device_host_file.py
@@ -1,4 +1,5 @@
 import functools
+import itertools
 import logging
 import os
 import sys
@@ -20,6 +21,7 @@
 from distributed.utils import nbytes
 
 from .is_device_object import is_device_object
+from .is_spillable_object import is_spillable_object
 from .utils import nvtx_annotate
 
 
@@ -235,6 +237,9 @@ def __init__(
         # For Worker compatibility only, where `fast` is host memory buffer
         self.fast = self.host_buffer if memory_limit is None else self.host_buffer.fast
 
+        # Dict of objects that will not be spilled by DeviceHostFile.
+        self.others = {}
+
     if sys.version_info < (3, 9):
 
         def __new__(
@@ -268,29 +273,35 @@ def __setitem__(self, key, value):
             # Make sure we register the removal of an existing key
             del self[key]
 
-        if is_device_object(value):
+        if is_spillable_object(value):
+            self.others[key] = value
+        elif is_device_object(value):
             self.device_keys.add(key)
             self.device_buffer[key] = value
         else:
             self.host_buffer[key] = value
 
     def __getitem__(self, key):
-        if key in self.device_keys:
+        if key in self.others:
+            return self.others[key]
+        elif key in self.device_keys:
             return self.device_buffer[key]
         elif key in self.host_buffer:
             return self.host_buffer[key]
-        else:
-            raise KeyError(key)
+        raise KeyError(key)
 
     def __len__(self):
-        return len(self.device_buffer)
+        return len(self.device_buffer) + len(self.others)
 
     def __iter__(self):
-        return iter(self.device_buffer)
+        return itertools.chain(self.device_buffer, self.others)
 
     def __delitem__(self, key):
         self.device_keys.discard(key)
-        del self.device_buffer[key]
+        if key in self.others:
+            del self.others[key]
+        else:
+            del self.device_buffer[key]
 
     def evict(self):
         """Evicts least recently used host buffer (aka, CPU or system memory)
diff --git a/dask_cuda/is_spillable_object.py b/dask_cuda/is_spillable_object.py
new file mode 100644
index 00000000..9e337aa8
--- /dev/null
+++ b/dask_cuda/is_spillable_object.py
@@ -0,0 +1,55 @@
+from __future__ import absolute_import, division, print_function
+
+from typing import Optional
+
+from dask.utils import Dispatch
+
+is_spillable_object = Dispatch(name="is_spillable_object")
+
+
+@is_spillable_object.register(list)
+@is_spillable_object.register(tuple)
+@is_spillable_object.register(set)
+@is_spillable_object.register(frozenset)
+def _(seq):
+    return any([is_spillable_object(s) for s in seq])
+
+
+@is_spillable_object.register(dict)
+def _(seq):
+    return any([is_spillable_object(s) for s in seq.items()])
+
+
+@is_spillable_object.register(object)
+def _(o):
+    return False
+
+
+@is_spillable_object.register_lazy("cudf")
+def register_cudf():
+    import cudf
+    from cudf.core.frame import Frame
+
+    @is_spillable_object.register(Frame)
+    def is_device_object_cudf_dataframe(df):
+        return cudf_spilling_status()
+
+    @is_spillable_object.register(cudf.BaseIndex)
+    def is_device_object_cudf_index(s):
+        return cudf_spilling_status()
+
+
+def cudf_spilling_status() -> Optional[bool]:
+    """Check the status of cudf's build-in spilling
+
+    Returns:
+        - True if cudf's internal spilling is enabled, or
+        - False if it is disabled, or
+        - None if the current version of cudf doesn't support spilling, or
+        - None if cudf isn't available.
+    """
+    try:
+        from cudf.core.buffer.spill_manager import get_global_manager
+    except ImportError:
+        return None
+    return get_global_manager() is not None
diff --git a/dask_cuda/proxify_host_file.py b/dask_cuda/proxify_host_file.py
index dd2e23e0..f258776e 100644
--- a/dask_cuda/proxify_host_file.py
+++ b/dask_cuda/proxify_host_file.py
@@ -40,6 +40,7 @@
 from . import proxify_device_objects as pdo
 from .disk_io import SpillToDiskProperties, disk_read, disk_write
 from .get_device_memory_objects import DeviceMemoryId, get_device_memory_ids
+from .is_spillable_object import cudf_spilling_status
 from .proxify_device_objects import proxify_device_objects, unproxify_device_objects
 from .proxy_object import ProxyObject
 
@@ -500,6 +501,13 @@ def __init__(
         spill_on_demand: bool = None,
         gds_spilling: bool = None,
     ):
+        if cudf_spilling_status():
+            warnings.warn(
+                "JIT-Unspill and cuDF's built-in spilling don't work together, please "
+                "disable one of them by setting either `CUDF_SPILL=off` or "
+                "`DASK_JIT_UNSPILL=off` environment variable."
+            )
+
         # each value of self.store is a tuple containing the proxified
         # object, as well as a boolean indicating whether any
         # incompatible types were found when proxifying it
diff --git a/dask_cuda/tests/test_cudf_builtin_spilling.py b/dask_cuda/tests/test_cudf_builtin_spilling.py
new file mode 100644
index 00000000..3e9519ca
--- /dev/null
+++ b/dask_cuda/tests/test_cudf_builtin_spilling.py
@@ -0,0 +1,148 @@
+import pytest
+
+from distributed.sizeof import safe_sizeof
+
+from dask_cuda.device_host_file import DeviceHostFile
+from dask_cuda.is_spillable_object import is_spillable_object
+from dask_cuda.proxify_host_file import ProxifyHostFile
+
+cupy = pytest.importorskip("cupy")
+pandas = pytest.importorskip("pandas")
+
+pytest.importorskip(
+    "cudf.core.buffer.spill_manager",
+    reason="Current version of cudf doesn't support built-in spilling",
+)
+
+import cudf  # noqa: E402
+from cudf.core.buffer.spill_manager import (  # noqa: E402
+    SpillManager,
+    get_global_manager,
+    set_global_manager,
+)
+from cudf.testing._utils import assert_eq  # noqa: E402
+
+if get_global_manager() is not None:
+    pytest.skip(
+        reason=(
+            "cannot test cudf built-in spilling, if already enabled globally. "
+            "Please set the `CUDF_SPILL=off` environment variable."
+        ),
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture
+def manager(request):
+    """Fixture to enable and make a spilling manager availabe"""
+    kwargs = dict(getattr(request, "param", {}))
+    set_global_manager(manager=SpillManager(**kwargs))
+    yield get_global_manager()
+    set_global_manager(manager=None)
+
+
+def test_is_spillable_object_when_cudf_spilling_disabled():
+    pdf = pandas.DataFrame({"a": [1, 2, 3]})
+    cdf = cudf.DataFrame({"a": [1, 2, 3]})
+    assert not is_spillable_object(pdf)
+    assert not is_spillable_object(cdf)
+
+
+def test_is_spillable_object_when_cudf_spilling_enabled(manager):
+    pdf = pandas.DataFrame({"a": [1, 2, 3]})
+    cdf = cudf.DataFrame({"a": [1, 2, 3]})
+    assert not is_spillable_object(pdf)
+    assert is_spillable_object(cdf)
+
+
+def test_device_host_file_when_cudf_spilling_is_disabled(tmp_path):
+    tmpdir = tmp_path / "storage"
+    tmpdir.mkdir()
+    dhf = DeviceHostFile(
+        device_memory_limit=1024 * 16,
+        memory_limit=1024 * 16,
+        worker_local_directory=tmpdir,
+    )
+    dhf["pandas"] = pandas.DataFrame({"a": [1, 2, 3]})
+    dhf["cudf"] = cudf.DataFrame({"a": [1, 2, 3]})
+
+    assert set(dhf.others.keys()) == set()
+    assert set(dhf.device.keys()) == set(["cudf"])
+    assert set(dhf.host.keys()) == set(["pandas"])
+    assert set(dhf.disk.keys()) == set()
+
+
+def test_device_host_file_step_by_step(tmp_path, manager: SpillManager):
+    tmpdir = tmp_path / "storage"
+    tmpdir.mkdir()
+    pdf = pandas.DataFrame({"a": [1, 2, 3]})
+    cdf = cudf.DataFrame({"a": [1, 2, 3]})
+    dhf = DeviceHostFile(
+        device_memory_limit=safe_sizeof(pdf),
+        memory_limit=safe_sizeof(pdf),
+        worker_local_directory=tmpdir,
+    )
+    dhf["pa1"] = pdf
+    dhf["cu1"] = cdf
+
+    assert set(dhf.others.keys()) == set(["cu1"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set(["pa1"])
+    assert set(dhf.disk.keys()) == set()
+    assert_eq(dhf["pa1"], dhf["cu1"])
+
+    dhf["pa2"] = pdf
+    assert set(dhf.others.keys()) == set(["cu1"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set(["pa2"])
+    assert set(dhf.disk.keys()) == set(["pa1"])
+
+    dhf["cu2"] = cdf
+    assert set(dhf.others.keys()) == set(["cu1", "cu2"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set(["pa2"])
+    assert set(dhf.disk.keys()) == set(["pa1"])
+
+    del dhf["cu1"]
+    assert set(dhf.others.keys()) == set(["cu2"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set(["pa2"])
+    assert set(dhf.disk.keys()) == set(["pa1"])
+
+    del dhf["pa2"]
+    assert set(dhf.others.keys()) == set(["cu2"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set()
+    assert set(dhf.disk.keys()) == set(["pa1"])
+
+    del dhf["pa1"]
+    assert set(dhf.others.keys()) == set(["cu2"])
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set()
+    assert set(dhf.disk.keys()) == set()
+
+    del dhf["cu2"]
+    assert set(dhf.others.keys()) == set()
+    assert set(dhf.device.keys()) == set()
+    assert set(dhf.host.keys()) == set()
+    assert set(dhf.disk.keys()) == set()
+
+
+def test_proxify_host_file(tmp_path_factory, manager: SpillManager):
+    # Reuse the spill-to-disk dir, if it exist
+    if ProxifyHostFile._spill_to_disk is None:
+        tmpdir = tmp_path_factory.mktemp("jit-unspill")
+    else:
+        tmpdir = ProxifyHostFile._spill_to_disk.root_dir / ".."
+
+    with pytest.warns(
+        UserWarning,
+        match="JIT-Unspill and cuDF's built-in spilling don't work together",
+    ):
+        dhf = ProxifyHostFile(
+            device_memory_limit=1000,
+            memory_limit=1000,
+            worker_local_directory=str(tmpdir),
+        )
+    dhf["cu1"] = cudf.DataFrame({"a": [1, 2, 3]})
+    del dhf["cu1"]
diff --git a/dask_cuda/tests/test_device_host_file.py b/dask_cuda/tests/test_device_host_file.py
index e9de6af7..59e06647 100644
--- a/dask_cuda/tests/test_device_host_file.py
+++ b/dask_cuda/tests/test_device_host_file.py
@@ -64,6 +64,7 @@ def test_device_host_file_short(
     assert set(dhf.device.keys()) == set()
     assert set(dhf.host.keys()) == set()
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
 
 def test_device_host_file_step_by_step(tmp_path):
@@ -79,41 +80,46 @@ def test_device_host_file_step_by_step(tmp_path):
     b = cupy.random.random(1000)
 
     dhf["a1"] = a
-
     assert set(dhf.device.keys()) == set()
     assert set(dhf.host.keys()) == set(["a1"])
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
     dhf["b1"] = b
-
     assert set(dhf.device.keys()) == set(["b1"])
     assert set(dhf.host.keys()) == set(["a1"])
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
     dhf["b2"] = b
     assert set(dhf.device.keys()) == set(["b1", "b2"])
     assert set(dhf.host.keys()) == set(["a1"])
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
     dhf["b3"] = b
     assert set(dhf.device.keys()) == set(["b2", "b3"])
     assert set(dhf.host.keys()) == set(["a1", "b1"])
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
     dhf["a2"] = a
     assert set(dhf.device.keys()) == set(["b2", "b3"])
     assert set(dhf.host.keys()) == set(["a2", "b1"])
     assert set(dhf.disk.keys()) == set(["a1"])
+    assert set(dhf.others.keys()) == set()
 
     dhf["b4"] = b
     assert set(dhf.device.keys()) == set(["b3", "b4"])
     assert set(dhf.host.keys()) == set(["a2", "b2"])
     assert set(dhf.disk.keys()) == set(["a1", "b1"])
+    assert set(dhf.others.keys()) == set()
 
     dhf["b4"] = b
     assert set(dhf.device.keys()) == set(["b3", "b4"])
     assert set(dhf.host.keys()) == set(["a2", "b2"])
     assert set(dhf.disk.keys()) == set(["a1", "b1"])
+    assert set(dhf.others.keys()) == set()
 
     assert_eq(dhf["a1"], a)
     del dhf["a1"]
@@ -131,11 +137,13 @@ def test_device_host_file_step_by_step(tmp_path):
     assert set(dhf.device.keys()) == set()
     assert set(dhf.host.keys()) == set()
     assert set(dhf.disk.keys()) == set()
+    assert set(dhf.others.keys()) == set()
 
     dhf["x"] = b
     dhf["x"] = a
     assert set(dhf.device.keys()) == set()
     assert set(dhf.host.keys()) == set(["x"])
+    assert set(dhf.others.keys()) == set()
 
 
 @pytest.mark.parametrize("collection", [dict, list, tuple])
diff --git a/dask_cuda/tests/test_proxify_host_file.py b/dask_cuda/tests/test_proxify_host_file.py
index 992679dc..09b5c9b4 100644
--- a/dask_cuda/tests/test_proxify_host_file.py
+++ b/dask_cuda/tests/test_proxify_host_file.py
@@ -45,7 +45,7 @@ def root_dir(tmp_path_factory):
     assert ProxifyHostFile._spill_to_disk is not None
 
     # In order to use the same tmp dir, we use `root_dir` for all
-    # ProxifyHostFile creations Notice, we use `..` to remove the
+    # ProxifyHostFile creations. Notice, we use `..` to remove the
     # `jit-unspill-disk-storage` part added by the
     # ProxifyHostFile implicitly.
     return str(ProxifyHostFile._spill_to_disk.root_dir / "..")
diff --git a/setup.py b/setup.py
index fa90437b..3b72644b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,7 @@
 import os
 
-from setuptools import setup
-
 import versioneer
+from setuptools import setup
 
 if "GIT_DESCRIBE_TAG" in os.environ:
     # Disgusting hack. For pypi uploads we cannot use the

From 4d725e3ec03d08e00fa1257a279dafd5688dac6a Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 23 Nov 2022 14:19:12 +0100
Subject: [PATCH 21/25] Re-implement shuffle using staging (#1030)

Introduce staging in explicit-comms. The idea is to "stage" the keys of the input on the workers so that a later explicit-comms task can access **and** free the data associated with the keys.

Notice, explicit-comms and this new staging approach is still experimental. If or when it gets to a state where it provides a significant performance improvements over a range of workflows, the plan is to tighten up the API.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/dask-cuda/pull/1030
---
 dask_cuda/explicit_comms/comms.py             |  73 ++-
 dask_cuda/explicit_comms/dataframe/shuffle.py | 468 +++++++++---------
 2 files changed, 311 insertions(+), 230 deletions(-)

diff --git a/dask_cuda/explicit_comms/comms.py b/dask_cuda/explicit_comms/comms.py
index f7726781..0ebd7f0c 100644
--- a/dask_cuda/explicit_comms/comms.py
+++ b/dask_cuda/explicit_comms/comms.py
@@ -3,10 +3,11 @@
 import contextlib
 import time
 import uuid
-from typing import List, Optional
+from typing import Any, Dict, Hashable, Iterable, List, Optional
 
 import distributed.comm
-from distributed import Client, default_client, get_worker
+from dask.utils import stringify
+from distributed import Client, Worker, default_client, get_worker
 from distributed.comm.addressing import parse_address, parse_host_port, unparse_address
 
 _default_comms = None
@@ -73,7 +74,7 @@ def worker_state(sessionId: Optional[int] = None) -> dict:
     state: dict
         Either a single state dict or a dict of state dict
     """
-    worker = get_worker()
+    worker: Any = get_worker()
     if not hasattr(worker, "_explicit_comm_state"):
         worker._explicit_comm_state = {}
     if sessionId is not None:
@@ -147,6 +148,20 @@ async def _stop_ucp_listeners(session_state):
     del session_state["lf"]
 
 
+async def _stage_keys(session_state: dict, name: str, keys: set):
+    worker: Worker = session_state["worker"]
+    data = worker.data
+    my_keys = keys.intersection(data)
+
+    stages = session_state.get("stages", {})
+    stage = stages.get(name, {})
+    for k in my_keys:
+        stage[k] = data[k]
+    stages[name] = stage
+    session_state["stages"] = stages
+    return (session_state["rank"], my_keys)
+
+
 class CommsContext:
     """Communication handler for explicit communication
 
@@ -260,3 +275,55 @@ def run(self, coroutine, *args, workers=None, lock_workers=False):
                     )
                 )
             return self.client.gather(ret)
+
+    def stage_keys(self, name: str, keys: Iterable[Hashable]) -> Dict[int, set]:
+        """Staging keys on workers under the given name
+
+        In an explicit-comms task, use `pop_staging_area(..., name)` to access
+        the staged keys and the associated data.
+
+        Notes
+        -----
+        In the context of explicit-comms, staging is the act of duplicating the
+        responsibility of Dask keys. When staging a key, the worker owning the
+        key (as assigned by the Dask scheduler) save a reference to the key and
+        the associated data to its local staging area. From this point on, if
+        the scheduler cancels the key, the worker (and the task running on the
+        worker) now has exclusive access to the key and the associated data.
+        This way, staging makes it possible for long running explicit-comms tasks
+        to free input data ASAP.
+
+        Parameters
+        ----------
+        name: str
+            Name for the staging area
+        keys: iterable
+            The keys to stage
+
+        Returns
+        -------
+        dict
+            dict that maps each worker-rank to the workers set of staged keys
+        """
+        key_set = {stringify(k) for k in keys}
+        return dict(self.run(_stage_keys, name, key_set))
+
+
+def pop_staging_area(session_state: dict, name: str) -> Dict[str, Any]:
+    """Pop the staging area called `name`
+
+    This function must be called within a running explicit-comms task.
+
+    Parameters
+    ----------
+    session_state: dict
+        Worker session state
+    name: str
+        Name for the staging area
+
+    Returns
+    -------
+    dict
+        The staging area, which is a dict that maps keys to their data.
+    """
+    return session_state["stages"].pop(name)
diff --git a/dask_cuda/explicit_comms/dataframe/shuffle.py b/dask_cuda/explicit_comms/dataframe/shuffle.py
index 294a8efd..6099025d 100644
--- a/dask_cuda/explicit_comms/dataframe/shuffle.py
+++ b/dask_cuda/explicit_comms/dataframe/shuffle.py
@@ -5,120 +5,186 @@
 import inspect
 from collections import defaultdict
 from operator import getitem
-from typing import Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Set, TypeVar
 
-from toolz import first
+import numpy
 
 import dask
 import dask.dataframe
-from dask.base import compute_as_if_collection, tokenize
-from dask.dataframe.core import DataFrame, _concat as dd_concat, new_dd_object
-from dask.dataframe.shuffle import shuffle_group
-from dask.dataframe.utils import make_meta
-from dask.delayed import delayed
+from dask.base import tokenize
+from dask.dataframe.core import DataFrame, Series, _concat as dd_concat, new_dd_object
+from dask.dataframe.shuffle import group_split_dispatch, hash_object_dispatch
 from distributed import wait
 from distributed.protocol import nested_deserialize, to_serialize
+from distributed.worker import Worker
 
-from ...proxify_host_file import ProxyManager
 from .. import comms
 
+T = TypeVar("T")
 
-async def send(eps, rank_to_out_parts_list: Dict[int, List[List[DataFrame]]]):
-    """Notice, items sent are removed from `rank_to_out_parts_list`"""
+
+async def send(
+    eps,
+    myrank,
+    rank_to_out_part_ids: Dict[int, Set[int]],
+    out_part_id_to_dataframe: Dict[int, DataFrame],
+) -> None:
+    """Notice, items sent are removed from `out_part_id_to_dataframe`"""
     futures = []
-    for rank, ep in eps.items():
-        out_parts_list = rank_to_out_parts_list.pop(rank, None)
-        if out_parts_list is not None:
-            futures.append(ep.write([to_serialize(f) for f in out_parts_list]))
+    for rank, out_part_ids in rank_to_out_part_ids.items():
+        if rank != myrank:
+            msg = {
+                i: to_serialize(out_part_id_to_dataframe.pop(i))
+                for i in (out_part_ids & out_part_id_to_dataframe.keys())
+            }
+            futures.append(eps[rank].write(msg))
     await asyncio.gather(*futures)
 
 
 async def recv(
-    eps, in_nparts: Dict[int, int], out_parts_list: List[List[List[DataFrame]]]
-):
+    eps,
+    myrank,
+    rank_to_out_part_ids: Dict[int, Set[int]],
+    out_part_id_to_dataframe_list: Dict[int, List[DataFrame]],
+    proxify,
+) -> None:
     """Notice, received items are appended to `out_parts_list`"""
-    futures = []
-    for rank, ep in eps.items():
-        if rank in in_nparts:
-            futures.append(ep.read())
 
-    # Notice, since Dask may convert lists to tuples, we convert them back into lists
-    out_parts_list.extend(
-        [[y for y in x] for x in nested_deserialize(await asyncio.gather(*futures))]
+    async def read_msg(rank: int) -> None:
+        msg: Dict[int, DataFrame] = nested_deserialize(await eps[rank].read())
+        for out_part_id, df in msg.items():
+            out_part_id_to_dataframe_list[out_part_id].append(proxify(df))
+
+    await asyncio.gather(
+        *(read_msg(rank) for rank in rank_to_out_part_ids if rank != myrank)
     )
 
 
-def sort_in_parts(
-    in_parts: List[Dict[int, DataFrame]],
-    rank_to_out_part_ids: Dict[int, List[int]],
-    ignore_index: bool,
-    concat_dfs_of_same_output_partition: bool,
-    concat,
-) -> Dict[int, List[List[DataFrame]]]:
-    """Sort the list of grouped dataframes in `in_parts`
-
-    It returns a dict that for each worker-rank specifies the output partitions:
-    '''
-        for each worker:
-            for each output partition:
-                list of dataframes that makes of an output partition
-    '''
-    If `concat_dfs_of_same_output_partition` is True, all the dataframes of an
-    output partition are concatenated.
+def get_proxify(worker: Worker) -> Callable[[T], T]:
+    """Get function to proxify objects"""
+    from dask_cuda.proxify_host_file import ProxifyHostFile
+
+    if isinstance(worker.data, ProxifyHostFile):
+        data = worker.data
+        return lambda x: data.manager.proxify(x)[0]
+    return lambda x: x  # no-op
+
+
+def compute_map_index(df: Any, column_names, npartitions) -> Series:
+    """Return a Series that maps each row `df` to a partition ID
+
+    The partitions are determined by hashing the columns given by column_names
+    unless if `column_names[0] == "_partitions"`, in which case the values of
+    `column_names[0]` are used as index.
 
     Parameters
     ----------
-    in_parts: list of dict of dataframes
-        List of dataframe groups that need to be shuffled.
-    rank_to_out_part_ids: dict
-        dict that for each worker rank specifices a list of partition IDs that
-        worker should return. If the worker shouldn't return any partitions,
-        it is excluded from the dict.
+    df: DataFrame
+    column_names: list of strings
+        List of column names on which we want to split.
+    npartitions: int or None
+        The desired number of output partitions.
+
+    Returns
+    -------
+    out: Dict[int, DataFrame]
+        A dictionary mapping integers in {0..k} to dataframes such that the
+        hash values of `df[col]` are well partitioned.
+    """
+
+    if column_names[0] == "_partitions":
+        ind = df[column_names[0]]
+    else:
+        ind = hash_object_dispatch(
+            df[column_names] if column_names else df, index=False
+        )
+    typ = numpy.min_scalar_type(npartitions * 2)
+    return (ind % npartitions).astype(typ, copy=False)
+
+
+def single_shuffle_group(
+    df: DataFrame, column_names, npartitions, ignore_index
+) -> Dict[int, DataFrame]:
+    """Split dataframe based on the indexes returned by `compute_map_index`"""
+    map_index = compute_map_index(df, column_names, npartitions)
+    return group_split_dispatch(df, map_index, npartitions, ignore_index=ignore_index)
+
+
+def multi_shuffle_group(
+    df_meta: DataFrame,
+    dfs: Dict[str, DataFrame],
+    column_names,
+    npartitions,
+    ignore_index,
+    proxify,
+) -> Dict[int, DataFrame]:
+    """Split multiple dataframes such that each partition hashes to the same
+
+    Since we concatenate dataframes belonging to the same partition, each
+    partition ID maps to exactly one dataframe.
+
+    Parameters
+    ----------
+    df_meta: DataFrame
+        An empty dataframe matching the expected output
+    dfs: dict of dataframes
+        The dataframes to split given as a map of stage keys to dataframes
+    column_names: list of strings
+        List of column names on which we want to split.
+    npartitions: int or None
+        The desired number of output partitions.
     ignore_index: bool
-        Ignore index during shuffle.  If ``True``, performance may improve,
+        Ignore index during shuffle.  If True, performance may improve,
         but index values will not be preserved.
-    concat_dfs_of_same_output_partition: bool
-        Concatenate all dataframes of the same output partition.
+    proxify: callable
+        Function to proxify object.
 
     Returns
     -------
-    rank_to_out_parts_list: dict of list of list of DataFrames
-        Dict that maps each worker rank to its output partitions.
+    dict of DataFrames
+        Mapping from partition ID to dataframe.
     """
 
-    out_part_id_to_dataframes = defaultdict(list)  # part_id -> list of dataframes
-    for bins in in_parts:
-        for k, v in bins.items():
-            out_part_id_to_dataframes[k].append(v)
-        del bins
-
-    # Create mapping: rank -> list of [list of dataframes]
-    rank_to_out_parts_list: Dict[int, List[List[DataFrame]]] = {}
-    for rank, part_ids in rank_to_out_part_ids.items():
-        rank_to_out_parts_list[rank] = [out_part_id_to_dataframes[i] for i in part_ids]
-    del out_part_id_to_dataframes
-
-    # Concatenate all dataframes of the same output partition.
-    if concat_dfs_of_same_output_partition:
-        for rank in rank_to_out_part_ids.keys():
-            for i in range(len(rank_to_out_parts_list[rank])):
-                if len(rank_to_out_parts_list[rank][i]) > 1:
-                    rank_to_out_parts_list[rank][i] = [
-                        concat(
-                            rank_to_out_parts_list[rank][i], ignore_index=ignore_index
-                        )
-                    ]
-    return rank_to_out_parts_list
-
-
-async def local_shuffle(
+    # Grouping each input dataframe, one part for each partition ID.
+    dfs_grouped: List[Dict[int, DataFrame]] = []
+    while dfs:
+        dfs_grouped.append(
+            proxify(
+                single_shuffle_group(
+                    # pop dataframe in any order, to free staged memory ASAP
+                    dfs.popitem()[1],
+                    column_names,
+                    npartitions,
+                    ignore_index,
+                )
+            )
+        )
+
+    # Maps each output partition ID to a dataframe. If the partition is empty,
+    # an empty dataframe is used.
+    ret: Dict[int, DataFrame] = {}
+    for i in range(npartitions):  # Iterate over all possible output partition IDs
+        t = [df_grouped[i] for df_grouped in dfs_grouped]
+        if len(t) == 1:
+            ret[i] = t[0]
+        elif len(t) > 1:
+            ret[i] = proxify(dd_concat(t, ignore_index=ignore_index))
+        else:
+            ret[i] = df_meta  # Empty dataframe
+    return ret
+
+
+async def shuffle_task(
     s,
-    in_nparts: Dict[int, int],
-    in_parts: List[Dict[int, DataFrame]],
-    rank_to_out_part_ids: Dict[int, List[int]],
-    ignore_index: bool,
+    stage_name,
+    df_meta,
+    rank_to_inkeys: Dict[int, set],
+    rank_to_out_part_ids: Dict[int, Set[int]],
+    column_names,
+    npartitions,
+    ignore_index,
 ) -> List[DataFrame]:
-    """Local shuffle operation of the already grouped/partitioned dataframes
+    """Explicit-comms shuffle task
 
     This function is running on each worker participating in the shuffle.
 
@@ -126,18 +192,21 @@ async def local_shuffle(
     ----------
     s: dict
         Worker session state
-    in_nparts: dict
-        dict that for each worker rank specifices the
-        number of partitions that worker has of the input dataframe.
-        If the worker doesn't have any partitions, it is excluded from the dict.
-    in_parts: list of dict of dataframes
-        List of dataframe groups that need to be shuffled.
+    stage_name: str
+        Name of the stage to retrieve the input keys from.
+    rank_to_inkeys: dict
+        dict that for each worker rank specifices the set of staged input keys.
     rank_to_out_part_ids: dict
-        dict that for each worker rank specifices a list of partition IDs that
-        worker should return. If the worker shouldn't return any partitions,
-        it is excluded from the dict.
+        dict that for each worker rank specifices a set of output partition IDs.
+        If the worker shouldn't return any partitions, it is excluded from the
+        dict. Partition IDs are global integers `0..npartitions` and corresponds
+        to the dict keys returned by `group_split_dispatch`.
+    column_names: list of strings
+        List of column names on which we want to split.
+    npartitions: int or None
+        The desired number of output partitions.
     ignore_index: bool
-        Ignore index during shuffle.  If ``True``, performance may improve,
+        Ignore index during shuffle.  If True, performance may improve,
         but index values will not be preserved.
 
     Returns
@@ -145,67 +214,49 @@ async def local_shuffle(
     partitions: list of DataFrames
         List of dataframe-partitions
     """
+
+    proxify = get_proxify(s["worker"])
     myrank = s["rank"]
     eps = s["eps"]
-
-    try:
-        manager = first(iter(in_parts[0].values()))._pxy_get().manager
-    except AttributeError:
-        manager = None
-
-    if isinstance(manager, ProxyManager):
-
-        def concat(args, ignore_index=False):
-            if len(args) < 2:
-                return args[0]
-
-            return manager.proxify(dd_concat(args, ignore_index=ignore_index))[0]
-
-    else:
-        concat = dd_concat
-
-    rank_to_out_parts_list = sort_in_parts(
-        in_parts,
-        rank_to_out_part_ids,
-        ignore_index,
-        concat_dfs_of_same_output_partition=True,
-        concat=concat,
+    stage = comms.pop_staging_area(s, stage_name)
+    assert stage.keys() == rank_to_inkeys[myrank]
+
+    out_part_id_to_dataframe = multi_shuffle_group(
+        df_meta=df_meta,
+        dfs=stage,
+        column_names=column_names,
+        npartitions=npartitions,
+        ignore_index=ignore_index,
+        proxify=proxify,
     )
 
     # Communicate all the dataframe-partitions all-to-all. The result is
-    # `out_parts_list` that for each worker and for each output partition
-    # contains a list of dataframes received.
-    out_parts_list: List[List[List[DataFrame]]] = []
-    futures = []
-    if myrank in rank_to_out_parts_list:
-        futures.append(recv(eps, in_nparts, out_parts_list))
-    if myrank in in_nparts:
-        futures.append(send(eps, rank_to_out_parts_list))
-    await asyncio.gather(*futures)
+    # `out_part_id_to_dataframe_list` that for each output partition maps
+    # a list of dataframes received.
+    out_part_id_to_dataframe_list: Dict[int, List[DataFrame]] = defaultdict(list)
+    await asyncio.gather(
+        recv(eps, myrank, rank_to_out_part_ids, out_part_id_to_dataframe_list, proxify),
+        send(eps, myrank, rank_to_out_part_ids, out_part_id_to_dataframe),
+    )
 
     # At this point `send()` should have pop'ed all output partitions
     # beside the partitions owned be `myrank`.
-    assert len(rank_to_out_parts_list) == 1
-
-    # Concatenate the received dataframes into the final output partitions
-    ret = []
-    for i in range(len(rank_to_out_part_ids[myrank])):
-        dfs = []
-        for out_parts in out_parts_list:
-            dfs.extend(out_parts[i])
-            out_parts[i] = None  # type: ignore
-        dfs.extend(rank_to_out_parts_list[myrank][i])
-        rank_to_out_parts_list[myrank][i] = None  # type: ignore
-        if len(dfs) > 1:
-            ret.append(concat(dfs, ignore_index=ignore_index))
-        else:
-            ret.append(dfs[0])
-    return ret
+    assert rank_to_out_part_ids[myrank] == out_part_id_to_dataframe.keys()
+    # We can now add them to the output dataframes.
+    for out_part_id, dataframe in out_part_id_to_dataframe.items():
+        out_part_id_to_dataframe_list[out_part_id].append(dataframe)
+    del out_part_id_to_dataframe
+
+    # Finally, we concatenate the output dataframes into the final output partitions
+    return [
+        proxify(dd_concat(dfs, ignore_index=ignore_index))
+        for dfs in out_part_id_to_dataframe_list.values()
+    ]
 
 
 def shuffle(
     df: DataFrame,
-    column_names: str | List[str],
+    column_names: List[str],
     npartitions: Optional[int] = None,
     ignore_index: bool = False,
 ) -> DataFrame:
@@ -230,7 +281,7 @@ def shuffle(
         The desired number of output partitions. If None, the number of output
         partitions equals `df.npartitions`
     ignore_index: bool
-        Ignore index during shuffle.  If True, performance may improve,
+        Ignore index during shuffle. If True, performance may improve,
         but index values will not be preserved.
 
     Returns
@@ -241,118 +292,81 @@ def shuffle(
     Developer Notes
     ---------------
     The implementation consist of three steps:
-      (a) Extend the dask graph of `df` with a call to `shuffle_group()` for each
-          dataframe partition and submit the graph.
+      (a) Stage the partitions of `df` on all workers and then cancel them
+          thus at this point the Dask Scheduler doesn't know about any of the
+          the partitions.
       (b) Submit a task on each worker that shuffle (all-to-all communicate)
-          the groups from (a) and return a list of dataframe-partitions.
+          the staged partitions and return a list of dataframe-partitions.
       (c) Submit a dask graph that extract (using `getitem()`) individual
           dataframe-partitions from (b).
     """
     c = comms.default_comms()
 
-    # As default we preserve number of partitions
+    # The ranks of the output workers
+    ranks = list(range(len(c.worker_addresses)))
+
+    # By default, we preserve number of partitions
     if npartitions is None:
         npartitions = df.npartitions
 
-    # Step (a): partition/group each dataframe-partition
+    # Step (a):
+    df = df.persist()  # Make sure optimizations are apply on the existing graph
+    wait(df)  # Make sure all keys has been materialized on workers
     name = (
-        "explicit-comms-shuffle-group-"
+        "explicit-comms-shuffle-"
         f"{tokenize(df, column_names, npartitions, ignore_index)}"
     )
-    df = df.persist()  # Making sure optimizations are apply on the existing graph
-    dsk = dict(df.__dask_graph__())
-    output_keys = []
-    for input_key in df.__dask_keys__():
-        output_key = (name, input_key[1])
-        dsk[output_key] = (
-            shuffle_group,
-            input_key,
+    df_meta: DataFrame = df._meta
+
+    # Stage all keys of `df` on the workers and cancel them, which makes it possible
+    # for the shuffle to free memory as the partitions of `df` are consumed.
+    # See CommsContext.stage_keys() for a description of staging.
+    rank_to_inkeys = c.stage_keys(name=name, keys=df.__dask_keys__())
+    c.client.cancel(df)
+
+    # Find the output partition IDs for each worker
+    div = npartitions // len(ranks)
+    rank_to_out_part_ids: Dict[int, Set[int]] = {}  # rank -> set of partition id
+    for i, rank in enumerate(ranks):
+        rank_to_out_part_ids[rank] = set(range(div * i, div * (i + 1)))
+    for rank, i in zip(ranks, range(div * len(ranks), npartitions)):
+        rank_to_out_part_ids[rank].add(i)
+
+    # Run `_shuffle()` on each worker
+    shuffle_result = {}
+    for rank in ranks:
+        shuffle_result[rank] = c.submit(
+            c.worker_addresses[rank],
+            shuffle_task,
+            name,
+            df_meta,
+            rank_to_inkeys,
+            rank_to_out_part_ids,
             column_names,
-            0,
-            npartitions,
             npartitions,
             ignore_index,
-            npartitions,
         )
-        output_keys.append(output_key)
-
-    # Compute `df_groups`, which is a list of futures, one future per partition in `df`.
-    # Each future points to a dict of length `df.npartitions` that maps each
-    # partition-id to a DataFrame.
-    df_groups = compute_as_if_collection(type(df), dsk, output_keys, sync=False)
-    wait(df_groups)
-    for f in df_groups:  # Check for errors
-        if f.status == "error":
-            f.result()  # raise exception
-
-    # Step (b): find out which workers has what part of `df_groups`,
-    #           find the number of output each worker should have,
-    #           and submit `local_shuffle()` on each worker.
-    key_to_part = {str(part.key): part for part in df_groups}
-    in_parts = defaultdict(list)  # Map worker -> [list of futures]
-    for key, workers in c.client.who_has(df_groups).items():
-        # Note, if multiple workers have the part, we pick the first worker
-        in_parts[first(workers)].append(key_to_part[key])
-
-    # Let's create a dict that specifices the number of partitions each worker has
-    in_nparts = {}
-    workers = set()  # All ranks that have a partition of `df`
-    for rank, worker in enumerate(c.worker_addresses):
-        nparts = len(in_parts.get(worker, ()))
-        if nparts > 0:
-            in_nparts[rank] = nparts
-            workers.add(rank)
-    workers_sorted = sorted(workers)
-
-    # Find the output partitions for each worker
-    div = npartitions // len(workers)
-    rank_to_out_part_ids = {}  # rank -> [list of partition id]
-    for i, rank in enumerate(workers_sorted):
-        rank_to_out_part_ids[rank] = list(range(div * i, div * (i + 1)))
-    for rank, i in zip(workers_sorted, range(div * len(workers), npartitions)):
-        rank_to_out_part_ids[rank].append(i)
-
-    # Run `local_shuffle()` on each worker
-    result_futures = {}
-    for rank, worker in enumerate(c.worker_addresses):
-        if rank in workers:
-            result_futures[rank] = c.submit(
-                worker,
-                local_shuffle,
-                in_nparts,
-                in_parts[worker],
-                rank_to_out_part_ids,
-                ignore_index,
-            )
-    wait(list(result_futures.values()))
-
-    # Release dataframes from step (a)
-    for fut in df_groups:
-        fut.release()
+    wait(list(shuffle_result.values()))
 
-    # Step (c): extract individual dataframe-partitions. We use `submit()`
+    # Step (d): extract individual dataframe-partitions. We use `submit()`
     #           to control where the tasks are executed.
     # TODO: can we do this without using `submit()` to avoid the overhead
     #       of creating a Future for each dataframe partition?
-    name = f"explicit-comms-shuffle-getitem-{tokenize(name)}"
-    dsk = {}
-    for rank, worker in enumerate(c.worker_addresses):
-        if rank in workers:
-            for i, part_id in enumerate(rank_to_out_part_ids[rank]):
-                dsk[(name, part_id)] = c.client.submit(
-                    getitem, result_futures[rank], i, workers=[worker]
-                )
 
-    # Get the meta from the first output partition
-    meta = delayed(make_meta)(next(iter(dsk.values()))).compute()
+    dsk = {}
+    for rank in ranks:
+        for i, part_id in enumerate(rank_to_out_part_ids[rank]):
+            dsk[(name, part_id)] = c.client.submit(
+                getitem, shuffle_result[rank], i, workers=[c.worker_addresses[rank]]
+            )
 
     # Create a distributed Dataframe from all the pieces
     divs = [None] * (len(dsk) + 1)
-    ret = new_dd_object(dsk, name, meta, divs).persist()
+    ret = new_dd_object(dsk, name, df_meta, divs).persist()
     wait(ret)
 
     # Release all temporary dataframes
-    for fut in [*result_futures.values(), *dsk.values()]:
+    for fut in [*shuffle_result.values(), *dsk.values()]:
         fut.release()
     return ret
 

From 8effa7ff14246b27042b0c12251f3a76456077d2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 28 Nov 2022 19:20:47 +0530
Subject: [PATCH 22/25] Fix `parse_memory_limit` function call (#1055)

This PR addresses a breaking change that was made in upstream distributed: https://github.com/dask/distributed/pull/5669/

Error due to above change:
```
>       self.memory_limit = parse_memory_limit(
            memory_limit=memory_limit, nthreads=1, total_cores=n_workers
        )
E       TypeError: parse_memory_limit() missing 1 required keyword-only argument: 'logger'

```
Introduced a version-based function call because we haven't yet finalized the `dask` pinning for `22.12`, though as of now `2022.11.1` seems to be the most likely release pinning. Since the dask dev `2022.11.1` packages now contain this breaking change, this check will also help unblock any `22.12` CI jobs that are fetching `2022.11.1` dask dev packages.

Once `dask` is pinned to a specific version, this version-based check can be removed.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1055
---
 dask_cuda/cuda_worker.py        | 15 ++++++++++++---
 dask_cuda/local_cuda_cluster.py | 18 +++++++++++++++---
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/dask_cuda/cuda_worker.py b/dask_cuda/cuda_worker.py
index b5c4285d..5e14aba8 100644
--- a/dask_cuda/cuda_worker.py
+++ b/dask_cuda/cuda_worker.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import atexit
+import logging
 import os
 import warnings
 
@@ -15,6 +16,7 @@
     enable_proctitle_on_children,
     enable_proctitle_on_current,
 )
+from distributed.utils import has_arg
 from distributed.worker_memory import parse_memory_limit
 
 from .device_host_file import DeviceHostFile
@@ -84,9 +86,16 @@ def __init__(
             raise ValueError("nthreads must be higher than 0.")
 
         # Set nthreads=1 when parsing mem_limit since it only depends on nprocs
-        memory_limit = parse_memory_limit(
-            memory_limit=memory_limit, nthreads=1, total_cores=nprocs
-        )
+        if has_arg(parse_memory_limit, "logger"):
+            # TODO: Remove has_arg check after 2022.11.1 support is dropped
+            logger = logging.getLogger(__name__)
+            memory_limit = parse_memory_limit(
+                memory_limit=memory_limit, nthreads=1, total_cores=nprocs, logger=logger
+            )
+        else:
+            memory_limit = parse_memory_limit(
+                memory_limit=memory_limit, nthreads=1, total_cores=nprocs
+            )
 
         if pid_file:
             with open(pid_file, "w") as f:
diff --git a/dask_cuda/local_cuda_cluster.py b/dask_cuda/local_cuda_cluster.py
index 014d0b4e..ff93532d 100644
--- a/dask_cuda/local_cuda_cluster.py
+++ b/dask_cuda/local_cuda_cluster.py
@@ -1,9 +1,11 @@
 import copy
+import logging
 import os
 import warnings
 
 import dask
 from distributed import LocalCluster, Nanny, Worker
+from distributed.utils import has_arg
 from distributed.worker_memory import parse_memory_limit
 
 from .device_host_file import DeviceHostFile
@@ -231,9 +233,19 @@ def __init__(
         if n_workers < 1:
             raise ValueError("Number of workers cannot be less than 1.")
         # Set nthreads=1 when parsing mem_limit since it only depends on n_workers
-        self.memory_limit = parse_memory_limit(
-            memory_limit=memory_limit, nthreads=1, total_cores=n_workers
-        )
+        if has_arg(parse_memory_limit, "logger"):
+            # TODO: Remove has_arg check after 2022.11.1 support is dropped
+            logger = logging.getLogger(__name__)
+            self.memory_limit = parse_memory_limit(
+                memory_limit=memory_limit,
+                nthreads=1,
+                total_cores=n_workers,
+                logger=logger,
+            )
+        else:
+            self.memory_limit = parse_memory_limit(
+                memory_limit=memory_limit, nthreads=1, total_cores=n_workers
+            )
         self.device_memory_limit = parse_device_memory_limit(
             device_memory_limit, device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES)
         )

From 55375b8b2da020f851a2123de68d1817c4c8a30a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 29 Nov 2022 17:47:04 +0100
Subject: [PATCH 23/25] Ensure linting checks for whole repo in CI (#1053)

It often happens that `pre-commit` finds improperly formatted files that are not in the `dask_cuda` directory, therefore it is sensible to check the entire repo.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/dask-cuda/pull/1053
---
 .pre-commit-config.yaml |  4 ++--
 ci/checks/style.sh      | 49 ++---------------------------------------
 2 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a2b60871..bd219066 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,13 +1,13 @@
 repos:
       - repo: https://github.com/pycqa/isort
-        rev: 5.6.4
+        rev: 5.10.1
         hooks:
               - id: isort
       - repo: https://github.com/ambv/black
         rev: 22.3.0
         hooks:
               - id: black
-      - repo: https://gitlab.com/pycqa/flake8
+      - repo: https://github.com/PyCQA/flake8
         rev: 3.8.3
         hooks:
               - id: flake8
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 48343273..5d01f97d 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -12,50 +12,5 @@ PATH=/opt/conda/bin:$PATH
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-# Print versions
-echo -e "\nVersions:"
-black --version
-echo "isort, `isort --vn`"
-echo "flake8, `flake8 --version`"
-
-# Run isort and get results/return code
-ISORT=`isort --check-only dask_cuda 2>&1`
-ISORT_RETVAL=$?
-
-# Run black and get results/return code
-BLACK=`black --check dask_cuda 2>&1`
-BLACK_RETVAL=$?
-
-# Run flake8 and get results/return code
-FLAKE=`flake8 dask_cuda 2>&1`
-FLAKE_RETVAL=$?
-
-# Output results if failure otherwise show pass
-if [ "$ISORT_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: isort style check; begin output\n\n"
-  echo -e "$ISORT"
-  echo -e "\n\n>>>> FAILED: isort style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: isort style check\n\n"
-fi
-
-if [ "$BLACK_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: black style check; begin output\n\n"
-  echo -e "$BLACK"
-  echo -e "\n\n>>>> FAILED: black style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: black style check\n\n"
-fi
-
-if [ "$FLAKE_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n"
-  echo -e "$FLAKE"
-  echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: flake8 style check\n\n"
-fi
-
-RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL)
-IFS=$'\n'
-RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
-exit $RETVAL
+# Run pre-commit checks
+pre-commit run --hook-stage manual --all-files

From ee40483c3fd2f69c2993126f5a28b4e791aa03f8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 1 Dec 2022 19:48:56 +0530
Subject: [PATCH 24/25] Pin `dask` and `distributed` for release (#1046)

This PR pins `dask` and `distributed` to `2022.11.1` for `22.12` release.

xref: https://github.com/rapidsai/cudf/pull/12165

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/dask-cuda/pull/1046
---
 ci/cpu/build.sh | 4 ++--
 ci/gpu/build.sh | 4 ++--
 pyproject.toml  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index c029956a..5ed0a322 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -21,10 +21,10 @@ export GPUCI_CONDA_RETRY_SLEEP=30
 
 # Whether to keep `dask/label/dev` channel in the env. If INSTALL_DASK_MAIN=0,
 # `dask/label/dev` channel is removed.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.9.2"
+export DASK_STABLE_VERSION="2022.11.1"
 
 # Switch to project root; also root of repo checkout
 cd "$WORKSPACE"
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 86e4a899..1e2479b7 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -35,10 +35,10 @@ export NUMPY_EXPERIMENTAL_ARRAY_FUNCTION=1
 
 # Install dask and distributed from main branch. Usually needed during
 # development time and disabled before a new dask-cuda release.
-export INSTALL_DASK_MAIN=1
+export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.9.2"
+export DASK_STABLE_VERSION="2022.11.1"
 
 # Temporary workaround for Jupyter errors.
 # See https://github.com/rapidsai/dask-cuda/issues/1040
diff --git a/pyproject.toml b/pyproject.toml
index 6ed22d82..4eec772d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license= { text = "Apache-2.0" }
 requires-python = ">=3.8"
 dependencies = [
-    "dask >=2022.9.2",
-    "distributed >=2022.9.2",
+    "dask ==2022.11.1",
+    "distributed ==2022.11.1",
     "pynvml >=11.0.0",
     "numpy >=1.18.0",
     "numba >=0.54",

From 8c872886ae74433f8e78a545ffcf934d813578cb Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Thu, 8 Dec 2022 09:46:37 -0500
Subject: [PATCH 25/25] update changelog

---
 CHANGELOG.md | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44cbac4c..680c0d9d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,43 @@
-# dask-cuda 22.12.00 (Date TBD)
+# dask-cuda 22.12.00 (8 Dec 2022)
 
-Please see https://github.com/rapidsai/dask-cuda/releases/tag/v22.12.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Make local_directory a required argument for spilling impls ([#1023](https://github.com/rapidsai/dask-cuda/pull/1023)) [@wence-](https://github.com/wence-)
+
+## 🐛 Bug Fixes
+
+- Fix `parse_memory_limit` function call ([#1055](https://github.com/rapidsai/dask-cuda/pull/1055)) [@galipremsagar](https://github.com/galipremsagar)
+- Work around Jupyter errors in CI ([#1041](https://github.com/rapidsai/dask-cuda/pull/1041)) [@pentschev](https://github.com/pentschev)
+- Fix version constraint ([#1036](https://github.com/rapidsai/dask-cuda/pull/1036)) [@wence-](https://github.com/wence-)
+- Support the new `Buffer` in cudf ([#1033](https://github.com/rapidsai/dask-cuda/pull/1033)) [@madsbk](https://github.com/madsbk)
+- Install Dask nightly last in CI ([#1029](https://github.com/rapidsai/dask-cuda/pull/1029)) [@pentschev](https://github.com/pentschev)
+- Fix recorded time in merge benchmark ([#1028](https://github.com/rapidsai/dask-cuda/pull/1028)) [@wence-](https://github.com/wence-)
+- Switch pre-import not found test to sync definition ([#1026](https://github.com/rapidsai/dask-cuda/pull/1026)) [@pentschev](https://github.com/pentschev)
+- Make local_directory a required argument for spilling impls ([#1023](https://github.com/rapidsai/dask-cuda/pull/1023)) [@wence-](https://github.com/wence-)
+- Fixes for handling MIG devices ([#950](https://github.com/rapidsai/dask-cuda/pull/950)) [@pentschev](https://github.com/pentschev)
+
+## 📖 Documentation
+
+- Merge 22.10 into 22.12 ([#1016](https://github.com/rapidsai/dask-cuda/pull/1016)) [@pentschev](https://github.com/pentschev)
+- Merge 22.08 into 22.10 ([#1010](https://github.com/rapidsai/dask-cuda/pull/1010)) [@pentschev](https://github.com/pentschev)
+
+## 🚀 New Features
+
+- Allow specifying fractions as RMM pool initial/maximum size ([#1021](https://github.com/rapidsai/dask-cuda/pull/1021)) [@pentschev](https://github.com/pentschev)
+- Add feature to get cluster configuration ([#1006](https://github.com/rapidsai/dask-cuda/pull/1006)) [@quasiben](https://github.com/quasiben)
+- Add benchmark option to use dask-noop ([#994](https://github.com/rapidsai/dask-cuda/pull/994)) [@wence-](https://github.com/wence-)
+
+## 🛠️ Improvements
+
+- Ensure linting checks for whole repo in CI ([#1053](https://github.com/rapidsai/dask-cuda/pull/1053)) [@pentschev](https://github.com/pentschev)
+- Pin `dask` and `distributed` for release ([#1046](https://github.com/rapidsai/dask-cuda/pull/1046)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove `pytest-asyncio` dependency ([#1045](https://github.com/rapidsai/dask-cuda/pull/1045)) [@pentschev](https://github.com/pentschev)
+- Migrate as much as possible to `pyproject.toml` ([#1035](https://github.com/rapidsai/dask-cuda/pull/1035)) [@jakirkham](https://github.com/jakirkham)
+- Re-implement shuffle using staging ([#1030](https://github.com/rapidsai/dask-cuda/pull/1030)) [@madsbk](https://github.com/madsbk)
+- Explicit-comms-shuffle: fine control of task scheduling ([#1025](https://github.com/rapidsai/dask-cuda/pull/1025)) [@madsbk](https://github.com/madsbk)
+- Remove stale labeler ([#1024](https://github.com/rapidsai/dask-cuda/pull/1024)) [@raydouglass](https://github.com/raydouglass)
+- Unpin `dask` and `distributed` for development ([#1005](https://github.com/rapidsai/dask-cuda/pull/1005)) [@galipremsagar](https://github.com/galipremsagar)
+- Support cuDF&#39;s built-in spilling ([#984](https://github.com/rapidsai/dask-cuda/pull/984)) [@madsbk](https://github.com/madsbk)
 
 # dask-cuda 22.10.00 (12 Oct 2022)