Skip to content

Commit

Permalink
Version 0.5.1
Browse files Browse the repository at this point in the history
  • Loading branch information
kno10 committed Mar 14, 2024
1 parent f700b07 commit 9d6e5c6
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 60 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

For changes to the main Rust package, please see <https://github.com/kno10/rust-kmedoids/blob/main/CHANGELOG.md>

## kmedoids 0.5.1 (2024-03-14)

- DynMSC: best loss reported incorrectly if best k=2
- add minimum k parameter
- bump rayon version (no changes)
- use pointer-sized np.uintp type for medoids, for wasm32 target
that should match Rust usize.

## kmedoids 0.5.0 (2023-12-10)

- add DynMSC, Silhouette clustering with optimal number of clusters
Expand Down
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors:
title: "Fast k-medoids Clustering in Rust and Python"
journal: "J. Open Source Softw."
doi: 10.21105/joss.04183
version: 0.5.0
version: 0.5.1
date-released: 2023-12-10
license: GPL-3.0
preferred-citation:
Expand Down
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
edition = "2021"
name = "kmedoids"
version = "0.5.0"
version = "0.5.1"
authors = ["Erich Schubert <[email protected]>", "Lars Lenssen <[email protected]>"]
description = "k-Medoids clustering with the FasterPAM algorithm"
homepage = "https://github.com/kno10/python-kmedoids"
Expand All @@ -14,11 +14,11 @@ name = "kmedoids"
crate-type = ["cdylib"]

[dependencies]
rustkmedoids = { version = "0.5.0", package = "kmedoids", git = "https://github.com/kno10/rust-kmedoids" }
rustkmedoids = { version = "0.5.1", package = "kmedoids", git = "https://github.com/kno10/rust-kmedoids" }
numpy = "0.20"
ndarray = "0.15"
rand = "0.8"
rayon = "1.8"
rayon = "1.9"

[dependencies.pyo3]
version = "0.20"
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,14 +153,15 @@ from sklearn.metrics.pairwise import euclidean_distances
X, _ = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
X = X[:10000]
diss = euclidean_distances(X)
kmin = 10
kmax = 20
kmin, kmax = 10, 20
dm = kmedoids.dynmsc(diss, kmax, kmin)
print("Optimal number of clusters according to the Medoid Silhouette:", dm.bestk)
print("Medoid Silhouette over range of k:", dm.losses)
print("Range of k:", dm.rangek)
```

[Full Colab notebook example](https://colab.research.google.com/drive/14vop12NwZ5Si5EuzXHIksKnxZxabecWW).

### Memory Requirements

Because the algorithms require a distance matrix as input, you need O(N²) memory to use these implementations. With single precision, this matrix needs 4·N² bytes, so a typical laptop with 8 GB of RAM could handle data sets of over 40.000 instances, but if your computation of the distance matrix incurs copying the matrix, only 30.000 or less may be feasible.
Expand Down
105 changes: 53 additions & 52 deletions kmedoids/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def _check_medoids(diss, medoids, init, random_state):
if isinstance(medoids, np.ndarray):
if random_state is not None:
warnings.warn("Seed will be ignored if initial medoids are given")
return medoids
return medoids.astype(np.uintp)
if isinstance(medoids, int):
if init.lower() == "build":
return pam_build(diss, medoids).medoids
Expand All @@ -177,8 +177,8 @@ def _check_medoids(diss, medoids, init, random_state):
elif isinstance(random_state, numbers.Integral):
random_state = np.random.RandomState(random_state)
if not isinstance(random_state, np.random.RandomState):
raise ValueError("Pass a numpy random generator, state or integer seed")
return random_state.choice(diss.shape[0], medoids, False)
raise ValueError("Pass a numpy random generator, RandomState or integer seed")
return random_state.choice(diss.shape[0], medoids, False).astype(np.uintp)
raise ValueError("Specify the number of medoids, or give a numpy array of initial medoids")

def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_cpu=-1):
Expand Down Expand Up @@ -245,22 +245,22 @@ def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_c
else:
raise ValueError("Pass a numpy random generator, state or integer seed")
if dtype == np.float32:
return KMedoidsResult(*_par_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
return KMedoidsResult(*_par_fasterpam_f32(diss, medoids, max_iter, seed, n_cpu))
elif dtype == np.float64:
return KMedoidsResult(*_par_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
return KMedoidsResult(*_par_fasterpam_f64(diss, medoids, max_iter, seed, n_cpu))
elif dtype == np.int32:
return KMedoidsResult(*_par_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
return KMedoidsResult(*_par_fasterpam_i32(diss, medoids, max_iter, seed, n_cpu))
elif dtype == np.int64:
return KMedoidsResult(*_par_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter, seed, n_cpu))
return KMedoidsResult(*_par_fasterpam_i64(diss, medoids, max_iter, seed, n_cpu))
elif random_state is None:
if dtype == np.float32:
return KMedoidsResult(*_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fasterpam_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fasterpam_f64(diss, medoids, max_iter))
elif dtype == np.int32:
return KMedoidsResult(*_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fasterpam_i32(diss, medoids, max_iter))
elif dtype == np.int64:
return KMedoidsResult(*_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fasterpam_i64(diss, medoids, max_iter))
else:
seed = None
if random_state is np.random:
Expand All @@ -272,13 +272,13 @@ def fasterpam(diss, medoids, max_iter=100, init="random", random_state=None, n_c
else:
raise ValueError("Pass a numpy random generator, state or integer seed")
if dtype == np.float32:
return KMedoidsResult(*_rand_fasterpam_f32(diss, medoids.astype(np.uint64), max_iter, seed))
return KMedoidsResult(*_rand_fasterpam_f32(diss, medoids, max_iter, seed))
elif dtype == np.float64:
return KMedoidsResult(*_rand_fasterpam_f64(diss, medoids.astype(np.uint64), max_iter, seed))
return KMedoidsResult(*_rand_fasterpam_f64(diss, medoids, max_iter, seed))
elif dtype == np.int32:
return KMedoidsResult(*_rand_fasterpam_i32(diss, medoids.astype(np.uint64), max_iter, seed))
return KMedoidsResult(*_rand_fasterpam_i32(diss, medoids, max_iter, seed))
elif dtype == np.int64:
return KMedoidsResult(*_rand_fasterpam_i64(diss, medoids.astype(np.uint64), max_iter, seed))
return KMedoidsResult(*_rand_fasterpam_i64(diss, medoids, max_iter, seed))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def fastpam1(diss, medoids, max_iter=100, init="random", random_state=None):
Expand Down Expand Up @@ -327,13 +327,13 @@ def fastpam1(diss, medoids, max_iter=100, init="random", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_fastpam1_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastpam1_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_fastpam1_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastpam1_f64(diss, medoids, max_iter))
elif dtype == np.int32:
return KMedoidsResult(*_fastpam1_i32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastpam1_i32(diss, medoids, max_iter))
elif dtype == np.int64:
return KMedoidsResult(*_fastpam1_i64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastpam1_i64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def pam_build(diss, k):
Expand Down Expand Up @@ -419,13 +419,13 @@ def pam(diss, medoids, max_iter=100, init="build", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_pam_swap_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pam_swap_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_pam_swap_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pam_swap_f64(diss, medoids, max_iter))
elif dtype == np.int32:
return KMedoidsResult(*_pam_swap_i32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pam_swap_i32(diss, medoids, max_iter))
elif dtype == np.int64:
return KMedoidsResult(*_pam_swap_i64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pam_swap_i64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None):
Expand Down Expand Up @@ -466,9 +466,9 @@ def pammedsil(diss, medoids, max_iter=100, init="build", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pammedsil_swap_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pammedsil_swap_f64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def pamsil(diss, medoids, max_iter=100, init="build", random_state=None):
Expand Down Expand Up @@ -508,9 +508,9 @@ def pamsil(diss, medoids, max_iter=100, init="build", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_pamsil_swap_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pamsil_swap_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_pamsil_swap_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_pamsil_swap_f64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None):
Expand Down Expand Up @@ -558,9 +558,9 @@ def fastmsc(diss, medoids, max_iter=100, init="random", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_fastmsc_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastmsc_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_fastmsc_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastmsc_f64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
Expand Down Expand Up @@ -608,9 +608,9 @@ def fastermsc(diss, medoids, max_iter=100, init="random", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_fastermsc_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastermsc_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_fastermsc_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_fastermsc_f64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state=None):
Expand All @@ -632,12 +632,12 @@ def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state
:type diss: ndarray
:param medoids: maximum number of clusters to find or existing medoids with length of maximum number of clusters to find
:type medoids: int or ndarray
:param minimum_k: minimum number of clusters to find
:type minimum_k: int
:param max_iter: maximum number of iterations
:type max_iter: int
:param init: initialization method
:type init: str, "random", "first" or "build"
:param minimum_k: minimum number of clusters to find
:type minimum_k: int
:param random_state: random seed if no medoids are given
:type random_state: int, RandomState instance or None
Expand All @@ -657,9 +657,9 @@ def dynmsc(diss, medoids, minimum_k=2, max_iter=100, init="random", random_state
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return DynkResult(*_dynmsc_f32(diss, medoids.astype(np.uint64), minimum_k, max_iter))
return DynkResult(*_dynmsc_f32(diss, medoids, minimum_k, max_iter))
elif dtype == np.float64:
return DynkResult(*_dynmsc_f64(diss, medoids.astype(np.uint64), minimum_k, max_iter))
return DynkResult(*_dynmsc_f64(diss, medoids, minimum_k, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
Expand Down Expand Up @@ -692,13 +692,13 @@ def alternating(diss, medoids, max_iter=100, init="random", random_state=None):
if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return KMedoidsResult(*_alternating_f32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_alternating_f32(diss, medoids, max_iter))
elif dtype == np.float64:
return KMedoidsResult(*_alternating_f64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_alternating_f64(diss, medoids, max_iter))
elif dtype == np.int32:
return KMedoidsResult(*_alternating_i32(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_alternating_i32(diss, medoids, max_iter))
elif dtype == np.int64:
return KMedoidsResult(*_alternating_i64(diss, medoids.astype(np.uint64), max_iter))
return KMedoidsResult(*_alternating_i64(diss, medoids, max_iter))
raise ValueError("Input data not supported. Use a numpy array of floats.")

def silhouette(diss, labels, samples=False, n_cpu=-1):
Expand Down Expand Up @@ -735,7 +735,7 @@ def silhouette(diss, labels, samples=False, n_cpu=-1):

if not isinstance(diss, np.ndarray):
diss = np.array(diss)
labels = np.unique(labels, return_inverse=True)[1].astype(np.uint64) # ensure labels are 0..k-1
labels = np.unique(labels, return_inverse=True)[1].astype(np.uintp) # ensure labels are 0..k-1

if isinstance(diss, np.ndarray):
dtype = diss.dtype
Expand Down Expand Up @@ -797,16 +797,17 @@ def medoid_silhouette(diss, meds, samples=False):
if not isinstance(diss, np.ndarray):
diss = np.array(diss)
if not isinstance(meds, np.ndarray):
meds = np.array(meds, dtype=np.uint64)
meds = np.array(meds)
meds = meds.astype(np.uintp)

if isinstance(diss, np.ndarray):
dtype = diss.dtype
if dtype == np.float32:
return _medoid_silhouette_f32(diss, meds.astype(np.uint64), samples)
return _medoid_silhouette_f32(diss, meds, samples)
elif dtype == np.float64:
return _medoid_silhouette_f64(diss, meds.astype(np.uint64), samples)
return _medoid_silhouette_f64(diss, meds, samples)
elif dtype == np.int32:
return _medoid_silhouette_i32(diss, meds.astype(np.uint64), samples)
return _medoid_silhouette_i32(diss, meds, samples)
elif dtype == np.int64:
raise ValueError("Input of int64 is currently not supported, as it could overflow the float64 used internally when computing Silhouette. Use diss.astype(numpy.float64) if that is acceptable and you have the necessary memory for this copy.")
raise ValueError("Input data not supported. Use a numpy array of floats.")
Expand Down Expand Up @@ -881,14 +882,14 @@ class KMedoids(SKLearnClusterer):
:param random_state: random seed if no medoids are given
:type random_state: int, RandomState instance or None
:ivar cluster_centers\_: None for 'precomputed'
:type cluster_centers\_: array
:ivar medoid_indices\_: The indices of the medoid rows in X
:type medoid_indices\_: array, shape = (n_clusters,)
:ivar labels\_: Labels of each point
:type labels\_: array, shape = (n_samples,)
:ivar inertia\_: Sum of distances of samples to their closest cluster center
:type inertia\_: float
:ivar cluster_centers_: None for 'precomputed'
:type cluster_centers_: array
:ivar medoid_indices_: The indices of the medoid rows in X
:type medoid_indices_: array, shape = (n_clusters,)
:ivar labels_: Labels of each point
:type labels_: array, shape = (n_samples,)
:ivar inertia_: Sum of distances of samples to their closest cluster center
:type inertia_: float
"""
def __init__(
self,
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "kmedoids"
version = "0.5.0"
version = "0.5.1"
description = "k-Medoids Clustering in Python with FasterPAM"
requires-dist = ["numpy"]
classifier = [
Expand Down
2 changes: 1 addition & 1 deletion tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_fastermsc(self):
def test_dynmsc(self):
dist = np.array([[0, 2, 3, 4, 5], [2, 0, 6, 7, 8], [3, 6, 0, 9, 10], [4, 7, 9, 0, 11], [5, 8, 10, 11, 0]], dtype=np.float32)
dmsc = kmedoids.dynmsc(dist, 3, init='build')
dmsc_rust = kmedoids.kmedoids._dynmsc_f32(dist, dmsc.medoids, 100)
dmsc_rust = kmedoids.kmedoids._dynmsc_f32(dist, dmsc.medoids, 2, 100)
assert dmsc.loss == 0.8761904761904762
assert np.array_equal(dmsc.medoids, dmsc_rust[2])
assert dmsc.loss == dmsc_rust[0]
Expand Down

0 comments on commit 9d6e5c6

Please sign in to comment.