Skip to content

Commit

Permalink
feat: added export of the examplars, handle small clustering input by…
Browse files Browse the repository at this point in the history
… bypassing tSNE which fails, and make dino_vits8 the default
  • Loading branch information
danellecline committed Jun 26, 2024
1 parent 5d083df commit 2306316
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 15 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ Options:

## File organization

The sdcat toolkit generates data in the following folders:
The sdcat toolkit generates data in the following folders. Here, we assume both detection and clustering is output to the same root folder.:

```
/data/20230504-MBARI/
Expand All @@ -126,7 +126,9 @@ The sdcat toolkit generates data in the following folders:
├── det_filtered # The filtered detections from the model
├── det_filtered_clustered # Clustered detections from the model
├── crops # Crops of the detections
├── dino_vits8... # The model output, i.e. cached embeddings, clustered detections, etc.
├── dino_vits8...date # The clustering results - one folder per each run of the clustering algorithm
├── dino_vits8..exemplars.csv # Exemplar embeddings - examples with the highest cosine similarity within a cluster
├── dino_vits8..detections.csv # The detections with the cluster id
├── stats.txt # Statistics of the detections
└── vizresults # Visualizations of the detections (boxes overlaid on images)
├── DSC01833.jpg
Expand Down
34 changes: 26 additions & 8 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _run_hdbscan_assign(
:param min_samples: The number of samples in a neighborhood for a point
:param ancillary_df: (optional) Ancillary data to include in the clustering
:param out_path: The output path to save the clustering artifacts to
:return: The average similarity score for each cluster, cluster ids, cluster means, and coverage
:return: The average similarity score for each cluster, exemplar_df, cluster ids, cluster means, and coverage
"""
info(f'Clustering using HDBSCAN using alpha {alpha}...')

Expand All @@ -80,8 +80,15 @@ def _run_hdbscan_assign(
# Get the number of samples which is the number of rows in the dataframe - this is used mostly for calculating coverage
num_samples = df.shape[0]

tsne = TSNE(n_components=2, perplexity=40, metric="cosine", n_jobs=8, random_state=42, verbose=True)
embedding = tsne.fit_transform(df.values)
# Perplexity must be less than the number of samples
perplexity = min(30, num_samples - 1)

# TSN-E does not work well when we have a few samples
if num_samples > 100:
tsne = TSNE(n_components=2, perplexity=perplexity, metric="cosine", n_jobs=8, random_state=42, verbose=True)
embedding = tsne.fit_transform(df.values)
else:
embedding = df.values
x = MinMaxScaler().fit_transform(embedding) # scale the embedding to 0-1

# Cluster the embeddings using HDBSCAN
Expand Down Expand Up @@ -131,13 +138,20 @@ def _run_hdbscan_assign(
# Get the index of the highest scores for each unique cluster sorted in increasing order
# and use this as a representative image for the cluster
max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
# Remove the first and last index which are the unassigned cluster and the noise cluster
max_scores = max_scores[1:-1]
# Remove the last index which is the -1 cluster
max_scores = max_scores[:-1]

# Get the representative embeddings for the max scoring each cluster
# Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
exemplar_emb = [image_emb[i] for i in max_scores]
exemplar_emb = np.array(exemplar_emb)

# Save the exemplar embeddings to a dataframe with some metadata
exemplar_df = pd.DataFrame()
exemplar_df['cluster'] = [f'Unknown C{i}' for i in range(0, len(max_scores))]
if ancillary_df is not None and 'image_path' in ancillary_df.columns:
exemplar_df['image_path'] = ancillary_df.iloc[max_scores]['image_path'].tolist()
exemplar_df['embedding'] = exemplar_emb.tolist()

# Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1
clustered = labels >= 0
coverage = np.sum(clustered) / num_samples
Expand Down Expand Up @@ -215,7 +229,7 @@ def _run_hdbscan_assign(
with open(f'{out_path}/{prefix}_summary.json', 'w') as f:
json.dump(params, f)

return avg_sim_scores, clusters, cluster_means, coverage
return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage


def cluster_vits(
Expand Down Expand Up @@ -298,7 +312,7 @@ def cluster_vits(
ancillary_df = df_dets

# Cluster the images
cluster_sim, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
cluster_sim, exemplar_df, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
image_emb,
alpha,
cluster_selection_epsilon,
Expand Down Expand Up @@ -345,6 +359,10 @@ def cluster_vits(
range(0, len(unique_clusters))]
pool.starmap(cluster_grid, args)

# Save the exemplar embeddings with the model type
exemplar_df['model'] = model
exemplar_df.to_csv(output_path / f'{prefix}_exemplars.csv', index=False)

info(f"Number of images {len(images)}")
info(f"Number of clusters {len(unique_clusters)}")
info(f"Coverage {coverage:.2f}")
Expand Down
6 changes: 3 additions & 3 deletions sdcat/cluster/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def run_cluster_det(det_dir, save_dir, device, config_ini, alpha, cluster_select
axis=1)

# Add in a column for the unique crop name for each detection with a unique id
df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster
df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster

# Remove small or large detections before clustering
size_before = len(df)
Expand Down Expand Up @@ -328,11 +328,11 @@ def run_cluster_roi(roi_dir, save_dir, device, config_ini, alpha, cluster_select
crop_path = save_dir / 'crops'
crop_path.mkdir(parents=True, exist_ok=True)
df['crop_path'] = df.apply(lambda row:
f'{crop_path}/{uuid.uuid5(uuid.NAMESPACE_DNS, row["image_path"])}.png',
f'{crop_path}/{Path(row["image_path"]).stem}.png',
axis=1)

# Add in a column for the unique crop name for each detection with a unique id
df['cluster_id'] = -1 # -1 is the default value and means that the image is not in a cluster
df['cluster'] = -1 # -1 is the default value and means that the image is not in a cluster

# Replace any NaNs with 0
df.fillna(0)
Expand Down
4 changes: 2 additions & 2 deletions sdcat/config/config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ min_similarity = 0.70
# dino_vits8 has block_size=8 which can be good for very small objects
# dino_vits14 has block_size=14
# Smaller block_size means more patches and more accurate fine-grained clustering on smaller objects
;model = dino_vits8
model = dinov2_vits14
model = dino_vits8
;model = dinov2_vits14
;model = dinov2_vitb14

[detect]
Expand Down

0 comments on commit 2306316

Please sign in to comment.