Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- Remove agora implementation and tests. ([#73](https://github.com/polis-community/red-dwarf/issues/74))
- Migrate from reference HDBSCAN module (in `scikit-learn`) to full-featured HDBSCAN* package.
- Add dependency groups to avoid installing everything. ([#11](https://github.com/polis-community/red-dwarf/issues/11))
- Add simple reducer registry for adding more dimensional reduction algos at runtime.

### Fixes

Expand Down
30 changes: 29 additions & 1 deletion docs/api_reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,27 @@ use in Scikit-Learn workflows, pipelines, and APIs.
options:
show_root_heading: true

### ::: reddwarf.sklearn.cluster.BestPolisKMeans
options:
show_root_heading: true
docstring_style: numpy

### ::: reddwarf.sklearn.model_selection.GridSearchNonCV
options:
show_root_heading: true

### ::: reddwarf.sklearn.pipeline.PatchedPipeline
options:
show_root_heading: true

### ::: reddwarf.sklearn.transformers.SparsityAwareScaler
options:
show_root_heading: true

### ::: reddwarf.sklearn.transformers.SparsityAwareCapturer
options:
show_root_heading: true

## `reddwarf.utils.matrix`

### ::: reddwarf.utils.matrix.generate_raw_matrix
Expand All @@ -58,15 +71,30 @@ use in Scikit-Learn workflows, pipelines, and APIs.
options:
show_root_heading: true

## `reddwarf.utils.reducer.registry`

### ::: reddwarf.utils.reducer.registry.register_reducer
options:
show_root_heading: true

### ::: reddwarf.utils.reducer.registry.get_reducer
options:
show_root_heading: true

### ::: reddwarf.utils.reducer.registry.list_reducers
options:
show_root_heading: true

## `reddwarf.utils.clusterer`

### ::: reddwarf.utils.clusterer.base.run_clusterer
options:
show_root_heading: true

### ::: reddwarf.utils.clusterer.kmeans.find_best_kmeans
### ::: reddwarf.sklearn.cluster.BestPolisKMeans
options:
show_root_heading: true
docstring_style: numpy

## `reddwarf.utils.consensus`

Expand Down
12 changes: 6 additions & 6 deletions reddwarf/implementations/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
simple_filter_matrix,
get_clusterable_participant_ids,
)
from reddwarf.utils.reducer.base import ReducerType, ReducerModel, run_reducer
from reddwarf.utils.clusterer.base import ClustererType, ClustererModel
from reddwarf.utils.reducer.base import ReducerModel, run_reducer
from reddwarf.utils.clusterer.base import ClustererModel
from reddwarf.utils.stats import (
calculate_comment_statistics_dataframes,
populate_priority_calculations_into_statements_df,
Expand Down Expand Up @@ -54,9 +54,9 @@ class PolisClusteringResult:

def run_pipeline(
votes: list[dict],
reducer: ReducerType = "pca",
reducer: str = "pca",
reducer_kwargs: dict = {},
clusterer: ClustererType = "kmeans",
clusterer: str = "kmeans",
clusterer_kwargs: dict = {},
mod_out_statement_ids: list[int] = [],
meta_statement_ids: list[int] = [],
Expand All @@ -79,9 +79,9 @@ def run_pipeline(

Args:
votes (list[dict]): Raw list of vote dicts, with keys for "participant_id", "statement_id", "vote" and "modified"
reducer (ReducerType): Selects the type of reducer model to use.
reducer (str): Selects the type of reducer model to use.
reducer_kwargs (dict): Extra params to pass to reducer model during initialization.
clusterer (ClustererType): Selects the type of clusterer model to use.
clusterer (str): Selects the type of clusterer model to use.
clusterer_kwargs (dict): Extra params to pass to clusterer model during initialization.
mod_out_statement_ids (list[int]): List of statement IDs to moderate/zero out
meta_statement_ids (list[int]): List of meta statement IDs
Expand Down
151 changes: 130 additions & 21 deletions reddwarf/sklearn/cluster.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,34 @@
from typing import Optional
from typing import List, Optional

import numpy as np
from numpy.typing import ArrayLike
from numpy.typing import ArrayLike, NDArray
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics import silhouette_score
from sklearn.utils.validation import check_random_state, check_array

from reddwarf.sklearn.model_selection import GridSearchNonCV


def _to_range(r) -> range:
"""
Creates an inclusive range from a list, tuple, or int.

Examples:
_to_range(2) # [2]
_to_range([2, 5]) # [2, 3, 4, 5]
_to_range((2, 5)) # [2, 3, 4, 5]
"""
if isinstance(r, int):
start = end = r
elif isinstance(r, (tuple, list)) and len(r) == 2:
start, end = r
else:
raise ValueError("Expected int or a 2-element tuple/list")

return range(start, end + 1) # inclusive


class PolisKMeans(KMeans):
"""
A modified version of scikit-learn's KMeans that allows partial initialization
Expand Down Expand Up @@ -76,7 +101,7 @@ def __init__(
self.init_centers = init_centers
self.init_centers_used_ = None

def _generate_centers(self, X, x_squared_norms, n_to_generate, random_state):
def _generate_centers(self, X, x_squared_norms, n_to_generate, random_state) -> np.ndarray:
if not isinstance(self._init_strategy, str):
raise ValueError("Internal error: _strategy must be a string.")

Expand Down Expand Up @@ -122,38 +147,45 @@ def fit(self, X, y=None, sample_weight=None):
X, x_squared_norms, self.n_clusters, random_state
)

# Override the init param passed to sklearn with actual centers.
# We take control of the initialization strategy (`k-means++`, `random`,
# `polis`, etc) in our own code.
# Override the init param with our computed centers
super().set_params(init=self.init_centers_used_)

return super().fit(X, y=y, sample_weight=sample_weight)

from sklearn.base import BaseEstimator, TransformerMixin


class PolisKMeansDownsampler(BaseEstimator, TransformerMixin):
"""
A transformer that fits `PolisKMeans` and returns the cluster centers as the
downsampled dataset.
A transformer that fits PolisKMeans and returns cluster centers as downsampled data.

This will support mimicking "base clusters" from the Polis platform.
This supports mimicking "base clusters" from the Polis platform and enables
use in sklearn pipelines where intermediate steps implement both fit and transform.

This enables use in sklearn pipelines, where intermediate steps
are expected to implement both `fit` and `transform`.
Parameters
----------
n_clusters : int, default=100
Number of clusters to form
random_state : int, RandomState instance or None, default=None
Random state for reproducible results
init : {'k-means++', 'random', 'polis'}, default='k-means++'
Initialization strategy
init_centers : array-like of shape (n_clusters, n_features), optional
Initial cluster centers
"""
def __init__(self,
n_clusters=100,
random_state=None,
init="k-means++",
init_centers=None,

def __init__(
self,
n_clusters: int = 100,
random_state: Optional[int] = None,
init: str = "k-means++",
init_centers: Optional[ArrayLike] = None,
):
self.n_clusters = n_clusters
self.random_state = random_state
self.init = init
self.init_centers = init_centers
self.kmeans_ = None

def fit(self, X, y=None):
def fit(self, X, y=None) -> 'PolisKMeansDownsampler':
self.kmeans_ = PolisKMeans(
n_clusters=self.n_clusters,
random_state=self.random_state,
Expand All @@ -163,5 +195,82 @@ def fit(self, X, y=None):
self.kmeans_.fit(X)
return self

def transform(self, X, y=None):
return self.kmeans_.cluster_centers_ if self.kmeans_ else None
def transform(self, X, y=None) -> Optional[np.ndarray]:
return self.kmeans_.cluster_centers_ if self.kmeans_ else None


class BestPolisKMeans(BaseEstimator):
"""
A clusterer that automatically finds optimal k-means clustering using silhouette scores.

This class provides a scikit-learn-like interface while handling k-selection
internally using grid search and silhouette scoring.

Parameters
----------
k_bounds : list of int, default=[2, 5]
Range of k values to search [min_k, max_k]
init : {'k-means++', 'random', 'polis'}, default='polis'
Initialization strategy
init_centers : array-like, optional
Initial cluster centers
random_state : int, optional
Random state for reproducible results

Attributes
----------
best_estimator_ : PolisKMeans
The best fitted estimator
best_k_ : int
The optimal number of clusters found
best_score_ : float
The best silhouette score achieved
"""

def __init__(
self,
k_bounds: Optional[List[int]] = None,
init: str = "polis",
init_centers: Optional[ArrayLike] = None,
random_state: Optional[int] = None,
):
self.k_bounds = k_bounds or [2, 5]
self.init = init
self.init_centers = init_centers
self.random_state = random_state
self.best_estimator_ = None
self.best_k_ = None
self.best_score_ = None

def fit(self, X: NDArray) -> 'BestPolisKMeans':
"""Fit the clusterer and find optimal number of clusters using silhouette scores."""
param_grid = {
"n_clusters": _to_range(self.k_bounds),
}

def scoring_function(estimator, X_data):
labels = estimator.fit_predict(X_data)
return silhouette_score(X_data, labels)

search = GridSearchNonCV(
param_grid=param_grid,
scoring=scoring_function,
estimator=PolisKMeans(
init=self.init,
init_centers=self.init_centers,
random_state=self.random_state,
),
)

search.fit(X)

self.best_k_ = search.best_params_['n_clusters']
self.best_score_ = search.best_score_
self.best_estimator_ = search.best_estimator_

return self

def fit_predict(self, X: NDArray, y=None, **kwargs) -> Optional[np.ndarray]:
"""Fit the clusterer and return cluster labels."""
self.fit(X)
return self.labels_
1 change: 0 additions & 1 deletion reddwarf/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from reddwarf.utils.clusterer.kmeans import *
from reddwarf.utils.matrix import *
from reddwarf.utils.polismath import *
from reddwarf.utils.stats import *
Expand Down
5 changes: 5 additions & 0 deletions reddwarf/utils/clusterer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def load_builtins():
"""
Load the builtin clusterers into the clusterer registry.
"""
import reddwarf.utils.clusterer.builtins # noqa: F401
Loading