polis-community · patcon · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@
 - Remove agora implementation and tests. ([#73](https://github.com/polis-community/red-dwarf/issues/74))
 - Migrate from reference HDBSCAN module (in `scikit-learn`) to full-featured HDBSCAN* package.
 - Add dependency groups to avoid installing everything. ([#11](https://github.com/polis-community/red-dwarf/issues/11))
+- Add simple reducer registry for adding more dimensional reduction algos at runtime.
 
 ### Fixes
 

diff --git a/docs/api_reference.md b/docs/api_reference.md
@@ -26,14 +26,27 @@ use in Scikit-Learn workflows, pipelines, and APIs.
     options:
         show_root_heading: true
 
+### ::: reddwarf.sklearn.cluster.BestPolisKMeans
+    options:
+        show_root_heading: true
+        docstring_style: numpy
+
 ### ::: reddwarf.sklearn.model_selection.GridSearchNonCV
     options:
         show_root_heading: true
 
+### ::: reddwarf.sklearn.pipeline.PatchedPipeline
+    options:
+        show_root_heading: true
+
 ### ::: reddwarf.sklearn.transformers.SparsityAwareScaler
     options:
         show_root_heading: true
 
+### ::: reddwarf.sklearn.transformers.SparsityAwareCapturer
+    options:
+        show_root_heading: true
+
 ## `reddwarf.utils.matrix`
 
 ### ::: reddwarf.utils.matrix.generate_raw_matrix
@@ -58,15 +71,30 @@ use in Scikit-Learn workflows, pipelines, and APIs.
     options:
         show_root_heading: true
 
+## `reddwarf.utils.reducer.registry`
+
+### ::: reddwarf.utils.reducer.registry.register_reducer
+    options:
+        show_root_heading: true
+
+### ::: reddwarf.utils.reducer.registry.get_reducer
+    options:
+        show_root_heading: true
+
+### ::: reddwarf.utils.reducer.registry.list_reducers
+    options:
+        show_root_heading: true
+
 ## `reddwarf.utils.clusterer`
 
 ### ::: reddwarf.utils.clusterer.base.run_clusterer
     options:
         show_root_heading: true
 
-### ::: reddwarf.utils.clusterer.kmeans.find_best_kmeans
+### ::: reddwarf.sklearn.cluster.BestPolisKMeans
     options:
         show_root_heading: true
+        docstring_style: numpy
 
 ## `reddwarf.utils.consensus`
 

diff --git a/reddwarf/implementations/base.py b/reddwarf/implementations/base.py
@@ -11,8 +11,8 @@
     simple_filter_matrix,
     get_clusterable_participant_ids,
 )
-from reddwarf.utils.reducer.base import ReducerType, ReducerModel, run_reducer
-from reddwarf.utils.clusterer.base import ClustererType, ClustererModel
+from reddwarf.utils.reducer.base import ReducerModel, run_reducer
+from reddwarf.utils.clusterer.base import ClustererModel
 from reddwarf.utils.stats import (
     calculate_comment_statistics_dataframes,
     populate_priority_calculations_into_statements_df,
@@ -54,9 +54,9 @@ class PolisClusteringResult:
 
 def run_pipeline(
     votes: list[dict],
-    reducer: ReducerType = "pca",
+    reducer: str = "pca",
     reducer_kwargs: dict = {},
-    clusterer: ClustererType = "kmeans",
+    clusterer: str = "kmeans",
     clusterer_kwargs: dict = {},
     mod_out_statement_ids: list[int] = [],
     meta_statement_ids: list[int] = [],
@@ -79,9 +79,9 @@ def run_pipeline(
 
     Args:
         votes (list[dict]): Raw list of vote dicts, with keys for "participant_id", "statement_id", "vote" and "modified"
-        reducer (ReducerType): Selects the type of reducer model to use.
+        reducer (str): Selects the type of reducer model to use.
         reducer_kwargs (dict): Extra params to pass to reducer model during initialization.
-        clusterer (ClustererType): Selects the type of clusterer model to use.
+        clusterer (str): Selects the type of clusterer model to use.
         clusterer_kwargs (dict): Extra params to pass to clusterer model during initialization.
         mod_out_statement_ids (list[int]): List of statement IDs to moderate/zero out
         meta_statement_ids (list[int]): List of meta statement IDs

diff --git a/reddwarf/sklearn/cluster.py b/reddwarf/sklearn/cluster.py
@@ -1,9 +1,34 @@
-from typing import Optional
+from typing import List, Optional
+
 import numpy as np
-from numpy.typing import ArrayLike
+from numpy.typing import ArrayLike, NDArray
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.cluster import KMeans, kmeans_plusplus
+from sklearn.metrics import silhouette_score
 from sklearn.utils.validation import check_random_state, check_array
 
+from reddwarf.sklearn.model_selection import GridSearchNonCV
+
+
+def _to_range(r) -> range:
+    """
+    Creates an inclusive range from a list, tuple, or int.
+
+    Examples:
+        _to_range(2) # [2]
+        _to_range([2, 5]) # [2, 3, 4, 5]
+        _to_range((2, 5)) # [2, 3, 4, 5]
+    """
+    if isinstance(r, int):
+        start = end = r
+    elif isinstance(r, (tuple, list)) and len(r) == 2:
+        start, end = r
+    else:
+        raise ValueError("Expected int or a 2-element tuple/list")
+
+    return range(start, end + 1)  # inclusive
+
+
 class PolisKMeans(KMeans):
     """
     A modified version of scikit-learn's KMeans that allows partial initialization
@@ -76,7 +101,7 @@ def __init__(
         self.init_centers = init_centers
         self.init_centers_used_ = None
 
-    def _generate_centers(self, X, x_squared_norms, n_to_generate, random_state):
+    def _generate_centers(self, X, x_squared_norms, n_to_generate, random_state) -> np.ndarray:
         if not isinstance(self._init_strategy, str):
             raise ValueError("Internal error: _strategy must be a string.")
 
@@ -122,38 +147,45 @@ def fit(self, X, y=None, sample_weight=None):
                 X, x_squared_norms, self.n_clusters, random_state
             )
 
-        # Override the init param passed to sklearn with actual centers.
-        # We take control of the initialization strategy (`k-means++`, `random`,
-        # `polis`, etc) in our own code.
+        # Override the init param with our computed centers
         super().set_params(init=self.init_centers_used_)
-
         return super().fit(X, y=y, sample_weight=sample_weight)
 
-from sklearn.base import BaseEstimator, TransformerMixin
+
 
 class PolisKMeansDownsampler(BaseEstimator, TransformerMixin):
     """
-    A transformer that fits `PolisKMeans` and returns the cluster centers as the
-    downsampled dataset.
+    A transformer that fits PolisKMeans and returns cluster centers as downsampled data.
 
-    This will support mimicking "base clusters" from the Polis platform.
+    This supports mimicking "base clusters" from the Polis platform and enables
+    use in sklearn pipelines where intermediate steps implement both fit and transform.
 
-    This enables use in sklearn pipelines, where intermediate steps
-    are expected to implement both `fit` and `transform`.
+    Parameters
+    ----------
+    n_clusters : int, default=100
+        Number of clusters to form
+    random_state : int, RandomState instance or None, default=None
+        Random state for reproducible results
+    init : {'k-means++', 'random', 'polis'}, default='k-means++'
+        Initialization strategy
+    init_centers : array-like of shape (n_clusters, n_features), optional
+        Initial cluster centers
     """
-    def __init__(self,
-        n_clusters=100,
-        random_state=None,
-        init="k-means++",
-        init_centers=None,
+
+    def __init__(
+        self,
+        n_clusters: int = 100,
+        random_state: Optional[int] = None,
+        init: str = "k-means++",
+        init_centers: Optional[ArrayLike] = None,
     ):
         self.n_clusters = n_clusters
         self.random_state = random_state
         self.init = init
         self.init_centers = init_centers
         self.kmeans_ = None
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None) -> 'PolisKMeansDownsampler':
         self.kmeans_ = PolisKMeans(
             n_clusters=self.n_clusters,
             random_state=self.random_state,
@@ -163,5 +195,82 @@ def fit(self, X, y=None):
         self.kmeans_.fit(X)
         return self
 
-    def transform(self, X, y=None):
-        return self.kmeans_.cluster_centers_ if self.kmeans_ else None
+    def transform(self, X, y=None) -> Optional[np.ndarray]:
+        return self.kmeans_.cluster_centers_ if self.kmeans_ else None
+
+
+class BestPolisKMeans(BaseEstimator):
+    """
+    A clusterer that automatically finds optimal k-means clustering using silhouette scores.
+
+    This class provides a scikit-learn-like interface while handling k-selection
+    internally using grid search and silhouette scoring.
+
+    Parameters
+    ----------
+    k_bounds : list of int, default=[2, 5]
+        Range of k values to search [min_k, max_k]
+    init : {'k-means++', 'random', 'polis'}, default='polis'
+        Initialization strategy
+    init_centers : array-like, optional
+        Initial cluster centers
+    random_state : int, optional
+        Random state for reproducible results
+
+    Attributes
+    ----------
+    best_estimator_ : PolisKMeans
+        The best fitted estimator
+    best_k_ : int
+        The optimal number of clusters found
+    best_score_ : float
+        The best silhouette score achieved
+    """
+
+    def __init__(
+        self,
+        k_bounds: Optional[List[int]] = None,
+        init: str = "polis",
+        init_centers: Optional[ArrayLike] = None,
+        random_state: Optional[int] = None,
+    ):
+        self.k_bounds = k_bounds or [2, 5]
+        self.init = init
+        self.init_centers = init_centers
+        self.random_state = random_state
+        self.best_estimator_ = None
+        self.best_k_ = None
+        self.best_score_ = None
+
+    def fit(self, X: NDArray) -> 'BestPolisKMeans':
+        """Fit the clusterer and find optimal number of clusters using silhouette scores."""
+        param_grid = {
+            "n_clusters": _to_range(self.k_bounds),
+        }
+
+        def scoring_function(estimator, X_data):
+            labels = estimator.fit_predict(X_data)
+            return silhouette_score(X_data, labels)
+
+        search = GridSearchNonCV(
+            param_grid=param_grid,
+            scoring=scoring_function,
+            estimator=PolisKMeans(
+                init=self.init,
+                init_centers=self.init_centers,
+                random_state=self.random_state,
+            ),
+        )
+
+        search.fit(X)
+
+        self.best_k_ = search.best_params_['n_clusters']
+        self.best_score_ = search.best_score_
+        self.best_estimator_ = search.best_estimator_
+
+        return self
+
+    def fit_predict(self, X: NDArray, y=None, **kwargs) -> Optional[np.ndarray]:
+        """Fit the clusterer and return cluster labels."""
+        self.fit(X)
+        return self.labels_
diff --git a/reddwarf/utils/__init__.py b/reddwarf/utils/__init__.py
@@ -1,4 +1,3 @@
-from reddwarf.utils.clusterer.kmeans import *
 from reddwarf.utils.matrix import *
 from reddwarf.utils.polismath import *
 from reddwarf.utils.stats import *

diff --git a/reddwarf/utils/clusterer/__init__.py b/reddwarf/utils/clusterer/__init__.py
@@ -0,0 +1,5 @@
+def load_builtins():
+    """
+    Load the builtin clusterers into the clusterer registry.
+    """
+    import reddwarf.utils.clusterer.builtins  # noqa: F401