Coverage for src\cognitivefactory\interactive_clustering\clustering\mpckmeans.py: 89.11%
217 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
1# -*- coding: utf-8 -*-
3"""
4* Name: cognitivefactory.interactive_clustering.clustering.mpckmeans
5* Description: Implementation of constrained mpckmeans clustering algorithms.
6* Author: Esther LENOTRE, David NICOLAZO, Marc TRUTT
7* Created: 10/09/2022
8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html)
9"""
11# ==============================================================================
12# IMPORT PYTHON DEPENDENCIES
13# ==============================================================================
15import warnings
17# import random
18from typing import Dict, List, Optional, Set, Tuple
20import numpy as np
21import scipy
22from scipy.sparse import csr_matrix
24from cognitivefactory.interactive_clustering.clustering.abstract import (
25 AbstractConstrainedClustering,
26 rename_clusters_by_order,
27)
28from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager
31# ==============================================================================
32# MPCKMEANS CONSTRAINED CLUSTERING
33# ==============================================================================
34class MPCKMeansConstrainedClustering(AbstractConstrainedClustering):
35 """
36 This class implements the MPCkmeans constrained clustering.
37 It inherits from `AbstractConstrainedClustering`.
39 Forked from https://github.com/Behrouz-Babaki/COP-Kmeans/blob/master/copkmeans/cop_kmeans.py
40 Modified by Esther LENOTRE <git@estherlenotre.fr> according to https://proceedings.mlr.press/v5/givoni09a.html
42 References:
43 - KMeans Clustering: `MacQueen, J. (1967). Some methods for classification and analysis of multivariate observations. Proceedings of the fifth Berkeley symposium on mathematical statistics and probability 1(14), 281–297.`
44 - Constrained _'MPC'_ KMeans Clustering: `Khan, Md. A., Tamim, I., Ahmed, E., & Awal, M. A. (2012). Multiple Parameter Based Clustering (MPC): Prospective Analysis for Effective Clustering in Wireless Sensor Network (WSN) Using K-Means Algorithm. In Wireless Sensor Network (Vol. 04, Issue 01, pp. 18–24). Scientific Research Publishing, Inc. https://doi.org/10.4236/wsn.2012.41003`
46 Example:
47 ```python
48 # Import.
49 from scipy.sparse import csr_matrix
50 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager
51 from cognitivefactory.interactive_clustering.clustering.dbscan import MPCKMeansConstrainedClustering
53 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts.
55 vectors = {
56 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]),
57 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]),
58 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]),
59 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]),
60 "4": csr_matrix([0.50, 0.22, 0.21, 0.07]),
61 "5": csr_matrix([0.50, 0.21, 0.22, 0.07]),
62 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]),
63 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]),
64 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]),
65 }
67 # Define constraints manager.
68 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys()))
69 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK")
70 constraints_manager.add_constraint(data_ID1="0", data_ID2="7", constraint_type="MUST_LINK")
71 constraints_manager.add_constraint(data_ID1="0", data_ID2="8", constraint_type="MUST_LINK")
72 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK")
73 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK")
74 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK")
76 cluster_model = MPCKMeansConstrainedClustering()
77 dict_of_predicted_clusters = cluster_model.cluster(
78 constraints_manager=constraints_manager,
79 vectors=vectors,
80 nb_clusters=3,
81 )
83 # Print results.
84 print("Expected results", ";", {"0": 0, "1": 0, "2": 1, "3": 1, "4": 2, "5": 2, "6": 0, "7": 0, "8": 0,})
85 print("Computed results", ":", dict_of_predicted_clusters)
86 ```
88 Warns:
89 FutureWarning: `clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.
90 """
92 # ==============================================================================
93 # INITIALIZATION
94 # ==============================================================================
95 def __init__(
96 self,
97 model: str = "MPC",
98 max_iteration: int = 150,
99 w: float = 1.0,
100 random_seed: Optional[int] = None,
101 **kargs,
102 ) -> None:
103 """
104 The constructor for MPCKMeans Constrainted Clustering class.
106 Args:
107 model (str, optional): The kmeans clustering model to use. Available kmeans models are `"MPC"`. Defaults to `"MPC"`.
108 max_iteration (int, optional): The maximum number of kmeans iteration for convergence. Defaults to `150`.
109 w (float, optional): Weight for the constraints
110 random_seed (Optional[int]): The random seed to use to redo the same clustering. Defaults to `None`.
111 **kargs (dict): Other parameters that can be used in the instantiation.
113 Warns:
114 FutureWarning: `clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.
116 Raises:
117 ValueError: if some parameters are incorrectly set.
118 """
120 # Deprecation warnings
121 warnings.warn(
122 "`clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.",
123 FutureWarning, # DeprecationWarning
124 stacklevel=2,
125 )
127 # Store `self.`model`.
128 if model != "MPC": # TODO use `not in {"MPC"}`.
129 raise ValueError("The `model` '" + str(model) + "' is not implemented.")
130 self.model: str = model
132 # Store 'self.max_iteration`.
133 if max_iteration < 1:
134 raise ValueError("The `max_iteration` must be greater than or equal to 1.")
135 self.max_iteration: int = max_iteration
137 # Store `self.weight`.
138 if w < 0:
139 raise ValueError("The `weight` must be greater than 0.0.")
140 self.w: float = w
142 # Store `self.random_seed`.
143 self.random_seed: Optional[int] = random_seed
145 # Store `self.kargs` for clustering.
146 self.kargs = kargs
148 # Initialize `self.dict_of_predicted_clusters`.
149 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None
151 # Initialize `ml_graph` and `cl_graph`.
152 self.ml_graph: Dict[str, List[str]] = {}
153 self.cl_graph: Dict[str, List[str]] = {}
155 # ==============================================================================
156 # MAIN - CLUSTER DATA
157 # ==============================================================================
159 def cluster(
160 self,
161 constraints_manager: AbstractConstraintsManager,
162 vectors: Dict[str, csr_matrix],
163 nb_clusters: Optional[int],
164 verbose: bool = False,
165 y=None,
166 **kargs,
167 ) -> Dict[str, int]:
168 """
169 The main method used to cluster data with the KMeans model.
171 Args:
172 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation.
173 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data.
174 nb_clusters (Optional[int]): The number of clusters to compute. Here None.
175 verbose (bool, optional): Enable verbose output. Defaults to `False`.
176 y : Something.
177 **kargs (dict): Other parameters that can be used in the clustering.
179 Raises:
180 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set.
182 Returns:
183 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.
184 """
186 ###
187 ### GET PARAMETERS
188 ###
190 # Store `self.constraints_manager` and `self.list_of_data_IDs`.
191 if not isinstance(constraints_manager, AbstractConstraintsManager):
192 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.")
193 self.constraints_manager: AbstractConstraintsManager = constraints_manager
194 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs()
196 # Store `self.vectors`.
197 if not isinstance(vectors, dict):
198 raise ValueError("The `vectors` parameter has to be a `dict` type.")
199 self.vectors: Dict[str, csr_matrix] = vectors
201 # Store `self.nb_clusters`.
202 if (nb_clusters is None) or (nb_clusters < 2):
203 raise ValueError("The `nb_clusters` '" + str(nb_clusters) + "' must be greater than or equal to 2.")
204 self.nb_clusters: int = min(nb_clusters, len(self.list_of_data_IDs))
206 # TODO: Reformat vectors
207 id_names: np.ndarray = np.array(list(vectors.keys()))
208 X: np.ndarray = np.array(
209 [
210 (np.array(v).flatten() if isinstance(v, (np.ndarray, list)) else v.toarray().flatten())
211 for v in vectors.values()
212 ]
213 )
215 # TODO: reformat constraints
216 self.ml: List[Tuple[int, int]] = [
217 (i, j)
218 for i, data_ID_i in enumerate(self.list_of_data_IDs)
219 for j, data_ID_j in enumerate(self.list_of_data_IDs)
220 if (
221 self.constraints_manager.get_inferred_constraint(
222 data_ID1=data_ID_j,
223 data_ID2=data_ID_i,
224 )
225 == "MUST_LINK"
226 )
227 ]
229 # TODO: reformat constraints
230 self.cl: List[Tuple[int, int]] = [
231 (i, j)
232 for i, data_ID_i in enumerate(self.list_of_data_IDs)
233 for j, data_ID_j in enumerate(self.list_of_data_IDs)
234 if (
235 self.constraints_manager.get_inferred_constraint(
236 data_ID1=data_ID_j,
237 data_ID2=data_ID_i,
238 )
239 == "CANNOT_LINK"
240 )
241 ]
243 # Preprocess constraints
244 ml_graph, cl_graph, neighborhoods = self.preprocess_constraints()
246 # Initialize cluster centers
247 cluster_centers = self._initialize_cluster_centers(X, neighborhoods)
249 # Initialize metrics
250 A = np.identity(X.shape[1])
252 iteration = 0
253 converged = False
255 # Repeat until convergence
256 while not converged and iteration < self.max_iteration:
257 prev_cluster_centers = cluster_centers.copy()
259 # Find farthest pair of points according to each metric
260 farthest = self._find_farthest_pairs_of_points(X, A)
262 # Assign clusters
263 labels = self._assign_clusters(X, y, cluster_centers, A, farthest, ml_graph, cl_graph, self.w)
265 # Estimate means
266 cluster_centers = self._get_cluster_centers(X, labels)
268 # Update metrics
269 A = self._update_metrics(X, labels, cluster_centers, farthest, ml_graph, cl_graph, self.w)
271 # Check for convergence
272 cluster_centers_shift = prev_cluster_centers - cluster_centers
273 converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0)
274 iteration += 1
276 self.cluster_centers = cluster_centers
277 self.labels = labels
279 self.dict_of_predicted_clusters = {id_names[i]: self.labels[i] for i, label in enumerate(labels) if label != -1}
281 self.dict_of_predicted_clusters = rename_clusters_by_order(clusters=self.dict_of_predicted_clusters)
283 for data_ID in self.list_of_data_IDs:
284 if data_ID not in self.dict_of_predicted_clusters.keys(): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true
285 self.dict_of_predicted_clusters[data_ID] = -1
287 return self.dict_of_predicted_clusters
289 # ==============================================================================
290 # USEFUL FUNCTIONS
291 # ==============================================================================
293 def dist(self, i: int, S: List[int], points: np.ndarray) -> float:
294 """
295 Computes the minimum distance of a single point to a group of points.
297 Args:
298 i (int): Index of the single point.
299 S (List[int]): List of the index of the group of points .
300 points (np.ndarray): Array containing all the points.
302 Returns:
303 float: Minimum distance of the single to the group of points.
304 """
306 distances = np.array([np.sqrt(((points[i] - points[j]) ** 2).sum()) for j in S])
307 return distances.min()
309 def _dist(self, x: np.ndarray, y: np.ndarray, A: np.ndarray) -> float:
310 """
311 Computes the Mahalanobis distance between two points.
312 "(x - y)^T A (x - y)"
314 Args:
315 x (np.ndarray): First point.
316 y (np.ndarray): Second point .
317 A (np.ndarray): Inverse of a covariance matrix.
319 Returns:
320 float: Minimum distance of the single to the group of points.
321 """
323 return scipy.spatial.distance.mahalanobis(x, y, A) ** 2
325 def _find_farthest_pairs_of_points(self, X: np.ndarray, A: np.ndarray) -> Tuple[int, int, float]:
326 """
327 Finds the farthest pair of points.
329 Args:
330 X (np.ndarray): Set of points.
331 A (np.ndarray): Positive-definite matrix used for the distances.
333 Returns:
334 Tuple[int, int, float]: Indexes of the farthest pair of points and the corresponding distance.
335 """
337 farthest = None
338 n = X.shape[0]
339 max_distance = 0.0
341 for i in range(n):
342 for j in range(n):
343 if j < i:
344 distance = self._dist(X[i], X[j], A)
345 if distance > max_distance:
346 max_distance = distance
347 farthest = (i, j, distance)
349 assert farthest is not None
351 return farthest
353 def weighted_farthest_first_traversal(self, points: np.ndarray, weights: np.ndarray, k: int) -> List[int]:
354 """
355 Applies weighted farthest first traversal algorithm.
357 Args:
358 points (np.ndarray): Set of points.
359 weights (np.ndarray): Weights for the distances.
360 k (int): Number of points to be traversed
362 Returns:
363 List[int]: Indexes of the traversed points.
364 """
365 traversed = []
367 # Choose the first point randomly (weighted)
368 i = np.random.choice(len(points), size=1, p=weights)[0]
369 traversed.append(i)
371 # Find remaining n - 1 maximally separated points
372 for _ in range(k - 1):
373 max_dst, max_dst_index = 0, None
375 number_of_points = len(points)
376 for j in range(number_of_points):
377 if j not in traversed:
378 dst = self.dist(j, traversed, points)
379 weighted_dst = weights[j] * dst
381 if weighted_dst > max_dst:
382 max_dst = weighted_dst
383 max_dst_index = j
385 traversed.append(max_dst_index)
387 return traversed
389 # ==============================================================================
390 # INITIALIZATION OF CLUSTERS
391 # ==============================================================================
392 def _add_both(self, d, i, j):
393 d[i].add(j)
394 d[j].add(i)
396 def _dfs(self, i, graph, visited, component):
397 visited[i] = True
398 for j in graph[i]:
399 if not visited[j]:
400 self._dfs(j, graph, visited, component)
401 component.append(i)
403 def preprocess_constraints(
404 self,
405 ) -> Tuple[Dict[int, Set[int]], Dict[int, Set[int]], List[List[int]]]:
406 """
407 Initialize each cluster.
408 The choice is based on the neighborhoods created by the initial constraints.
410 Raises:
411 ValueError: if there is a Cannot-link constraint in conflict with a Must-link constraint involving both one same point.
413 Returns:
414 Tuple[Dict[int, set], Dict[int, set], List[List[int]]]:
415 A new list of must-link and cannot-link constraints as well as the lambda starting neighborhoods.
416 """
418 # Get the list of possible indices.
419 indices: List[str] = self.list_of_data_IDs.copy()
421 n: int = len(indices)
423 # Represent the graphs using adjacency-lists.
424 ml_graph: Dict[int, Set[int]] = {}
425 cl_graph: Dict[int, Set[int]] = {}
427 for k in range(n):
428 ml_graph[k] = set()
429 cl_graph[k] = set()
431 for data_ID_i1, data_ID_j1 in self.ml:
432 ml_graph[data_ID_i1].add(data_ID_j1)
433 ml_graph[data_ID_j1].add(data_ID_i1)
435 for data_ID_i2, data_ID_j2 in self.cl:
436 cl_graph[data_ID_i2].add(data_ID_j2)
437 cl_graph[data_ID_j2].add(data_ID_i2)
439 visited = [False for _ in range(n)]
440 neighborhoods = []
441 for index in range(n):
442 if not visited[index] and ml_graph[index]:
443 component: List[int] = []
444 self._dfs(index, ml_graph, visited, component)
445 for x1 in component:
446 for x2 in component:
447 if x1 != x2:
448 ml_graph[x1].add(x2)
449 neighborhoods.append(component)
451 for data_ID_i3, data_ID_j3 in self.cl:
452 for x in ml_graph[data_ID_i3]:
453 self._add_both(cl_graph, x, data_ID_j3)
455 for y in ml_graph[data_ID_j3]:
456 self._add_both(cl_graph, data_ID_i3, y)
458 for a in ml_graph[data_ID_i3]:
459 for b in ml_graph[data_ID_j3]:
460 self._add_both(cl_graph, a, b)
462 for index_1, _ in ml_graph.items():
463 for index_2 in ml_graph[index_1]:
464 if index_2 != index_1 and index_2 in cl_graph[index_1]: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true
465 raise ValueError("Inconsistent constraints between " + str(index_1) + " and " + str(index_2))
467 return ml_graph, cl_graph, neighborhoods
469 def _initialize_cluster_centers(self, X: np.ndarray, neighborhoods: List[List[int]]) -> np.ndarray:
470 """
471 Initialises cluster centers.
473 Args:
474 X (np.ndarray): Set of points.
475 neighborhoods (List[List[int]]): Lists of neighbors for each point.
477 Returns:
478 np.ndarray: Computed centers.
479 """
481 neighborhood_centers = np.array([X[neighborhood].mean(axis=0) for neighborhood in neighborhoods])
482 neighborhood_sizes = np.array([len(neighborhood) for neighborhood in neighborhoods])
483 neighborhood_weights = neighborhood_sizes / neighborhood_sizes.sum()
485 # print('\t', len(neighborhoods), neighborhood_sizes)
487 if len(neighborhoods) > self.nb_clusters: 487 ↛ 492line 487 didn't jump to line 492, because the condition on line 487 was never false
488 cluster_centers = neighborhood_centers[
489 self.weighted_farthest_first_traversal(neighborhood_centers, neighborhood_weights, self.nb_clusters)
490 ]
491 else:
492 if neighborhoods:
493 cluster_centers = neighborhood_centers
494 else:
495 cluster_centers = np.empty((0, X.shape[1]))
497 if len(neighborhoods) < self.nb_clusters:
498 remaining_cluster_centers = X[
499 np.random.choice(X.shape[0], self.nb_clusters - len(neighborhoods), replace=False), :
500 ]
501 cluster_centers = np.concatenate([cluster_centers, remaining_cluster_centers])
503 return cluster_centers
505 # ==============================================================================
506 # COMPUTE CLUSTERS
507 # ==============================================================================
509 def _f_m(self, X: np.ndarray, i: int, j: int, A) -> float:
510 return self._dist(X[i], X[j], A)
512 def _f_c(self, X: np.ndarray, i: int, j: int, A, farthest) -> float:
513 return farthest[2] - self._dist(X[i], X[j], A)
515 def _objective_fn(
516 self, X: np.ndarray, i: int, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w
517 ) -> float:
518 sign, logdet = np.linalg.slogdet(A)
519 log_det_a = sign * logdet
521 if log_det_a == np.inf: 521 ↛ 522line 521 didn't jump to line 522, because the condition on line 521 was never true
522 log_det_a = 0
524 term_d: float = self._dist(X[i], cluster_centers[cluster_id], A) - log_det_a
526 term_m: float = 0
527 for j in ml_graph[i]:
528 if labels[j] >= 0 and labels[j] != cluster_id:
529 term_m += 2 * w * self._f_m(X, i, j, A)
531 term_c: float = 0
532 for k in cl_graph[i]:
533 if labels[k] == cluster_id:
534 # assert self._f_c(i, k, A, farthest) >= 0
535 term_c += 2 * w * self._f_c(X, i, k, A, farthest)
537 return term_d + term_m + term_c
539 def _assign_clusters(self, X, y, cluster_centers, A, farthest, ml_graph, cl_graph, w):
540 labels = np.full(X.shape[0], fill_value=-1)
542 index = list(range(X.shape[0]))
543 np.random.shuffle(index)
544 for i in index:
545 labels[i] = np.argmin(
546 [
547 self._objective_fn(X, i, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w)
548 for cluster_id, cluster_center in enumerate(cluster_centers)
549 ]
550 )
552 # Handle empty clusters
553 n_samples_in_cluster = np.bincount(labels, minlength=self.nb_clusters)
554 empty_clusters = np.where(n_samples_in_cluster == 0)[0]
556 for empty_cluster_id in empty_clusters: 556 ↛ 558line 556 didn't jump to line 558, because the loop on line 556 never started
557 # Get clusters that have at least 2 points and can give one of them to another cluster
558 filled_clusters = np.where(n_samples_in_cluster > 1)[0]
560 # Get points from filled_clusters, and compute distance to their center
561 distances_to_clusters: Dict[str, float] = {}
563 for cluster_id in filled_clusters:
564 available_cluster_points = np.where(labels == cluster_id)[0]
565 for point in available_cluster_points:
566 distances_to_clusters[point] = self._dist(X[point], cluster_centers[cluster_id], A)
568 # Fill empty clusters with the farthest points regarding their respective centers
569 filling_point: float = max(distances_to_clusters, key=distances_to_clusters.get)
570 labels[filling_point] = empty_cluster_id
572 n_samples_in_cluster = np.bincount(labels, minlength=10)
574 empty_clusters = np.where(n_samples_in_cluster == 0)[0]
576 if empty_clusters.size: 576 ↛ 578line 576 didn't jump to line 578, because the condition on line 576 was never true
577 # print("Empty clusters")
578 raise ValueError("Empty Clusters Exception")
579 return labels
581 def _get_cluster_centers(self, X, labels):
582 return np.array([X[labels == i].mean(axis=0) for i in range(self.nb_clusters)])
584 # ==============================================================================
585 # UPDATE METRICS
586 # ==============================================================================
588 def _update_metrics(self, X, labels, cluster_centers, farthest, ml_graph, cl_graph, w):
589 N, D = X.shape
590 A = np.zeros((D, D))
592 for d in range(D):
593 term_x = np.sum([(x[d] - cluster_centers[labels[i], d]) ** 2 for i, x in enumerate(X)])
595 term_m = 0
596 for i in range(N):
597 for j in ml_graph[i]:
598 if labels[i] != labels[j]: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true
599 term_m += 1 / 2 * w * (X[i, d] - X[j, d]) ** 2
601 term_c = 0
602 for k in range(N):
603 for m in cl_graph[k]:
604 if labels[k] == labels[m]: 604 ↛ 605line 604 didn't jump to line 605, because the condition on line 604 was never true
605 tmp = (X[farthest[0], d] - X[farthest[1], d]) ** 2 - (X[k, d] - X[m, d]) ** 2
606 term_c += w * max(tmp, 0)
608 # print('term_x', term_x, 'term_m', term_m, 'term_c', term_c)
610 A[d, d] = N / max(term_x + term_m + term_c, 1e-9)
612 return A