Coverage for src\cognitivefactory\interactive_clustering\clustering\spectral.py: 100.00%
51 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
1# -*- coding: utf-8 -*-
3"""
4* Name: cognitivefactory.interactive_clustering.clustering.spectral
5* Description: Implementation of constrained spectral clustering algorithms.
6* Author: Erwan SCHILD
7* Created: 17/03/2021
8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html)
9"""
11# ==============================================================================
12# IMPORTS PYTHON DEPENDENCIES
13# ==============================================================================
15from typing import Dict, List, Optional
17from scipy.sparse import csr_matrix, vstack
18from sklearn.cluster import SpectralClustering
19from sklearn.metrics import pairwise_kernels
21from cognitivefactory.interactive_clustering.clustering.abstract import (
22 AbstractConstrainedClustering,
23 rename_clusters_by_order,
24)
25from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager
28# ==============================================================================
29# SPECTRAL CONSTRAINED CLUSTERING
30# ==============================================================================
31class SpectralConstrainedClustering(AbstractConstrainedClustering):
32 """
33 This class implements the spectral constrained clustering.
34 It inherits from `AbstractConstrainedClustering`.
36 References:
37 - Spectral Clustering: `Ng, A. Y., M. I. Jordan, et Y.Weiss (2002). On Spectral Clustering: Analysis and an algorithm. In T. G. Dietterich, S. Becker, et Z. Ghahramani (Eds.), Advances in Neural Information Processing Systems 14. MIT Press.`
38 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.`
40 Example:
41 ```python
42 # Import.
43 from scipy.sparse import csr_matrix
44 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager
45 from cognitivefactory.interactive_clustering.clustering.spectral import SpectralConstrainedClustering
47 # Create an instance of spectral clustering.
48 clustering_model = SpectralConstrainedClustering(
49 model="SPEC",
50 random_seed=1,
51 )
53 # Define vectors.
54 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts.
55 vectors = {
56 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]),
57 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]),
58 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]),
59 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]),
60 "4": csr_matrix([0.60, 0.17, 0.16, 0.07]),
61 "5": csr_matrix([0.60, 0.16, 0.17, 0.07]),
62 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]),
63 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]),
64 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]),
65 }
67 # Define constraints manager.
68 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys()))
69 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK")
70 constraints_manager.add_constraint(data_ID1="2", data_ID2="3", constraint_type="MUST_LINK")
71 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK")
72 constraints_manager.add_constraint(data_ID1="7", data_ID2="8", constraint_type="MUST_LINK")
73 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK")
74 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK")
75 constraints_manager.add_constraint(data_ID1="4", data_ID2="7", constraint_type="CANNOT_LINK")
77 # Run clustering.
78 dict_of_predicted_clusters = clustering_model.cluster(
79 constraints_manager=constraints_manager,
80 vectors=vectors,
81 nb_clusters=3,
82 )
84 # Print results.
85 print("Expected results", ";", {"0": 0, "1": 0, "2": 0, "3": 0, "4": 1, "5": 1, "6": 2, "7": 2, "8": 2,})
86 print("Computed results", ":", dict_of_predicted_clusters)
87 ```
88 """
90 # ==============================================================================
91 # INITIALIZATION
92 # ==============================================================================
93 def __init__(
94 self, model: str = "SPEC", nb_components: Optional[int] = None, random_seed: Optional[int] = None, **kargs
95 ) -> None:
96 """
97 The constructor for Spectral Constrainted Clustering class.
99 Args:
100 model (str, optional): The spectral clustering model to use. Available spectral models are `"SPEC"` and `"CCSR"`. Defaults to `"SPEC"`.
101 nb_components (Optional[int], optional): The number of eigenvectors to compute in the spectral clustering. If `None`, set the number of components to the number of clusters. Defaults to `None`.
102 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`.
103 **kargs (dict): Other parameters that can be used in the instantiation.
105 Raises:
106 ValueError: if some parameters are incorrectly set.
107 """
109 # Store `self.model`.
110 if model != "SPEC": # TODO use `not in {"SPEC"}`. # TODO `"CCSR"` to add after correction.
111 raise ValueError("The `model` '" + str(model) + "' is not implemented.")
112 self.model: str = model
114 # Store `self.nb_components`.
115 if (nb_components is not None) and (nb_components < 2):
116 raise ValueError(
117 "The `nb_components` '" + str(nb_components) + "' must be `None` or greater than or equal to 2."
118 )
119 self.nb_components: Optional[int] = nb_components
121 # Store `self.random_seed`.
122 self.random_seed: Optional[int] = random_seed
124 # Store `self.kargs` for kmeans clustering.
125 self.kargs = kargs
127 # Initialize `self.dict_of_predicted_clusters`.
128 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None
130 # ==============================================================================
131 # MAIN - CLUSTER DATA
132 # ==============================================================================
133 def cluster(
134 self,
135 constraints_manager: AbstractConstraintsManager,
136 vectors: Dict[str, csr_matrix],
137 nb_clusters: Optional[int],
138 verbose: bool = False,
139 **kargs,
140 ) -> Dict[str, int]:
141 """
142 The main method used to cluster data with the Spectral model.
144 Args:
145 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation.
146 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data.
147 nb_clusters (Optional[int]): The number of clusters to compute.
148 verbose (bool, optional): Enable verbose output. Defaults to `False`.
149 **kargs (dict): Other parameters that can be used in the clustering.
151 Raises:
152 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set.
154 Returns:
155 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.
156 """
158 ###
159 ### GET PARAMETERS
160 ###
162 # Store `self.constraints_manager` and `self.list_of_data_IDs`.
163 if not isinstance(constraints_manager, AbstractConstraintsManager):
164 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.")
165 self.constraints_manager: AbstractConstraintsManager = constraints_manager
166 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs()
168 # Store `self.vectors`.
169 if not isinstance(vectors, dict):
170 raise ValueError("The `vectors` parameter has to be a `dict` type.")
171 self.vectors: Dict[str, csr_matrix] = vectors
173 # Store `self.nb_clusters`.
174 if (nb_clusters is None) or (nb_clusters < 2):
175 raise ValueError("The `nb_clusters` '" + str(nb_clusters) + "' must be greater than or equal to 2.")
176 self.nb_clusters: int = min(nb_clusters, len(self.list_of_data_IDs))
178 # Define `self.current_nb_components`.
179 self.current_nb_components: int = (
180 self.nb_components
181 if ((self.nb_components is not None) and (self.nb_clusters < self.nb_components))
182 else self.nb_clusters
183 )
185 # Compute `self.pairwise_similarity_matrix`.
186 self.pairwise_similarity_matrix: csr_matrix = pairwise_kernels(
187 X=vstack(self.vectors[data_ID] for data_ID in self.constraints_manager.get_list_of_managed_data_IDs()),
188 metric="rbf", # TODO get different pairwise_distances config in **kargs
189 )
191 ###
192 ### RUN SPECTRAL CONSTRAINED CLUSTERING
193 ###
195 # Initialize `self.dict_of_predicted_clusters`.
196 self.dict_of_predicted_clusters = None
198 # Case of `"CCSR"` spectral clustering.
199 # TODO Don't work.
200 ##if self.model == "CCSR":
201 ## self.dict_of_predicted_clusters = self.clustering_spectral_model_CCSR(verbose=verbose)
203 # Case of `"SPEC"` spectral clustering.
204 ##### DEFAULTS : if self.model=="SPEC":
205 self.dict_of_predicted_clusters = self.clustering_spectral_model_SPEC(verbose=verbose)
207 ###
208 ### RETURN PREDICTED CLUSTERS
209 ###
211 return self.dict_of_predicted_clusters
213 # ==============================================================================
214 # IMPLEMENTATION - SPEC SPECTRAL CLUSTERING
215 # ==============================================================================
216 def clustering_spectral_model_SPEC(
217 self,
218 verbose: bool = False,
219 ) -> Dict[str, int]:
220 """
221 Implementation of a simple Spectral clustering algorithm, based affinity matrix modifications.
223 References :
224 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.`
226 Args:
227 verbose (bool, optional): Enable verbose output. Default is `False`.
229 Returns:
230 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.
231 """
233 ###
234 ### MODIFY CONSTRAINTS MATRIX WITH CONSTRAINTS
235 ###
237 # Modify the similarity over data IDs.
238 for ID1, data_ID1 in enumerate(self.list_of_data_IDs):
239 for ID2, data_ID2 in enumerate(self.list_of_data_IDs):
240 # Symetry is already handled in next instructions.
241 if ID1 > ID2:
242 continue
244 # For each `"MUST_LINK"` constraint, set the similarity to 1.0.
245 if (
246 self.constraints_manager.get_inferred_constraint(
247 data_ID1=data_ID1,
248 data_ID2=data_ID2,
249 )
250 == "MUST_LINK"
251 ):
252 self.pairwise_similarity_matrix[ID1, ID2] = 1.0
253 self.pairwise_similarity_matrix[ID2, ID1] = 1.0
255 # For each `"CANNOT_LINK"` constraint, set the similarity to 0.0.
256 elif (
257 self.constraints_manager.get_inferred_constraint(
258 data_ID1=data_ID1,
259 data_ID2=data_ID2,
260 )
261 == "CANNOT_LINK"
262 ):
263 self.pairwise_similarity_matrix[ID1, ID2] = 0.0
264 self.pairwise_similarity_matrix[ID2, ID1] = 0.0
266 ###
267 ### RUN SPECTRAL CONSTRAINED CLUSTERING
268 ### | Define laplacian matrix
269 ### | Compute eigen vectors
270 ### | Cluster eigen vectors
271 ### | Return labels based on eigen vectors clustering
272 ###
274 # Initialize spectral clustering model.
275 self.clustering_model = SpectralClustering(
276 n_clusters=self.nb_clusters,
277 # n_components=self.current_nb_components, #TODO Add if `scikit-learn>=0.24.1`
278 affinity="precomputed",
279 random_state=self.random_seed,
280 **self.kargs,
281 )
283 # Run spectral clustering model.
284 self.clustering_model.fit_predict(X=self.pairwise_similarity_matrix)
286 # Get prediction of spectral clustering model.
287 list_of_clusters: List[int] = self.clustering_model.labels_.tolist()
289 # Define the dictionary of predicted clusters.
290 predicted_clusters: Dict[str, int] = {
291 data_ID: list_of_clusters[ID] for ID, data_ID in enumerate(self.list_of_data_IDs)
292 }
294 # Rename cluster IDs by order.
295 predicted_clusters = rename_clusters_by_order(clusters=predicted_clusters)
297 # Return predicted clusters
298 return predicted_clusters
300 # ==============================================================================
301 # IMPLEMENTATION - CCSR SPECTRAL CLUSTERING
302 # ==============================================================================
303 """ #TODO : do not work. check if 1) wrong implementation ? 2) cvxopt better ?
304 def clustering_spectral_model_CCSR(
305 self,
306 verbose: bool = False,
307 ) -> Dict[str, int]:
308 \"""
309 Implementation of Constrained Clustering with Spectral Regularization algorithm, based on spectral semidefinite programming interpretation.
310 - Source : `Li, Z., Liu, J., & Tang, X. (2009). Constrained clustering via spectral regularization. 2009 IEEE Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/cvpr.2009.5206852`
311 - MATLAB Implementation : https://icube-forge.unistra.fr/lampert/TSCC/-/tree/master/methods/Li09
312 - SOLVERS comparison : https://pypi.org/project/PICOS/
313 - CVXPY solver : https://www.cvxpy.org/index.html
315 Args:
316 verbose (bool, optional): Enable verbose output. Defaults to `False`.
318 Returns:
319 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.
320 \"""
322 ###
323 ### COMPUTE NORMALIZED LAPLACIAN
324 ###
326 # Compute the normalized Laplacian.
327 normalized_laplacian, diagonal = csgraph_laplacian(
328 csgraph=self.pairwise_similarity_matrix,
329 normed=True,
330 return_diag=True
331 )
333 ###
334 ### COMPUTE EIGENVECTORS OF LAPLACIAN
335 ###
337 random.seed(self.random_seed)
338 v0 = np.random.rand(min(normalized_laplacian.shape))
340 # Compute the largest eigen vectors.
341 eigen_values, eigen_vectors = eigsh(
342 A=normalized_laplacian,
343 which="SM",
344 k=self.current_nb_components + 1,
345 v0=v0,
346 )
348 # Ignore eigenvector of eigenvalue 0.
349 eigen_values = eigen_values[1:]
350 eigen_vectors = eigen_vectors.T[1:]
352 if verbose: # pragma: no cover
353 print("EIGENVALUES / EIGENVECTORS")
355 for k in range(len(eigen_values)):
356 print(" ", "=============")
357 print(" ", "ID : ", k)
358 print(" ", "VAL : ", eigen_values[k])
359 print(" ", "VEC : ", eigen_vectors[k])
361 ###
362 ### FORMALIZE SEMIDEFINITE PROBLEM
363 ###
365 ## Problem ::
366 ## Cost function to minimize : L(z) = 1/2 * z.T * B * z + b.T * z + c
367 ## z : variable to find with the SDP problem
368 ## z = vec(M)
369 ## M >> 0 (semi definite positive), i.e. M = M.T, M.shape=(nb_components, nb_components)
370 ## B = sum ( s_ij s_ij.T )
371 # for ij in MUST_LINK or i,j in CANNOT_LINK
372 ## b = -2 * sum ( t_ij * s_ij )
373 # for ij in MUST_LINK or i,j in CANNOT_LINK
374 ## c = sum ( t_ij^2 )
375 # for ij in MUST_LINK or i,j in CANNOT_LINK
376 ## s_ij = vec( eigen_vector_i.T * eigen_vector_j )
377 ## t_ij = 1 if MUST_LINK(i,j), 0 if CANNOT_LINK(i,j)
379 # Initialization of SDP variables.
380 B = np.zeros((self.current_nb_components ** 2, self.current_nb_components ** 2))
381 b = np.zeros((self.current_nb_components ** 2, 1))
382 c = 0
384 for ID1, data_ID1 in enumerate(self.list_of_data_IDs):
385 for ID2, data_ID2 in enumerate(self.list_of_data_IDs):
387 # Get eigenvectors.
388 eigen_vector_i = np.atleast_2d(eigen_vectors.T[ID1])
389 eigen_vector_j = np.atleast_2d(eigen_vectors.T[ID2])
391 # Compute eigenvectors similarity.
392 U = eigen_vector_j.T @ eigen_vector_i
393 s = np.atleast_2d(U.ravel())
396 # For each `"MUST_LINK"` constraint, ....
397 if self.constraints_manager.get_inferred_constraint(
398 data_ID1=data_ID1,
399 data_ID2=data_ID2,
400 ) == "MUST_LINK":
402 # Add the value to SDP variables.
403 B += s.T * s
404 b += - 1 * s.T
405 c += 1 * 1
407 # For each `"CANNOT_LINK"` constraint, ....
408 if self.constraints_manager.get_inferred_constraint(
409 data_ID1=data_ID1,
410 data_ID2=data_ID2,
411 ) == "CANNOT_LINK":
413 # Add the value to SDP variables.
414 B += s.T * s
415 b += - 0 * s.T
416 c += 0 * 0
418 ###
419 ### SOLVE SEMIDEFINITE PROBLEM
420 ###
422 # Create a symetric matrix variable.
423 M = cp.Variable((self.current_nb_components, self.current_nb_components))
425 ### Define constraints.
426 SDP_constraints = []
428 # The solution must be positive semidefinite.
429 SDP_constraints += [M >> 0]
431 # Define cost function to minimize : `( 1/2 * z.T * B * z + b.T * z + c )`.
432 self.SDP_problem = cp.Problem(
433 cp.Minimize(
434 cp.quad_form(cp.atoms.affine.vec.vec(M), B) # `1/2 * z.T * B * z`.
435 + b.T @ cp.atoms.affine.vec.vec(M) # `b.T * z`.
436 + c # c
437 ),
438 SDP_constraints,
439 )
441 # Solve the SDP problem.
442 self.SDP_problem.solve(solver="MOSEK")
443 if verbose: # pragma: no cover
444 print("SEMIDEFINITE PROBLEM")
445 print(" ", "STATUS", ":", self.SDP_problem.status)
446 print(" ", "COST FUNCTION VALUE", ":", self.SDP_problem.value)
448 ###
449 ### CLUSTER EIGEN VECTORS
450 ###
452 # Define square root of M, and force sqrtM to be symetric.
453 sqrtM = sqrtm(M.value).real
454 sqrtM = (sqrtM + sqrtM.T) / 2
456 # Compute new embeddings for spectral clustering.
457 new_vectors_to_clusters = eigen_vectors.T @ sqrtM
459 # Initialize kmeans klustering model.
460 self.clustering_model = KMeans(
461 n_clusters=self.nb_clusters,
462 max_iter=10000,
463 random_state=self.random_seed,
464 )
466 # Run kmeans clustering model.
467 self.clustering_model.fit_predict(
468 X=new_vectors_to_clusters
469 )
471 # Get prediction of kmeans clustering model.
472 list_of_clusters: List[int] = self.clustering_model.labels_.tolist()
474 # Define the dictionary of predicted clusters.
475 predicted_clusters: Dict[str, int] = {
476 data_ID: list_of_clusters[ID]
477 for ID, data_ID in enumerate(self.list_of_data_IDs)
478 }
480 ###
481 ### RENAME CLUSTERS BY ORDER
482 ###
484 # Define a map to be able to rename cluster IDs.
485 mapping_of_old_ID_to_new_ID: Dict[int, int] = {}
486 new_ID: int = 0
487 for data_ID in self.list_of_data_IDs:
488 if predicted_clusters[data_ID] not in mapping_of_old_ID_to_new_ID.keys():
489 mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]] = new_ID
490 new_ID += 1
492 # Rename cluster IDs.
493 predicted_clusters = {
494 data_ID: mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]]
495 for data_ID in self.list_of_data_IDs
496 }
498 # Return predicted clusters
499 return predicted_clusters
500 """