Coverage for src\cognitivefactory\interactive_clustering\clustering\spectral.py: 100.00%

51 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1# -*- coding: utf-8 -*- 

2 

3""" 

4* Name: cognitivefactory.interactive_clustering.clustering.spectral 

5* Description: Implementation of constrained spectral clustering algorithms. 

6* Author: Erwan SCHILD 

7* Created: 17/03/2021 

8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html) 

9""" 

10 

11# ============================================================================== 

12# IMPORTS PYTHON DEPENDENCIES 

13# ============================================================================== 

14 

15from typing import Dict, List, Optional 

16 

17from scipy.sparse import csr_matrix, vstack 

18from sklearn.cluster import SpectralClustering 

19from sklearn.metrics import pairwise_kernels 

20 

21from cognitivefactory.interactive_clustering.clustering.abstract import ( 

22 AbstractConstrainedClustering, 

23 rename_clusters_by_order, 

24) 

25from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager 

26 

27 

28# ============================================================================== 

29# SPECTRAL CONSTRAINED CLUSTERING 

30# ============================================================================== 

31class SpectralConstrainedClustering(AbstractConstrainedClustering): 

32 """ 

33 This class implements the spectral constrained clustering. 

34 It inherits from `AbstractConstrainedClustering`. 

35 

36 References: 

37 - Spectral Clustering: `Ng, A. Y., M. I. Jordan, et Y.Weiss (2002). On Spectral Clustering: Analysis and an algorithm. In T. G. Dietterich, S. Becker, et Z. Ghahramani (Eds.), Advances in Neural Information Processing Systems 14. MIT Press.` 

38 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.` 

39 

40 Example: 

41 ```python 

42 # Import. 

43 from scipy.sparse import csr_matrix 

44 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager 

45 from cognitivefactory.interactive_clustering.clustering.spectral import SpectralConstrainedClustering 

46 

47 # Create an instance of spectral clustering. 

48 clustering_model = SpectralConstrainedClustering( 

49 model="SPEC", 

50 random_seed=1, 

51 ) 

52 

53 # Define vectors. 

54 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts. 

55 vectors = { 

56 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]), 

57 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]), 

58 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]), 

59 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]), 

60 "4": csr_matrix([0.60, 0.17, 0.16, 0.07]), 

61 "5": csr_matrix([0.60, 0.16, 0.17, 0.07]), 

62 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]), 

63 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]), 

64 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]), 

65 } 

66 

67 # Define constraints manager. 

68 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys())) 

69 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK") 

70 constraints_manager.add_constraint(data_ID1="2", data_ID2="3", constraint_type="MUST_LINK") 

71 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK") 

72 constraints_manager.add_constraint(data_ID1="7", data_ID2="8", constraint_type="MUST_LINK") 

73 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK") 

74 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK") 

75 constraints_manager.add_constraint(data_ID1="4", data_ID2="7", constraint_type="CANNOT_LINK") 

76 

77 # Run clustering. 

78 dict_of_predicted_clusters = clustering_model.cluster( 

79 constraints_manager=constraints_manager, 

80 vectors=vectors, 

81 nb_clusters=3, 

82 ) 

83 

84 # Print results. 

85 print("Expected results", ";", {"0": 0, "1": 0, "2": 0, "3": 0, "4": 1, "5": 1, "6": 2, "7": 2, "8": 2,}) 

86 print("Computed results", ":", dict_of_predicted_clusters) 

87 ``` 

88 """ 

89 

90 # ============================================================================== 

91 # INITIALIZATION 

92 # ============================================================================== 

93 def __init__( 

94 self, model: str = "SPEC", nb_components: Optional[int] = None, random_seed: Optional[int] = None, **kargs 

95 ) -> None: 

96 """ 

97 The constructor for Spectral Constrainted Clustering class. 

98 

99 Args: 

100 model (str, optional): The spectral clustering model to use. Available spectral models are `"SPEC"` and `"CCSR"`. Defaults to `"SPEC"`. 

101 nb_components (Optional[int], optional): The number of eigenvectors to compute in the spectral clustering. If `None`, set the number of components to the number of clusters. Defaults to `None`. 

102 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`. 

103 **kargs (dict): Other parameters that can be used in the instantiation. 

104 

105 Raises: 

106 ValueError: if some parameters are incorrectly set. 

107 """ 

108 

109 # Store `self.model`. 

110 if model != "SPEC": # TODO use `not in {"SPEC"}`. # TODO `"CCSR"` to add after correction. 

111 raise ValueError("The `model` '" + str(model) + "' is not implemented.") 

112 self.model: str = model 

113 

114 # Store `self.nb_components`. 

115 if (nb_components is not None) and (nb_components < 2): 

116 raise ValueError( 

117 "The `nb_components` '" + str(nb_components) + "' must be `None` or greater than or equal to 2." 

118 ) 

119 self.nb_components: Optional[int] = nb_components 

120 

121 # Store `self.random_seed`. 

122 self.random_seed: Optional[int] = random_seed 

123 

124 # Store `self.kargs` for kmeans clustering. 

125 self.kargs = kargs 

126 

127 # Initialize `self.dict_of_predicted_clusters`. 

128 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None 

129 

130 # ============================================================================== 

131 # MAIN - CLUSTER DATA 

132 # ============================================================================== 

133 def cluster( 

134 self, 

135 constraints_manager: AbstractConstraintsManager, 

136 vectors: Dict[str, csr_matrix], 

137 nb_clusters: Optional[int], 

138 verbose: bool = False, 

139 **kargs, 

140 ) -> Dict[str, int]: 

141 """ 

142 The main method used to cluster data with the Spectral model. 

143 

144 Args: 

145 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation. 

146 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data. 

147 nb_clusters (Optional[int]): The number of clusters to compute. 

148 verbose (bool, optional): Enable verbose output. Defaults to `False`. 

149 **kargs (dict): Other parameters that can be used in the clustering. 

150 

151 Raises: 

152 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set. 

153 

154 Returns: 

155 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID. 

156 """ 

157 

158 ### 

159 ### GET PARAMETERS 

160 ### 

161 

162 # Store `self.constraints_manager` and `self.list_of_data_IDs`. 

163 if not isinstance(constraints_manager, AbstractConstraintsManager): 

164 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.") 

165 self.constraints_manager: AbstractConstraintsManager = constraints_manager 

166 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs() 

167 

168 # Store `self.vectors`. 

169 if not isinstance(vectors, dict): 

170 raise ValueError("The `vectors` parameter has to be a `dict` type.") 

171 self.vectors: Dict[str, csr_matrix] = vectors 

172 

173 # Store `self.nb_clusters`. 

174 if (nb_clusters is None) or (nb_clusters < 2): 

175 raise ValueError("The `nb_clusters` '" + str(nb_clusters) + "' must be greater than or equal to 2.") 

176 self.nb_clusters: int = min(nb_clusters, len(self.list_of_data_IDs)) 

177 

178 # Define `self.current_nb_components`. 

179 self.current_nb_components: int = ( 

180 self.nb_components 

181 if ((self.nb_components is not None) and (self.nb_clusters < self.nb_components)) 

182 else self.nb_clusters 

183 ) 

184 

185 # Compute `self.pairwise_similarity_matrix`. 

186 self.pairwise_similarity_matrix: csr_matrix = pairwise_kernels( 

187 X=vstack(self.vectors[data_ID] for data_ID in self.constraints_manager.get_list_of_managed_data_IDs()), 

188 metric="rbf", # TODO get different pairwise_distances config in **kargs 

189 ) 

190 

191 ### 

192 ### RUN SPECTRAL CONSTRAINED CLUSTERING 

193 ### 

194 

195 # Initialize `self.dict_of_predicted_clusters`. 

196 self.dict_of_predicted_clusters = None 

197 

198 # Case of `"CCSR"` spectral clustering. 

199 # TODO Don't work. 

200 ##if self.model == "CCSR": 

201 ## self.dict_of_predicted_clusters = self.clustering_spectral_model_CCSR(verbose=verbose) 

202 

203 # Case of `"SPEC"` spectral clustering. 

204 ##### DEFAULTS : if self.model=="SPEC": 

205 self.dict_of_predicted_clusters = self.clustering_spectral_model_SPEC(verbose=verbose) 

206 

207 ### 

208 ### RETURN PREDICTED CLUSTERS 

209 ### 

210 

211 return self.dict_of_predicted_clusters 

212 

213 # ============================================================================== 

214 # IMPLEMENTATION - SPEC SPECTRAL CLUSTERING 

215 # ============================================================================== 

216 def clustering_spectral_model_SPEC( 

217 self, 

218 verbose: bool = False, 

219 ) -> Dict[str, int]: 

220 """ 

221 Implementation of a simple Spectral clustering algorithm, based affinity matrix modifications. 

222 

223 References : 

224 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.` 

225 

226 Args: 

227 verbose (bool, optional): Enable verbose output. Default is `False`. 

228 

229 Returns: 

230 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID. 

231 """ 

232 

233 ### 

234 ### MODIFY CONSTRAINTS MATRIX WITH CONSTRAINTS 

235 ### 

236 

237 # Modify the similarity over data IDs. 

238 for ID1, data_ID1 in enumerate(self.list_of_data_IDs): 

239 for ID2, data_ID2 in enumerate(self.list_of_data_IDs): 

240 # Symetry is already handled in next instructions. 

241 if ID1 > ID2: 

242 continue 

243 

244 # For each `"MUST_LINK"` constraint, set the similarity to 1.0. 

245 if ( 

246 self.constraints_manager.get_inferred_constraint( 

247 data_ID1=data_ID1, 

248 data_ID2=data_ID2, 

249 ) 

250 == "MUST_LINK" 

251 ): 

252 self.pairwise_similarity_matrix[ID1, ID2] = 1.0 

253 self.pairwise_similarity_matrix[ID2, ID1] = 1.0 

254 

255 # For each `"CANNOT_LINK"` constraint, set the similarity to 0.0. 

256 elif ( 

257 self.constraints_manager.get_inferred_constraint( 

258 data_ID1=data_ID1, 

259 data_ID2=data_ID2, 

260 ) 

261 == "CANNOT_LINK" 

262 ): 

263 self.pairwise_similarity_matrix[ID1, ID2] = 0.0 

264 self.pairwise_similarity_matrix[ID2, ID1] = 0.0 

265 

266 ### 

267 ### RUN SPECTRAL CONSTRAINED CLUSTERING 

268 ### | Define laplacian matrix 

269 ### | Compute eigen vectors 

270 ### | Cluster eigen vectors 

271 ### | Return labels based on eigen vectors clustering 

272 ### 

273 

274 # Initialize spectral clustering model. 

275 self.clustering_model = SpectralClustering( 

276 n_clusters=self.nb_clusters, 

277 # n_components=self.current_nb_components, #TODO Add if `scikit-learn>=0.24.1` 

278 affinity="precomputed", 

279 random_state=self.random_seed, 

280 **self.kargs, 

281 ) 

282 

283 # Run spectral clustering model. 

284 self.clustering_model.fit_predict(X=self.pairwise_similarity_matrix) 

285 

286 # Get prediction of spectral clustering model. 

287 list_of_clusters: List[int] = self.clustering_model.labels_.tolist() 

288 

289 # Define the dictionary of predicted clusters. 

290 predicted_clusters: Dict[str, int] = { 

291 data_ID: list_of_clusters[ID] for ID, data_ID in enumerate(self.list_of_data_IDs) 

292 } 

293 

294 # Rename cluster IDs by order. 

295 predicted_clusters = rename_clusters_by_order(clusters=predicted_clusters) 

296 

297 # Return predicted clusters 

298 return predicted_clusters 

299 

300 # ============================================================================== 

301 # IMPLEMENTATION - CCSR SPECTRAL CLUSTERING 

302 # ============================================================================== 

303 """ #TODO : do not work. check if 1) wrong implementation ? 2) cvxopt better ? 

304 def clustering_spectral_model_CCSR( 

305 self, 

306 verbose: bool = False, 

307 ) -> Dict[str, int]: 

308 \""" 

309 Implementation of Constrained Clustering with Spectral Regularization algorithm, based on spectral semidefinite programming interpretation. 

310 - Source : `Li, Z., Liu, J., & Tang, X. (2009). Constrained clustering via spectral regularization. 2009 IEEE Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/cvpr.2009.5206852` 

311 - MATLAB Implementation : https://icube-forge.unistra.fr/lampert/TSCC/-/tree/master/methods/Li09 

312 - SOLVERS comparison : https://pypi.org/project/PICOS/ 

313 - CVXPY solver : https://www.cvxpy.org/index.html 

314 

315 Args: 

316 verbose (bool, optional): Enable verbose output. Defaults to `False`. 

317 

318 Returns: 

319 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID. 

320 \""" 

321 

322 ### 

323 ### COMPUTE NORMALIZED LAPLACIAN 

324 ### 

325 

326 # Compute the normalized Laplacian. 

327 normalized_laplacian, diagonal = csgraph_laplacian( 

328 csgraph=self.pairwise_similarity_matrix, 

329 normed=True, 

330 return_diag=True 

331 ) 

332 

333 ### 

334 ### COMPUTE EIGENVECTORS OF LAPLACIAN 

335 ### 

336 

337 random.seed(self.random_seed) 

338 v0 = np.random.rand(min(normalized_laplacian.shape)) 

339 

340 # Compute the largest eigen vectors. 

341 eigen_values, eigen_vectors = eigsh( 

342 A=normalized_laplacian, 

343 which="SM", 

344 k=self.current_nb_components + 1, 

345 v0=v0, 

346 ) 

347 

348 # Ignore eigenvector of eigenvalue 0. 

349 eigen_values = eigen_values[1:] 

350 eigen_vectors = eigen_vectors.T[1:] 

351 

352 if verbose: # pragma: no cover 

353 print("EIGENVALUES / EIGENVECTORS") 

354 

355 for k in range(len(eigen_values)): 

356 print(" ", "=============") 

357 print(" ", "ID : ", k) 

358 print(" ", "VAL : ", eigen_values[k]) 

359 print(" ", "VEC : ", eigen_vectors[k]) 

360 

361 ### 

362 ### FORMALIZE SEMIDEFINITE PROBLEM 

363 ### 

364 

365 ## Problem :: 

366 ## Cost function to minimize : L(z) = 1/2 * z.T * B * z + b.T * z + c 

367 ## z : variable to find with the SDP problem 

368 ## z = vec(M) 

369 ## M >> 0 (semi definite positive), i.e. M = M.T, M.shape=(nb_components, nb_components) 

370 ## B = sum ( s_ij s_ij.T ) 

371 # for ij in MUST_LINK or i,j in CANNOT_LINK 

372 ## b = -2 * sum ( t_ij * s_ij ) 

373 # for ij in MUST_LINK or i,j in CANNOT_LINK 

374 ## c = sum ( t_ij^2 ) 

375 # for ij in MUST_LINK or i,j in CANNOT_LINK 

376 ## s_ij = vec( eigen_vector_i.T * eigen_vector_j ) 

377 ## t_ij = 1 if MUST_LINK(i,j), 0 if CANNOT_LINK(i,j) 

378 

379 # Initialization of SDP variables. 

380 B = np.zeros((self.current_nb_components ** 2, self.current_nb_components ** 2)) 

381 b = np.zeros((self.current_nb_components ** 2, 1)) 

382 c = 0 

383 

384 for ID1, data_ID1 in enumerate(self.list_of_data_IDs): 

385 for ID2, data_ID2 in enumerate(self.list_of_data_IDs): 

386 

387 # Get eigenvectors. 

388 eigen_vector_i = np.atleast_2d(eigen_vectors.T[ID1]) 

389 eigen_vector_j = np.atleast_2d(eigen_vectors.T[ID2]) 

390 

391 # Compute eigenvectors similarity. 

392 U = eigen_vector_j.T @ eigen_vector_i 

393 s = np.atleast_2d(U.ravel()) 

394 

395 

396 # For each `"MUST_LINK"` constraint, .... 

397 if self.constraints_manager.get_inferred_constraint( 

398 data_ID1=data_ID1, 

399 data_ID2=data_ID2, 

400 ) == "MUST_LINK": 

401 

402 # Add the value to SDP variables. 

403 B += s.T * s 

404 b += - 1 * s.T 

405 c += 1 * 1 

406 

407 # For each `"CANNOT_LINK"` constraint, .... 

408 if self.constraints_manager.get_inferred_constraint( 

409 data_ID1=data_ID1, 

410 data_ID2=data_ID2, 

411 ) == "CANNOT_LINK": 

412 

413 # Add the value to SDP variables. 

414 B += s.T * s 

415 b += - 0 * s.T 

416 c += 0 * 0 

417 

418 ### 

419 ### SOLVE SEMIDEFINITE PROBLEM 

420 ### 

421 

422 # Create a symetric matrix variable. 

423 M = cp.Variable((self.current_nb_components, self.current_nb_components)) 

424 

425 ### Define constraints. 

426 SDP_constraints = [] 

427 

428 # The solution must be positive semidefinite. 

429 SDP_constraints += [M >> 0] 

430 

431 # Define cost function to minimize : `( 1/2 * z.T * B * z + b.T * z + c )`. 

432 self.SDP_problem = cp.Problem( 

433 cp.Minimize( 

434 cp.quad_form(cp.atoms.affine.vec.vec(M), B) # `1/2 * z.T * B * z`. 

435 + b.T @ cp.atoms.affine.vec.vec(M) # `b.T * z`. 

436 + c # c 

437 ), 

438 SDP_constraints, 

439 ) 

440 

441 # Solve the SDP problem. 

442 self.SDP_problem.solve(solver="MOSEK") 

443 if verbose: # pragma: no cover 

444 print("SEMIDEFINITE PROBLEM") 

445 print(" ", "STATUS", ":", self.SDP_problem.status) 

446 print(" ", "COST FUNCTION VALUE", ":", self.SDP_problem.value) 

447 

448 ### 

449 ### CLUSTER EIGEN VECTORS 

450 ### 

451 

452 # Define square root of M, and force sqrtM to be symetric. 

453 sqrtM = sqrtm(M.value).real 

454 sqrtM = (sqrtM + sqrtM.T) / 2 

455 

456 # Compute new embeddings for spectral clustering. 

457 new_vectors_to_clusters = eigen_vectors.T @ sqrtM 

458 

459 # Initialize kmeans klustering model. 

460 self.clustering_model = KMeans( 

461 n_clusters=self.nb_clusters, 

462 max_iter=10000, 

463 random_state=self.random_seed, 

464 ) 

465 

466 # Run kmeans clustering model. 

467 self.clustering_model.fit_predict( 

468 X=new_vectors_to_clusters 

469 ) 

470 

471 # Get prediction of kmeans clustering model. 

472 list_of_clusters: List[int] = self.clustering_model.labels_.tolist() 

473 

474 # Define the dictionary of predicted clusters. 

475 predicted_clusters: Dict[str, int] = { 

476 data_ID: list_of_clusters[ID] 

477 for ID, data_ID in enumerate(self.list_of_data_IDs) 

478 } 

479 

480 ### 

481 ### RENAME CLUSTERS BY ORDER 

482 ### 

483 

484 # Define a map to be able to rename cluster IDs. 

485 mapping_of_old_ID_to_new_ID: Dict[int, int] = {} 

486 new_ID: int = 0 

487 for data_ID in self.list_of_data_IDs: 

488 if predicted_clusters[data_ID] not in mapping_of_old_ID_to_new_ID.keys(): 

489 mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]] = new_ID 

490 new_ID += 1 

491 

492 # Rename cluster IDs. 

493 predicted_clusters = { 

494 data_ID: mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]] 

495 for data_ID in self.list_of_data_IDs 

496 } 

497 

498 # Return predicted clusters 

499 return predicted_clusters 

500 """