Coverage for src\cognitivefactory\interactive_clustering\clustering\affinity_propagation.py: 82.03%

149 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1""" 

2* Name: interactive-clustering/src/clustering/affinity_propagation.py 

3* Description: Implementation of constrained Affinity Propagation clustering algorithm. 

4* Author: David NICOLAZO, Esther LENOTRE, Marc TRUTT 

5* Created: 02/03/2022 

6* Licence: CeCILL (https://cecill.info/licences.fr.html) 

7""" 

8 

9# ============================================================================== 

10# IMPORT PYTHON DEPENDENCIES 

11# ============================================================================== 

12 

13import warnings 

14from itertools import combinations, product 

15from typing import Dict, List, Optional, Tuple 

16 

17import numpy as np 

18from scipy.sparse import csr_matrix, vstack 

19from sklearn.metrics import pairwise_distances 

20from sklearn.utils import check_random_state 

21 

22from cognitivefactory.interactive_clustering.clustering.abstract import ( 

23 AbstractConstrainedClustering, 

24 rename_clusters_by_order, 

25) 

26from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager 

27 

28# ============================================================================== 

29# AFFINITY PROPAGATION CONSTRAINED CLUSTERING 

30# ============================================================================== 

31 

32 

33class AffinityPropagationConstrainedClustering(AbstractConstrainedClustering): 

34 """ 

35 This class will implements the Affinity Propagation constrained clustering. 

36 It inherits from `AbstractConstrainedClustering`. 

37 

38 References: 

39 - Affinity Propagation Clustering: `Frey, B. J., & Dueck, D. (2007). Clustering by Passing Messages Between Data Points. In Science (Vol. 315, Issue 5814, pp. 972–976). American Association for the Advancement of Science (AAAS). https://doi.org/10.1126/science.1136800` 

40 - Constrained Affinity Propagation Clustering: `Givoni, I., & Frey, B. J. (2009). Semi-Supervised Affinity Propagation with Instance-Level Constraints. Proceedings of the Twelth International Conference on Artificial Intelligence and Statistics, PMLR 5:161-168` 

41 

42 Example: 

43 ```python 

44 # Import. 

45 from scipy.sparse import csr_matrix 

46 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager 

47 from cognitivefactory.interactive_clustering.clustering.affinity_propagation import AffinityPropagationConstrainedClustering 

48 

49 # Create an instance of affinity propagation clustering. 

50 clustering_model = AffinityPropagationConstrainedClustering( 

51 random_seed=1, 

52 ) 

53 

54 # Define vectors. 

55 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts. 

56 vectors = { 

57 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]), 

58 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]), 

59 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]), 

60 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]), 

61 "4": csr_matrix([0.60, 0.17, 0.16, 0.07]), 

62 "5": csr_matrix([0.60, 0.16, 0.17, 0.07]), 

63 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]), 

64 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]), 

65 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]), 

66 } 

67 

68 # Define constraints manager. 

69 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys())) 

70 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK") 

71 constraints_manager.add_constraint(data_ID1="2", data_ID2="3", constraint_type="MUST_LINK") 

72 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK") 

73 constraints_manager.add_constraint(data_ID1="7", data_ID2="8", constraint_type="MUST_LINK") 

74 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK") 

75 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK") 

76 constraints_manager.add_constraint(data_ID1="4", data_ID2="7", constraint_type="CANNOT_LINK") 

77 

78 # Run clustering. 

79 dict_of_predicted_clusters = clustering_model.cluster( 

80 constraints_manager=constraints_manager, 

81 vectors=vectors, 

82 ####nb_clusters=None, 

83 ) 

84 

85 # Print results. 

86 print("Expected results", ";", {"0": 0, "1": 0, "2": 0, "3": 0, "4": 1, "5": 1, "6": 2, "7": 2, "8": 2,}) # TODO: 

87 print("Computed results", ":", dict_of_predicted_clusters) 

88 ``` 

89 

90 Warns: 

91 FutureWarning: `clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use. 

92 """ 

93 

94 def __init__( 

95 self, 

96 max_iteration: int = 150, 

97 convergence_iteration: int = 10, 

98 random_seed: Optional[int] = None, 

99 absolute_must_links: bool = True, 

100 **kargs, 

101 ) -> None: 

102 """ 

103 The constructor for the Affinity Propagation constrained clustering. 

104 

105 Args: 

106 max_iteration (int, optional): The maximum number of iteration for convergence. Defaults to `150`. 

107 convergence_iteration (int, optional): The number of iterations with no change to consider a convergence. Default to `15`. 

108 absolute_must_links (bool, optional): the option to strictly respect `"MUST_LINK"` type constraints. Defaults to ``True`. 

109 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`. 

110 **kargs (dict): Other parameters that can be used in the instantiation. 

111 

112 Warns: 

113 FutureWarning: `clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use. 

114 

115 Raises: 

116 ValueError: if some parameters are incorrectly set. 

117 """ 

118 

119 # Deprecation warnings 

120 warnings.warn( 

121 "`clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.", 

122 FutureWarning, # DeprecationWarning 

123 stacklevel=2, 

124 ) 

125 

126 # Store 'self.max_iteration`. 

127 if max_iteration < 1: 

128 raise ValueError("The `max_iteration` must be greater than or equal to 1.") 

129 self.max_iteration: int = max_iteration 

130 

131 # Store 'self.convergence_iteration`. 

132 if convergence_iteration < 1: 

133 raise ValueError("The `convergence_iteration` must be greater than or equal to 1.") 

134 self.convergence_iteration: int = convergence_iteration 

135 

136 # Store 'self.absolute_must_links`. 

137 self.absolute_must_links: bool = absolute_must_links 

138 

139 # Store 'self.random_seed`. 

140 self.random_seed: Optional[int] = random_seed 

141 

142 # Store `self.kargs` for kmeans clustering. 

143 self.kargs = kargs 

144 

145 # Initialize `self.dict_of_predicted_clusters`. 

146 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None 

147 

148 # ============================================================================== 

149 # MAIN - CLUSTER DATA 

150 # ============================================================================== 

151 

152 def cluster( 

153 self, 

154 constraints_manager: AbstractConstraintsManager, 

155 vectors: Dict[str, csr_matrix], 

156 nb_clusters: Optional[int] = None, 

157 verbose: bool = False, 

158 **kargs, 

159 ) -> Dict[str, int]: 

160 """ 

161 The main method used to cluster data with the KMeans model. 

162 

163 Args: 

164 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation. 

165 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data. 

166 nb_clusters (Optional[int]): The number of clusters to compute. Here `None`. 

167 verbose (bool, optional): Enable verbose output. Defaults to `False`. 

168 **kargs (dict): Other parameters that can be used in the clustering. 

169 

170 Raises: 

171 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set. 

172 

173 Returns: 

174 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID. 

175 """ 

176 

177 ### 

178 ### GET PARAMETERS 

179 ### 

180 

181 # Store `self.constraints_manager` and `self.list_of_data_IDs`. 

182 if not isinstance(constraints_manager, AbstractConstraintsManager): 

183 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.") 

184 self.constraints_manager: AbstractConstraintsManager = constraints_manager 

185 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs() 

186 

187 # Store `self.vectors`. 

188 if not isinstance(vectors, dict): 

189 raise ValueError("The `vectors` parameter has to be a `dict` type.") 

190 self.vectors: Dict[str, csr_matrix] = vectors 

191 

192 # Store `self.nb_clusters`. 

193 if nb_clusters is not None: 

194 raise ValueError("The `nb_clusters` should be 'None' for Affinity Propagataion clustering.") 

195 self.nb_clusters: Optional[int] = None 

196 

197 ### 

198 ### RUN AFFINITY PROPAGATION CONSTRAINED CLUSTERING 

199 ### 

200 

201 # Initialize `self.dict_of_predicted_clusters`. 

202 self.dict_of_predicted_clusters = None 

203 

204 # Correspondances ID -> index 

205 data_ID_to_idx: Dict[str, int] = {v: i for i, v in enumerate(self.list_of_data_IDs)} 

206 n_sample: int = len(self.list_of_data_IDs) 

207 

208 # Compute similarity between data points. 

209 S: csr_matrix = -pairwise_distances(vstack(self.vectors[data_ID] for data_ID in self.list_of_data_IDs)) 

210 

211 # Get connected components (closures of MUST_LINK contraints). 

212 must_link_closures: List[List[str]] = self.constraints_manager.get_connected_components() 

213 must_links: List[List[int]] = [[data_ID_to_idx[ID] for ID in closure] for closure in must_link_closures] 

214 

215 # Get annotated CANNOT_LINK contraints. 

216 cannot_links: List[Tuple[int, int]] = [] 

217 for data_ID_i1, data_ID_j1 in combinations(range(n_sample), 2): 

218 constraint = self.constraints_manager.get_added_constraint( 

219 self.list_of_data_IDs[data_ID_i1], self.list_of_data_IDs[data_ID_j1] 

220 ) 

221 if constraint and constraint[0] == "CANNOT_LINK": 

222 cannot_links.append((data_ID_i1, data_ID_j1)) 

223 

224 # Run constrained affinity propagation. 

225 cluster_labels: List[int] = _affinity_propagation_constrained( 

226 S, 

227 must_links=must_links, 

228 cannot_links=cannot_links, 

229 absolute_must_links=self.absolute_must_links, 

230 max_iteration=self.max_iteration, 

231 convergence_iteration=self.convergence_iteration, 

232 random_seed=self.random_seed, 

233 verbose=verbose, 

234 ) 

235 

236 # Rename cluster IDs by order. 

237 self.dict_of_predicted_clusters = rename_clusters_by_order( 

238 {self.list_of_data_IDs[i]: l for i, l in enumerate(cluster_labels)} 

239 ) 

240 

241 return self.dict_of_predicted_clusters 

242 

243 

244# ============================================================================== 

245# AFFINITY PROPAGATION FROM SKLEARN UNDER CONSTRAINTS 

246# ============================================================================== 

247 

248 

249def _equal_similarities_and_preferences(S, preference) -> bool: 

250 def _all_equal_preferences() -> bool: # noqa: WPS430 (nested function) 

251 return bool(np.all(preference == preference.flat[0])) 

252 

253 def _all_equal_similarities() -> bool: # noqa: WPS430 (nested function) 

254 # Create mask to ignore diagonal of S 

255 mask = np.ones(S.shape, dtype=bool) 

256 np.fill_diagonal(mask, 0) 

257 

258 return bool(np.all(S[mask].flat == S[mask].flat[0])) 

259 

260 return _all_equal_preferences() and _all_equal_similarities() 

261 

262 

263def _affinity_propagation_constrained( 

264 S: csr_matrix, 

265 must_links: List[List[int]], 

266 cannot_links: List[Tuple[int, int]], 

267 absolute_must_links: bool = True, 

268 max_iteration: int = 150, 

269 convergence_iteration: int = 10, 

270 damping: float = 0.5, 

271 remove_degeneracies: bool = True, 

272 random_seed: Optional[int] = None, 

273 verbose: bool = False, 

274): 

275 """ 

276 Perform Affinity Propagation Clustering of data. 

277 

278 Forked from https://github.com/scikit-learn/scikit-learn/blob/98cf537f5c538fdbc9d27b851cf03ce7611b8a48/sklearn/cluster/_affinity_propagation.py 

279 Modified by David NICOLAZO <git@dabsunter.fr> according to https://proceedings.mlr.press/v5/givoni09a.html 

280 

281 Args: 

282 S (csr_matrix): Matrix of similarities between points. 

283 must_links (List[List[int]]): The list of MUST_LINK closures, i.e. list of list of data linked by MUST_LINK constraints. 

284 cannot_links (List[Tuple[int, int]]): The list of data linked by CANNOT_LINK constraints 

285 absolute_must_links (bool, optional): The option to use absolute must links implementation. Defaults to `True`. 

286 max_iteration (int, optional): The maximum number of iteration for convergence. Defaults to `150`. 

287 convergence_iteration (int, optional): The number of iterations with no change to consider a convergence. Default to `15`. 

288 damping (float, optional): Damping factor between 0.5 and 1. Defaults to `0.5`. 

289 remove_degeneracies (bool, optional): The option to remove degeneracies in the similarity matrix. Defaults to `True`. 

290 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`. 

291 verbose (bool, optional): Enable verbose output. Defaults to `False`. 

292 

293 Returns: 

294 List[int]: The list of data cluster labels. 

295 """ 

296 

297 n_samples: int = S.shape[0] 

298 n_must_links: int = len(must_links) 

299 n_similarities: int = n_samples + n_must_links 

300 

301 # Define the preference by the median of the similarity matrix. 

302 preference = np.array(np.median(S)) 

303 

304 if n_samples == 1 or _equal_similarities_and_preferences(S, preference): 304 ↛ 307line 304 didn't jump to line 307, because the condition on line 304 was never true

305 # It makes no sense to run the algorithm in this case, so return 1 or n_samples clusters, depending on preferences 

306 # TODO: warnings.warn("All samples have mutually equal similarities. Returning arbitrary cluster center(s).") 

307 if preference.flat[0] >= S.flat[n_samples - 1]: 

308 return np.arange(n_samples) 

309 return [0 for _ in range(n_samples)] 

310 

311 # Fix random seed. 

312 random_state = check_random_state(random_seed) 

313 

314 # Remove degeneracies 

315 if remove_degeneracies: 315 ↛ 319line 315 didn't jump to line 319, because the condition on line 315 was never false

316 S += (np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) * random_state.randn(n_samples, n_samples) 

317 

318 # Use only meta-points to force data from MUST_LINK closure to be in the same cluster. 

319 if absolute_must_links: 319 ↛ 335line 319 didn't jump to line 335, because the condition on line 319 was never false

320 Saml = np.zeros((n_must_links, n_must_links)) 

321 

322 for data_ID_i1, data_ID_j1 in product(range(n_must_links), range(n_must_links)): 

323 Saml[data_ID_i1, data_ID_j1] = max( 

324 S[k, l] for k, l in product(must_links[data_ID_i1], must_links[data_ID_j1]) 

325 ) 

326 

327 S = Saml 

328 

329 n_similarities = n_must_links 

330 

331 preference = np.array(np.median(S)) 

332 

333 # Use data and meta-points. 

334 else: 

335 MPS = np.array( 

336 [ 

337 [0 if data_ID_i2 in Pm else max(S[data_ID_i2, data_ID_j2] for data_ID_j2 in Pm) for Pm in must_links] 

338 for data_ID_i2 in range(n_samples) 

339 ] 

340 ) 

341 

342 # Update similarity matrix. 

343 S = np.block([[S, MPS], [MPS.T, 2 * np.min(S) * np.ones((n_must_links, n_must_links))]]) 

344 

345 # Place preference on the diagonal of S 

346 S.flat[:: (n_similarities + 1)] = preference # noqa: WPS362 (subscript slice) 

347 

348 if absolute_must_links: 348 ↛ 351line 348 didn't jump to line 351, because the condition on line 348 was never false

349 Sp = S 

350 

351 A = np.zeros((n_similarities, n_similarities)) 

352 R = np.zeros((n_similarities, n_similarities)) # Initialize messages 

353 if not absolute_must_links: 353 ↛ 355line 353 didn't jump to line 355, because the condition on line 353 was never true

354 # E-I: CL constraints 

355 Q1 = np.zeros((n_similarities, n_similarities, len(cannot_links))) # qj (m, mn) [m,j,n] 

356 Q2 = np.zeros((n_similarities, n_similarities, len(cannot_links))) # qj (mn, m) [m,j,n] 

357 Sp = np.zeros((n_similarities, n_similarities)) # Ŝ 

358 

359 # Intermediate results 

360 tmp = np.zeros((n_similarities, n_similarities)) 

361 

362 # Execute parallel affinity propagation updates 

363 e = np.zeros((n_similarities, convergence_iteration)) 

364 

365 ind = np.arange(n_similarities) 

366 

367 for it in range(max_iteration): 367 ↛ 428line 367 didn't jump to line 428, because the loop on line 367 didn't complete

368 if not absolute_must_links: # TODO: Cannot link not supported for absolute_must_links mode 368 ↛ 374line 368 didn't jump to line 374, because the condition on line 368 was never true

369 # TODO: Verify Cannot link implementation as its effect is almost inexistant 

370 

371 # Sp = S + np.sum(Q2, axis=2) 

372 # np.sum(Q2, axis=2, out=Sp) 

373 # np.add(S, Sp, Sp) 

374 Sp[:] = S # noqa: WPS362 (subscript slice) 

375 for m in range(n_samples, n_similarities): 

376 Sp[m, :] += sum(Q2[m, :, n] for n, CLm in enumerate(cannot_links) if m in CLm) 

377 

378 # Q1 = A + R - Q2 

379 Q1new = A[:, :, None] + R[:, :, None] - Q2 

380 

381 # Q2 = - max(0, Q1) 

382 Q2 = -np.maximum(0, Q1) 

383 Q1 = Q1new 

384 

385 # tmp = A + S; compute responsibilities 

386 np.add(A, Sp, tmp) 

387 I = np.argmax(tmp, axis=1) # noqa: E741 (ambiguous variable) 

388 Y = tmp[ind, I] # np.max(A + S, axis=1) 

389 tmp[ind, I] = -np.inf 

390 Y2 = np.max(tmp, axis=1) 

391 

392 # tmp = Rnew 

393 np.subtract(Sp, Y[:, None], tmp) 

394 tmp[ind, I] = Sp[ind, I] - Y2 

395 

396 # Damping 

397 tmp *= 1 - damping 

398 R *= damping 

399 R += tmp 

400 

401 # tmp = Rp; compute availabilities 

402 np.maximum(R, 0, tmp) 

403 tmp.flat[:: n_similarities + 1] = R.flat[:: n_similarities + 1] # noqa: WPS362 (subscript slice) 

404 

405 # tmp = -Anew 

406 tmp -= np.sum(tmp, axis=0) 

407 dA = np.diag(tmp).copy() 

408 tmp.clip(0, np.inf, tmp) 

409 tmp.flat[:: n_similarities + 1] = dA # noqa: WPS362 (subscript slice) 

410 

411 # Damping 

412 tmp *= 1 - damping 

413 A *= damping 

414 A -= tmp 

415 

416 # Check for convergence 

417 E = (np.diag(A) + np.diag(R)) > 0 

418 e[:, it % convergence_iteration] = E 

419 K = np.sum(E, axis=0) 

420 

421 if it >= convergence_iteration: 

422 se = np.sum(e, axis=1) 

423 unconverged = np.sum((se == convergence_iteration) + (se == 0)) != n_similarities # n_samples 

424 if (not unconverged and (K > 0)) or (it == max_iteration): 

425 never_converged = False 

426 break 

427 else: 

428 never_converged = True 

429 

430 I = np.flatnonzero(E) # noqa: E741 (ambiguous variable) 

431 K = I.size # Identify exemplars 

432 

433 if K > 0 and not never_converged: 433 ↛ 461line 433 didn't jump to line 461, because the condition on line 433 was never false

434 c = np.argmax(Sp[:, I], axis=1) 

435 c[I] = np.arange(K) # Identify clusters 

436 # Refine the final set of exemplars and clusters and return results 

437 for k in range(K): 

438 ii = np.where(c == k)[0] 

439 j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0)) 

440 I[k] = ii[j] 

441 

442 c = np.argmax(Sp[:, I], axis=1) 

443 c[I] = np.arange(K) 

444 labels = I[c] 

445 if absolute_must_links: 445 ↛ 454line 445 didn't jump to line 454, because the condition on line 445 was never false

446 # Assign labels to meta-points members 

447 real_labels = [-1 for _ in range(n_samples)] 

448 for label, ml in zip(labels, must_links): 

449 for i in ml: 

450 real_labels[i] = label 

451 labels = real_labels 

452 else: 

453 # Remove meta-points 

454 labels = labels[:n_samples] 

455 # Reduce labels to a sorted, gapless, list 

456 cluster_centers_indices = np.unique(labels) 

457 labels = np.searchsorted(cluster_centers_indices, labels) 

458 else: 

459 # TODO: if verbose: # pragma: no cover 

460 # TODO: warnings.warn("Affinity propagation did not converge, this model " "will not have any cluster centers.", ConvergenceWarning) 

461 labels = [-1 for _ in range(n_samples)] 

462 cluster_centers_indices = [] 

463 

464 return labels