Coverage for src\cognitivefactory\interactive_clustering\clustering\affinity

1"""

2* Name: interactive-clustering/src/clustering/affinity_propagation.py

3* Description: Implementation of constrained Affinity Propagation clustering algorithm.

4* Author: David NICOLAZO, Esther LENOTRE, Marc TRUTT

5* Created: 02/03/2022

6* Licence: CeCILL (https://cecill.info/licences.fr.html)

7"""

9# ==============================================================================

10# IMPORT PYTHON DEPENDENCIES

11# ==============================================================================

13import warnings

14from itertools import combinations, product

15from typing import Dict, List, Optional, Tuple

17import numpy as np

18from scipy.sparse import csr_matrix, vstack

19from sklearn.metrics import pairwise_distances

20from sklearn.utils import check_random_state

22from cognitivefactory.interactive_clustering.clustering.abstract import (

23 AbstractConstrainedClustering,

24 rename_clusters_by_order,

25)

26from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager

28# ==============================================================================

29# AFFINITY PROPAGATION CONSTRAINED CLUSTERING

30# ==============================================================================

33class AffinityPropagationConstrainedClustering(AbstractConstrainedClustering):

34 """

35 This class will implements the Affinity Propagation constrained clustering.

36 It inherits from `AbstractConstrainedClustering`.

38 References:

39 - Affinity Propagation Clustering: `Frey, B. J., & Dueck, D. (2007). Clustering by Passing Messages Between Data Points. In Science (Vol. 315, Issue 5814, pp. 972–976). American Association for the Advancement of Science (AAAS). https://doi.org/10.1126/science.1136800`

40 - Constrained Affinity Propagation Clustering: `Givoni, I., & Frey, B. J. (2009). Semi-Supervised Affinity Propagation with Instance-Level Constraints. Proceedings of the Twelth International Conference on Artificial Intelligence and Statistics, PMLR 5:161-168`

42 Example:

43 ```python

44 # Import.

45 from scipy.sparse import csr_matrix

46 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager

47 from cognitivefactory.interactive_clustering.clustering.affinity_propagation import AffinityPropagationConstrainedClustering

49 # Create an instance of affinity propagation clustering.

50 clustering_model = AffinityPropagationConstrainedClustering(

51 random_seed=1,

52 )

54 # Define vectors.

55 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts.

56 vectors = {

57 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]),

58 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]),

59 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]),

60 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]),

61 "4": csr_matrix([0.60, 0.17, 0.16, 0.07]),

62 "5": csr_matrix([0.60, 0.16, 0.17, 0.07]),

63 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]),

64 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]),

65 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]),

66 }

68 # Define constraints manager.

69 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys()))

70 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK")

71 constraints_manager.add_constraint(data_ID1="2", data_ID2="3", constraint_type="MUST_LINK")

72 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK")

73 constraints_manager.add_constraint(data_ID1="7", data_ID2="8", constraint_type="MUST_LINK")

74 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK")

75 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK")

76 constraints_manager.add_constraint(data_ID1="4", data_ID2="7", constraint_type="CANNOT_LINK")

78 # Run clustering.

79 dict_of_predicted_clusters = clustering_model.cluster(

80 constraints_manager=constraints_manager,

81 vectors=vectors,

82 ####nb_clusters=None,

83 )

85 # Print results.

86 print("Expected results", ";", {"0": 0, "1": 0, "2": 0, "3": 0, "4": 1, "5": 1, "6": 2, "7": 2, "8": 2,}) # TODO:

87 print("Computed results", ":", dict_of_predicted_clusters)

88 ```

90 Warns:

91 FutureWarning: `clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.

92 """

94 def __init__(

95 self,

96 max_iteration: int = 150,

97 convergence_iteration: int = 10,

98 random_seed: Optional[int] = None,

99 absolute_must_links: bool = True,

100 **kargs,

101 ) -> None:

102 """

103 The constructor for the Affinity Propagation constrained clustering.

104

105 Args:

106 max_iteration (int, optional): The maximum number of iteration for convergence. Defaults to `150`.

107 convergence_iteration (int, optional): The number of iterations with no change to consider a convergence. Default to `15`.

108 absolute_must_links (bool, optional): the option to strictly respect `"MUST_LINK"` type constraints. Defaults to ``True`.

109 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`.

110 **kargs (dict): Other parameters that can be used in the instantiation.

111

112 Warns:

113 FutureWarning: `clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.

114

115 Raises:

116 ValueError: if some parameters are incorrectly set.

117 """

118

119 # Deprecation warnings

120 warnings.warn(

121 "`clustering.affinity_propagation.AffinityPropagationConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.",

122 FutureWarning, # DeprecationWarning

123 stacklevel=2,

124 )

125

126 # Store 'self.max_iteration`.

127 if max_iteration < 1:

128 raise ValueError("The `max_iteration` must be greater than or equal to 1.")

129 self.max_iteration: int = max_iteration

130

131 # Store 'self.convergence_iteration`.

132 if convergence_iteration < 1:

133 raise ValueError("The `convergence_iteration` must be greater than or equal to 1.")

134 self.convergence_iteration: int = convergence_iteration

135

136 # Store 'self.absolute_must_links`.

137 self.absolute_must_links: bool = absolute_must_links

138

139 # Store 'self.random_seed`.

140 self.random_seed: Optional[int] = random_seed

141

142 # Store `self.kargs` for kmeans clustering.

143 self.kargs = kargs

144

145 # Initialize `self.dict_of_predicted_clusters`.

146 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None

147

148 # ==============================================================================

149 # MAIN - CLUSTER DATA

150 # ==============================================================================

151

152 def cluster(

153 self,

154 constraints_manager: AbstractConstraintsManager,

155 vectors: Dict[str, csr_matrix],

156 nb_clusters: Optional[int] = None,

157 verbose: bool = False,

158 **kargs,

159 ) -> Dict[str, int]:

160 """

161 The main method used to cluster data with the KMeans model.

162

163 Args:

164 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation.

165 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data.

166 nb_clusters (Optional[int]): The number of clusters to compute. Here `None`.

167 verbose (bool, optional): Enable verbose output. Defaults to `False`.

168 **kargs (dict): Other parameters that can be used in the clustering.

169

170 Raises:

171 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set.

172

173 Returns:

174 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.

175 """

176

177 ###

178 ### GET PARAMETERS

179 ###

180

181 # Store `self.constraints_manager` and `self.list_of_data_IDs`.

182 if not isinstance(constraints_manager, AbstractConstraintsManager):

183 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.")

184 self.constraints_manager: AbstractConstraintsManager = constraints_manager

185 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs()

186

187 # Store `self.vectors`.

188 if not isinstance(vectors, dict):

189 raise ValueError("The `vectors` parameter has to be a `dict` type.")

190 self.vectors: Dict[str, csr_matrix] = vectors

191

192 # Store `self.nb_clusters`.

193 if nb_clusters is not None:

194 raise ValueError("The `nb_clusters` should be 'None' for Affinity Propagataion clustering.")

195 self.nb_clusters: Optional[int] = None

196

197 ###

198 ### RUN AFFINITY PROPAGATION CONSTRAINED CLUSTERING

199 ###

200

201 # Initialize `self.dict_of_predicted_clusters`.

202 self.dict_of_predicted_clusters = None

203

204 # Correspondances ID -> index

205 data_ID_to_idx: Dict[str, int] = {v: i for i, v in enumerate(self.list_of_data_IDs)}

206 n_sample: int = len(self.list_of_data_IDs)

207

208 # Compute similarity between data points.

209 S: csr_matrix = -pairwise_distances(vstack(self.vectors[data_ID] for data_ID in self.list_of_data_IDs))

210

211 # Get connected components (closures of MUST_LINK contraints).

212 must_link_closures: List[List[str]] = self.constraints_manager.get_connected_components()

213 must_links: List[List[int]] = [[data_ID_to_idx[ID] for ID in closure] for closure in must_link_closures]

214

215 # Get annotated CANNOT_LINK contraints.

216 cannot_links: List[Tuple[int, int]] = []

217 for data_ID_i1, data_ID_j1 in combinations(range(n_sample), 2):

218 constraint = self.constraints_manager.get_added_constraint(

219 self.list_of_data_IDs[data_ID_i1], self.list_of_data_IDs[data_ID_j1]

220 )

221 if constraint and constraint[0] == "CANNOT_LINK":

222 cannot_links.append((data_ID_i1, data_ID_j1))

223

224 # Run constrained affinity propagation.

225 cluster_labels: List[int] = _affinity_propagation_constrained(

226 S,

227 must_links=must_links,

228 cannot_links=cannot_links,

229 absolute_must_links=self.absolute_must_links,

230 max_iteration=self.max_iteration,

231 convergence_iteration=self.convergence_iteration,

232 random_seed=self.random_seed,

233 verbose=verbose,

234 )

235

236 # Rename cluster IDs by order.

237 self.dict_of_predicted_clusters = rename_clusters_by_order(

238 {self.list_of_data_IDs[i]: l for i, l in enumerate(cluster_labels)}

239 )

240

241 return self.dict_of_predicted_clusters

242

243

244# ==============================================================================

245# AFFINITY PROPAGATION FROM SKLEARN UNDER CONSTRAINTS

246# ==============================================================================

247

248

249def _equal_similarities_and_preferences(S, preference) -> bool:

250 def _all_equal_preferences() -> bool: # noqa: WPS430 (nested function)

251 return bool(np.all(preference == preference.flat[0]))

252

253 def _all_equal_similarities() -> bool: # noqa: WPS430 (nested function)

254 # Create mask to ignore diagonal of S

255 mask = np.ones(S.shape, dtype=bool)

256 np.fill_diagonal(mask, 0)

257

258 return bool(np.all(S[mask].flat == S[mask].flat[0]))

259

260 return _all_equal_preferences() and _all_equal_similarities()

261

262

263def _affinity_propagation_constrained(

264 S: csr_matrix,

265 must_links: List[List[int]],

266 cannot_links: List[Tuple[int, int]],

267 absolute_must_links: bool = True,

268 max_iteration: int = 150,

269 convergence_iteration: int = 10,

270 damping: float = 0.5,

271 remove_degeneracies: bool = True,

272 random_seed: Optional[int] = None,

273 verbose: bool = False,

274):

275 """

276 Perform Affinity Propagation Clustering of data.

277

278 Forked from https://github.com/scikit-learn/scikit-learn/blob/98cf537f5c538fdbc9d27b851cf03ce7611b8a48/sklearn/cluster/_affinity_propagation.py

279 Modified by David NICOLAZO <git@dabsunter.fr> according to https://proceedings.mlr.press/v5/givoni09a.html

280

281 Args:

282 S (csr_matrix): Matrix of similarities between points.

283 must_links (List[List[int]]): The list of MUST_LINK closures, i.e. list of list of data linked by MUST_LINK constraints.

284 cannot_links (List[Tuple[int, int]]): The list of data linked by CANNOT_LINK constraints

285 absolute_must_links (bool, optional): The option to use absolute must links implementation. Defaults to `True`.

286 max_iteration (int, optional): The maximum number of iteration for convergence. Defaults to `150`.

287 convergence_iteration (int, optional): The number of iterations with no change to consider a convergence. Default to `15`.

288 damping (float, optional): Damping factor between 0.5 and 1. Defaults to `0.5`.

289 remove_degeneracies (bool, optional): The option to remove degeneracies in the similarity matrix. Defaults to `True`.

290 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`.

291 verbose (bool, optional): Enable verbose output. Defaults to `False`.

292

293 Returns:

294 List[int]: The list of data cluster labels.

295 """

296

297 n_samples: int = S.shape[0]

298 n_must_links: int = len(must_links)

299 n_similarities: int = n_samples + n_must_links

300

301 # Define the preference by the median of the similarity matrix.

302 preference = np.array(np.median(S))

303

304 if n_samples == 1 or _equal_similarities_and_preferences(S, preference): 304 ↛ 307line 304 didn't jump to line 307, because the condition on line 304 was never true

305 # It makes no sense to run the algorithm in this case, so return 1 or n_samples clusters, depending on preferences

306 # TODO: warnings.warn("All samples have mutually equal similarities. Returning arbitrary cluster center(s).")

307 if preference.flat[0] >= S.flat[n_samples - 1]:

308 return np.arange(n_samples)

309 return [0 for _ in range(n_samples)]

310

311 # Fix random seed.

312 random_state = check_random_state(random_seed)

313

314 # Remove degeneracies

315 if remove_degeneracies: 315 ↛ 319line 315 didn't jump to line 319, because the condition on line 315 was never false

316 S += (np.finfo(S.dtype).eps * S + np.finfo(S.dtype).tiny * 100) * random_state.randn(n_samples, n_samples)

317

318 # Use only meta-points to force data from MUST_LINK closure to be in the same cluster.

319 if absolute_must_links: 319 ↛ 335line 319 didn't jump to line 335, because the condition on line 319 was never false

320 Saml = np.zeros((n_must_links, n_must_links))

321

322 for data_ID_i1, data_ID_j1 in product(range(n_must_links), range(n_must_links)):

323 Saml[data_ID_i1, data_ID_j1] = max(

324 S[k, l] for k, l in product(must_links[data_ID_i1], must_links[data_ID_j1])

325 )

326

327 S = Saml

328

329 n_similarities = n_must_links

330

331 preference = np.array(np.median(S))

332

333 # Use data and meta-points.

334 else:

335 MPS = np.array(

336 [

337 [0 if data_ID_i2 in Pm else max(S[data_ID_i2, data_ID_j2] for data_ID_j2 in Pm) for Pm in must_links]

338 for data_ID_i2 in range(n_samples)

339 ]

340 )

341

342 # Update similarity matrix.

343 S = np.block([[S, MPS], [MPS.T, 2 * np.min(S) * np.ones((n_must_links, n_must_links))]])

344

345 # Place preference on the diagonal of S

346 S.flat[:: (n_similarities + 1)] = preference # noqa: WPS362 (subscript slice)

347

348 if absolute_must_links: 348 ↛ 351line 348 didn't jump to line 351, because the condition on line 348 was never false

349 Sp = S

350

351 A = np.zeros((n_similarities, n_similarities))

352 R = np.zeros((n_similarities, n_similarities)) # Initialize messages

353 if not absolute_must_links: 353 ↛ 355line 353 didn't jump to line 355, because the condition on line 353 was never true

354 # E-I: CL constraints

355 Q1 = np.zeros((n_similarities, n_similarities, len(cannot_links))) # qj (m, mn) [m,j,n]

356 Q2 = np.zeros((n_similarities, n_similarities, len(cannot_links))) # qj (mn, m) [m,j,n]

357 Sp = np.zeros((n_similarities, n_similarities)) # Ŝ

358

359 # Intermediate results

360 tmp = np.zeros((n_similarities, n_similarities))

361

362 # Execute parallel affinity propagation updates

363 e = np.zeros((n_similarities, convergence_iteration))

364

365 ind = np.arange(n_similarities)

366

367 for it in range(max_iteration): 367 ↛ 428line 367 didn't jump to line 428, because the loop on line 367 didn't complete

368 if not absolute_must_links: # TODO: Cannot link not supported for absolute_must_links mode 368 ↛ 374line 368 didn't jump to line 374, because the condition on line 368 was never true

369 # TODO: Verify Cannot link implementation as its effect is almost inexistant

370

371 # Sp = S + np.sum(Q2, axis=2)

372 # np.sum(Q2, axis=2, out=Sp)

373 # np.add(S, Sp, Sp)

374 Sp[:] = S # noqa: WPS362 (subscript slice)

375 for m in range(n_samples, n_similarities):

376 Sp[m, :] += sum(Q2[m, :, n] for n, CLm in enumerate(cannot_links) if m in CLm)

377

378 # Q1 = A + R - Q2

379 Q1new = A[:, :, None] + R[:, :, None] - Q2

380

381 # Q2 = - max(0, Q1)

382 Q2 = -np.maximum(0, Q1)

383 Q1 = Q1new

384

385 # tmp = A + S; compute responsibilities

386 np.add(A, Sp, tmp)

387 I = np.argmax(tmp, axis=1) # noqa: E741 (ambiguous variable)

388 Y = tmp[ind, I] # np.max(A + S, axis=1)

389 tmp[ind, I] = -np.inf

390 Y2 = np.max(tmp, axis=1)

391

392 # tmp = Rnew

393 np.subtract(Sp, Y[:, None], tmp)

394 tmp[ind, I] = Sp[ind, I] - Y2

395

396 # Damping

397 tmp *= 1 - damping

398 R *= damping

399 R += tmp

400

401 # tmp = Rp; compute availabilities

402 np.maximum(R, 0, tmp)

403 tmp.flat[:: n_similarities + 1] = R.flat[:: n_similarities + 1] # noqa: WPS362 (subscript slice)

404

405 # tmp = -Anew

406 tmp -= np.sum(tmp, axis=0)

407 dA = np.diag(tmp).copy()

408 tmp.clip(0, np.inf, tmp)

409 tmp.flat[:: n_similarities + 1] = dA # noqa: WPS362 (subscript slice)

410

411 # Damping

412 tmp *= 1 - damping

413 A *= damping

414 A -= tmp

415

416 # Check for convergence

417 E = (np.diag(A) + np.diag(R)) > 0

418 e[:, it % convergence_iteration] = E

419 K = np.sum(E, axis=0)

420

421 if it >= convergence_iteration:

422 se = np.sum(e, axis=1)

423 unconverged = np.sum((se == convergence_iteration) + (se == 0)) != n_similarities # n_samples

424 if (not unconverged and (K > 0)) or (it == max_iteration):

425 never_converged = False

426 break

427 else:

428 never_converged = True

429

430 I = np.flatnonzero(E) # noqa: E741 (ambiguous variable)

431 K = I.size # Identify exemplars

432

433 if K > 0 and not never_converged: 433 ↛ 461line 433 didn't jump to line 461, because the condition on line 433 was never false

434 c = np.argmax(Sp[:, I], axis=1)

435 c[I] = np.arange(K) # Identify clusters

436 # Refine the final set of exemplars and clusters and return results

437 for k in range(K):

438 ii = np.where(c == k)[0]

439 j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))

440 I[k] = ii[j]

441

442 c = np.argmax(Sp[:, I], axis=1)

443 c[I] = np.arange(K)

444 labels = I[c]

445 if absolute_must_links: 445 ↛ 454line 445 didn't jump to line 454, because the condition on line 445 was never false

446 # Assign labels to meta-points members

447 real_labels = [-1 for _ in range(n_samples)]

448 for label, ml in zip(labels, must_links):

449 for i in ml:

450 real_labels[i] = label

451 labels = real_labels

452 else:

453 # Remove meta-points

454 labels = labels[:n_samples]

455 # Reduce labels to a sorted, gapless, list

456 cluster_centers_indices = np.unique(labels)

457 labels = np.searchsorted(cluster_centers_indices, labels)

458 else:

459 # TODO: if verbose: # pragma: no cover

460 # TODO: warnings.warn("Affinity propagation did not converge, this model " "will not have any cluster centers.", ConvergenceWarning)

461 labels = [-1 for _ in range(n_samples)]

462 cluster_centers_indices = []

463

464 return labels

Coverage for src\cognitivefactory\interactive_clustering\clustering\affinity_propagation.py: 82.03%

149 statements