Coverage for src\cognitivefactory\interactive_clustering\clustering\spectral.py: 100.00%

1# -*- coding: utf-8 -*-

3"""

4* Name: cognitivefactory.interactive_clustering.clustering.spectral

5* Description: Implementation of constrained spectral clustering algorithms.

6* Author: Erwan SCHILD

7* Created: 17/03/2021

8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html)

9"""

11# ==============================================================================

12# IMPORTS PYTHON DEPENDENCIES

13# ==============================================================================

15from typing import Dict, List, Optional

17from scipy.sparse import csr_matrix, vstack

18from sklearn.cluster import SpectralClustering

19from sklearn.metrics import pairwise_kernels

21from cognitivefactory.interactive_clustering.clustering.abstract import (

22 AbstractConstrainedClustering,

23 rename_clusters_by_order,

24)

25from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager

28# ==============================================================================

29# SPECTRAL CONSTRAINED CLUSTERING

30# ==============================================================================

31class SpectralConstrainedClustering(AbstractConstrainedClustering):

32 """

33 This class implements the spectral constrained clustering.

34 It inherits from `AbstractConstrainedClustering`.

36 References:

37 - Spectral Clustering: `Ng, A. Y., M. I. Jordan, et Y.Weiss (2002). On Spectral Clustering: Analysis and an algorithm. In T. G. Dietterich, S. Becker, et Z. Ghahramani (Eds.), Advances in Neural Information Processing Systems 14. MIT Press.`

38 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.`

40 Example:

41 ```python

42 # Import.

43 from scipy.sparse import csr_matrix

44 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager

45 from cognitivefactory.interactive_clustering.clustering.spectral import SpectralConstrainedClustering

47 # Create an instance of spectral clustering.

48 clustering_model = SpectralConstrainedClustering(

49 model="SPEC",

50 random_seed=1,

51 )

53 # Define vectors.

54 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts.

55 vectors = {

56 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]),

57 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]),

58 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]),

59 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]),

60 "4": csr_matrix([0.60, 0.17, 0.16, 0.07]),

61 "5": csr_matrix([0.60, 0.16, 0.17, 0.07]),

62 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]),

63 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]),

64 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]),

65 }

67 # Define constraints manager.

68 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys()))

69 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK")

70 constraints_manager.add_constraint(data_ID1="2", data_ID2="3", constraint_type="MUST_LINK")

71 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK")

72 constraints_manager.add_constraint(data_ID1="7", data_ID2="8", constraint_type="MUST_LINK")

73 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK")

74 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK")

75 constraints_manager.add_constraint(data_ID1="4", data_ID2="7", constraint_type="CANNOT_LINK")

77 # Run clustering.

78 dict_of_predicted_clusters = clustering_model.cluster(

79 constraints_manager=constraints_manager,

80 vectors=vectors,

81 nb_clusters=3,

82 )

84 # Print results.

85 print("Expected results", ";", {"0": 0, "1": 0, "2": 0, "3": 0, "4": 1, "5": 1, "6": 2, "7": 2, "8": 2,})

86 print("Computed results", ":", dict_of_predicted_clusters)

87 ```

88 """

90 # ==============================================================================

91 # INITIALIZATION

92 # ==============================================================================

93 def __init__(

94 self, model: str = "SPEC", nb_components: Optional[int] = None, random_seed: Optional[int] = None, **kargs

95 ) -> None:

96 """

97 The constructor for Spectral Constrainted Clustering class.

99 Args:

100 model (str, optional): The spectral clustering model to use. Available spectral models are `"SPEC"` and `"CCSR"`. Defaults to `"SPEC"`.

101 nb_components (Optional[int], optional): The number of eigenvectors to compute in the spectral clustering. If `None`, set the number of components to the number of clusters. Defaults to `None`.

102 random_seed (Optional[int], optional): The random seed to use to redo the same clustering. Defaults to `None`.

103 **kargs (dict): Other parameters that can be used in the instantiation.

104

105 Raises:

106 ValueError: if some parameters are incorrectly set.

107 """

108

109 # Store `self.model`.

110 if model != "SPEC": # TODO use `not in {"SPEC"}`. # TODO `"CCSR"` to add after correction.

111 raise ValueError("The `model` '" + str(model) + "' is not implemented.")

112 self.model: str = model

113

114 # Store `self.nb_components`.

115 if (nb_components is not None) and (nb_components < 2):

116 raise ValueError(

117 "The `nb_components` '" + str(nb_components) + "' must be `None` or greater than or equal to 2."

118 )

119 self.nb_components: Optional[int] = nb_components

120

121 # Store `self.random_seed`.

122 self.random_seed: Optional[int] = random_seed

123

124 # Store `self.kargs` for kmeans clustering.

125 self.kargs = kargs

126

127 # Initialize `self.dict_of_predicted_clusters`.

128 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None

129

130 # ==============================================================================

131 # MAIN - CLUSTER DATA

132 # ==============================================================================

133 def cluster(

134 self,

135 constraints_manager: AbstractConstraintsManager,

136 vectors: Dict[str, csr_matrix],

137 nb_clusters: Optional[int],

138 verbose: bool = False,

139 **kargs,

140 ) -> Dict[str, int]:

141 """

142 The main method used to cluster data with the Spectral model.

143

144 Args:

145 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation.

146 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data.

147 nb_clusters (Optional[int]): The number of clusters to compute.

148 verbose (bool, optional): Enable verbose output. Defaults to `False`.

149 **kargs (dict): Other parameters that can be used in the clustering.

150

151 Raises:

152 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set.

153

154 Returns:

155 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.

156 """

157

158 ###

159 ### GET PARAMETERS

160 ###

161

162 # Store `self.constraints_manager` and `self.list_of_data_IDs`.

163 if not isinstance(constraints_manager, AbstractConstraintsManager):

164 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.")

165 self.constraints_manager: AbstractConstraintsManager = constraints_manager

166 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs()

167

168 # Store `self.vectors`.

169 if not isinstance(vectors, dict):

170 raise ValueError("The `vectors` parameter has to be a `dict` type.")

171 self.vectors: Dict[str, csr_matrix] = vectors

172

173 # Store `self.nb_clusters`.

174 if (nb_clusters is None) or (nb_clusters < 2):

175 raise ValueError("The `nb_clusters` '" + str(nb_clusters) + "' must be greater than or equal to 2.")

176 self.nb_clusters: int = min(nb_clusters, len(self.list_of_data_IDs))

177

178 # Define `self.current_nb_components`.

179 self.current_nb_components: int = (

180 self.nb_components

181 if ((self.nb_components is not None) and (self.nb_clusters < self.nb_components))

182 else self.nb_clusters

183 )

184

185 # Compute `self.pairwise_similarity_matrix`.

186 self.pairwise_similarity_matrix: csr_matrix = pairwise_kernels(

187 X=vstack(self.vectors[data_ID] for data_ID in self.constraints_manager.get_list_of_managed_data_IDs()),

188 metric="rbf", # TODO get different pairwise_distances config in **kargs

189 )

190

191 ###

192 ### RUN SPECTRAL CONSTRAINED CLUSTERING

193 ###

194

195 # Initialize `self.dict_of_predicted_clusters`.

196 self.dict_of_predicted_clusters = None

197

198 # Case of `"CCSR"` spectral clustering.

199 # TODO Don't work.

200 ##if self.model == "CCSR":

201 ## self.dict_of_predicted_clusters = self.clustering_spectral_model_CCSR(verbose=verbose)

202

203 # Case of `"SPEC"` spectral clustering.

204 ##### DEFAULTS : if self.model=="SPEC":

205 self.dict_of_predicted_clusters = self.clustering_spectral_model_SPEC(verbose=verbose)

206

207 ###

208 ### RETURN PREDICTED CLUSTERS

209 ###

210

211 return self.dict_of_predicted_clusters

212

213 # ==============================================================================

214 # IMPLEMENTATION - SPEC SPECTRAL CLUSTERING

215 # ==============================================================================

216 def clustering_spectral_model_SPEC(

217 self,

218 verbose: bool = False,

219 ) -> Dict[str, int]:

220 """

221 Implementation of a simple Spectral clustering algorithm, based affinity matrix modifications.

222

223 References :

224 - Constrained _'SPEC'_ Spectral Clustering: `Kamvar, S. D., D. Klein, et C. D. Manning (2003). Spectral Learning. Proceedings of the international joint conference on artificial intelligence, 561–566.`

225

226 Args:

227 verbose (bool, optional): Enable verbose output. Default is `False`.

228

229 Returns:

230 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.

231 """

232

233 ###

234 ### MODIFY CONSTRAINTS MATRIX WITH CONSTRAINTS

235 ###

236

237 # Modify the similarity over data IDs.

238 for ID1, data_ID1 in enumerate(self.list_of_data_IDs):

239 for ID2, data_ID2 in enumerate(self.list_of_data_IDs):

240 # Symetry is already handled in next instructions.

241 if ID1 > ID2:

242 continue

243

244 # For each `"MUST_LINK"` constraint, set the similarity to 1.0.

245 if (

246 self.constraints_manager.get_inferred_constraint(

247 data_ID1=data_ID1,

248 data_ID2=data_ID2,

249 )

250 == "MUST_LINK"

251 ):

252 self.pairwise_similarity_matrix[ID1, ID2] = 1.0

253 self.pairwise_similarity_matrix[ID2, ID1] = 1.0

254

255 # For each `"CANNOT_LINK"` constraint, set the similarity to 0.0.

256 elif (

257 self.constraints_manager.get_inferred_constraint(

258 data_ID1=data_ID1,

259 data_ID2=data_ID2,

260 )

261 == "CANNOT_LINK"

262 ):

263 self.pairwise_similarity_matrix[ID1, ID2] = 0.0

264 self.pairwise_similarity_matrix[ID2, ID1] = 0.0

265

266 ###

267 ### RUN SPECTRAL CONSTRAINED CLUSTERING

268 ### | Define laplacian matrix

269 ### | Compute eigen vectors

270 ### | Cluster eigen vectors

271 ### | Return labels based on eigen vectors clustering

272 ###

273

274 # Initialize spectral clustering model.

275 self.clustering_model = SpectralClustering(

276 n_clusters=self.nb_clusters,

277 # n_components=self.current_nb_components, #TODO Add if `scikit-learn>=0.24.1`

278 affinity="precomputed",

279 random_state=self.random_seed,

280 **self.kargs,

281 )

282

283 # Run spectral clustering model.

284 self.clustering_model.fit_predict(X=self.pairwise_similarity_matrix)

285

286 # Get prediction of spectral clustering model.

287 list_of_clusters: List[int] = self.clustering_model.labels_.tolist()

288

289 # Define the dictionary of predicted clusters.

290 predicted_clusters: Dict[str, int] = {

291 data_ID: list_of_clusters[ID] for ID, data_ID in enumerate(self.list_of_data_IDs)

292 }

293

294 # Rename cluster IDs by order.

295 predicted_clusters = rename_clusters_by_order(clusters=predicted_clusters)

296

297 # Return predicted clusters

298 return predicted_clusters

299

300 # ==============================================================================

301 # IMPLEMENTATION - CCSR SPECTRAL CLUSTERING

302 # ==============================================================================

303 """ #TODO : do not work. check if 1) wrong implementation ? 2) cvxopt better ?

304 def clustering_spectral_model_CCSR(

305 self,

306 verbose: bool = False,

307 ) -> Dict[str, int]:

308 \"""

309 Implementation of Constrained Clustering with Spectral Regularization algorithm, based on spectral semidefinite programming interpretation.

310 - Source : `Li, Z., Liu, J., & Tang, X. (2009). Constrained clustering via spectral regularization. 2009 IEEE Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/cvpr.2009.5206852`

311 - MATLAB Implementation : https://icube-forge.unistra.fr/lampert/TSCC/-/tree/master/methods/Li09

312 - SOLVERS comparison : https://pypi.org/project/PICOS/

313 - CVXPY solver : https://www.cvxpy.org/index.html

314

315 Args:

316 verbose (bool, optional): Enable verbose output. Defaults to `False`.

317

318 Returns:

319 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.

320 \"""

321

322 ###

323 ### COMPUTE NORMALIZED LAPLACIAN

324 ###

325

326 # Compute the normalized Laplacian.

327 normalized_laplacian, diagonal = csgraph_laplacian(

328 csgraph=self.pairwise_similarity_matrix,

329 normed=True,

330 return_diag=True

331 )

332

333 ###

334 ### COMPUTE EIGENVECTORS OF LAPLACIAN

335 ###

336

337 random.seed(self.random_seed)

338 v0 = np.random.rand(min(normalized_laplacian.shape))

339

340 # Compute the largest eigen vectors.

341 eigen_values, eigen_vectors = eigsh(

342 A=normalized_laplacian,

343 which="SM",

344 k=self.current_nb_components + 1,

345 v0=v0,

346 )

347

348 # Ignore eigenvector of eigenvalue 0.

349 eigen_values = eigen_values[1:]

350 eigen_vectors = eigen_vectors.T[1:]

351

352 if verbose: # pragma: no cover

353 print("EIGENVALUES / EIGENVECTORS")

354

355 for k in range(len(eigen_values)):

356 print(" ", "=============")

357 print(" ", "ID : ", k)

358 print(" ", "VAL : ", eigen_values[k])

359 print(" ", "VEC : ", eigen_vectors[k])

360

361 ###

362 ### FORMALIZE SEMIDEFINITE PROBLEM

363 ###

364

365 ## Problem ::

366 ## Cost function to minimize : L(z) = 1/2 * z.T * B * z + b.T * z + c

367 ## z : variable to find with the SDP problem

368 ## z = vec(M)

369 ## M >> 0 (semi definite positive), i.e. M = M.T, M.shape=(nb_components, nb_components)

370 ## B = sum ( s_ij s_ij.T )

371 # for ij in MUST_LINK or i,j in CANNOT_LINK

372 ## b = -2 * sum ( t_ij * s_ij )

373 # for ij in MUST_LINK or i,j in CANNOT_LINK

374 ## c = sum ( t_ij^2 )

375 # for ij in MUST_LINK or i,j in CANNOT_LINK

376 ## s_ij = vec( eigen_vector_i.T * eigen_vector_j )

377 ## t_ij = 1 if MUST_LINK(i,j), 0 if CANNOT_LINK(i,j)

378

379 # Initialization of SDP variables.

380 B = np.zeros((self.current_nb_components ** 2, self.current_nb_components ** 2))

381 b = np.zeros((self.current_nb_components ** 2, 1))

382 c = 0

383

384 for ID1, data_ID1 in enumerate(self.list_of_data_IDs):

385 for ID2, data_ID2 in enumerate(self.list_of_data_IDs):

386

387 # Get eigenvectors.

388 eigen_vector_i = np.atleast_2d(eigen_vectors.T[ID1])

389 eigen_vector_j = np.atleast_2d(eigen_vectors.T[ID2])

390

391 # Compute eigenvectors similarity.

392 U = eigen_vector_j.T @ eigen_vector_i

393 s = np.atleast_2d(U.ravel())

394

395

396 # For each `"MUST_LINK"` constraint, ....

397 if self.constraints_manager.get_inferred_constraint(

398 data_ID1=data_ID1,

399 data_ID2=data_ID2,

400 ) == "MUST_LINK":

401

402 # Add the value to SDP variables.

403 B += s.T * s

404 b += - 1 * s.T

405 c += 1 * 1

406

407 # For each `"CANNOT_LINK"` constraint, ....

408 if self.constraints_manager.get_inferred_constraint(

409 data_ID1=data_ID1,

410 data_ID2=data_ID2,

411 ) == "CANNOT_LINK":

412

413 # Add the value to SDP variables.

414 B += s.T * s

415 b += - 0 * s.T

416 c += 0 * 0

417

418 ###

419 ### SOLVE SEMIDEFINITE PROBLEM

420 ###

421

422 # Create a symetric matrix variable.

423 M = cp.Variable((self.current_nb_components, self.current_nb_components))

424

425 ### Define constraints.

426 SDP_constraints = []

427

428 # The solution must be positive semidefinite.

429 SDP_constraints += [M >> 0]

430

431 # Define cost function to minimize : `( 1/2 * z.T * B * z + b.T * z + c )`.

432 self.SDP_problem = cp.Problem(

433 cp.Minimize(

434 cp.quad_form(cp.atoms.affine.vec.vec(M), B) # `1/2 * z.T * B * z`.

435 + b.T @ cp.atoms.affine.vec.vec(M) # `b.T * z`.

436 + c # c

437 ),

438 SDP_constraints,

439 )

440

441 # Solve the SDP problem.

442 self.SDP_problem.solve(solver="MOSEK")

443 if verbose: # pragma: no cover

444 print("SEMIDEFINITE PROBLEM")

445 print(" ", "STATUS", ":", self.SDP_problem.status)

446 print(" ", "COST FUNCTION VALUE", ":", self.SDP_problem.value)

447

448 ###

449 ### CLUSTER EIGEN VECTORS

450 ###

451

452 # Define square root of M, and force sqrtM to be symetric.

453 sqrtM = sqrtm(M.value).real

454 sqrtM = (sqrtM + sqrtM.T) / 2

455

456 # Compute new embeddings for spectral clustering.

457 new_vectors_to_clusters = eigen_vectors.T @ sqrtM

458

459 # Initialize kmeans klustering model.

460 self.clustering_model = KMeans(

461 n_clusters=self.nb_clusters,

462 max_iter=10000,

463 random_state=self.random_seed,

464 )

465

466 # Run kmeans clustering model.

467 self.clustering_model.fit_predict(

468 X=new_vectors_to_clusters

469 )

470

471 # Get prediction of kmeans clustering model.

472 list_of_clusters: List[int] = self.clustering_model.labels_.tolist()

473

474 # Define the dictionary of predicted clusters.

475 predicted_clusters: Dict[str, int] = {

476 data_ID: list_of_clusters[ID]

477 for ID, data_ID in enumerate(self.list_of_data_IDs)

478 }

479

480 ###

481 ### RENAME CLUSTERS BY ORDER

482 ###

483

484 # Define a map to be able to rename cluster IDs.

485 mapping_of_old_ID_to_new_ID: Dict[int, int] = {}

486 new_ID: int = 0

487 for data_ID in self.list_of_data_IDs:

488 if predicted_clusters[data_ID] not in mapping_of_old_ID_to_new_ID.keys():

489 mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]] = new_ID

490 new_ID += 1

491

492 # Rename cluster IDs.

493 predicted_clusters = {

494 data_ID: mapping_of_old_ID_to_new_ID[predicted_clusters[data_ID]]

495 for data_ID in self.list_of_data_IDs

496 }

497

498 # Return predicted clusters

499 return predicted_clusters

500 """