Coverage for src\cognitivefactory\interactive_clustering\clustering\mpckmeans.py: 89.11%

217 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1# -*- coding: utf-8 -*- 

2 

3""" 

4* Name: cognitivefactory.interactive_clustering.clustering.mpckmeans 

5* Description: Implementation of constrained mpckmeans clustering algorithms. 

6* Author: Esther LENOTRE, David NICOLAZO, Marc TRUTT 

7* Created: 10/09/2022 

8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html) 

9""" 

10 

11# ============================================================================== 

12# IMPORT PYTHON DEPENDENCIES 

13# ============================================================================== 

14 

15import warnings 

16 

17# import random 

18from typing import Dict, List, Optional, Set, Tuple 

19 

20import numpy as np 

21import scipy 

22from scipy.sparse import csr_matrix 

23 

24from cognitivefactory.interactive_clustering.clustering.abstract import ( 

25 AbstractConstrainedClustering, 

26 rename_clusters_by_order, 

27) 

28from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager 

29 

30 

31# ============================================================================== 

32# MPCKMEANS CONSTRAINED CLUSTERING 

33# ============================================================================== 

34class MPCKMeansConstrainedClustering(AbstractConstrainedClustering): 

35 """ 

36 This class implements the MPCkmeans constrained clustering. 

37 It inherits from `AbstractConstrainedClustering`. 

38 

39 Forked from https://github.com/Behrouz-Babaki/COP-Kmeans/blob/master/copkmeans/cop_kmeans.py 

40 Modified by Esther LENOTRE <git@estherlenotre.fr> according to https://proceedings.mlr.press/v5/givoni09a.html 

41 

42 References: 

43 - KMeans Clustering: `MacQueen, J. (1967). Some methods for classification and analysis of multivariate observations. Proceedings of the fifth Berkeley symposium on mathematical statistics and probability 1(14), 281–297.` 

44 - Constrained _'MPC'_ KMeans Clustering: `Khan, Md. A., Tamim, I., Ahmed, E., & Awal, M. A. (2012). Multiple Parameter Based Clustering (MPC): Prospective Analysis for Effective Clustering in Wireless Sensor Network (WSN) Using K-Means Algorithm. In Wireless Sensor Network (Vol. 04, Issue 01, pp. 18–24). Scientific Research Publishing, Inc. https://doi.org/10.4236/wsn.2012.41003` 

45 

46 Example: 

47 ```python 

48 # Import. 

49 from scipy.sparse import csr_matrix 

50 from cognitivefactory.interactive_clustering.constraints.binary import BinaryConstraintsManager 

51 from cognitivefactory.interactive_clustering.clustering.dbscan import MPCKMeansConstrainedClustering 

52 

53 # NB : use cognitivefactory.interactive_clustering.utils to preprocess and vectorize texts. 

54 

55 vectors = { 

56 "0": csr_matrix([1.00, 0.00, 0.00, 0.00]), 

57 "1": csr_matrix([0.95, 0.02, 0.02, 0.01]), 

58 "2": csr_matrix([0.98, 0.00, 0.02, 0.00]), 

59 "3": csr_matrix([0.99, 0.00, 0.01, 0.00]), 

60 "4": csr_matrix([0.50, 0.22, 0.21, 0.07]), 

61 "5": csr_matrix([0.50, 0.21, 0.22, 0.07]), 

62 "6": csr_matrix([0.01, 0.01, 0.01, 0.97]), 

63 "7": csr_matrix([0.00, 0.01, 0.00, 0.99]), 

64 "8": csr_matrix([0.00, 0.00, 0.00, 1.00]), 

65 } 

66 

67 # Define constraints manager. 

68 constraints_manager = BinaryConstraintsManager(list_of_data_IDs=list(vectors.keys())) 

69 constraints_manager.add_constraint(data_ID1="0", data_ID2="1", constraint_type="MUST_LINK") 

70 constraints_manager.add_constraint(data_ID1="0", data_ID2="7", constraint_type="MUST_LINK") 

71 constraints_manager.add_constraint(data_ID1="0", data_ID2="8", constraint_type="MUST_LINK") 

72 constraints_manager.add_constraint(data_ID1="4", data_ID2="5", constraint_type="MUST_LINK") 

73 constraints_manager.add_constraint(data_ID1="0", data_ID2="4", constraint_type="CANNOT_LINK") 

74 constraints_manager.add_constraint(data_ID1="2", data_ID2="4", constraint_type="CANNOT_LINK") 

75 

76 cluster_model = MPCKMeansConstrainedClustering() 

77 dict_of_predicted_clusters = cluster_model.cluster( 

78 constraints_manager=constraints_manager, 

79 vectors=vectors, 

80 nb_clusters=3, 

81 ) 

82 

83 # Print results. 

84 print("Expected results", ";", {"0": 0, "1": 0, "2": 1, "3": 1, "4": 2, "5": 2, "6": 0, "7": 0, "8": 0,}) 

85 print("Computed results", ":", dict_of_predicted_clusters) 

86 ``` 

87 

88 Warns: 

89 FutureWarning: `clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use. 

90 """ 

91 

92 # ============================================================================== 

93 # INITIALIZATION 

94 # ============================================================================== 

95 def __init__( 

96 self, 

97 model: str = "MPC", 

98 max_iteration: int = 150, 

99 w: float = 1.0, 

100 random_seed: Optional[int] = None, 

101 **kargs, 

102 ) -> None: 

103 """ 

104 The constructor for MPCKMeans Constrainted Clustering class. 

105 

106 Args: 

107 model (str, optional): The kmeans clustering model to use. Available kmeans models are `"MPC"`. Defaults to `"MPC"`. 

108 max_iteration (int, optional): The maximum number of kmeans iteration for convergence. Defaults to `150`. 

109 w (float, optional): Weight for the constraints 

110 random_seed (Optional[int]): The random seed to use to redo the same clustering. Defaults to `None`. 

111 **kargs (dict): Other parameters that can be used in the instantiation. 

112 

113 Warns: 

114 FutureWarning: `clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use. 

115 

116 Raises: 

117 ValueError: if some parameters are incorrectly set. 

118 """ 

119 

120 # Deprecation warnings 

121 warnings.warn( 

122 "`clustering.mpckmeans.MPCKMeansConstrainedClustering` is still in development and is not fully tested : it is not ready for production use.", 

123 FutureWarning, # DeprecationWarning 

124 stacklevel=2, 

125 ) 

126 

127 # Store `self.`model`. 

128 if model != "MPC": # TODO use `not in {"MPC"}`. 

129 raise ValueError("The `model` '" + str(model) + "' is not implemented.") 

130 self.model: str = model 

131 

132 # Store 'self.max_iteration`. 

133 if max_iteration < 1: 

134 raise ValueError("The `max_iteration` must be greater than or equal to 1.") 

135 self.max_iteration: int = max_iteration 

136 

137 # Store `self.weight`. 

138 if w < 0: 

139 raise ValueError("The `weight` must be greater than 0.0.") 

140 self.w: float = w 

141 

142 # Store `self.random_seed`. 

143 self.random_seed: Optional[int] = random_seed 

144 

145 # Store `self.kargs` for clustering. 

146 self.kargs = kargs 

147 

148 # Initialize `self.dict_of_predicted_clusters`. 

149 self.dict_of_predicted_clusters: Optional[Dict[str, int]] = None 

150 

151 # Initialize `ml_graph` and `cl_graph`. 

152 self.ml_graph: Dict[str, List[str]] = {} 

153 self.cl_graph: Dict[str, List[str]] = {} 

154 

155 # ============================================================================== 

156 # MAIN - CLUSTER DATA 

157 # ============================================================================== 

158 

159 def cluster( 

160 self, 

161 constraints_manager: AbstractConstraintsManager, 

162 vectors: Dict[str, csr_matrix], 

163 nb_clusters: Optional[int], 

164 verbose: bool = False, 

165 y=None, 

166 **kargs, 

167 ) -> Dict[str, int]: 

168 """ 

169 The main method used to cluster data with the KMeans model. 

170 

171 Args: 

172 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation. 

173 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data. 

174 nb_clusters (Optional[int]): The number of clusters to compute. Here None. 

175 verbose (bool, optional): Enable verbose output. Defaults to `False`. 

176 y : Something. 

177 **kargs (dict): Other parameters that can be used in the clustering. 

178 

179 Raises: 

180 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set. 

181 

182 Returns: 

183 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID. 

184 """ 

185 

186 ### 

187 ### GET PARAMETERS 

188 ### 

189 

190 # Store `self.constraints_manager` and `self.list_of_data_IDs`. 

191 if not isinstance(constraints_manager, AbstractConstraintsManager): 

192 raise ValueError("The `constraints_manager` parameter has to be a `AbstractConstraintsManager` type.") 

193 self.constraints_manager: AbstractConstraintsManager = constraints_manager 

194 self.list_of_data_IDs: List[str] = self.constraints_manager.get_list_of_managed_data_IDs() 

195 

196 # Store `self.vectors`. 

197 if not isinstance(vectors, dict): 

198 raise ValueError("The `vectors` parameter has to be a `dict` type.") 

199 self.vectors: Dict[str, csr_matrix] = vectors 

200 

201 # Store `self.nb_clusters`. 

202 if (nb_clusters is None) or (nb_clusters < 2): 

203 raise ValueError("The `nb_clusters` '" + str(nb_clusters) + "' must be greater than or equal to 2.") 

204 self.nb_clusters: int = min(nb_clusters, len(self.list_of_data_IDs)) 

205 

206 # TODO: Reformat vectors 

207 id_names: np.ndarray = np.array(list(vectors.keys())) 

208 X: np.ndarray = np.array( 

209 [ 

210 (np.array(v).flatten() if isinstance(v, (np.ndarray, list)) else v.toarray().flatten()) 

211 for v in vectors.values() 

212 ] 

213 ) 

214 

215 # TODO: reformat constraints 

216 self.ml: List[Tuple[int, int]] = [ 

217 (i, j) 

218 for i, data_ID_i in enumerate(self.list_of_data_IDs) 

219 for j, data_ID_j in enumerate(self.list_of_data_IDs) 

220 if ( 

221 self.constraints_manager.get_inferred_constraint( 

222 data_ID1=data_ID_j, 

223 data_ID2=data_ID_i, 

224 ) 

225 == "MUST_LINK" 

226 ) 

227 ] 

228 

229 # TODO: reformat constraints 

230 self.cl: List[Tuple[int, int]] = [ 

231 (i, j) 

232 for i, data_ID_i in enumerate(self.list_of_data_IDs) 

233 for j, data_ID_j in enumerate(self.list_of_data_IDs) 

234 if ( 

235 self.constraints_manager.get_inferred_constraint( 

236 data_ID1=data_ID_j, 

237 data_ID2=data_ID_i, 

238 ) 

239 == "CANNOT_LINK" 

240 ) 

241 ] 

242 

243 # Preprocess constraints 

244 ml_graph, cl_graph, neighborhoods = self.preprocess_constraints() 

245 

246 # Initialize cluster centers 

247 cluster_centers = self._initialize_cluster_centers(X, neighborhoods) 

248 

249 # Initialize metrics 

250 A = np.identity(X.shape[1]) 

251 

252 iteration = 0 

253 converged = False 

254 

255 # Repeat until convergence 

256 while not converged and iteration < self.max_iteration: 

257 prev_cluster_centers = cluster_centers.copy() 

258 

259 # Find farthest pair of points according to each metric 

260 farthest = self._find_farthest_pairs_of_points(X, A) 

261 

262 # Assign clusters 

263 labels = self._assign_clusters(X, y, cluster_centers, A, farthest, ml_graph, cl_graph, self.w) 

264 

265 # Estimate means 

266 cluster_centers = self._get_cluster_centers(X, labels) 

267 

268 # Update metrics 

269 A = self._update_metrics(X, labels, cluster_centers, farthest, ml_graph, cl_graph, self.w) 

270 

271 # Check for convergence 

272 cluster_centers_shift = prev_cluster_centers - cluster_centers 

273 converged = np.allclose(cluster_centers_shift, np.zeros(cluster_centers.shape), atol=1e-6, rtol=0) 

274 iteration += 1 

275 

276 self.cluster_centers = cluster_centers 

277 self.labels = labels 

278 

279 self.dict_of_predicted_clusters = {id_names[i]: self.labels[i] for i, label in enumerate(labels) if label != -1} 

280 

281 self.dict_of_predicted_clusters = rename_clusters_by_order(clusters=self.dict_of_predicted_clusters) 

282 

283 for data_ID in self.list_of_data_IDs: 

284 if data_ID not in self.dict_of_predicted_clusters.keys(): 284 ↛ 285line 284 didn't jump to line 285, because the condition on line 284 was never true

285 self.dict_of_predicted_clusters[data_ID] = -1 

286 

287 return self.dict_of_predicted_clusters 

288 

289 # ============================================================================== 

290 # USEFUL FUNCTIONS 

291 # ============================================================================== 

292 

293 def dist(self, i: int, S: List[int], points: np.ndarray) -> float: 

294 """ 

295 Computes the minimum distance of a single point to a group of points. 

296 

297 Args: 

298 i (int): Index of the single point. 

299 S (List[int]): List of the index of the group of points . 

300 points (np.ndarray): Array containing all the points. 

301 

302 Returns: 

303 float: Minimum distance of the single to the group of points. 

304 """ 

305 

306 distances = np.array([np.sqrt(((points[i] - points[j]) ** 2).sum()) for j in S]) 

307 return distances.min() 

308 

309 def _dist(self, x: np.ndarray, y: np.ndarray, A: np.ndarray) -> float: 

310 """ 

311 Computes the Mahalanobis distance between two points. 

312 "(x - y)^T A (x - y)" 

313 

314 Args: 

315 x (np.ndarray): First point. 

316 y (np.ndarray): Second point . 

317 A (np.ndarray): Inverse of a covariance matrix. 

318 

319 Returns: 

320 float: Minimum distance of the single to the group of points. 

321 """ 

322 

323 return scipy.spatial.distance.mahalanobis(x, y, A) ** 2 

324 

325 def _find_farthest_pairs_of_points(self, X: np.ndarray, A: np.ndarray) -> Tuple[int, int, float]: 

326 """ 

327 Finds the farthest pair of points. 

328 

329 Args: 

330 X (np.ndarray): Set of points. 

331 A (np.ndarray): Positive-definite matrix used for the distances. 

332 

333 Returns: 

334 Tuple[int, int, float]: Indexes of the farthest pair of points and the corresponding distance. 

335 """ 

336 

337 farthest = None 

338 n = X.shape[0] 

339 max_distance = 0.0 

340 

341 for i in range(n): 

342 for j in range(n): 

343 if j < i: 

344 distance = self._dist(X[i], X[j], A) 

345 if distance > max_distance: 

346 max_distance = distance 

347 farthest = (i, j, distance) 

348 

349 assert farthest is not None 

350 

351 return farthest 

352 

353 def weighted_farthest_first_traversal(self, points: np.ndarray, weights: np.ndarray, k: int) -> List[int]: 

354 """ 

355 Applies weighted farthest first traversal algorithm. 

356 

357 Args: 

358 points (np.ndarray): Set of points. 

359 weights (np.ndarray): Weights for the distances. 

360 k (int): Number of points to be traversed 

361 

362 Returns: 

363 List[int]: Indexes of the traversed points. 

364 """ 

365 traversed = [] 

366 

367 # Choose the first point randomly (weighted) 

368 i = np.random.choice(len(points), size=1, p=weights)[0] 

369 traversed.append(i) 

370 

371 # Find remaining n - 1 maximally separated points 

372 for _ in range(k - 1): 

373 max_dst, max_dst_index = 0, None 

374 

375 number_of_points = len(points) 

376 for j in range(number_of_points): 

377 if j not in traversed: 

378 dst = self.dist(j, traversed, points) 

379 weighted_dst = weights[j] * dst 

380 

381 if weighted_dst > max_dst: 

382 max_dst = weighted_dst 

383 max_dst_index = j 

384 

385 traversed.append(max_dst_index) 

386 

387 return traversed 

388 

389 # ============================================================================== 

390 # INITIALIZATION OF CLUSTERS 

391 # ============================================================================== 

392 def _add_both(self, d, i, j): 

393 d[i].add(j) 

394 d[j].add(i) 

395 

396 def _dfs(self, i, graph, visited, component): 

397 visited[i] = True 

398 for j in graph[i]: 

399 if not visited[j]: 

400 self._dfs(j, graph, visited, component) 

401 component.append(i) 

402 

403 def preprocess_constraints( 

404 self, 

405 ) -> Tuple[Dict[int, Set[int]], Dict[int, Set[int]], List[List[int]]]: 

406 """ 

407 Initialize each cluster. 

408 The choice is based on the neighborhoods created by the initial constraints. 

409 

410 Raises: 

411 ValueError: if there is a Cannot-link constraint in conflict with a Must-link constraint involving both one same point. 

412 

413 Returns: 

414 Tuple[Dict[int, set], Dict[int, set], List[List[int]]]: 

415 A new list of must-link and cannot-link constraints as well as the lambda starting neighborhoods. 

416 """ 

417 

418 # Get the list of possible indices. 

419 indices: List[str] = self.list_of_data_IDs.copy() 

420 

421 n: int = len(indices) 

422 

423 # Represent the graphs using adjacency-lists. 

424 ml_graph: Dict[int, Set[int]] = {} 

425 cl_graph: Dict[int, Set[int]] = {} 

426 

427 for k in range(n): 

428 ml_graph[k] = set() 

429 cl_graph[k] = set() 

430 

431 for data_ID_i1, data_ID_j1 in self.ml: 

432 ml_graph[data_ID_i1].add(data_ID_j1) 

433 ml_graph[data_ID_j1].add(data_ID_i1) 

434 

435 for data_ID_i2, data_ID_j2 in self.cl: 

436 cl_graph[data_ID_i2].add(data_ID_j2) 

437 cl_graph[data_ID_j2].add(data_ID_i2) 

438 

439 visited = [False for _ in range(n)] 

440 neighborhoods = [] 

441 for index in range(n): 

442 if not visited[index] and ml_graph[index]: 

443 component: List[int] = [] 

444 self._dfs(index, ml_graph, visited, component) 

445 for x1 in component: 

446 for x2 in component: 

447 if x1 != x2: 

448 ml_graph[x1].add(x2) 

449 neighborhoods.append(component) 

450 

451 for data_ID_i3, data_ID_j3 in self.cl: 

452 for x in ml_graph[data_ID_i3]: 

453 self._add_both(cl_graph, x, data_ID_j3) 

454 

455 for y in ml_graph[data_ID_j3]: 

456 self._add_both(cl_graph, data_ID_i3, y) 

457 

458 for a in ml_graph[data_ID_i3]: 

459 for b in ml_graph[data_ID_j3]: 

460 self._add_both(cl_graph, a, b) 

461 

462 for index_1, _ in ml_graph.items(): 

463 for index_2 in ml_graph[index_1]: 

464 if index_2 != index_1 and index_2 in cl_graph[index_1]: 464 ↛ 465line 464 didn't jump to line 465, because the condition on line 464 was never true

465 raise ValueError("Inconsistent constraints between " + str(index_1) + " and " + str(index_2)) 

466 

467 return ml_graph, cl_graph, neighborhoods 

468 

469 def _initialize_cluster_centers(self, X: np.ndarray, neighborhoods: List[List[int]]) -> np.ndarray: 

470 """ 

471 Initialises cluster centers. 

472 

473 Args: 

474 X (np.ndarray): Set of points. 

475 neighborhoods (List[List[int]]): Lists of neighbors for each point. 

476 

477 Returns: 

478 np.ndarray: Computed centers. 

479 """ 

480 

481 neighborhood_centers = np.array([X[neighborhood].mean(axis=0) for neighborhood in neighborhoods]) 

482 neighborhood_sizes = np.array([len(neighborhood) for neighborhood in neighborhoods]) 

483 neighborhood_weights = neighborhood_sizes / neighborhood_sizes.sum() 

484 

485 # print('\t', len(neighborhoods), neighborhood_sizes) 

486 

487 if len(neighborhoods) > self.nb_clusters: 487 ↛ 492line 487 didn't jump to line 492, because the condition on line 487 was never false

488 cluster_centers = neighborhood_centers[ 

489 self.weighted_farthest_first_traversal(neighborhood_centers, neighborhood_weights, self.nb_clusters) 

490 ] 

491 else: 

492 if neighborhoods: 

493 cluster_centers = neighborhood_centers 

494 else: 

495 cluster_centers = np.empty((0, X.shape[1])) 

496 

497 if len(neighborhoods) < self.nb_clusters: 

498 remaining_cluster_centers = X[ 

499 np.random.choice(X.shape[0], self.nb_clusters - len(neighborhoods), replace=False), : 

500 ] 

501 cluster_centers = np.concatenate([cluster_centers, remaining_cluster_centers]) 

502 

503 return cluster_centers 

504 

505 # ============================================================================== 

506 # COMPUTE CLUSTERS 

507 # ============================================================================== 

508 

509 def _f_m(self, X: np.ndarray, i: int, j: int, A) -> float: 

510 return self._dist(X[i], X[j], A) 

511 

512 def _f_c(self, X: np.ndarray, i: int, j: int, A, farthest) -> float: 

513 return farthest[2] - self._dist(X[i], X[j], A) 

514 

515 def _objective_fn( 

516 self, X: np.ndarray, i: int, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w 

517 ) -> float: 

518 sign, logdet = np.linalg.slogdet(A) 

519 log_det_a = sign * logdet 

520 

521 if log_det_a == np.inf: 521 ↛ 522line 521 didn't jump to line 522, because the condition on line 521 was never true

522 log_det_a = 0 

523 

524 term_d: float = self._dist(X[i], cluster_centers[cluster_id], A) - log_det_a 

525 

526 term_m: float = 0 

527 for j in ml_graph[i]: 

528 if labels[j] >= 0 and labels[j] != cluster_id: 

529 term_m += 2 * w * self._f_m(X, i, j, A) 

530 

531 term_c: float = 0 

532 for k in cl_graph[i]: 

533 if labels[k] == cluster_id: 

534 # assert self._f_c(i, k, A, farthest) >= 0 

535 term_c += 2 * w * self._f_c(X, i, k, A, farthest) 

536 

537 return term_d + term_m + term_c 

538 

539 def _assign_clusters(self, X, y, cluster_centers, A, farthest, ml_graph, cl_graph, w): 

540 labels = np.full(X.shape[0], fill_value=-1) 

541 

542 index = list(range(X.shape[0])) 

543 np.random.shuffle(index) 

544 for i in index: 

545 labels[i] = np.argmin( 

546 [ 

547 self._objective_fn(X, i, labels, cluster_centers, cluster_id, A, farthest, ml_graph, cl_graph, w) 

548 for cluster_id, cluster_center in enumerate(cluster_centers) 

549 ] 

550 ) 

551 

552 # Handle empty clusters 

553 n_samples_in_cluster = np.bincount(labels, minlength=self.nb_clusters) 

554 empty_clusters = np.where(n_samples_in_cluster == 0)[0] 

555 

556 for empty_cluster_id in empty_clusters: 556 ↛ 558line 556 didn't jump to line 558, because the loop on line 556 never started

557 # Get clusters that have at least 2 points and can give one of them to another cluster 

558 filled_clusters = np.where(n_samples_in_cluster > 1)[0] 

559 

560 # Get points from filled_clusters, and compute distance to their center 

561 distances_to_clusters: Dict[str, float] = {} 

562 

563 for cluster_id in filled_clusters: 

564 available_cluster_points = np.where(labels == cluster_id)[0] 

565 for point in available_cluster_points: 

566 distances_to_clusters[point] = self._dist(X[point], cluster_centers[cluster_id], A) 

567 

568 # Fill empty clusters with the farthest points regarding their respective centers 

569 filling_point: float = max(distances_to_clusters, key=distances_to_clusters.get) 

570 labels[filling_point] = empty_cluster_id 

571 

572 n_samples_in_cluster = np.bincount(labels, minlength=10) 

573 

574 empty_clusters = np.where(n_samples_in_cluster == 0)[0] 

575 

576 if empty_clusters.size: 576 ↛ 578line 576 didn't jump to line 578, because the condition on line 576 was never true

577 # print("Empty clusters") 

578 raise ValueError("Empty Clusters Exception") 

579 return labels 

580 

581 def _get_cluster_centers(self, X, labels): 

582 return np.array([X[labels == i].mean(axis=0) for i in range(self.nb_clusters)]) 

583 

584 # ============================================================================== 

585 # UPDATE METRICS 

586 # ============================================================================== 

587 

588 def _update_metrics(self, X, labels, cluster_centers, farthest, ml_graph, cl_graph, w): 

589 N, D = X.shape 

590 A = np.zeros((D, D)) 

591 

592 for d in range(D): 

593 term_x = np.sum([(x[d] - cluster_centers[labels[i], d]) ** 2 for i, x in enumerate(X)]) 

594 

595 term_m = 0 

596 for i in range(N): 

597 for j in ml_graph[i]: 

598 if labels[i] != labels[j]: 598 ↛ 599line 598 didn't jump to line 599, because the condition on line 598 was never true

599 term_m += 1 / 2 * w * (X[i, d] - X[j, d]) ** 2 

600 

601 term_c = 0 

602 for k in range(N): 

603 for m in cl_graph[k]: 

604 if labels[k] == labels[m]: 604 ↛ 605line 604 didn't jump to line 605, because the condition on line 604 was never true

605 tmp = (X[farthest[0], d] - X[farthest[1], d]) ** 2 - (X[k, d] - X[m, d]) ** 2 

606 term_c += w * max(tmp, 0) 

607 

608 # print('term_x', term_x, 'term_m', term_m, 'term_c', term_c) 

609 

610 A[d, d] = N / max(term_x + term_m + term_c, 1e-9) 

611 

612 return A