Coverage for src\cognitivefactory\interactive_clustering\clustering\abstract.py: 100.00%

1# -*- coding: utf-8 -*-

3"""

4* Name: cognitivefactory.interactive_clustering.clustering.abstract

5* Description: The abstract class used to define constrained clustering algorithms.

6* Author: Erwan SCHILD

7* Created: 17/03/2021

8* Licence: CeCILL-C License v1.0 (https://cecill.info/licences.fr.html)

9"""

11# ==============================================================================

12# IMPORT PYTHON DEPENDENCIES

13# ==============================================================================

15from abc import ABC, abstractmethod

16from typing import Dict, Optional

18from scipy.sparse import csr_matrix

20from cognitivefactory.interactive_clustering.constraints.abstract import AbstractConstraintsManager

23# ==============================================================================

24# ABSTRACT CONSTRAINED CLUSTERING

25# ==============================================================================

26class AbstractConstrainedClustering(ABC):

27 """

28 Abstract class that is used to define constrained clustering algorithms.

29 The main inherited method is `cluster`.

31 References:

32 - Survey on Constrained Clustering : `Lampert, T., T.-B.-H. Dao, B. Lafabregue, N. Serrette, G. Forestier, B. Cremilleux, C. Vrain, et P. Gancarski (2018). Constrained distance based clustering for time-series : a comparative and experimental study. Data Mining and Knowledge Discovery 32(6), 1663–1707.`

33 """

35 # ==============================================================================

36 # ABSTRACT METHOD - CLUSTER

37 # ==============================================================================

38 @abstractmethod

39 def cluster(

40 self,

41 constraints_manager: AbstractConstraintsManager,

42 vectors: Dict[str, csr_matrix],

43 nb_clusters: Optional[int],

44 verbose: bool = False,

45 **kargs,

46 ) -> Dict[str, int]:

47 """

48 (ABSTRACT METHOD)

49 An abstract method that represents the main method used to cluster data.

51 Args:

52 constraints_manager (AbstractConstraintsManager): A constraints manager over data IDs that will force clustering to respect some conditions during computation.

53 vectors (Dict[str, csr_matrix]): The representation of data vectors. The keys of the dictionary represents the data IDs. This keys have to refer to the list of data IDs managed by the `constraints_manager`. The value of the dictionary represent the vector of each data.

54 nb_clusters (Optional[int]): The number of clusters to compute. Can be `None` if this parameters is estimated or if the algorithm doesn't need it.

55 verbose (bool, optional): Enable verbose output. Defaults to `False`.

56 **kargs (dict): Other parameters that can be used in the clustering.

58 Raises:

59 ValueError: if `vectors` and `constraints_manager` are incompatible, or if some parameters are incorrectly set.

61 Returns:

62 Dict[str,int]: A dictionary that contains the predicted cluster for each data ID.

63 """

66# ==============================================================================

67# RENAME CLUSTERS BY ORDER

68# ==============================================================================

69def rename_clusters_by_order(

70 clusters: Dict[str, int],

71) -> Dict[str, int]:

72 """

73 Rename cluster ID to be ordered by data IDs.

75 Args:

76 clusters (Dict[str, int]): The dictionary of clusters.

78 Returns:

79 Dict[str, int]: The sorted dictionary of clusters.

80 """

82 # Get `list_of_data_IDs`.

83 list_of_data_IDs = sorted(clusters.keys())

85 # Define a map to be able to rename cluster IDs.

86 mapping_of_old_ID_to_new_ID: Dict[int, int] = {}

87 new_ID: int = 0

88 for data_ID in list_of_data_IDs: # , cluster_ID in clusters.items():

89 if clusters[data_ID] not in mapping_of_old_ID_to_new_ID.keys():

90 mapping_of_old_ID_to_new_ID[clusters[data_ID]] = new_ID

91 new_ID += 1

93 # Rename cluster IDs.

94 new_clusters = {

95 data_ID_to_assign: mapping_of_old_ID_to_new_ID[clusters[data_ID_to_assign]]

96 for data_ID_to_assign in list_of_data_IDs

97 }

99 # Return the new ordered clusters

100 return new_clusters