Coverage for tests\test_usage

1# -*- coding: utf-8 -*-

3"""

4* Name: interactive-clustering/tests/tests_docs.py

5* Description: Unittests for the documentation.

6* Author: Erwan SCHILD

7* Created: 17/03/2021

8* Licence: CeCILL (https://cecill.info/licences.fr.html)

9"""

12# ==============================================================================

13# test_docs_usage :

14# ==============================================================================

15def test_docs_usage():

16 """

17 Test the `usage` documentation.

18 """

20 # Import dependencies.

21 from cognitivefactory.interactive_clustering.clustering.factory import ( # noqa: C0415 (not top level import, it's fine)

22 clustering_factory,

23 )

24 from cognitivefactory.interactive_clustering.constraints.factory import ( # noqa: C0415 (not top level import, it's fine)

25 managing_factory,

26 )

27 from cognitivefactory.interactive_clustering.sampling.factory import ( # noqa: C0415 (not top level import, it's fine)

28 sampling_factory,

29 )

30 from cognitivefactory.interactive_clustering.utils.preprocessing import ( # noqa: C0415 (not top level import, it's fine)

31 preprocess,

32 )

33 from cognitivefactory.interactive_clustering.utils.vectorization import ( # noqa: C0415 (not top level import, it's fine)

34 vectorize,

35 )

37 ###

38 ### Initialization step (iteration `0`)

39 ###

40 # Define dictionary of texts.

41 dict_of_texts = {

42 "0": "This is my first question.",

43 "1": "This is my second item.",

44 "2": "This is my third query.",

45 "3": "This is my fourth issue.",

46 # ...

47 "N": "This is my last request.",

48 }

50 # Preprocess data.

51 dict_of_preprocess_texts = preprocess(

52 dict_of_texts=dict_of_texts,

53 spacy_language_model="fr_core_news_md",

54 ) # Apply simple preprocessing. Spacy language model has to be installed. Other parameters are available.

56 # Vectorize data.

57 dict_of_vectors = vectorize(

58 dict_of_texts=dict_of_preprocess_texts,

59 vectorizer_type="tfidf",

60 ) # Apply TF-IDF vectorization. Other parameters are available.

62 # Create an instance of binary constraints manager.

63 constraints_manager = managing_factory(

64 manager="binary",

65 list_of_data_IDs=list(dict_of_texts.keys()),

66 )

67 assert constraints_manager

69 # Create an instance of constrained COP-kmeans clustering.

70 clustering_model = clustering_factory(

71 algorithm="kmeans",

72 random_seed=1,

73 ) # Other clustering algorithms are available.

74 assert clustering_model

76 # Run clustering.

77 clustering_result = clustering_model.cluster(

78 constraints_manager=constraints_manager,

79 vectors=dict_of_vectors,

80 nb_clusters=2,

81 )

82 assert clustering_result

84 ###

85 ### Iteration step (iteration `N`)

86 ###

88 # Check if all constraints are already annotated.

89 is_finish = constraints_manager.check_completude_of_constraints()

91 # Print result

92 if is_finish: # pragma: no cover

93 print("All possible constraints are annotated. No more iteration can be run.")

94 # break

96 # Create an instance of random sampler.

97 sampler = sampling_factory(

98 algorithm="random",

99 random_seed=None,

100 ) # Other algorithms are available.

101

102 # Sample constraints to annotated.

103 selection = sampler.sample(

104 constraints_manager=constraints_manager,

105 nb_to_select=3,

106 # clustering_result=clustering_result, # Results from iteration `N-1`.

107 # vectors=dict_of_vectors,

108 )

109 assert len(selection) == 3

110 assert selection

111

112 # Annotate constraints (manual operation).

113 ANNOTATIONS = ["MUST_LINK", "CANNOT_LINK", None]

114 list_of_annotation = [

115 (data_ID1, data_ID2, ANNOTATIONS[i]) for i, (data_ID1, data_ID2) in enumerate(selection)

116 ] # List of triplets with format `(data_ID1, data_ID2, annotation_type)` where `annotation_type` can be "MUST_LINK" or "CANNOT_LINK".

117

118 for annotation in list_of_annotation:

119 # Get the annotation

120 data_ID1, data_ID2, constraint_type = annotation

121

122 # Add constraints

123 try:

124 constraints_manager.add_constraint(data_ID1=data_ID1, data_ID2=data_ID2, constraint_type=constraint_type)

125 except ValueError as err:

126 print(

127 err

128 ) # An error can occur if parameters are incorrect or if annotation is incompatible with previous annotation.

129

130 # Get min and max range of clusters based on constraints.

131 min_n, max_n = constraints_manager.get_min_and_max_number_of_clusters()

132

133 # Choose the number of cluster.

134 nb_clusters = int((min_n + max_n) / 2) # or manual selection.

135

136 # Create an instance of constrained COP-kmeans clustering.

137 clustering_model = clustering_factory(

138 algorithm="kmeans",

139 random_seed=1,

140 ) # Other clustering algorithms are available.

141 assert clustering_model

142

143 # Run clustering.

144 clustering_result = clustering_model.cluster(

145 constraints_manager=constraints_manager, # Annotation since iteration `0`.

146 nb_clusters=nb_clusters,

147 vectors=dict_of_vectors,

148 ) # Clustering results are corrected since the previous iteration.

149 assert clustering_result

Coverage for tests\test_usage_docs.py: 100.00%

35 statements