Coverage for tests\test_usage_docs.py: 100.00%

35 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1# -*- coding: utf-8 -*- 

2 

3""" 

4* Name: interactive-clustering/tests/tests_docs.py 

5* Description: Unittests for the documentation. 

6* Author: Erwan SCHILD 

7* Created: 17/03/2021 

8* Licence: CeCILL (https://cecill.info/licences.fr.html) 

9""" 

10 

11 

12# ============================================================================== 

13# test_docs_usage : 

14# ============================================================================== 

15def test_docs_usage(): 

16 """ 

17 Test the `usage` documentation. 

18 """ 

19 

20 # Import dependencies. 

21 from cognitivefactory.interactive_clustering.clustering.factory import ( # noqa: C0415 (not top level import, it's fine) 

22 clustering_factory, 

23 ) 

24 from cognitivefactory.interactive_clustering.constraints.factory import ( # noqa: C0415 (not top level import, it's fine) 

25 managing_factory, 

26 ) 

27 from cognitivefactory.interactive_clustering.sampling.factory import ( # noqa: C0415 (not top level import, it's fine) 

28 sampling_factory, 

29 ) 

30 from cognitivefactory.interactive_clustering.utils.preprocessing import ( # noqa: C0415 (not top level import, it's fine) 

31 preprocess, 

32 ) 

33 from cognitivefactory.interactive_clustering.utils.vectorization import ( # noqa: C0415 (not top level import, it's fine) 

34 vectorize, 

35 ) 

36 

37 ### 

38 ### Initialization step (iteration `0`) 

39 ### 

40 # Define dictionary of texts. 

41 dict_of_texts = { 

42 "0": "This is my first question.", 

43 "1": "This is my second item.", 

44 "2": "This is my third query.", 

45 "3": "This is my fourth issue.", 

46 # ... 

47 "N": "This is my last request.", 

48 } 

49 

50 # Preprocess data. 

51 dict_of_preprocess_texts = preprocess( 

52 dict_of_texts=dict_of_texts, 

53 spacy_language_model="fr_core_news_md", 

54 ) # Apply simple preprocessing. Spacy language model has to be installed. Other parameters are available. 

55 

56 # Vectorize data. 

57 dict_of_vectors = vectorize( 

58 dict_of_texts=dict_of_preprocess_texts, 

59 vectorizer_type="tfidf", 

60 ) # Apply TF-IDF vectorization. Other parameters are available. 

61 

62 # Create an instance of binary constraints manager. 

63 constraints_manager = managing_factory( 

64 manager="binary", 

65 list_of_data_IDs=list(dict_of_texts.keys()), 

66 ) 

67 assert constraints_manager 

68 

69 # Create an instance of constrained COP-kmeans clustering. 

70 clustering_model = clustering_factory( 

71 algorithm="kmeans", 

72 random_seed=1, 

73 ) # Other clustering algorithms are available. 

74 assert clustering_model 

75 

76 # Run clustering. 

77 clustering_result = clustering_model.cluster( 

78 constraints_manager=constraints_manager, 

79 vectors=dict_of_vectors, 

80 nb_clusters=2, 

81 ) 

82 assert clustering_result 

83 

84 ### 

85 ### Iteration step (iteration `N`) 

86 ### 

87 

88 # Check if all constraints are already annotated. 

89 is_finish = constraints_manager.check_completude_of_constraints() 

90 

91 # Print result 

92 if is_finish: # pragma: no cover 

93 print("All possible constraints are annotated. No more iteration can be run.") 

94 # break 

95 

96 # Create an instance of random sampler. 

97 sampler = sampling_factory( 

98 algorithm="random", 

99 random_seed=None, 

100 ) # Other algorithms are available. 

101 

102 # Sample constraints to annotated. 

103 selection = sampler.sample( 

104 constraints_manager=constraints_manager, 

105 nb_to_select=3, 

106 # clustering_result=clustering_result, # Results from iteration `N-1`. 

107 # vectors=dict_of_vectors, 

108 ) 

109 assert len(selection) == 3 

110 assert selection 

111 

112 # Annotate constraints (manual operation). 

113 ANNOTATIONS = ["MUST_LINK", "CANNOT_LINK", None] 

114 list_of_annotation = [ 

115 (data_ID1, data_ID2, ANNOTATIONS[i]) for i, (data_ID1, data_ID2) in enumerate(selection) 

116 ] # List of triplets with format `(data_ID1, data_ID2, annotation_type)` where `annotation_type` can be "MUST_LINK" or "CANNOT_LINK". 

117 

118 for annotation in list_of_annotation: 

119 # Get the annotation 

120 data_ID1, data_ID2, constraint_type = annotation 

121 

122 # Add constraints 

123 try: 

124 constraints_manager.add_constraint(data_ID1=data_ID1, data_ID2=data_ID2, constraint_type=constraint_type) 

125 except ValueError as err: 

126 print( 

127 err 

128 ) # An error can occur if parameters are incorrect or if annotation is incompatible with previous annotation. 

129 

130 # Get min and max range of clusters based on constraints. 

131 min_n, max_n = constraints_manager.get_min_and_max_number_of_clusters() 

132 

133 # Choose the number of cluster. 

134 nb_clusters = int((min_n + max_n) / 2) # or manual selection. 

135 

136 # Create an instance of constrained COP-kmeans clustering. 

137 clustering_model = clustering_factory( 

138 algorithm="kmeans", 

139 random_seed=1, 

140 ) # Other clustering algorithms are available. 

141 assert clustering_model 

142 

143 # Run clustering. 

144 clustering_result = clustering_model.cluster( 

145 constraints_manager=constraints_manager, # Annotation since iteration `0`. 

146 nb_clusters=nb_clusters, 

147 vectors=dict_of_vectors, 

148 ) # Clustering results are corrected since the previous iteration. 

149 assert clustering_result