Coverage for tests\test_usage_docs.py: 100.00%
35 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
1# -*- coding: utf-8 -*-
3"""
4* Name: interactive-clustering/tests/tests_docs.py
5* Description: Unittests for the documentation.
6* Author: Erwan SCHILD
7* Created: 17/03/2021
8* Licence: CeCILL (https://cecill.info/licences.fr.html)
9"""
12# ==============================================================================
13# test_docs_usage :
14# ==============================================================================
15def test_docs_usage():
16 """
17 Test the `usage` documentation.
18 """
20 # Import dependencies.
21 from cognitivefactory.interactive_clustering.clustering.factory import ( # noqa: C0415 (not top level import, it's fine)
22 clustering_factory,
23 )
24 from cognitivefactory.interactive_clustering.constraints.factory import ( # noqa: C0415 (not top level import, it's fine)
25 managing_factory,
26 )
27 from cognitivefactory.interactive_clustering.sampling.factory import ( # noqa: C0415 (not top level import, it's fine)
28 sampling_factory,
29 )
30 from cognitivefactory.interactive_clustering.utils.preprocessing import ( # noqa: C0415 (not top level import, it's fine)
31 preprocess,
32 )
33 from cognitivefactory.interactive_clustering.utils.vectorization import ( # noqa: C0415 (not top level import, it's fine)
34 vectorize,
35 )
37 ###
38 ### Initialization step (iteration `0`)
39 ###
40 # Define dictionary of texts.
41 dict_of_texts = {
42 "0": "This is my first question.",
43 "1": "This is my second item.",
44 "2": "This is my third query.",
45 "3": "This is my fourth issue.",
46 # ...
47 "N": "This is my last request.",
48 }
50 # Preprocess data.
51 dict_of_preprocess_texts = preprocess(
52 dict_of_texts=dict_of_texts,
53 spacy_language_model="fr_core_news_md",
54 ) # Apply simple preprocessing. Spacy language model has to be installed. Other parameters are available.
56 # Vectorize data.
57 dict_of_vectors = vectorize(
58 dict_of_texts=dict_of_preprocess_texts,
59 vectorizer_type="tfidf",
60 ) # Apply TF-IDF vectorization. Other parameters are available.
62 # Create an instance of binary constraints manager.
63 constraints_manager = managing_factory(
64 manager="binary",
65 list_of_data_IDs=list(dict_of_texts.keys()),
66 )
67 assert constraints_manager
69 # Create an instance of constrained COP-kmeans clustering.
70 clustering_model = clustering_factory(
71 algorithm="kmeans",
72 random_seed=1,
73 ) # Other clustering algorithms are available.
74 assert clustering_model
76 # Run clustering.
77 clustering_result = clustering_model.cluster(
78 constraints_manager=constraints_manager,
79 vectors=dict_of_vectors,
80 nb_clusters=2,
81 )
82 assert clustering_result
84 ###
85 ### Iteration step (iteration `N`)
86 ###
88 # Check if all constraints are already annotated.
89 is_finish = constraints_manager.check_completude_of_constraints()
91 # Print result
92 if is_finish: # pragma: no cover
93 print("All possible constraints are annotated. No more iteration can be run.")
94 # break
96 # Create an instance of random sampler.
97 sampler = sampling_factory(
98 algorithm="random",
99 random_seed=None,
100 ) # Other algorithms are available.
102 # Sample constraints to annotated.
103 selection = sampler.sample(
104 constraints_manager=constraints_manager,
105 nb_to_select=3,
106 # clustering_result=clustering_result, # Results from iteration `N-1`.
107 # vectors=dict_of_vectors,
108 )
109 assert len(selection) == 3
110 assert selection
112 # Annotate constraints (manual operation).
113 ANNOTATIONS = ["MUST_LINK", "CANNOT_LINK", None]
114 list_of_annotation = [
115 (data_ID1, data_ID2, ANNOTATIONS[i]) for i, (data_ID1, data_ID2) in enumerate(selection)
116 ] # List of triplets with format `(data_ID1, data_ID2, annotation_type)` where `annotation_type` can be "MUST_LINK" or "CANNOT_LINK".
118 for annotation in list_of_annotation:
119 # Get the annotation
120 data_ID1, data_ID2, constraint_type = annotation
122 # Add constraints
123 try:
124 constraints_manager.add_constraint(data_ID1=data_ID1, data_ID2=data_ID2, constraint_type=constraint_type)
125 except ValueError as err:
126 print(
127 err
128 ) # An error can occur if parameters are incorrect or if annotation is incompatible with previous annotation.
130 # Get min and max range of clusters based on constraints.
131 min_n, max_n = constraints_manager.get_min_and_max_number_of_clusters()
133 # Choose the number of cluster.
134 nb_clusters = int((min_n + max_n) / 2) # or manual selection.
136 # Create an instance of constrained COP-kmeans clustering.
137 clustering_model = clustering_factory(
138 algorithm="kmeans",
139 random_seed=1,
140 ) # Other clustering algorithms are available.
141 assert clustering_model
143 # Run clustering.
144 clustering_result = clustering_model.cluster(
145 constraints_manager=constraints_manager, # Annotation since iteration `0`.
146 nb_clusters=nb_clusters,
147 vectors=dict_of_vectors,
148 ) # Clustering results are corrected since the previous iteration.
149 assert clustering_result