Skip to content

Usage

Import dependencies.

from cognitivefactory.interactive_clustering.utils.preprocessing import preprocess
from cognitivefactory.interactive_clustering.utils.vectorization import vectorize
from cognitivefactory.interactive_clustering.constraints.factory import managing_factory
from cognitivefactory.interactive_clustering.clustering.factory import clustering_factory
from cognitivefactory.interactive_clustering.sampling.factory import sampling_factory

Initialization step (iteration 0)

Get data.

# Define dictionary of texts.
dict_of_texts = {
    "0": "This is my first question.",
    "1": "This is my second item.",
    "2": "This is my third query.",
    "3": "This is my fourth issue.",
    # ...
    "N": "This is my last request.",
}

Preprocess data.

# Preprocess data.
dict_of_preprocess_texts = preprocess(
    dict_of_texts=dict_of_texts,
    spacy_language_model="fr_core_news_md",
)  # Apply simple preprocessing. Spacy language model has to be installed. Other parameters are available.

Vectorize data.

# Vectorize data.
dict_of_vectors = vectorize(
    dict_of_texts=dict_of_preprocess_texts,
    vectorizer_type="tfidf",
)  # Apply TF-IDF vectorization. Other parameters are available.

Initialize constraints manager.

# Create an instance of binary constraints manager.
constraints_manager = managing_factory(
    manager="binary",
    list_of_data_IDs = list(dict_of_texts.keys()),
)

Apply first clustering without constraints.

# Create an instance of constrained COP-kmeans clustering.
clustering_model = clustering_factory(
    algorithm="kmeans",
    random_seed=1,
)  # Other clustering algorithms are available.

# Run clustering.
clustering_result = clustering_model.cluster(
    constraints_manager=constraints_manager,
    nb_clusters=2,
    vectors=dict_of_vectors,
)

Iteration step (iteration N)

Check if all possible constraints are annotated.

# Check if all constraints are already annotated.
is_finish = constraints_manager.check_completude_of_constraints()

# Print result
if is_finish:
    print("All possible constraints are annotated. No more iteration can be run.")
    # break

Sampling constraints to annotate.

# Create an instance of random sampler.
sampler = sampling_factory(
    algorithm="random",
    random_seed=None,
)  # Other algorithms are available.

# Sample constraints to annotated.
selection = sampler.sample(
    constraints_manager=constraints_manager,
    nb_to_select=3,
    #clustering_result=clustering_result,  # Results from iteration `N-1`.
    #vectors=dict_of_vectors,
)

Annotate constraints (manual operation).

# TODO: Use a graphical interface for interactive clustering.
# WIP: Project `interactive-clustering-gui`.

list_of_annotation = []  # List of triplets with format `(data_ID1, data_ID2, annotation_type)` where `annotation_type` can be "MUST_LINK" or "CANNOT_LINK".

Update constraints manager.

for annotation in list_of_annotation:

    # Get the annotation
    data_ID1, data_ID2, constraint_type = annotation

    # Add constraints
    try:
        constraints_manager.add_constraint(
            data_ID1=data_ID1,
            data_ID2=data_ID2,
            constraint_type=constraint_type
        )
    except ValueError as err:
        print(err)  # An error can occur if parameters are incorrect or if annotation is incompatible with previous annotation.

Determine the range of possible cluster number.

# Get min and max range of clusters based on constraints.
min_n, max_n = constraints_manager.get_min_and_max_number_of_clusters()

# Choose the number of cluster.
nb_clusters = int( (min_n + max_n) / 2 ) # or manual selection.

Run constrained clustering.

# Create an instance of constrained COP-kmeans clustering.
clustering_model = clustering_factory(
    algorithm="kmeans",
    random_seed=1,
)  # Other clustering algorithms are available.

# Run clustering.
clustering_result = clustering_model.cluster(
    constraints_manager=constraints_manager,  # Annotation since iteration `0`.
    nb_clusters=nb_clusters,
    vectors=dict_of_vectors,
)  # Clustering results are corrected since the previous iteration.

Analyze cluster (not implemented here).

# TODO: Evaluate completness, homogeneity, v-measure, rand index (basic, adjusted), mutual information (basic, normalized, mutual), ...
# TODO: Plot clustering.