Coverage for src\cognitivefactory\interactive_clustering\utils\vectorization.py: 100.00%
20 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
1# -*- coding: utf-8 -*-
3"""
4* Name: cognitivefactory.interactive_clustering.utils.vectorization
5* Description: Utilities methods to apply NLP vectorization.
6* Author: Erwan SCHILD
7* Created: 17/03/2021
8* Licence: CeCILL (https://cecill.info/licences.fr.html)
9"""
11# ==============================================================================
12# IMPORT PYTHON DEPENDENCIES
13# ==============================================================================
15from typing import Dict
17import spacy
18from scipy.sparse import csr_matrix
19from sklearn.feature_extraction.text import TfidfVectorizer
22# ==============================================================================
23# NLP VECTORIZATION
24# ==============================================================================
25def vectorize(
26 dict_of_texts: Dict[str, str],
27 vectorizer_type: str = "tfidf",
28 spacy_language_model: str = "fr_core_news_md",
29) -> Dict[str, csr_matrix]:
30 """
31 A method used to vectorize texts.
32 Severals vectorizer are available : TFIDF, spaCy language model.
34 References:
35 - _Scikit-learn_: `Pedregosa, F., G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R.Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, et E. Duchesnay (2011). Scikit-learn : Machine Learning in Python. Journal of Machine Learning Research 12, 2825–2830.`
36 - _Scikit-learn_ _'TfidfVectorizer'_: `https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html`
37 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.`
38 - _spaCy_ language models: `https://spacy.io/usage/models`
40 Args:
41 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to vectorize.
42 vectorizer_type (str, optional): The vectorizer type to use. The type can be `"tfidf"` or `"spacy"`. Defaults to `"tfidf"`.
43 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. Defaults to `"fr_core_news_md"`.
45 Raises:
46 ValueError: Raises error if `vectorizer_type` is not implemented or if the `spacy_language_model` is not installed.
48 Returns:
49 Dict[str, csr_matrix]: A dictionary that contains the computed vectors.
51 Example:
52 ```python
53 # Import.
54 from cognitivefactory.interactive_clustering.utils.vectorization import vectorize
56 # Define data.
57 dict_of_texts={
58 "0": "comment signaler une perte de carte de paiement",
59 "1": "quelle est la procedure pour chercher une carte de credit avalee",
60 "2": "ma carte visa a un plafond de paiment trop bas puis je l augmenter",
61 }
63 # Apply vectorization.
64 dict_of_vectors = vectorize(
65 dict_of_texts=dict_of_texts,
66 vectorizer_type="spacy",
67 spacy_language_model="fr_core_news_md",
68 )
70 # Print results.
71 print("Computed results", ":", dict_of_vectors)
72 ```
73 """
75 # Initialize dictionary of vectors.
76 dict_of_vectors: Dict[str, csr_matrix] = {}
78 ###
79 ### Case of TFIDF vectorization.
80 ###
81 if vectorizer_type == "tfidf":
82 # Initialize vectorizer.
83 vectorizer = TfidfVectorizer(
84 analyzer="word",
85 ngram_range=(1, 3),
86 min_df=2,
87 ####min_df=0.0, max_df=0.95, max_features=20000,
88 ####ngram_range=(1,5), analyzer="char_wb", sublinear_tf=True,
89 )
91 # Apply vectorization.
92 tfidf_vectorization: csr_matrix = vectorizer.fit_transform(
93 [str(dict_of_texts[data_ID]) for data_ID in dict_of_texts.keys()]
94 )
96 # Format dictionary of vectors to return.
97 dict_of_vectors = {data_ID: tfidf_vectorization[i] for i, data_ID in enumerate(dict_of_texts.keys())}
99 # Return the dictionary of vectors.
100 return dict_of_vectors
102 ###
103 ### Case of SPACY vectorization.
104 ###
105 if vectorizer_type == "spacy":
106 # Load vectorizer (spaCy language model).
107 try:
108 spacy_nlp = spacy.load(
109 name=spacy_language_model,
110 disable=[
111 "morphologizer", # Not needed
112 "parser", # Not needed
113 "attribute_ruler", # Not needed
114 "lemmatizer", # Not needed
115 "ner", # Not needed
116 ],
117 )
118 except OSError as err: # `spacy_language_model` is not installed.
119 raise ValueError(
120 "The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed."
121 ) from err
123 # Apply vectorization.
124 dict_of_vectors = {data_ID: csr_matrix(spacy_nlp(str(text)).vector) for data_ID, text in dict_of_texts.items()}
126 # Return the dictionary of vectors.
127 return dict_of_vectors
129 ###
130 ### Other case : Raise a `ValueError`.
131 ###
132 raise ValueError("The `vectorizer_type` '" + str(vectorizer_type) + "' is not implemented.")