Coverage for src\cognitivefactory\interactive_clustering\utils\vectorization.py: 100.00%

1# -*- coding: utf-8 -*-

3"""

4* Name: cognitivefactory.interactive_clustering.utils.vectorization

5* Description: Utilities methods to apply NLP vectorization.

6* Author: Erwan SCHILD

7* Created: 17/03/2021

8* Licence: CeCILL (https://cecill.info/licences.fr.html)

9"""

11# ==============================================================================

12# IMPORT PYTHON DEPENDENCIES

13# ==============================================================================

15from typing import Dict

17import spacy

18from scipy.sparse import csr_matrix

19from sklearn.feature_extraction.text import TfidfVectorizer

22# ==============================================================================

23# NLP VECTORIZATION

24# ==============================================================================

25def vectorize(

26 dict_of_texts: Dict[str, str],

27 vectorizer_type: str = "tfidf",

28 spacy_language_model: str = "fr_core_news_md",

29) -> Dict[str, csr_matrix]:

30 """

31 A method used to vectorize texts.

32 Severals vectorizer are available : TFIDF, spaCy language model.

34 References:

35 - _Scikit-learn_: `Pedregosa, F., G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R.Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, et E. Duchesnay (2011). Scikit-learn : Machine Learning in Python. Journal of Machine Learning Research 12, 2825–2830.`

36 - _Scikit-learn_ _'TfidfVectorizer'_: `https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html`

37 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.`

38 - _spaCy_ language models: `https://spacy.io/usage/models`

40 Args:

41 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to vectorize.

42 vectorizer_type (str, optional): The vectorizer type to use. The type can be `"tfidf"` or `"spacy"`. Defaults to `"tfidf"`.

43 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. Defaults to `"fr_core_news_md"`.

45 Raises:

46 ValueError: Raises error if `vectorizer_type` is not implemented or if the `spacy_language_model` is not installed.

48 Returns:

49 Dict[str, csr_matrix]: A dictionary that contains the computed vectors.

51 Example:

52 ```python

53 # Import.

54 from cognitivefactory.interactive_clustering.utils.vectorization import vectorize

56 # Define data.

57 dict_of_texts={

58 "0": "comment signaler une perte de carte de paiement",

59 "1": "quelle est la procedure pour chercher une carte de credit avalee",

60 "2": "ma carte visa a un plafond de paiment trop bas puis je l augmenter",

61 }

63 # Apply vectorization.

64 dict_of_vectors = vectorize(

65 dict_of_texts=dict_of_texts,

66 vectorizer_type="spacy",

67 spacy_language_model="fr_core_news_md",

68 )

70 # Print results.

71 print("Computed results", ":", dict_of_vectors)

72 ```

73 """

75 # Initialize dictionary of vectors.

76 dict_of_vectors: Dict[str, csr_matrix] = {}

78 ###

79 ### Case of TFIDF vectorization.

80 ###

81 if vectorizer_type == "tfidf":

82 # Initialize vectorizer.

83 vectorizer = TfidfVectorizer(

84 analyzer="word",

85 ngram_range=(1, 3),

86 min_df=2,

87 ####min_df=0.0, max_df=0.95, max_features=20000,

88 ####ngram_range=(1,5), analyzer="char_wb", sublinear_tf=True,

89 )

91 # Apply vectorization.

92 tfidf_vectorization: csr_matrix = vectorizer.fit_transform(

93 [str(dict_of_texts[data_ID]) for data_ID in dict_of_texts.keys()]

94 )

96 # Format dictionary of vectors to return.

97 dict_of_vectors = {data_ID: tfidf_vectorization[i] for i, data_ID in enumerate(dict_of_texts.keys())}

99 # Return the dictionary of vectors.

100 return dict_of_vectors

101

102 ###

103 ### Case of SPACY vectorization.

104 ###

105 if vectorizer_type == "spacy":

106 # Load vectorizer (spaCy language model).

107 try:

108 spacy_nlp = spacy.load(

109 name=spacy_language_model,

110 disable=[

111 "morphologizer", # Not needed

112 "parser", # Not needed

113 "attribute_ruler", # Not needed

114 "lemmatizer", # Not needed

115 "ner", # Not needed

116 ],

117 )

118 except OSError as err: # `spacy_language_model` is not installed.

119 raise ValueError(

120 "The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed."

121 ) from err

122

123 # Apply vectorization.

124 dict_of_vectors = {data_ID: csr_matrix(spacy_nlp(str(text)).vector) for data_ID, text in dict_of_texts.items()}

125

126 # Return the dictionary of vectors.

127 return dict_of_vectors

128

129 ###

130 ### Other case : Raise a `ValueError`.

131 ###

132 raise ValueError("The `vectorizer_type` '" + str(vectorizer_type) + "' is not implemented.")