Coverage for src\cognitivefactory\interactive_clustering\utils\vectorization.py: 100.00%

20 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1# -*- coding: utf-8 -*- 

2 

3""" 

4* Name: cognitivefactory.interactive_clustering.utils.vectorization 

5* Description: Utilities methods to apply NLP vectorization. 

6* Author: Erwan SCHILD 

7* Created: 17/03/2021 

8* Licence: CeCILL (https://cecill.info/licences.fr.html) 

9""" 

10 

11# ============================================================================== 

12# IMPORT PYTHON DEPENDENCIES 

13# ============================================================================== 

14 

15from typing import Dict 

16 

17import spacy 

18from scipy.sparse import csr_matrix 

19from sklearn.feature_extraction.text import TfidfVectorizer 

20 

21 

22# ============================================================================== 

23# NLP VECTORIZATION 

24# ============================================================================== 

25def vectorize( 

26 dict_of_texts: Dict[str, str], 

27 vectorizer_type: str = "tfidf", 

28 spacy_language_model: str = "fr_core_news_md", 

29) -> Dict[str, csr_matrix]: 

30 """ 

31 A method used to vectorize texts. 

32 Severals vectorizer are available : TFIDF, spaCy language model. 

33 

34 References: 

35 - _Scikit-learn_: `Pedregosa, F., G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R.Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, et E. Duchesnay (2011). Scikit-learn : Machine Learning in Python. Journal of Machine Learning Research 12, 2825–2830.` 

36 - _Scikit-learn_ _'TfidfVectorizer'_: `https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html` 

37 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.` 

38 - _spaCy_ language models: `https://spacy.io/usage/models` 

39 

40 Args: 

41 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to vectorize. 

42 vectorizer_type (str, optional): The vectorizer type to use. The type can be `"tfidf"` or `"spacy"`. Defaults to `"tfidf"`. 

43 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. Defaults to `"fr_core_news_md"`. 

44 

45 Raises: 

46 ValueError: Raises error if `vectorizer_type` is not implemented or if the `spacy_language_model` is not installed. 

47 

48 Returns: 

49 Dict[str, csr_matrix]: A dictionary that contains the computed vectors. 

50 

51 Example: 

52 ```python 

53 # Import. 

54 from cognitivefactory.interactive_clustering.utils.vectorization import vectorize 

55 

56 # Define data. 

57 dict_of_texts={ 

58 "0": "comment signaler une perte de carte de paiement", 

59 "1": "quelle est la procedure pour chercher une carte de credit avalee", 

60 "2": "ma carte visa a un plafond de paiment trop bas puis je l augmenter", 

61 } 

62 

63 # Apply vectorization. 

64 dict_of_vectors = vectorize( 

65 dict_of_texts=dict_of_texts, 

66 vectorizer_type="spacy", 

67 spacy_language_model="fr_core_news_md", 

68 ) 

69 

70 # Print results. 

71 print("Computed results", ":", dict_of_vectors) 

72 ``` 

73 """ 

74 

75 # Initialize dictionary of vectors. 

76 dict_of_vectors: Dict[str, csr_matrix] = {} 

77 

78 ### 

79 ### Case of TFIDF vectorization. 

80 ### 

81 if vectorizer_type == "tfidf": 

82 # Initialize vectorizer. 

83 vectorizer = TfidfVectorizer( 

84 analyzer="word", 

85 ngram_range=(1, 3), 

86 min_df=2, 

87 ####min_df=0.0, max_df=0.95, max_features=20000, 

88 ####ngram_range=(1,5), analyzer="char_wb", sublinear_tf=True, 

89 ) 

90 

91 # Apply vectorization. 

92 tfidf_vectorization: csr_matrix = vectorizer.fit_transform( 

93 [str(dict_of_texts[data_ID]) for data_ID in dict_of_texts.keys()] 

94 ) 

95 

96 # Format dictionary of vectors to return. 

97 dict_of_vectors = {data_ID: tfidf_vectorization[i] for i, data_ID in enumerate(dict_of_texts.keys())} 

98 

99 # Return the dictionary of vectors. 

100 return dict_of_vectors 

101 

102 ### 

103 ### Case of SPACY vectorization. 

104 ### 

105 if vectorizer_type == "spacy": 

106 # Load vectorizer (spaCy language model). 

107 try: 

108 spacy_nlp = spacy.load( 

109 name=spacy_language_model, 

110 disable=[ 

111 "morphologizer", # Not needed 

112 "parser", # Not needed 

113 "attribute_ruler", # Not needed 

114 "lemmatizer", # Not needed 

115 "ner", # Not needed 

116 ], 

117 ) 

118 except OSError as err: # `spacy_language_model` is not installed. 

119 raise ValueError( 

120 "The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed." 

121 ) from err 

122 

123 # Apply vectorization. 

124 dict_of_vectors = {data_ID: csr_matrix(spacy_nlp(str(text)).vector) for data_ID, text in dict_of_texts.items()} 

125 

126 # Return the dictionary of vectors. 

127 return dict_of_vectors 

128 

129 ### 

130 ### Other case : Raise a `ValueError`. 

131 ### 

132 raise ValueError("The `vectorizer_type` '" + str(vectorizer_type) + "' is not implemented.")