Coverage for src\cognitivefactory\interactive_clustering\utils\preprocessing.py: 100.00%

22 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-17 13:31 +0100

1# -*- coding: utf-8 -*- 

2 

3""" 

4* Name: cognitivefactory.interactive_clustering.utils.preprocessing 

5* Description: Utilities methods to apply NLP preprocessing. 

6* Author: Erwan SCHILD 

7* Created: 17/03/2021 

8* Licence: CeCILL (https://cecill.info/licences.fr.html) 

9""" 

10 

11# ============================================================================== 

12# IMPORT PYTHON DEPENDENCIES 

13# ============================================================================== 

14 

15import unicodedata 

16from typing import Dict 

17 

18import spacy 

19 

20# from nltk.stem.snowball import SnowballStemmer 

21 

22 

23# ============================================================================== 

24# NLP PREPROCESSING 

25# ============================================================================== 

26def preprocess( 

27 dict_of_texts: Dict[str, str], 

28 apply_stopwords_deletion: bool = False, 

29 apply_parsing_filter: bool = False, 

30 apply_lemmatization: bool = False, 

31 spacy_language_model: str = "fr_core_news_md", 

32) -> Dict[str, str]: 

33 """ 

34 A method used to preprocess texts. 

35 It applies simple preprocessing (lowercasing, punctuations deletion, accents replacement, whitespace deletion). 

36 Some options are available to delete stopwords, apply lemmatization, and delete tokens according to their depth in the denpendency tree. 

37 

38 References: 

39 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.` 

40 - _spaCy_ language models: `https://spacy.io/usage/models` 

41 - _NLTK_: `Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.` 

42 - _NLTK_ _'SnowballStemmer'_: `https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.snowball` 

43 

44 Args: 

45 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to preprocess. 

46 apply_stopwords_deletion (bool, optional): The option to delete stopwords. Defaults to `False`. 

47 apply_parsing_filter (bool, optional): The option to filter tokens based on dependency parsing results. If set, it only keeps `"ROOT"` tokens and their direct children. Defaults to `False`. 

48 apply_lemmatization (bool, optional): The option to lemmatize tokens. Defaults to `False`. 

49 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. The model has to be installed. Defaults to `"fr_core_news_md"`. 

50 

51 Raises: 

52 ValueError: Raises error if the `spacy_language_model` is not installed. 

53 

54 Returns: 

55 Dict[str,str]: A dictionary that contains the preprocessed texts. 

56 

57 Example: 

58 ```python 

59 # Import. 

60 from cognitivefactory.interactive_clustering.utils.preprocessing import preprocess 

61 

62 # Define data. 

63 dict_of_texts={ 

64 "0": "Comment signaler une perte de carte de paiement ?", 

65 "1": "Quelle est la procédure pour chercher une carte de crédit avalée ?", 

66 "2": "Ma carte Visa a un plafond de paiment trop bas, puis-je l'augmenter ?", 

67 } 

68 

69 # Apply preprocessing. 

70 dict_of_preprocessed_texts = preprocess( 

71 dict_of_texts=dict_of_texts, 

72 apply_stopwords_deletion=True, 

73 apply_parsing_filter=False, 

74 apply_lemmatization=False, 

75 spacy_language_model="fr_core_news_md", 

76 ) 

77 

78 # Print results. 

79 print("Expected results", ";", { 

80 "0": "signaler perte carte paiement", 

81 "1": "procedure chercher carte credit avalee", 

82 "2": "carte visa plafond paiment l augmenter", 

83 }) 

84 print("Computed results", ":", dict_of_preprocessed_texts) 

85 ``` 

86 """ 

87 

88 # Initialize dictionary of preprocessed texts. 

89 dict_of_preprocessed_texts: Dict[str, str] = {} 

90 

91 # Initialize punctuation translator. 

92 punctuation_translator = str.maketrans( 

93 { 

94 punct: " " 

95 for punct in ( 

96 ".", 

97 ",", 

98 ";", 

99 ":", 

100 "!", 

101 "¡", 

102 "?", 

103 "¿", 

104 "…", 

105 "•", 

106 "(", 

107 ")", 

108 "{", 

109 "}", 

110 "[", 

111 "]", 

112 "«", 

113 "»", 

114 "^", 

115 "`", 

116 "'", 

117 '"', 

118 "\\", 

119 "/", 

120 "|", 

121 "-", 

122 "_", 

123 "#", 

124 "&", 

125 "~", 

126 "@", 

127 ) 

128 } 

129 ) 

130 

131 # Load vectorizer (spacy language model). 

132 try: 

133 spacy_nlp = spacy.load( 

134 name=spacy_language_model, 

135 disable=[ 

136 # "morphologizer", # Needed for lemmatization. 

137 # "parser", # Needed for filtering on dependency parsing. 

138 # "attribute_ruler", # Need for pos tagging. 

139 # "lemmatizer", # Needed for lemmatization. 

140 "ner", # Not needed 

141 ], 

142 ) 

143 except OSError as err: # `spacy_language_model` is not installed. 

144 raise ValueError("The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed.") from err 

145 

146 # Initialize stemmer. 

147 ####stemmer = SnowballStemmer(language="french") 

148 

149 # For each text... 

150 for key, text in dict_of_texts.items(): 

151 # Force string type. 

152 preprocessed_text: str = str(text) 

153 

154 # Apply lowercasing. 

155 preprocessed_text = text.lower() 

156 

157 # Apply punctuation deletion (before tokenization). 

158 preprocessed_text = preprocessed_text.translate(punctuation_translator) 

159 

160 # Apply tokenization and spaCy pipeline. 

161 tokens = [ 

162 token 

163 for token in spacy_nlp(preprocessed_text) 

164 if ( 

165 # Spaces are not allowed. 

166 not token.is_space 

167 ) 

168 and ( 

169 # Punctuation are not allowed. 

170 not token.is_punct 

171 and not token.is_quote 

172 ) 

173 and ( 

174 # If set, stopwords are not allowed. 

175 (not apply_stopwords_deletion) 

176 or (not token.is_stop) 

177 ) 

178 and ( 

179 # If set, stopwords are not allowed. 

180 (not apply_parsing_filter) 

181 or (len(list(token.ancestors)) <= 1) 

182 ) 

183 ] 

184 

185 # Apply retokenization with lemmatization. 

186 if apply_lemmatization: 

187 preprocessed_text = " ".join([token.lemma_.strip() for token in tokens]) 

188 

189 # Apply retokenization without lemmatization. 

190 else: 

191 preprocessed_text = " ".join([token.text.strip() for token in tokens]) 

192 

193 # Apply accents deletion (after lemmatization). 

194 preprocessed_text = "".join( 

195 [char for char in unicodedata.normalize("NFKD", preprocessed_text) if not unicodedata.combining(char)] 

196 ) 

197 

198 # Store preprocessed text. 

199 dict_of_preprocessed_texts[key] = preprocessed_text 

200 

201 return dict_of_preprocessed_texts