Coverage for src\cognitivefactory\interactive_clustering\utils\preprocessing.py: 100.00%

1# -*- coding: utf-8 -*-

3"""

4* Name: cognitivefactory.interactive_clustering.utils.preprocessing

5* Description: Utilities methods to apply NLP preprocessing.

6* Author: Erwan SCHILD

7* Created: 17/03/2021

8* Licence: CeCILL (https://cecill.info/licences.fr.html)

9"""

11# ==============================================================================

12# IMPORT PYTHON DEPENDENCIES

13# ==============================================================================

15import unicodedata

16from typing import Dict

18import spacy

20# from nltk.stem.snowball import SnowballStemmer

23# ==============================================================================

24# NLP PREPROCESSING

25# ==============================================================================

26def preprocess(

27 dict_of_texts: Dict[str, str],

28 apply_stopwords_deletion: bool = False,

29 apply_parsing_filter: bool = False,

30 apply_lemmatization: bool = False,

31 spacy_language_model: str = "fr_core_news_md",

32) -> Dict[str, str]:

33 """

34 A method used to preprocess texts.

35 It applies simple preprocessing (lowercasing, punctuations deletion, accents replacement, whitespace deletion).

36 Some options are available to delete stopwords, apply lemmatization, and delete tokens according to their depth in the denpendency tree.

38 References:

39 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.`

40 - _spaCy_ language models: `https://spacy.io/usage/models`

41 - _NLTK_: `Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.`

42 - _NLTK_ _'SnowballStemmer'_: `https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.snowball`

44 Args:

45 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to preprocess.

46 apply_stopwords_deletion (bool, optional): The option to delete stopwords. Defaults to `False`.

47 apply_parsing_filter (bool, optional): The option to filter tokens based on dependency parsing results. If set, it only keeps `"ROOT"` tokens and their direct children. Defaults to `False`.

48 apply_lemmatization (bool, optional): The option to lemmatize tokens. Defaults to `False`.

49 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. The model has to be installed. Defaults to `"fr_core_news_md"`.

51 Raises:

52 ValueError: Raises error if the `spacy_language_model` is not installed.

54 Returns:

55 Dict[str,str]: A dictionary that contains the preprocessed texts.

57 Example:

58 ```python

59 # Import.

60 from cognitivefactory.interactive_clustering.utils.preprocessing import preprocess

62 # Define data.

63 dict_of_texts={

64 "0": "Comment signaler une perte de carte de paiement ?",

65 "1": "Quelle est la procédure pour chercher une carte de crédit avalée ?",

66 "2": "Ma carte Visa a un plafond de paiment trop bas, puis-je l'augmenter ?",

67 }

69 # Apply preprocessing.

70 dict_of_preprocessed_texts = preprocess(

71 dict_of_texts=dict_of_texts,

72 apply_stopwords_deletion=True,

73 apply_parsing_filter=False,

74 apply_lemmatization=False,

75 spacy_language_model="fr_core_news_md",

76 )

78 # Print results.

79 print("Expected results", ";", {

80 "0": "signaler perte carte paiement",

81 "1": "procedure chercher carte credit avalee",

82 "2": "carte visa plafond paiment l augmenter",

83 })

84 print("Computed results", ":", dict_of_preprocessed_texts)

85 ```

86 """

88 # Initialize dictionary of preprocessed texts.

89 dict_of_preprocessed_texts: Dict[str, str] = {}

91 # Initialize punctuation translator.

92 punctuation_translator = str.maketrans(

93 {

94 punct: " "

95 for punct in (

96 ".",

97 ",",

98 ";",

99 ":",

100 "!",

101 "¡",

102 "?",

103 "¿",

104 "…",

105 "•",

106 "(",

107 ")",

108 "{",

109 "}",

110 "[",

111 "]",

112 "«",

113 "»",

114 "^",

115 "`",

116 "'",

117 '"',

118 "\\",

119 "/",

120 "|",

121 "-",

122 "_",

123 "#",

124 "&",

125 "~",

126 "@",

127 )

128 }

129 )

130

131 # Load vectorizer (spacy language model).

132 try:

133 spacy_nlp = spacy.load(

134 name=spacy_language_model,

135 disable=[

136 # "morphologizer", # Needed for lemmatization.

137 # "parser", # Needed for filtering on dependency parsing.

138 # "attribute_ruler", # Need for pos tagging.

139 # "lemmatizer", # Needed for lemmatization.

140 "ner", # Not needed

141 ],

142 )

143 except OSError as err: # `spacy_language_model` is not installed.

144 raise ValueError("The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed.") from err

145

146 # Initialize stemmer.

147 ####stemmer = SnowballStemmer(language="french")

148

149 # For each text...

150 for key, text in dict_of_texts.items():

151 # Force string type.

152 preprocessed_text: str = str(text)

153

154 # Apply lowercasing.

155 preprocessed_text = text.lower()

156

157 # Apply punctuation deletion (before tokenization).

158 preprocessed_text = preprocessed_text.translate(punctuation_translator)

159

160 # Apply tokenization and spaCy pipeline.

161 tokens = [

162 token

163 for token in spacy_nlp(preprocessed_text)

164 if (

165 # Spaces are not allowed.

166 not token.is_space

167 )

168 and (

169 # Punctuation are not allowed.

170 not token.is_punct

171 and not token.is_quote

172 )

173 and (

174 # If set, stopwords are not allowed.

175 (not apply_stopwords_deletion)

176 or (not token.is_stop)

177 )

178 and (

179 # If set, stopwords are not allowed.

180 (not apply_parsing_filter)

181 or (len(list(token.ancestors)) <= 1)

182 )

183 ]

184

185 # Apply retokenization with lemmatization.

186 if apply_lemmatization:

187 preprocessed_text = " ".join([token.lemma_.strip() for token in tokens])

188

189 # Apply retokenization without lemmatization.

190 else:

191 preprocessed_text = " ".join([token.text.strip() for token in tokens])

192

193 # Apply accents deletion (after lemmatization).

194 preprocessed_text = "".join(

195 [char for char in unicodedata.normalize("NFKD", preprocessed_text) if not unicodedata.combining(char)]

196 )

197

198 # Store preprocessed text.

199 dict_of_preprocessed_texts[key] = preprocessed_text

200

201 return dict_of_preprocessed_texts