Coverage for src\cognitivefactory\interactive_clustering\utils\preprocessing.py: 100.00%
22 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-17 13:31 +0100
1# -*- coding: utf-8 -*-
3"""
4* Name: cognitivefactory.interactive_clustering.utils.preprocessing
5* Description: Utilities methods to apply NLP preprocessing.
6* Author: Erwan SCHILD
7* Created: 17/03/2021
8* Licence: CeCILL (https://cecill.info/licences.fr.html)
9"""
11# ==============================================================================
12# IMPORT PYTHON DEPENDENCIES
13# ==============================================================================
15import unicodedata
16from typing import Dict
18import spacy
20# from nltk.stem.snowball import SnowballStemmer
23# ==============================================================================
24# NLP PREPROCESSING
25# ==============================================================================
26def preprocess(
27 dict_of_texts: Dict[str, str],
28 apply_stopwords_deletion: bool = False,
29 apply_parsing_filter: bool = False,
30 apply_lemmatization: bool = False,
31 spacy_language_model: str = "fr_core_news_md",
32) -> Dict[str, str]:
33 """
34 A method used to preprocess texts.
35 It applies simple preprocessing (lowercasing, punctuations deletion, accents replacement, whitespace deletion).
36 Some options are available to delete stopwords, apply lemmatization, and delete tokens according to their depth in the denpendency tree.
38 References:
39 - _spaCy_: `Honnibal, M. et I. Montani (2017). spaCy 2 : Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing.`
40 - _spaCy_ language models: `https://spacy.io/usage/models`
41 - _NLTK_: `Bird, Steven, Edward Loper and Ewan Klein (2009), Natural Language Processing with Python. O’Reilly Media Inc.`
42 - _NLTK_ _'SnowballStemmer'_: `https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.snowball`
44 Args:
45 dict_of_texts (Dict[str,str]): A dictionary that contains the texts to preprocess.
46 apply_stopwords_deletion (bool, optional): The option to delete stopwords. Defaults to `False`.
47 apply_parsing_filter (bool, optional): The option to filter tokens based on dependency parsing results. If set, it only keeps `"ROOT"` tokens and their direct children. Defaults to `False`.
48 apply_lemmatization (bool, optional): The option to lemmatize tokens. Defaults to `False`.
49 spacy_language_model (str, optional): The spaCy language model to use if vectorizer is spacy. The model has to be installed. Defaults to `"fr_core_news_md"`.
51 Raises:
52 ValueError: Raises error if the `spacy_language_model` is not installed.
54 Returns:
55 Dict[str,str]: A dictionary that contains the preprocessed texts.
57 Example:
58 ```python
59 # Import.
60 from cognitivefactory.interactive_clustering.utils.preprocessing import preprocess
62 # Define data.
63 dict_of_texts={
64 "0": "Comment signaler une perte de carte de paiement ?",
65 "1": "Quelle est la procédure pour chercher une carte de crédit avalée ?",
66 "2": "Ma carte Visa a un plafond de paiment trop bas, puis-je l'augmenter ?",
67 }
69 # Apply preprocessing.
70 dict_of_preprocessed_texts = preprocess(
71 dict_of_texts=dict_of_texts,
72 apply_stopwords_deletion=True,
73 apply_parsing_filter=False,
74 apply_lemmatization=False,
75 spacy_language_model="fr_core_news_md",
76 )
78 # Print results.
79 print("Expected results", ";", {
80 "0": "signaler perte carte paiement",
81 "1": "procedure chercher carte credit avalee",
82 "2": "carte visa plafond paiment l augmenter",
83 })
84 print("Computed results", ":", dict_of_preprocessed_texts)
85 ```
86 """
88 # Initialize dictionary of preprocessed texts.
89 dict_of_preprocessed_texts: Dict[str, str] = {}
91 # Initialize punctuation translator.
92 punctuation_translator = str.maketrans(
93 {
94 punct: " "
95 for punct in (
96 ".",
97 ",",
98 ";",
99 ":",
100 "!",
101 "¡",
102 "?",
103 "¿",
104 "…",
105 "•",
106 "(",
107 ")",
108 "{",
109 "}",
110 "[",
111 "]",
112 "«",
113 "»",
114 "^",
115 "`",
116 "'",
117 '"',
118 "\\",
119 "/",
120 "|",
121 "-",
122 "_",
123 "#",
124 "&",
125 "~",
126 "@",
127 )
128 }
129 )
131 # Load vectorizer (spacy language model).
132 try:
133 spacy_nlp = spacy.load(
134 name=spacy_language_model,
135 disable=[
136 # "morphologizer", # Needed for lemmatization.
137 # "parser", # Needed for filtering on dependency parsing.
138 # "attribute_ruler", # Need for pos tagging.
139 # "lemmatizer", # Needed for lemmatization.
140 "ner", # Not needed
141 ],
142 )
143 except OSError as err: # `spacy_language_model` is not installed.
144 raise ValueError("The `spacy_language_model` '" + str(spacy_language_model) + "' is not installed.") from err
146 # Initialize stemmer.
147 ####stemmer = SnowballStemmer(language="french")
149 # For each text...
150 for key, text in dict_of_texts.items():
151 # Force string type.
152 preprocessed_text: str = str(text)
154 # Apply lowercasing.
155 preprocessed_text = text.lower()
157 # Apply punctuation deletion (before tokenization).
158 preprocessed_text = preprocessed_text.translate(punctuation_translator)
160 # Apply tokenization and spaCy pipeline.
161 tokens = [
162 token
163 for token in spacy_nlp(preprocessed_text)
164 if (
165 # Spaces are not allowed.
166 not token.is_space
167 )
168 and (
169 # Punctuation are not allowed.
170 not token.is_punct
171 and not token.is_quote
172 )
173 and (
174 # If set, stopwords are not allowed.
175 (not apply_stopwords_deletion)
176 or (not token.is_stop)
177 )
178 and (
179 # If set, stopwords are not allowed.
180 (not apply_parsing_filter)
181 or (len(list(token.ancestors)) <= 1)
182 )
183 ]
185 # Apply retokenization with lemmatization.
186 if apply_lemmatization:
187 preprocessed_text = " ".join([token.lemma_.strip() for token in tokens])
189 # Apply retokenization without lemmatization.
190 else:
191 preprocessed_text = " ".join([token.text.strip() for token in tokens])
193 # Apply accents deletion (after lemmatization).
194 preprocessed_text = "".join(
195 [char for char in unicodedata.normalize("NFKD", preprocessed_text) if not unicodedata.combining(char)]
196 )
198 # Store preprocessed text.
199 dict_of_preprocessed_texts[key] = preprocessed_text
201 return dict_of_preprocessed_texts