textProcessing/textProcessor.py at master · GraphGrail/textProcessing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308

# coding: utf-8

from nltk.corpus import stopwords
import numpy as np
import pymorphy2
import re
import pandas as pd
import json
import enchant
import copy
import os

from .pymorphySentencePartTagToUniversalTagConverter import PymorphySentencePartTagToUniversalTagConverter

class TextProcessor:
    def __init__(self, fixMisspellings = True, add_POS = False, removeNamedEntities = True):
        self.__pymorphySentencePartTagToUniversalTagConverter = PymorphySentencePartTagToUniversalTagConverter()
        self.__morph = pymorphy2.MorphAnalyzer()
        self.__dictRu = enchant.Dict("ru_RU")
        #self.__dictEn = enchant.Dict("en_EN")

        self.__fixMisspellings = fixMisspellings
        self.__add_POS = add_POS
        self.__removeNamedEntities = removeNamedEntities

        self.__stoplist = set()
        self.__stoplist = set.union(self.__stoplist, set(stopwords.words('english')))
        self.__stoplist = set.union(self.__stoplist, set(stopwords.words('russian')))

        self.setSignificantSentenceParts(set(['NOUN', 'ADJF', 'ADJS', 'COMP', 'VERB', 'INFN', 'PRTF', 'PRTS', 'GRND', 'ADVB']))

    # convert text to list of words, removing named entities, not significant sentence parts and stopwords
    def convertDocument(self, doc, vectorsAsResult = False):
        if pd.isnull(doc):
            return []
        wordList = self.splitDocumentInListOfWords(doc)
        if self.testMode == True:
            print("After splitting:")
            print(wordList)
        self.removeNotInformativeWordsFromList(wordList)
        if self.__removeNamedEntities == True:
            namedEntitiesSet = self.getEntitiesSet(wordList)
            self.removeSpecificWordsFromList(wordList, namedEntitiesSet)
        i = 0
        res = []
        while i < len(wordList):
            if self.__baseModel == None or self.__baseModel.vector_size > 1:
                res = res + self.tryToHandleWord(wordList[i], vectorsAsResult)
            else:
                res.append(self.tryToHandleWord(wordList[i], vectorsAsResult))
            i += 1
        if self.testMode == True:
            print("After all changes:")
            print(res)
        return res
    def convertSequenceOfDocuments(self, docSeq, vectorsAsResult = False):
        res = []
        for doc in docSeq:
            res.append(self.convertDocument(doc, vectorsAsResult))
        return res
    def tryToHandleWord(self, word, vectorsAsResult = False):
        wordAndItsAlternatives = [word]
        if self.wordIsCorrect(wordAndItsAlternatives[0]) == True:
            wordAndItsAlternatives[0] = self.__morph.parse(wordAndItsAlternatives[0].lower())[0].normal_form
        elif self.__fixMisspellings == True:
            wordAndItsAlternatives = wordAndItsAlternatives + self.suggestCloseDocuments(word)
        if self.__baseModel is not None:
            res = []
            searchResult = self.searchForClosestWordInModels(wordAndItsAlternatives[0], vectorsAsResult) # search in models not normalized word, because it could be a jargon. Jargon normalizing by pymorphy would not bring word to appropriate form (probably)
            if searchResult is None: # if we did not wind word without bringing it to normal form then we try to search for normalized word in models
                originalWordNormalized = self.__morph.parse(wordAndItsAlternatives[0].lower())[0].normal_form
                searchResult = self.searchForClosestWordInModels(originalWordNormalized, vectorsAsResult)
            if searchResult is not None:
                res.append(searchResult)
                return res
            i = 1
            # Trying to find appropriate word among proposed
            while i < len(wordAndItsAlternatives):
                splittedWords = wordAndItsAlternatives[i].split(" ")
                self.removeNotInformativeWordsFromList(splittedWords)
                for splittedWord in splittedWords:
                    splittedWord = self.__morph.parse(splittedWord.lower())[0].normal_form
                    searchResult = self.searchForClosestWordInModels(splittedWord, vectorsAsResult)
                    if searchResult is not None:
                        res.append(searchResult)
                if res != []:
                    return res
                i += 1
            return res
        else:
            resultWordList = None
            if len(wordAndItsAlternatives) > 1:
                resultWordList = wordAndItsAlternatives[1].split(" ")
                self.removeNotInformativeWordsFromList(resultWordList)
            else:
                resultWordList = wordAndItsAlternatives
            i = 0
            size = len(resultWordList)
            while i < size:
                resultWordList[i] = self.__morph.parse(resultWordList[i])[0].normal_form
                if self.__add_POS == True:
                    resultWordList[i] = resultWordList[i] + "_" + self.__morph.parse(resultWordList[i])[0].tag.POS
                i += 1
            return resultWordList

    def searchForClosestWordInModels(self, word, vectorsAsResult = False):
        probablePymorphyPOS = str(self.__morph.parse(word.lower())[0].tag.POS)
        probablePOS = None
        if probablePymorphyPOS in list(self.__pymorphySentencePartTagToUniversalTagConverter.keys()): # should POS be transform?
            probablePOS = self.__pymorphySentencePartTagToUniversalTagConverter[str(self.__morph.parse(word.lower())[0].tag.POS)]
        else:
            probablePOS = probablePymorphyPOS
        copiedTags = copy.deepcopy(list(self.__significantModel_POS_Set))
        i = 0
        while i < len(copiedTags):
            if probablePOS == copiedTags[i]:
                del copiedTags[i]
                copiedTags.insert(0, probablePOS)
                break
            i += 1
        res = self._searchForWordInBaseModel(word, copiedTags, vectorsAsResult)
        if res is not None:
            return res
        if self.__supportingModels is not None:
            res = self._searchForSimilarWordInSupportingModels(word, copiedTags, vectorsAsResult)
        return res
    def _searchForWordInBaseModel(self, word, copiedTags, vectorsAsResult):
        for POS in copiedTags:
            try:
                wordWithPOS = word + "_" + POS
                res = self.__baseModel[wordWithPOS]
                if vectorsAsResult == True:
                    return res
                elif self.__add_POS == True:
                    return word + "_" + str(self.__morph.parse(word)[0].tag.POS)
                else:
                    return word
            except:
                pass
        return None
    def _searchForSimilarWordInSupportingModels(self, word, copiedTags, vectorsAsResult):
        for POS in copiedTags:
            for supportModel in self.__supportingModels:
                wordWithPOS = word + "_" + POS
                try:
                    res = self.supportModel[wordWithPOS]
                    similarWords = supportModel.similar_by_word(wordWithPOS, topn=3)
                    for similarWord in similarWords:
                        try:
                            res = self.__baseModel[similarWord[0]]
                            if vectorsAsResult == True:
                                return res
                            elif self.__add_POS == True:
                                return similarWord[0] + "_" + str(self.__morph.parse(similarWord[0])[0].tag.POS)
                            else:
                                return similarWord[0]
                        except:
                            pass
                except:
                    pass
        return None
    # get list of words from text using regular expressions
    @staticmethod
    def splitDocumentInListOfWords(doc):
        #prog = re.compile(r'[А-Яа-яA-Za-z-]{1,}')
        prog = re.compile(r'[А-Яа-я]{1,}-{0,1}[А-Яа-я]{1,}|[A-Za-z]{1,}-{0,1}[A-Za-z]{1,}')
        wordsIterator = prog.finditer(doc)
        resultList = []
        for word in wordsIterator:
            resultList.append(word.group(0))
        return resultList
    def wordIsCorrect(self, word):
        wordTitle = word.title() # make first letter capital and others make lowercase to check by enchant (it is case-sensitive)
        wordUpper = word.upper() # make all letters capital
        correct = self.__dictRu.check(wordTitle) or self.__dictRu.check(wordUpper)
        if correct == False:
            return self.checkWordByPymorphy(wordTitle)
        return correct
    def checkWordByPymorphy(self, word):
        parseResult = self.__morph.parse(word)[0]
        return TextProcessor._accordingToMethodsStackAnalizeIsCorrect(parseResult.methods_stack)
    @staticmethod
    def _accordingToMethodsStackAnalizeIsCorrect(methodStack):
        acceptedAnalizerClasses = ["DictionaryAnalyzer", "KnownSuffixAnalyzer", "KnownPrefixAnalyzer"]
        for method in methodStack:
            if method[0].__class__.__name__ not in acceptedAnalizerClasses:
                return False
        return True
    def closestWordAccordingPymorphy(self, word):
        parseResults = self.__morph.parse(word)
        for parseResult in parseResults:
            if parseResult.methods_stack[0][0].__class__.__name__ == "DictionaryAnalyzer":
                return self.__morph.parse(parseResult.methods_stack[0][1])[0].normal_form
        return None
    # return set of words which are named entities in list l
    def getEntitiesSet(self, l):
        res = set()
        for word in l:
            if self.wordIsNamedEntitie(word):
                res.add(word)
        return res
    # can return not single word but document in case of missing space between words
    def suggestCloseDocuments(self, word):
        res = []
        suggestedByEnchant = self.__dictRu.suggest(word)
        for suggested in suggestedByEnchant:
            res = res + [suggested]
        return res
    def wordIsNamedEntitie(self, word):
        if self.wordIsCorrect(word) == False:
            return False
        tag = self.__morph.parse(word)[0].tag
        namedEntitiesTags = ["Name", "Surn", "Patr", "Geox", "Orgn", "Trad"]
        for entitieTag in namedEntitiesTags:
            if entitieTag in tag:
                return True
        return False
    # removes wordsSet from list of words l
    def removeSpecificWordsFromList(self, l, words_set):
        for word in words_set:
            word = word.lower()
            i = len(l) - 1
            while i >= 0:
                if l[i].lower() == word:
                    del l[i]
                i -= 1
    def removeNotInformativeWordsFromList(self, l):
        i = len(l) - 1
        while i >= 0:
            if self.__dictRu.check(l[i]) and self.__morph.parse(l[i])[0].tag.POS not in self.__significantPymorphy_POS_Set:
                del l[i]
            i -= 1
    def getSignificantSentenceParts(self):
        return self.__significantPymorphy_POS_Set
    def setSignificantSentenceParts(self, significant_POS):
        self.__significantPymorphy_POS_Set = significant_POS
        self.__significantModel_POS_Set = set()
        for pos in significant_POS:
            if pos in list(self.__pymorphySentencePartTagToUniversalTagConverter.keys()):
                self.__significantModel_POS_Set.add(self.__pymorphySentencePartTagToUniversalTagConverter[pos])
            else:
                self.__significantModel_POS_Set.add(pos)
    def getStopList(self):
        return self.__stoplist
    def setStopList(self, stopList):
        self.__stoplist = stoplist
    def setBaseModel(self, model):
        self.__baseModel = model
    def setSupportingModels(self, supportingModels):
        self.__supportingModels = supportingModels
    def save(self, destinationFolder):
        with open(destinationFolder + "/state.json", "w") as outputFile:
            state = json.dumps([list(self.__significantPymorphy_POS_Set),
                                list(self.__stoplist),
                                self.__add_POS,
                               self.__fixMisspellings],
                               separators=(',',':'))
            outputFile.write(state)
            outputFile.close()
        if self.__baseModel != None:
            self.__baseModel.save(destinationFolder + "/" + "baseModel")
        if self.__supportingModels != None:
            with open(destinationFolder + "/numberOfSupportingModels.txt", "w") as outputFile:
                outputFile.write(str(len(self.__supportingModels)))
                i = 0
                while i < len(self.__supportingModels):
                    self.__supportingModels[i].save(destinationFolder + "/" + "supModel_" + str(i))
                    i += 1

    @staticmethod
    def load(destinationFolder):
        obj = TextProcessor()
        with open(destinationFolder + "/state.json", "r") as inputFile:
            state = json.load(inputFile)
            obj.__significantPymorphy_POS_Set = set(state[0])
            obj.__stoplist = set(state[1])
            obj.__add_POS = state[2]
            obj.__fixMisspellings = state[3]
        if os.path.exists(destinationFolder + "/" + "baseModel"):
            obj.__baseModel = gensim.models.KeyedVectors.load(destinationFolder + "/" + "baseModel")
        if os.path.exists(destinationFolder + "/numberOfSupportingModels.txt"):
            with open(destinationFolder + "/numberOfSupportingModels.txt", "r") as inputFile:
                num = int(inputFile.readline())
            obj.__supportingModels = []
            i = 0
            while i < num:
                obj.__supportingModels.append(gensim.models.KeyedVectors.load(destinationFolder + "/" + "supModel_" + str(i)))
                i += 1
        return obj

    __significantPymorphy_POS_Set = None
    __significantModel_POS_Set = None
    __stoplist = None
    __morph = None
    __dictRu = None
    __dictEn = None

    __add_POS = None
    __fixMisspellings = None
    __removeNamedEntities = None

    __pymorphySentencePartTagToUniversalTagConverter = None

    __baseModel = None
    __supportingModels = None

    testMode = False