textProcessingEngModel/documentVectorizer.py at master · GraphGrail/textProcessingEngModel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

# coding: utf-8

from polyglot.detect import Detector
import psutil

from .offlineWordTranslator.offlineWordTranslator import OfflineWordTranslator

from .textPreprocessorForConcreteLanguage.eng.textPreprocessorEng import TextPreprocessorEng
from .textPreprocessorForConcreteLanguage.rus.textPreprocessorRus import TextPreprocessorRus

from .onlineMultilanguageTranslator.onlineMultilanguageTranslator import OnlineMultilanguageTranslator

import json

import os

class DocumentVectorizer:
    def __init__(self):
        langSys = {"preprocessor" : TextPreprocessorEng()}
        self.__languageSystems = {"English" : langSys}

        pathToDictionary = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                        "offlineWordTranslator",
                                        "dictionaries",
                                        "custom",
                                        "rusToEngExtendedDict.json")
        offlineWordTranslator = None
        with open(pathToDictionary, "r") as fp:
            rusToEngDictionary = json.load(fp)
            offlineWordTranslator = OfflineWordTranslator(rusToEngDictionary)
        langSys = {"preprocessor" : TextPreprocessorRus(), "offlineWordTranslator" : offlineWordTranslator}
        self.__languageSystems["Russian"] = langSys

        self.__onlineMultilanguageTranslator = OnlineMultilanguageTranslator()

        self.__normalize = False
        self.__fixMisspellings = False
        self.__removeUnsignificantSentenceParts = True
        self.__removeNamedEntities = True

    # lang: "Russian", "English"
    def vectorizeDocument(self, doc, lang = None, useOfflineTranslation = True, useOnlineTranslation = False):
        translatedWords = None

        if useOnlineTranslation == True:
            translatedWords = self._onlineWordSeparationAndTranslation(doc, maxNumberOfAttempts = 4)

        if translatedWords == None:
            if useOfflineTranslation == True:
                translatedWords = self._offlineWordSeparationAndTranslation(doc, lang)
            elif useOnlineTranslation == True and len(translatedWords) == 0:
                translatedWords = self._onlineWordSeparationAndTranslation(doc, maxNumberOfAttempts=None)

        if translatedWords == None:
            return None

        res = []
        for word in translatedWords:
            vector = self.__wordToVecConverter.convert(word)
            if vector is not None:
                res.append(vector)
        return res

    def vectorizeSequenceOfDocuments(self, docSeq, lang = None, useOfflineTranslation = True, useOnlineTranslation = False):
        res = [None] * len(docSeq)

        curPos = 0
        curPosMutex = threading.Lock()

        numberOfThreads = 4
        try:
            numberOfThreads = psutil.cpu_count(logical=False)
        except:
            pass

        dataPieceLength = 10

        def vectorizeDocumentsInOneThread():
            nonlocal curPos
            nonlocal curPosMutex
            nonlocal dataPieceLength

            nonlocal docSeq
            nonlocal lang
            nonlocal useOfflineTranslation
            nonlocal useOnlineTranslation

            while True:
                curPosMutex.acquire()
                beginIndex = curPos
                curPos += dataPieceLength
                if self.testMode == True:
                    print("Current position of parsed document: " + str(curPos))
                curPosMutex.release()

                if beginIndex >= len(docSeq):
                    break

                endIndex = beginIndex + dataPieceLength
                if endIndex > len(docSeq):
                    endIndex = len(docSeq)

                i = beginIndex
                while i < endIndex:
                    res[i] = self.vectorizeDocument(docSeq[i],
                                                    lang,
                                                    useOfflineTranslation,
                                                    useOnlineTranslation)
                    i += 1

        threadList = []
        for r in range(numberOfThreads):
            threadList.append(threading.Thread(target=vectorizeDocumentsInOneThread, args=()))

        for thr in threadList:
            thr.start()

        for thr in threadList:
            thr.join()

        return res

    # Setters and getters:

    def setWordToVecConverter(self, converter):
        self.__wordToVecConverter = converter
    def getWordToVecConverter(self):
        return self.__wordToVecConverter

    def setPreprocessingParameters(self, normalize, fixMisspellings, removeUnsignificantSentenceParts, removeNamedEntities):
        self.__normalize = normalize
        self.__fixMisspellings = fixMisspellings
        self.__removeUnsignificantSentenceParts = removeUnsignificantSentenceParts
        self.__removeNamedEntities = removeNamedEntities

    # Private functions:

    def _offlineWordSeparationAndTranslation(self, doc, lang):
        # if lang == None detect language of doc automatically
        if lang is None:
            try:
                d = Detector(doc)
                lang = d.language.name
            except:
                lang = "English"

        # if language is supported then preprocess, translate and vectorize words
        if lang in self.__languageSystems:
            # translate words if they are not in English
            if lang != "English":
                wordList = self.__languageSystems[lang]["preprocessor"].prepareDocument(doc, normalize=True, fixMisspellings=self.__fixMisspellings, removeUnsignificantSentenceParts=self.__removeUnsignificantSentenceParts, removeNamedEntities=self.__removeNamedEntities)
                translatedWords = []
                i = 0
                while i < len(wordList):
                    translation = self.__languageSystems[lang]["offlineWordTranslator"].translate(wordList[i])
                    if translation is not None:
                        translatedWords += self.__languageSystems["English"]["preprocessor"].prepareDocument(translation, normalize = False, fixMisspellings = False, removeUnsignificantSentenceParts = False, removeNamedEntities = False)
                    i += 1
                return translatedWords
            else:
                return self.__languageSystems[lang]["preprocessor"].prepareDocument(doc, normalize=self.__normalize, fixMisspellings=self.__fixMisspellings, removeUnsignificantSentenceParts=self.__removeUnsignificantSentenceParts, removeNamedEntities=self.__removeNamedEntities)
        else:
            raise ValueError("Unsupported language of document")

    def _onlineWordSeparationAndTranslation(self, doc, maxNumberOfAttempts):
        res = None
        translation = self.__onlineMultilanguageTranslator.translate(doc, dest="en", maxNumberOfAttempts=maxNumberOfAttempts)
        # If online translation succeed
        if translation != None:
            res = self.__languageSystems["English"]["preprocessor"].prepareDocument(translation, normalize = False, fixMisspellings = False, removeUnsignificantSentenceParts = True, removeNamedEntities = False)
        return res

    # Fields:

    __wordToVecConverter = None
    __languageSystems = None
    __onlineMultilanguageTranslator = None

    __normalize = None
    __fixMisspellings = None
    __removeUnsignificantSentenceParts = None
    __removeNamedEntities = None