-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathHTMLHandler.py
More file actions
101 lines (76 loc) · 2.23 KB
/
HTMLHandler.py
File metadata and controls
101 lines (76 loc) · 2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
def allClosed( dictionary ):
result = sum( [ int( dictionary[key] ) for key in dictionary ] )
if result == 0:
return True
else:
return False
def isValidTag( char ):
return char == '<'
def isSlashTag( char ):
return (char == '\'' or char == '/')
def getTagName( index, source ):
tagName, lenSource, up = '', len(source), 1
index += 1
if isSlashTag( source[index] ):
up = -1
index += 1
while True:
char = source[index]
if char.isalpha():
tagName += char
else:
break
index += 1
return (tagName, up)
def isUniqueTag( tag ):
uniqueTags = ['img', 'br', 'wbr']
return tag in uniqueTags
def updateTagNameValue(dictionary, key, up):
#disposableTags = ['wbr']
if isUniqueTag( key ):
up = 0
if key in dictionary:
dictionary[key] += up
else:
dictionary[key] = up
if dictionary[key] < 0:
dictionary[key] = 0
def getScopeTag(index, source):
controler, scope, start, end = {}, '', index, index
#print('to>', source[start:start+20])
while True:
try:
char = source[index]
if isValidTag( char ):
tagName, up = getTagName(index, source)
updateTagNameValue( controler, tagName, up )
#print(source[index:index+100], controler)
end += 1
if allClosed( controler ):
end += source[index:].find('>')
break
index += 1
except:
#print('gerou excecao')
break
scope = source[start:end]
return scope
def extractLink(tag):
attr, link = 'href="', ''
if attr in tag:
index = tag.find( attr ) + len( attr )
while tag[index] != '"':
link += tag[index]
index += 1
return link
else:
return None
def stripTags(value):
while True:
try:
lt = value.index('<')
mt = lt + value[lt:].index('>')
value = value[0:lt] + value[mt+1:]
except:
break
return value