-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawl_test.go
More file actions
128 lines (115 loc) · 2.95 KB
/
crawl_test.go
File metadata and controls
128 lines (115 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
package main
import (
"net/url"
"testing"
)
func TestNormalizeURL(t *testing.T) {
tests := []struct {
input string
want string
}{
{
"https://example.com/foo#bar",
"https://example.com/foo",
},
{
"https://example.com/foo/",
"https://example.com/foo",
},
{
"https://example.com/foo?b=2&a=1",
"https://example.com/foo?a=1&b=2",
},
}
for _, tt := range tests {
got := normalizeURL(tt.input)
if got != tt.want {
t.Errorf("normalizeURL(%q) = %q, want %q", tt.input, got, tt.want)
}
}
}
func TestIsSameDomain(t *testing.T) {
base, _ := url.Parse("https://www.baeldung.com/spring-boot")
tests := []struct {
href string
want bool
}{
{"https://www.baeldung.com/spring-kafka", true},
{"https://baeldung.com/spring-kafka", true},
{"/spring-kafka", true},
{"https://google.com", false},
}
for _, tt := range tests {
got := isSameDomain(tt.href, base)
if got != tt.want {
t.Errorf("isSameDomain(%q, baeldung) = %v, want %v", tt.href, got, tt.want)
}
}
}
func TestPathGlob(t *testing.T) {
tests := []struct {
urlPath string
pattern string
want bool
}{
{"/spring-kafka", "/spring-*", true},
{"/spring-boot-start", "/spring-*", true},
{"/java-collections", "/spring-*", false},
{"/docs/api/v2", "/docs/*", true},
}
for _, tt := range tests {
got := pathMatchesGlob(tt.urlPath, tt.pattern)
if got != tt.want {
t.Errorf("pathMatchesGlob(%q, %q) = %v, want %v", tt.urlPath, tt.pattern, got, tt.want)
}
}
}
func TestDedup(t *testing.T) {
seen := newURLSet()
if !seen.add("https://example.com/page1") {
t.Error("first add should return true")
}
if seen.add("https://example.com/page1") {
t.Error("duplicate add should return false")
}
if !seen.add("https://example.com/page2") {
t.Error("different URL should return true")
}
}
func TestExtractLinks(t *testing.T) {
h := `<html><body>
<a href="/page1">Page 1</a>
<a href="https://example.com/page2">Page 2</a>
<a href="https://other.com/nope">Other site</a>
<a href="#fragment">Fragment only</a>
<a href="javascript:void(0)">JS link</a>
</body></html>`
base, _ := url.Parse("https://example.com/start")
doc := parseHTML(h)
links := extractLinks(doc, base)
// Build a set for easy lookup
linkSet := make(map[string]bool)
for _, l := range links {
linkSet[l] = true
}
// Should include same-domain pages
if !linkSet["https://example.com/page1"] {
t.Errorf("expected https://example.com/page1 in links, got: %v", links)
}
if !linkSet["https://example.com/page2"] {
t.Errorf("expected https://example.com/page2 in links, got: %v", links)
}
// Should exclude other domains
for _, l := range links {
u, _ := url.Parse(l)
if u.Hostname() == "other.com" {
t.Errorf("other.com link should be excluded, got: %s", l)
}
}
// Should exclude fragment-only and javascript links
for _, l := range links {
if l == "#fragment" || l == "javascript:void(0)" {
t.Errorf("fragment/javascript link should be excluded: %s", l)
}
}
}