rawdoc/crawl_test.go at main · RandomCodeSpace/rawdoc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
package main

import (
	"net/url"
	"testing"
)

func TestNormalizeURL(t *testing.T) {
	tests := []struct {
		input string
		want  string
	}{
		{
			"https://example.com/foo#bar",
			"https://example.com/foo",
		},
		{
			"https://example.com/foo/",
			"https://example.com/foo",
		},
		{
			"https://example.com/foo?b=2&a=1",
			"https://example.com/foo?a=1&b=2",
		},
	}
	for _, tt := range tests {
		got := normalizeURL(tt.input)
		if got != tt.want {
			t.Errorf("normalizeURL(%q) = %q, want %q", tt.input, got, tt.want)
		}
	}
}

func TestIsSameDomain(t *testing.T) {
	base, _ := url.Parse("https://www.baeldung.com/spring-boot")
	tests := []struct {
		href string
		want bool
	}{
		{"https://www.baeldung.com/spring-kafka", true},
		{"https://baeldung.com/spring-kafka", true},
		{"/spring-kafka", true},
		{"https://google.com", false},
	}
	for _, tt := range tests {
		got := isSameDomain(tt.href, base)
		if got != tt.want {
			t.Errorf("isSameDomain(%q, baeldung) = %v, want %v", tt.href, got, tt.want)
		}
	}
}

func TestPathGlob(t *testing.T) {
	tests := []struct {
		urlPath string
		pattern string
		want    bool
	}{
		{"/spring-kafka", "/spring-*", true},
		{"/spring-boot-start", "/spring-*", true},
		{"/java-collections", "/spring-*", false},
		{"/docs/api/v2", "/docs/*", true},
	}
	for _, tt := range tests {
		got := pathMatchesGlob(tt.urlPath, tt.pattern)
		if got != tt.want {
			t.Errorf("pathMatchesGlob(%q, %q) = %v, want %v", tt.urlPath, tt.pattern, got, tt.want)
		}
	}
}

func TestDedup(t *testing.T) {
	seen := newURLSet()

	if !seen.add("https://example.com/page1") {
		t.Error("first add should return true")
	}
	if seen.add("https://example.com/page1") {
		t.Error("duplicate add should return false")
	}
	if !seen.add("https://example.com/page2") {
		t.Error("different URL should return true")
	}
}

func TestExtractLinks(t *testing.T) {
	h := `<html><body>
		<a href="/page1">Page 1</a>
		<a href="https://example.com/page2">Page 2</a>
		<a href="https://other.com/nope">Other site</a>
		<a href="#fragment">Fragment only</a>
		<a href="javascript:void(0)">JS link</a>
	</body></html>`

	base, _ := url.Parse("https://example.com/start")
	doc := parseHTML(h)

	links := extractLinks(doc, base)

	// Build a set for easy lookup
	linkSet := make(map[string]bool)
	for _, l := range links {
		linkSet[l] = true
	}

	// Should include same-domain pages
	if !linkSet["https://example.com/page1"] {
		t.Errorf("expected https://example.com/page1 in links, got: %v", links)
	}
	if !linkSet["https://example.com/page2"] {
		t.Errorf("expected https://example.com/page2 in links, got: %v", links)
	}

	// Should exclude other domains
	for _, l := range links {
		u, _ := url.Parse(l)
		if u.Hostname() == "other.com" {
			t.Errorf("other.com link should be excluded, got: %s", l)
		}
	}

	// Should exclude fragment-only and javascript links
	for _, l := range links {
		if l == "#fragment" || l == "javascript:void(0)" {
			t.Errorf("fragment/javascript link should be excluded: %s", l)
		}
	}
}