-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch.go
More file actions
171 lines (144 loc) · 3.83 KB
/
fetch.go
File metadata and controls
171 lines (144 loc) · 3.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package main
import (
"fmt"
"io"
"net/http"
"net/http/cookiejar"
"strings"
"time"
)
type fetchOptions struct {
timeout time.Duration
maxRetries int
verbose bool
quiet bool
headers []string
}
type fetchResult struct {
html string
tier int
url string
}
type httpError struct {
statusCode int
message string
}
func (e *httpError) Error() string {
return fmt.Sprintf("HTTP %d: %s", e.statusCode, e.message)
}
var browserHeaders = map[string]string{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
func logVerbose(opts *fetchOptions, format string, args ...any) {
if opts.verbose && !opts.quiet {
fmt.Fprintf(stderr, format+"\n", args...)
}
}
func fetch(rawURL string, opts *fetchOptions) (*fetchResult, error) {
logVerbose(opts, "[tier1] %s → fetching", rawURL)
result, err := fetchTier1(rawURL, opts)
if err == nil {
return result, nil
}
return nil, err
}
func isEscalatable(err error) bool {
if err == nil {
return false
}
if he, ok := err.(*httpError); ok {
return he.statusCode == 403 || he.statusCode == 503
}
return false
}
func fetchTier1(rawURL string, opts *fetchOptions) (*fetchResult, error) {
jar, err := cookiejar.New(nil)
if err != nil {
return nil, fmt.Errorf("cookiejar: %w", err)
}
timeout := opts.timeout
if timeout == 0 {
timeout = 15 * time.Second
}
client := &http.Client{
Timeout: timeout,
Jar: jar,
}
maxRetries := opts.maxRetries
if maxRetries == 0 {
maxRetries = 3
}
var lastErr error
backoff := time.Second
for attempt := 1; attempt <= maxRetries; attempt++ {
if attempt > 1 {
logVerbose(opts, "[tier1] %s → retry %d/%d after %v", rawURL, attempt, maxRetries, backoff)
time.Sleep(backoff)
backoff *= 2
if backoff > 30*time.Second {
backoff = 30 * time.Second
}
}
req, err := http.NewRequest("GET", rawURL, nil)
if err != nil {
return nil, fmt.Errorf("creating request: %w", err)
}
for k, v := range browserHeaders {
req.Header.Set(k, v)
}
for _, h := range opts.headers {
parts := strings.SplitN(h, "=", 2)
if len(parts) == 2 {
req.Header.Set(parts[0], parts[1])
}
}
resp, err := client.Do(req)
if err != nil {
lastErr = err
continue
}
switch {
case resp.StatusCode == http.StatusOK:
body, err := readBody(resp)
resp.Body.Close()
if err != nil {
return nil, fmt.Errorf("reading body: %w", err)
}
return &fetchResult{html: body, tier: 1, url: rawURL}, nil
case resp.StatusCode == 429:
resp.Body.Close()
lastErr = &httpError{statusCode: resp.StatusCode, message: http.StatusText(resp.StatusCode)}
continue
case resp.StatusCode >= 500:
resp.Body.Close()
lastErr = &httpError{statusCode: resp.StatusCode, message: http.StatusText(resp.StatusCode)}
continue
case resp.StatusCode == 403 || resp.StatusCode == 503:
resp.Body.Close()
return nil, &httpError{statusCode: resp.StatusCode, message: http.StatusText(resp.StatusCode)}
default:
resp.Body.Close()
return nil, &httpError{statusCode: resp.StatusCode, message: http.StatusText(resp.StatusCode)}
}
}
if lastErr == nil {
lastErr = &httpError{statusCode: 0, message: "max retries exceeded"}
}
return nil, lastErr
}
func readBody(resp *http.Response) (string, error) {
data, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
return string(data), nil
}