-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.go
More file actions
441 lines (384 loc) · 11.7 KB
/
main.go
File metadata and controls
441 lines (384 loc) · 11.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
package main
import (
"encoding/json"
"flag"
"fmt"
"io"
"net/url"
"os"
"strings"
"time"
"golang.org/x/net/html"
)
var version = "0.1.0"
type config struct {
// Output
output string
format string
codeOnly bool
noLinks bool
// Crawling
depth int
concurrency int
maxPages int
delay time.Duration
include string
exclude string
sitemap bool
// HTTP
timeout time.Duration
maxTime time.Duration
maxRetries int
headers headerFlags
// Info
verbose bool
quiet bool
}
type headerFlags []string
func (h *headerFlags) String() string { return strings.Join(*h, ", ") }
func (h *headerFlags) Set(value string) error {
*h = append(*h, value)
return nil
}
var stderr = os.Stderr
func main() {
cfg := &config{}
flag.StringVar(&cfg.output, "o", "", "Output file or directory")
flag.StringVar(&cfg.output, "output", "", "Output file or directory")
flag.StringVar(&cfg.format, "f", "markdown", "Output format: markdown|text|json|yaml")
flag.StringVar(&cfg.format, "format", "markdown", "Output format: markdown|text|json|yaml")
flag.BoolVar(&cfg.codeOnly, "code-only", false, "Extract only code blocks")
flag.BoolVar(&cfg.noLinks, "no-links", false, "Strip link URLs, keep text only")
flag.IntVar(&cfg.depth, "d", 0, "Crawl depth, 0 = single page")
flag.IntVar(&cfg.depth, "depth", 0, "Crawl depth, 0 = single page")
flag.IntVar(&cfg.concurrency, "c", 5, "Parallel fetches")
flag.IntVar(&cfg.concurrency, "concurrency", 5, "Parallel fetches")
flag.IntVar(&cfg.maxPages, "max-pages", 50, "Page limit for crawling")
flag.DurationVar(&cfg.delay, "delay", 1*time.Second, "Delay between requests")
flag.StringVar(&cfg.include, "include", "", "URL path glob to include")
flag.StringVar(&cfg.exclude, "exclude", "", "URL path glob to exclude")
flag.BoolVar(&cfg.sitemap, "sitemap", false, "Parse sitemap.xml for URL discovery")
flag.DurationVar(&cfg.timeout, "timeout", 15*time.Second, "Request timeout")
flag.DurationVar(&cfg.maxTime, "max-time", 10*time.Minute, "Total runtime ceiling")
flag.IntVar(&cfg.maxRetries, "max-retries", 3, "Per-URL retries")
flag.Var(&cfg.headers, "header", "Extra header K=V (repeatable)")
flag.BoolVar(&cfg.verbose, "v", false, "Log fetch/tier decisions to stderr")
flag.BoolVar(&cfg.verbose, "verbose", false, "Log fetch/tier decisions to stderr")
flag.BoolVar(&cfg.quiet, "q", false, "Suppress all stderr output")
flag.BoolVar(&cfg.quiet, "quiet", false, "Suppress all stderr output")
showVersion := flag.Bool("version", false, "Print version")
serveMode := flag.Bool("serve", false, "Run as MCP stdio server")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage: rawdoc <url> [flags]\n\n")
fmt.Fprintf(os.Stderr, "Fetches web pages and converts them to clean markdown.\n\n")
flag.PrintDefaults()
}
// Reorder os.Args so flags work regardless of position relative to URL.
// Go's flag package stops parsing at the first non-flag argument.
reorderArgs()
flag.Parse()
if *showVersion {
fmt.Println("rawdoc " + version)
os.Exit(0)
}
if *serveMode {
serveMCP()
os.Exit(0)
}
if flag.NArg() < 1 {
fmt.Fprintln(os.Stderr, "error: URL required")
flag.Usage()
os.Exit(2)
}
rawURL := flag.Arg(0)
if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
rawURL = "https://" + rawURL
}
parsed, err := url.Parse(rawURL)
if err != nil || parsed.Host == "" {
fmt.Fprintf(os.Stderr, "error: invalid URL %q\n", flag.Arg(0))
os.Exit(2)
}
// Validate format
switch cfg.format {
case "markdown", "text", "json", "yaml":
default:
fmt.Fprintf(os.Stderr, "error: invalid format %q (use markdown, text, json, or yaml)\n", cfg.format)
os.Exit(2)
}
if err := run(cfg, parsed); err != nil {
if !cfg.quiet {
fmt.Fprintf(os.Stderr, "error: %v\n", err)
}
os.Exit(1)
}
}
// reorderArgs moves the positional URL argument to the end of os.Args
// so that Go's flag package parses all flags regardless of position.
func reorderArgs() {
var flags []string
var positional []string
args := os.Args[1:]
for i := 0; i < len(args); i++ {
arg := args[i]
if strings.HasPrefix(arg, "-") {
flags = append(flags, arg)
// Check if this flag takes a value (next arg doesn't start with -)
if i+1 < len(args) && !strings.HasPrefix(args[i+1], "-") && !strings.Contains(arg, "=") {
// Could be a boolean flag or a value flag — check if it's a known bool flag
name := strings.TrimLeft(arg, "-")
if !isBoolFlag(name) {
i++
flags = append(flags, args[i])
}
}
} else {
positional = append(positional, arg)
}
}
os.Args = append([]string{os.Args[0]}, append(flags, positional...)...)
}
func isBoolFlag(name string) bool {
boolFlags := map[string]bool{
"code-only": true, "no-links": true, "sitemap": true,
"v": true, "verbose": true, "q": true, "quiet": true,
"version": true, "serve": true,
}
return boolFlags[name]
}
func run(cfg *config, u *url.URL) error {
if cfg.depth > 0 {
return runCrawl(cfg, u)
}
return runSingle(cfg, u)
}
func runSingle(cfg *config, u *url.URL) error {
opts := &fetchOptions{
timeout: cfg.timeout,
maxRetries: cfg.maxRetries,
verbose: cfg.verbose,
quiet: cfg.quiet,
headers: []string(cfg.headers),
}
result, err := fetch(u.String(), opts)
if err != nil {
return fmt.Errorf("fetch: %w", err)
}
doc, err := html.Parse(strings.NewReader(result.html))
if err != nil {
return fmt.Errorf("parse HTML: %w", err)
}
rawHTMLSize := len(result.html)
stripNoise(doc)
content := extractContent(doc, u.Host)
markdown := convertToMarkdown(content)
markdown = optimizeMarkdown(markdown)
title := ""
if titleNode := findFirst(doc, "title"); titleNode != nil {
title = strings.TrimSpace(textContent(titleNode))
}
description := ""
var findMeta func(*html.Node)
findMeta = func(n *html.Node) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && c.Data == "meta" && getAttr(c, "name") == "description" {
description = strings.TrimSpace(getAttr(c, "content"))
return
}
findMeta(c)
}
}
findMeta(doc)
outputSize, err := writeOutput(cfg, result.url, title, description, markdown, result)
if err != nil {
return err
}
if cfg.verbose {
rawTokens := estimateTokens(rawHTMLSize)
outTokens := estimateTokens(outputSize)
savings := 0
if rawTokens > 0 {
savings = 100 - (outTokens*100)/rawTokens
}
fmt.Fprintf(stderr, "[stats] input: %s (%d tokens) → output: %s (%d tokens) | %d%% saved\n",
humanSize(rawHTMLSize), rawTokens, humanSize(outputSize), outTokens, savings)
// Machine-parseable stats line for test scripts
fmt.Fprintf(stderr, "[data] raw_bytes=%d out_bytes=%d raw_tokens=%d out_tokens=%d saved_pct=%d\n",
rawHTMLSize, outputSize, rawTokens, outTokens, savings)
if cfg.output != "" {
fmt.Fprintf(stderr, "[output] wrote %s to %s\n", cfg.format, cfg.output)
}
}
return nil
}
func writeOutput(cfg *config, pageURL, title, description, markdown string, result *fetchResult) (int, error) {
var buf strings.Builder
if cfg.codeOnly {
if err := writeCodeOnly(&buf, markdown); err != nil {
return 0, err
}
} else {
switch cfg.format {
case "json":
if err := writeJSON(&buf, pageURL, title, description, markdown, result); err != nil {
return 0, err
}
case "yaml":
if err := writeYAML(&buf, pageURL, title, description, markdown, result); err != nil {
return 0, err
}
case "text":
if err := writeText(&buf, markdown); err != nil {
return 0, err
}
default:
if err := writeMarkdown(&buf, markdown); err != nil {
return 0, err
}
}
}
output := buf.String()
outputSize := len(output)
var w io.Writer = os.Stdout
if cfg.output != "" && cfg.depth == 0 {
f, err := os.Create(cfg.output)
if err != nil {
return 0, fmt.Errorf("create output file: %w", err)
}
defer f.Close()
w = f
}
_, err := io.WriteString(w, output)
return outputSize, err
}
func writeMarkdown(w io.Writer, markdown string) error {
_, err := fmt.Fprintln(w, markdown)
return err
}
func writeText(w io.Writer, markdown string) error {
// Strip bold/italic markers (** and *)
text := strings.ReplaceAll(markdown, "**", "")
text = strings.ReplaceAll(text, "*", "")
_, err := fmt.Fprintln(w, text)
return err
}
type codeBlock struct {
Lang string `json:"lang"`
Code string `json:"code"`
}
func extractCodeBlocks(markdown string) []codeBlock {
var blocks []codeBlock
lines := strings.Split(markdown, "\n")
var inBlock bool
var lang string
var buf strings.Builder
for _, line := range lines {
if !inBlock {
if strings.HasPrefix(line, "```") {
inBlock = true
lang = strings.TrimPrefix(line, "```")
lang = strings.TrimSpace(lang)
buf.Reset()
}
} else {
if strings.HasPrefix(line, "```") {
blocks = append(blocks, codeBlock{Lang: lang, Code: buf.String()})
inBlock = false
lang = ""
buf.Reset()
} else {
buf.WriteString(line)
buf.WriteByte('\n')
}
}
}
return blocks
}
func writeJSON(w io.Writer, pageURL, title, description, markdown string, result *fetchResult) error {
codeBlocks := extractCodeBlocks(markdown)
type output struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
Content string `json:"content"`
CodeBlocks []codeBlock `json:"code_blocks"`
FetchTier int `json:"fetch_tier"`
FetchedAt string `json:"fetched_at"`
ContentLength int `json:"content_length"`
}
out := output{
URL: pageURL,
Title: title,
Description: description,
Content: markdown,
CodeBlocks: codeBlocks,
FetchTier: result.tier,
FetchedAt: time.Now().UTC().Format(time.RFC3339),
ContentLength: len(markdown),
}
enc := json.NewEncoder(w)
enc.SetIndent("", " ")
return enc.Encode(out)
}
func writeYAML(w io.Writer, pageURL, title, description, markdown string, result *fetchResult) error {
codeBlocks := extractCodeBlocks(markdown)
// Write YAML manually — avoids adding a yaml dependency for simple output
fmt.Fprintf(w, "url: %s\n", yamlQuote(pageURL))
fmt.Fprintf(w, "title: %s\n", yamlQuote(title))
fmt.Fprintf(w, "description: %s\n", yamlQuote(description))
fmt.Fprintf(w, "fetch_tier: %d\n", result.tier)
fmt.Fprintf(w, "fetched_at: %s\n", time.Now().UTC().Format(time.RFC3339))
fmt.Fprintf(w, "content_length: %d\n", len(markdown))
fmt.Fprintf(w, "content: |\n")
for _, line := range strings.Split(markdown, "\n") {
fmt.Fprintf(w, " %s\n", line)
}
if len(codeBlocks) > 0 {
fmt.Fprintf(w, "code_blocks:\n")
for _, b := range codeBlocks {
fmt.Fprintf(w, " - lang: %s\n", yamlQuote(b.Lang))
fmt.Fprintf(w, " code: |\n")
for _, line := range strings.Split(b.Code, "\n") {
fmt.Fprintf(w, " %s\n", line)
}
}
}
return nil
}
func yamlQuote(s string) string {
if s == "" {
return `""`
}
// Quote if it contains special YAML chars
for _, c := range s {
if c == ':' || c == '#' || c == '\'' || c == '"' || c == '{' || c == '}' || c == '[' || c == ']' || c == ',' || c == '&' || c == '*' || c == '!' || c == '|' || c == '>' || c == '%' || c == '@' || c == '`' {
return `"` + strings.ReplaceAll(strings.ReplaceAll(s, `\`, `\\`), `"`, `\"`) + `"`
}
}
return s
}
func writeCodeOnly(w io.Writer, markdown string) error {
blocks := extractCodeBlocks(markdown)
for i, b := range blocks {
if i > 0 {
fmt.Fprintln(w)
}
lang := b.Lang
if lang == "" {
lang = ""
}
fmt.Fprintf(w, "```%s\n%s```\n", lang, b.Code)
}
return nil
}
func humanSize(bytes int) string {
if bytes < 1024 {
return fmt.Sprintf("%dB", bytes)
}
kb := float64(bytes) / 1024
if kb < 1024 {
return fmt.Sprintf("%.1fKB", kb)
}
mb := kb / 1024
return fmt.Sprintf("%.1fMB", mb)
}