diff --git a/core/baseline/baseline.go b/core/baseline/baseline.go index 292507a..37947e5 100644 --- a/core/baseline/baseline.go +++ b/core/baseline/baseline.go @@ -240,6 +240,7 @@ func (bl *Baseline) Collect() { } } + bl.Extracteds.Merge(pkg.HTTPLanguageExtract(pkg.ExtractHTTPLanguage(bl.Header, bl.Body))) bl.Hashes = parsers.NewHashes(bl.Raw) bl.Extracteds.Merge(pkg.ProtonExtract(bl.Raw)) bl.Unique = UniqueHash(bl) diff --git a/core/baseline/http_language_test.go b/core/baseline/http_language_test.go new file mode 100644 index 0000000..b0e4a2e --- /dev/null +++ b/core/baseline/http_language_test.go @@ -0,0 +1,66 @@ +package baseline + +import ( + "testing" + + "github.com/chainreactors/spray/core/ihttp" + "github.com/chainreactors/spray/pkg" + "github.com/valyala/fasthttp" +) + +func TestCollectAddsHTTPLanguageExtract(t *testing.T) { + var fastResp fasthttp.Response + fastResp.SetStatusCode(200) + fastResp.Header.Set("Content-Type", "text/html; charset=utf-8") + fastResp.Header.Set("Content-Language", "en-US") + fastResp.SetBodyString(`
これは日本語のページです。サービス状態を表示します。`) + + resp := &ihttp.Response{FastResponse: &fastResp} + bl := NewBaseline("https://example.test/", "example.test", resp) + bl.Collect() + + var found bool + for _, e := range bl.Extracteds { + if e.Name == "language" { + found = true + if len(e.ExtractResult) != 1 || e.ExtractResult[0] != "ja" { + t.Fatalf("language extract_result = %v, want [ja]", e.ExtractResult) + } + break + } + } + if !found { + t.Fatal("language extract not found in Extracteds") + } +} + +func TestCollectAddsHTTPLanguageExtractWithoutFingerEngine(t *testing.T) { + oldEnableAllFingerEngine := pkg.EnableAllFingerEngine + pkg.EnableAllFingerEngine = false + t.Cleanup(func() { + pkg.EnableAllFingerEngine = oldEnableAllFingerEngine + }) + + var fastResp fasthttp.Response + fastResp.SetStatusCode(200) + fastResp.Header.Set("Content-Type", "text/html; charset=utf-8") + fastResp.SetBodyString(`This is an English product page and the service is available for you from the web console with your account.`) + + resp := &ihttp.Response{FastResponse: &fastResp} + bl := NewBaseline("https://example.test/", "example.test", resp) + bl.Collect() + + var found bool + for _, e := range bl.Extracteds { + if e.Name == "language" { + found = true + if len(e.ExtractResult) != 1 || e.ExtractResult[0] != "en" { + t.Fatalf("language extract_result = %v, want [en]", e.ExtractResult) + } + break + } + } + if !found { + t.Fatal("language extract not found, want language detection even when finger engine is disabled") + } +} diff --git a/pkg/http_language.go b/pkg/http_language.go new file mode 100644 index 0000000..a6a1723 --- /dev/null +++ b/pkg/http_language.go @@ -0,0 +1,326 @@ +package pkg + +import ( + "bytes" + "html" + "io" + "regexp" + "strings" + "unicode/utf8" + + "github.com/chainreactors/utils/parsers" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" +) + +const maxLanguageBodyBytes = 200_000 + +var ( + htmlTagRe = regexp.MustCompile(`(?is)]*>`) + metaTagRe = regexp.MustCompile(`(?is)]*>`) + attrRe = regexp.MustCompile(`(?is)([:\w-]+)\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>/]+))`) + scriptStyleRe = regexp.MustCompile(`(?is)||`) + tagRe = regexp.MustCompile(`(?is)<[^>]+>`) + spaceRe = regexp.MustCompile(`\s+`) + wordRe = regexp.MustCompile(`[a-zA-Z]{2,}`) +) + +var languageAliases = map[string]string{ + "chinese": "zh", + "cn": "zh-cn", + "english": "en", + "french": "fr", + "german": "de", + "japanese": "ja", + "korean": "ko", + "spanish": "es", +} + +var englishStopwords = map[string]struct{}{ + "and": {}, + "are": {}, + "for": {}, + "from": {}, + "have": {}, + "not": {}, + "that": {}, + "the": {}, + "this": {}, + "with": {}, + "you": {}, + "your": {}, +} + +type HTTPLanguageAttrs struct { + ContentLanguage string + HTMLLang string + MetaLanguage string + DetectedLanguage string + Language string + LanguageSource string +} + +func ExtractHTTPLanguage(rawHeaders, body []byte) HTTPLanguageAttrs { + bodyText := decodeLanguageBody(rawHeaders, body) + + attrs := HTTPLanguageAttrs{ + ContentLanguage: normalizeLanguageTag(headerValue(rawHeaders, "content-language")), + HTMLLang: htmlLanguage(bodyText), + MetaLanguage: metaLanguage(bodyText), + DetectedLanguage: detectBodyLanguage(bodyText), + } + + attrs.Language, attrs.LanguageSource = chooseLanguage( + attrs.DetectedLanguage, + attrs.HTMLLang, + attrs.ContentLanguage, + attrs.MetaLanguage, + ) + + return attrs +} + +func HTTPLanguageExtract(attrs HTTPLanguageAttrs) parsers.Extracteds { + if attrs.Language == "" { + return nil + } + return parsers.Extracteds{ + &parsers.Extracted{ + Name: "language", + Severity: "info", + ExtractResult: []string{attrs.Language}, + }, + } +} + +func decodeLanguageBody(rawHeaders, body []byte) string { + if len(body) == 0 { + return "" + } + + sample := body + if len(sample) > maxLanguageBodyBytes { + sample = sample[:maxLanguageBodyBytes] + } + + if utf8.Valid(sample) { + return string(sample) + } + + contentType := headerValue(rawHeaders, "content-type") + encoding, _, _ := charset.DetermineEncoding(sample, contentType) + reader := transform.NewReader(bytes.NewReader(sample), encoding.NewDecoder()) + decoded, err := io.ReadAll(reader) + if err != nil { + return string(sample) + } + return string(decoded) +} + +func headerValue(rawHeaders []byte, name string) string { + target := strings.ToLower(name) + for _, line := range strings.Split(string(rawHeaders), "\n") { + line = strings.TrimRight(line, "\r") + i := strings.IndexByte(line, ':') + if i <= 0 { + continue + } + if strings.ToLower(strings.TrimSpace(line[:i])) == target { + return strings.TrimSpace(line[i+1:]) + } + } + return "" +} + +func normalizeLanguageTag(value string) string { + text := strings.Trim(strings.TrimSpace(value), `"'`) + if text == "" { + return "" + } + if i := strings.IndexAny(text, ",;"); i >= 0 { + text = text[:i] + } + text = strings.ToLower(strings.ReplaceAll(strings.Trim(strings.TrimSpace(text), `"'`), "_", "-")) + if alias, ok := languageAliases[text]; ok { + text = alias + } + if !validLanguageTag(text) { + return "" + } + return text +} + +func validLanguageTag(text string) bool { + if text == "" { + return false + } + parts := strings.Split(text, "-") + if len(parts[0]) < 2 || len(parts[0]) > 3 || !allLowerASCII(parts[0]) { + return false + } + for _, part := range parts[1:] { + if len(part) < 2 || len(part) > 8 || !allLowerAlphaNum(part) { + return false + } + } + return true +} + +func allLowerASCII(s string) bool { + for _, r := range s { + if r < 'a' || r > 'z' { + return false + } + } + return true +} + +func allLowerAlphaNum(s string) bool { + for _, r := range s { + if (r < 'a' || r > 'z') && (r < '0' || r > '9') { + return false + } + } + return true +} + +func htmlLanguage(body string) string { + match := htmlTagRe.FindString(body) + if match == "" { + return "" + } + attrs := parseTagAttrs(match) + if value := normalizeLanguageTag(attrs["lang"]); value != "" { + return value + } + return normalizeLanguageTag(attrs["xml:lang"]) +} + +func metaLanguage(body string) string { + for _, match := range metaTagRe.FindAllString(body, -1) { + attrs := parseTagAttrs(match) + content := attrs["content"] + if content == "" { + continue + } + + if strings.EqualFold(attrs["http-equiv"], "content-language") { + if language := normalizeLanguageTag(content); language != "" { + return language + } + } + + switch strings.ToLower(attrs["name"]) { + case "content-language", "dc.language", "dc.language.iso", "language": + if language := normalizeLanguageTag(content); language != "" { + return language + } + } + + switch strings.ToLower(attrs["property"]) { + case "og:locale", "og:locale:alternate": + if language := normalizeLanguageTag(content); language != "" { + return language + } + } + } + return "" +} + +func parseTagAttrs(tag string) map[string]string { + attrs := make(map[string]string) + for _, match := range attrRe.FindAllStringSubmatch(tag, -1) { + value := "" + for _, group := range match[2:] { + if group != "" { + value = group + break + } + } + attrs[strings.ToLower(match[1])] = html.UnescapeString(strings.TrimSpace(value)) + } + return attrs +} + +func detectBodyLanguage(body string) string { + text := visibleText(body) + if text == "" { + return "" + } + + counts := map[string]int{ + "zh": countRuneRanges(text, [][2]rune{{0x4E00, 0x9FFF}, {0x3400, 0x4DBF}}), + "ja_kana": countRuneRanges(text, [][2]rune{{0x3040, 0x30FF}, {0x31F0, 0x31FF}}), + "ko": countRuneRanges(text, [][2]rune{{0xAC00, 0xD7AF}, {0x1100, 0x11FF}}), + "ar": countRuneRanges(text, [][2]rune{{0x0600, 0x06FF}, {0x0750, 0x077F}}), + "he": countRuneRanges(text, [][2]rune{{0x0590, 0x05FF}}), + "el": countRuneRanges(text, [][2]rune{{0x0370, 0x03FF}}), + "th": countRuneRanges(text, [][2]rune{{0x0E00, 0x0E7F}}), + } + + if counts["ja_kana"] >= 3 { + return "ja" + } + for _, language := range []string{"ko", "zh", "ar", "he", "el", "th"} { + if counts[language] >= 4 { + return language + } + } + + return detectEnglish(text) +} + +func visibleText(body string) string { + withoutScripts := scriptStyleRe.ReplaceAllString(body, " ") + withoutTags := tagRe.ReplaceAllString(withoutScripts, " ") + return strings.TrimSpace(spaceRe.ReplaceAllString(html.UnescapeString(withoutTags), " ")) +} + +func countRuneRanges(text string, ranges [][2]rune) int { + count := 0 + for _, r := range text { + for _, item := range ranges { + if r >= item[0] && r <= item[1] { + count++ + break + } + } + } + return count +} + +func detectEnglish(text string) string { + words := wordRe.FindAllString(text, -1) + if len(words) < 12 { + return "" + } + + hits := 0 + for _, word := range words { + if _, ok := englishStopwords[strings.ToLower(word)]; ok { + hits++ + } + } + + ratio := float64(hits) / float64(len(words)) + if hits < 5 || ratio < 0.08 { + return "" + } + return "en" +} + +func chooseLanguage(detected, htmlLang, contentLanguage, metaLanguage string) (string, string) { + if detected != "" { + return detected, "body" + } + if htmlLang != "" { + return htmlLang, "html" + } + if contentLanguage != "" { + return contentLanguage, "header" + } + if metaLanguage != "" { + return metaLanguage, "meta" + } + return "", "" +} diff --git a/pkg/http_language_test.go b/pkg/http_language_test.go new file mode 100644 index 0000000..5d3822b --- /dev/null +++ b/pkg/http_language_test.go @@ -0,0 +1,124 @@ +package pkg + +import ( + "testing" + + "golang.org/x/text/encoding/simplifiedchinese" + "golang.org/x/text/transform" +) + +func TestExtractHTTPLanguageFromHeadersHTMLMetaAndBody(t *testing.T) { + attrs := ExtractHTTPLanguage( + []byte("HTTP/1.1 200 OK\r\nContent-Language: zh-CN\r\nContent-Type: text/html; charset=utf-8\r\n\r\n"), + []byte(`欢迎使用平台,这里展示资产、任务、漏洞和生命周期。`), + ) + + if attrs.ContentLanguage != "zh-cn" { + t.Fatalf("content language = %q, want zh-cn", attrs.ContentLanguage) + } + if attrs.HTMLLang != "zh-cn" { + t.Fatalf("html lang = %q, want zh-cn", attrs.HTMLLang) + } + if attrs.MetaLanguage != "zh-cn" { + t.Fatalf("meta language = %q, want zh-cn", attrs.MetaLanguage) + } + if attrs.DetectedLanguage != "zh" { + t.Fatalf("detected language = %q, want zh", attrs.DetectedLanguage) + } + if attrs.Language != "zh" || attrs.LanguageSource != "body" { + t.Fatalf("language/source = %q/%q, want zh/body", attrs.Language, attrs.LanguageSource) + } +} + +func TestExtractHTTPLanguageBodyOverridesDeclared(t *testing.T) { + attrs := ExtractHTTPLanguage( + []byte("HTTP/1.1 200 OK\r\nContent-Language: en-US\r\nContent-Type: text/html; charset=utf-8\r\n\r\n"), + []byte(`これは日本語のページです。サービス状態を表示します。`), + ) + + if attrs.ContentLanguage != "en-us" { + t.Fatalf("content language = %q, want en-us", attrs.ContentLanguage) + } + if attrs.HTMLLang != "en" { + t.Fatalf("html lang = %q, want en", attrs.HTMLLang) + } + if attrs.DetectedLanguage != "ja" { + t.Fatalf("detected language = %q, want ja", attrs.DetectedLanguage) + } + if attrs.Language != "ja" || attrs.LanguageSource != "body" { + t.Fatalf("language/source = %q/%q, want ja/body", attrs.Language, attrs.LanguageSource) + } +} + +func TestExtractHTTPLanguageFallsBackToHeader(t *testing.T) { + attrs := ExtractHTTPLanguage( + []byte("HTTP/1.1 200 OK\r\nContent-Language: fr-FR\r\n\r\n"), + nil, + ) + + if attrs.Language != "fr-fr" || attrs.LanguageSource != "header" { + t.Fatalf("language/source = %q/%q, want fr-fr/header", attrs.Language, attrs.LanguageSource) + } +} + +func TestExtractHTTPLanguageDetectsEnglishBody(t *testing.T) { + attrs := ExtractHTTPLanguage( + []byte("HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=utf-8\r\n\r\n"), + []byte(`This is an English product page and the service is available for you from the web console with your account.`), + ) + + if attrs.DetectedLanguage != "en" { + t.Fatalf("detected language = %q, want en", attrs.DetectedLanguage) + } + if attrs.Language != "en" || attrs.LanguageSource != "body" { + t.Fatalf("language/source = %q/%q, want en/body", attrs.Language, attrs.LanguageSource) + } +} + +func TestExtractHTTPLanguageDecodesGBKBody(t *testing.T) { + html := `欢迎使用平台,这里展示资产、任务、漏洞和生命周期。` + gbk, _, err := transform.String(simplifiedchinese.GBK.NewEncoder(), html) + if err != nil { + t.Fatalf("encode gbk fixture: %v", err) + } + + attrs := ExtractHTTPLanguage( + []byte("HTTP/1.1 200 OK\r\nContent-Type: text/html; charset=gbk\r\n\r\n"), + []byte(gbk), + ) + + if attrs.DetectedLanguage != "zh" { + t.Fatalf("detected language = %q, want zh", attrs.DetectedLanguage) + } + if attrs.Language != "zh" || attrs.LanguageSource != "body" { + t.Fatalf("language/source = %q/%q, want zh/body", attrs.Language, attrs.LanguageSource) + } +} + +func TestHTTPLanguageExtractReturnsExtracted(t *testing.T) { + attrs := HTTPLanguageAttrs{ + Language: "zh", + LanguageSource: "body", + } + + extracted := HTTPLanguageExtract(attrs) + if len(extracted) != 1 { + t.Fatalf("extracted length = %d, want 1", len(extracted)) + } + if extracted[0].Name != "language" { + t.Fatalf("extracted name = %q, want language", extracted[0].Name) + } + if extracted[0].Severity != "info" { + t.Fatalf("extracted severity = %q, want info", extracted[0].Severity) + } + if len(extracted[0].ExtractResult) != 1 || extracted[0].ExtractResult[0] != "zh" { + t.Fatalf("extract_result = %v, want [zh]", extracted[0].ExtractResult) + } +} + +func TestHTTPLanguageExtractReturnsNilForEmpty(t *testing.T) { + extracted := HTTPLanguageExtract(HTTPLanguageAttrs{}) + if extracted != nil { + t.Fatalf("extracted = %v, want nil for empty language", extracted) + } +}