diff --git a/README.md b/README.md index 2824c96..1ce5485 100644 --- a/README.md +++ b/README.md @@ -253,6 +253,8 @@ Defined in `wrangler.toml` under `[vars]`: | `MAX_ARTICLES_PER_DAY` | `5000` | Global circuit breaker — soft cap per UTC day | | `GEN_PER_IP_PER_HOUR` | `100` | Per-IP article generation budget | | `IDENT_PER_IP_PER_HOUR` | `10` | Per-IP cap on minting new commenter identities | +| `COMMENT_PER_IP_PER_HOUR` | `90` | Per-IP comment creation budget | +| `COMMENT_PER_IP_PER_MINUTE` | `15` | Per-IP comment burst budget | Secrets (set via `wrangler secret put`): diff --git a/package.json b/package.json index 8f7920f..7b4aaa1 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ }, "dependencies": { "hono": "^4.6.14", + "ipaddr.js": "^2.4.0", "react": "^18.3.1", "react-dom": "^18.3.1" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bf6fbe6..98061c7 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -11,6 +11,9 @@ importers: hono: specifier: ^4.6.14 version: 4.12.14 + ipaddr.js: + specifier: ^2.4.0 + version: 2.4.0 react: specifier: ^18.3.1 version: 18.3.1 @@ -935,6 +938,10 @@ packages: resolution: {integrity: sha512-am5zfg3yu6sqn5yjKBNqhnTX7Cv+m00ox+7jbaKkrLMRJ4rAdldd1xPd/JzbBWspqaQv6RSTrgFN95EsfhC+7w==} engines: {node: '>=16.9.0'} + ipaddr.js@2.4.0: + resolution: {integrity: sha512-9VGk3HGanVE6JoZXHiCpnGy5X0jYDnN4EA4lntFPj+1vIWlFhIylq2CrrCOJH9EAhc5CYhq18F2Av2tgoAPsYQ==} + engines: {node: '>= 10'} + is-arrayish@0.3.4: resolution: {integrity: sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==} @@ -1938,6 +1945,8 @@ snapshots: hono@4.12.14: {} + ipaddr.js@2.4.0: {} + is-arrayish@0.3.4: optional: true diff --git a/src/worker/abuse.ts b/src/worker/abuse.ts new file mode 100644 index 0000000..27a04a5 --- /dev/null +++ b/src/worker/abuse.ts @@ -0,0 +1,171 @@ +/** + * Deterministic abuse floor for cases that should not depend on model + * judgment. The LLM moderation prompt handles nuance; this catches obvious + * slurs, protected-class hate, genocidal/Nazi promotion, and sexual-minor + * abuse even when the configured provider is too permissive. + */ + +const DIRECT_SLURS = [ + "nigger", + "nigga", + "kike", + "faggot", + "fag", + "tranny", + "chink", + "gook", + "spic", + "wetback", + "coon", + "raghead", + "towelhead", + "sandnigger", + "zipperhead", + "retard", + "mongoloid", +]; + +const PROTECTED_GROUPS = [ + "jew", + "jews", + "jewish", + "muslim", + "muslims", + "islam", + "islamic", + "islamist", + "islamists", + "christian", + "christians", + "black", + "blacks", + "asian", + "asians", + "arab", + "arabs", + "mexican", + "mexicans", + "immigrant", + "immigrants", + "gay", + "gays", + "lesbian", + "lesbians", + "trans", + "transgender", + "women", + "woman", + "disabled", +]; + +const HATE_ACTIONS = [ + "kill", + "murder", + "rape", + "hang", + "lynch", + "gas", + "exterminate", + "eradicate", + "enslave", + "sterilize", + "castrate", + "deport", + "banish", +]; + +const HATE_DESCRIPTORS = [ + "vermin", + "rats", + "parasites", + "cockroaches", + "subhuman", + "animals", + "rapists", + "pedophiles", + "terrorists", + "disease", + "plague", + "filth", + "scum", + "degenerate", + "degenerates", + "inferior", +]; + +const PROFANE_TARGETING = [ + "fuck", + "suck", + "sucks", + "hate", + "destroy", +]; + +const SEXUAL_MINOR_PATTERNS = [ + /(?:^|-)child(?:ren)?-(?:porn|sex|rape|rapist|molest|abuse)(?:-|$)/, + /(?:^|-)(?:porn|sex|rape|molest|abuse)-child(?:ren)?(?:-|$)/, + /(?:^|-)minor(?:s)?-(?:porn|sex|rape|molest|abuse)(?:-|$)/, + /(?:^|-)(?:cp|loli|lolicon)(?:-|$)/, +]; + +const GENOCIDE_PATTERNS = [ + /(?:^|-)heil-hitler(?:-|$)/, + /(?:^|-)white-power(?:-|$)/, + /(?:^|-)white-supremacy(?:-|$)/, + /(?:^|-)aryan-(?:race|nation|supremacy)(?:-|$)/, + /(?:^|-)gas-the-[a-z0-9-]+/, + /(?:^|-)holocaust-(?:hoax|fake|denial)(?:-|$)/, + /(?:^|-)hitler-did-nothing-wrong(?:-|$)/, + /(?:^|-)(?:kill|exterminate|eradicate|gas)-all-[a-z0-9-]+/, +]; + +export function containsDeterministicDisallowedAbuse(text: string): boolean { + const normalized = normalizeForAbuseScan(text); + if (!normalized) return false; + + for (const term of DIRECT_SLURS) { + if (hasToken(normalized, term)) return true; + } + + for (const pattern of SEXUAL_MINOR_PATTERNS) { + if (pattern.test(normalized)) return true; + } + + for (const pattern of GENOCIDE_PATTERNS) { + if (pattern.test(normalized)) return true; + } + + const targetsProtectedGroup = PROTECTED_GROUPS.some((term) => + hasToken(normalized, term) + ); + if (!targetsProtectedGroup) return false; + + return ( + HATE_ACTIONS.some((term) => hasToken(normalized, term)) || + HATE_DESCRIPTORS.some((term) => hasToken(normalized, term)) || + PROFANE_TARGETING.some((term) => hasToken(normalized, term)) + ); +} + +function normalizeForAbuseScan(text: string): string { + return text + .toLowerCase() + .normalize("NFKD") + .replace(/[\u0300-\u036f]/g, "") + .replace(/0/g, "o") + .replace(/1/g, "i") + .replace(/3/g, "e") + .replace(/4/g, "a") + .replace(/5/g, "s") + .replace(/7/g, "t") + .replace(/@/g, "a") + .replace(/\$/g, "s") + .replace(/[^a-z0-9]+/g, "-") + .replace(/-+/g, "-") + .replace(/^-|-$/g, ""); +} + +function hasToken(normalized: string, token: string): boolean { + const escaped = token.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + return new RegExp(`(?:^|-)${escaped}(?:s|es)?(?:-|$)`).test(normalized); +} diff --git a/src/worker/blocklist.ts b/src/worker/blocklist.ts index 76e6e7e..f2a8da0 100644 --- a/src/worker/blocklist.ts +++ b/src/worker/blocklist.ts @@ -1,3 +1,5 @@ +import { containsDeterministicDisallowedAbuse } from "./abuse"; + /** * Permanent slug-pattern blocklist. Slugs matching any rule below are * refused before generation, before cache lookup, before anything. They @@ -12,6 +14,7 @@ */ export function isPermanentlyBlockedSlug(slug: string): boolean { slug = slug.toLowerCase(); + if (containsDeterministicDisallowedAbuse(slug)) return true; if (slug.startsWith("0-0")) return true; if (slug.includes("0-0-0")) return true; if (slug.includes("strama")) return true; diff --git a/src/worker/comments.ts b/src/worker/comments.ts index 0b2b419..6ba9ac3 100644 --- a/src/worker/comments.ts +++ b/src/worker/comments.ts @@ -6,6 +6,7 @@ import { type Identity, } from "./identity"; import { slugify } from "./slug"; +import ipaddr from "ipaddr.js"; import { rateLimit, clientIp } from "./ratelimit"; import { moderateCommentNow } from "./moderation"; import { requireHuman, challengeResponse } from "./turnstile"; @@ -14,9 +15,16 @@ export interface CommentsEnv { DB: D1Database; ARTICLES: KVNamespace; OPENROUTER_API_KEY: string; + LLM_API_URL?: string; OPENROUTER_MODEL: string; OPENROUTER_MODERATION_MODEL?: string; IDENT_PER_IP_PER_HOUR?: string; + ARTICLE_VOTE_PER_USER_PER_HOUR?: string; + ARTICLE_VOTE_PER_IP_PER_HOUR?: string; + ARTICLE_VOTE_PER_SUBNET_PER_HOUR?: string; + ARTICLE_VOTE_PER_USER_PER_MINUTE?: string; + COMMENT_PER_IP_PER_HOUR?: string; + COMMENT_PER_IP_PER_MINUTE?: string; // Forwarded so turnstile.requireHuman can read its config. Optional; // missing values fall open (no gating). TURNSTILE_SITE_KEY?: string; @@ -32,6 +40,17 @@ const COOKIE_NAME = "hu_uid"; // actually expire. const COOKIE_MAX_AGE = 60 * 60 * 24 * 400; const MAX_BODY_LEN = 2000; +const ARTICLE_VOTE_DEFAULTS = { + userPerHour: 120, + ipPerHour: 240, + subnetPerHour: 600, + userPerMinute: 30, +}; + +const COMMENT_CREATION_DEFAULTS = { + ipPerHour: 90, + ipPerMinute: 15, +}; export interface UserRow { id: string; @@ -135,6 +154,7 @@ async function ensureUser( try { identity = await hallucinateIdentity( env.OPENROUTER_API_KEY, + env.LLM_API_URL, env.OPENROUTER_MODEL || "google/gemini-2.5-flash-lite" ); } catch { @@ -224,7 +244,7 @@ function rootOrderClause(sort: SortMode): string { return "ORDER BY score DESC, created_at DESC"; case "recommended": default: - return ( + return ( "ORDER BY (sqrt(CAST(score AS REAL)) / " + "pow(((? - created_at) / 3600000.0) + 2.0, 0.8)) DESC, " + "created_at DESC" @@ -232,6 +252,170 @@ function rootOrderClause(sort: SortMode): string { } } +function parsePositiveInt(raw: string | undefined, fallback: number): number { + const n = Number.parseInt(raw || "", 10); + return Number.isFinite(n) && n > 0 ? n : fallback; +} + +function ipToSubnet(ip: string): string { + if (!ip || ip === "unknown") return "unknown"; + try { + const parsed = ipaddr.parse(ip); + if (parsed.kind() === "ipv4") { + const bytes = parsed.toByteArray(); + return `${bytes[0]}.${bytes[1]}.${bytes[2]}.0/24`; + } + + if (parsed.kind() === "ipv6") { + if (parsed.isIPv4MappedAddress && parsed.isIPv4MappedAddress()) { + const ipv4 = parsed.toIPv4Address(); + const bytes = ipv4.toByteArray(); + return `${bytes[0]}.${bytes[1]}.${bytes[2]}.0/24`; + } + + const bytes = parsed.toByteArray(); + const parts = [ + (bytes[0] << 8) | bytes[1], + (bytes[2] << 8) | bytes[3], + (bytes[4] << 8) | bytes[5], + (bytes[6] << 8) | bytes[7], + ].map((n) => n.toString(16)); + return `${parts[0]}:${parts[1]}:${parts[2]}:${parts[3]}::/64`; + } + } catch { + return ip; + } + return ip; +} + +async function enforceArticleVoteRateLimits( + c: any, + env: CommentsEnv, + userId: string +): Promise { + const ip = clientIp(c); + const subnet = ipToSubnet(ip); + + const checks = [ + { + scope: "user/hour", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "article-vote", + ip: `user:${userId}`, + limit: parsePositiveInt( + env.ARTICLE_VOTE_PER_USER_PER_HOUR, + ARTICLE_VOTE_DEFAULTS.userPerHour + ), + windowSec: 3600, + }), + }, + { + scope: "ip/hour", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "article-vote", + ip: `ip:${ip}`, + limit: parsePositiveInt( + env.ARTICLE_VOTE_PER_IP_PER_HOUR, + ARTICLE_VOTE_DEFAULTS.ipPerHour + ), + windowSec: 3600, + }), + }, + { + scope: "subnet/hour", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "article-vote", + ip: `subnet:${subnet}`, + limit: parsePositiveInt( + env.ARTICLE_VOTE_PER_SUBNET_PER_HOUR, + ARTICLE_VOTE_DEFAULTS.subnetPerHour + ), + windowSec: 3600, + }), + }, + { + scope: "user/minute", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "article-vote", + ip: `user:${userId}:burst`, + limit: parsePositiveInt( + env.ARTICLE_VOTE_PER_USER_PER_MINUTE, + ARTICLE_VOTE_DEFAULTS.userPerMinute + ), + windowSec: 60, + }), + }, + ]; + + for (const { scope, run } of checks) { + const result = await run(); + if (result.ok) continue; + const err: any = new Error( + `vote limit exceeded (${scope}), max ${result.limit} per window` + ); + err.status = 429; + err.retryAfter = result.retryAfter; + throw err; + } +} + +async function enforceCommentCreationRateLimits( + c: any, + env: CommentsEnv +): Promise { + const ip = clientIp(c); + + const checks = [ + { + scope: "ip/hour", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "comment-create", + ip: `ip:${ip}`, + limit: parsePositiveInt( + env.COMMENT_PER_IP_PER_HOUR, + COMMENT_CREATION_DEFAULTS.ipPerHour + ), + windowSec: 3600, + }), + }, + { + scope: "ip/minute", + run: () => + rateLimit({ + kv: env.ARTICLES, + bucket: "comment-create", + ip: `ip:${ip}:burst`, + limit: parsePositiveInt( + env.COMMENT_PER_IP_PER_MINUTE, + COMMENT_CREATION_DEFAULTS.ipPerMinute + ), + windowSec: 60, + }), + }, + ]; + + for (const { scope, run } of checks) { + const result = await run(); + if (result.ok) continue; + const err: any = new Error( + `comment limit exceeded (${scope}), max ${result.limit} per window` + ); + err.status = 429; + err.retryAfter = result.retryAfter; + throw err; + } +} + function compareDTO(sort: SortMode): (a: CommentDTO, b: CommentDTO) => number { if (sort === "newest") return (a, b) => b.created_at - a.created_at; if (sort === "top") @@ -457,6 +641,7 @@ export function createCommentsApp() { let user: UserRow; try { user = await ensureUser(c, c.env); + await enforceCommentCreationRateLimits(c, c.env); } catch (e: any) { if (e?.status === 429) { return c.json({ error: e.message }, 429, { @@ -521,6 +706,7 @@ export function createCommentsApp() { let user: UserRow; try { user = await ensureUser(c, c.env); + await enforceArticleVoteRateLimits(c, c.env, user.id); } catch (e: any) { if (e?.status === 429) { return c.json({ error: e.message }, 429, { @@ -653,6 +839,7 @@ export function createCommentsApp() { let user: UserRow; try { user = await ensureUser(c, c.env); + await enforceArticleVoteRateLimits(c, c.env, user.id); } catch (e: any) { if (e?.status === 429) { return c.json({ error: e.message }, 429, { diff --git a/src/worker/identity.ts b/src/worker/identity.ts index b7748d3..ce694cf 100644 --- a/src/worker/identity.ts +++ b/src/worker/identity.ts @@ -20,6 +20,7 @@ export interface Identity { export async function hallucinateIdentity( apiKey: string, + apiUrl: string | undefined, model: string ): Promise { const body = { @@ -33,7 +34,7 @@ export async function hallucinateIdentity( ], }; - const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + const res = await fetch(chatCompletionsUrl(apiUrl), { method: "POST", headers: { "Content-Type": "application/json", @@ -50,6 +51,13 @@ export async function hallucinateIdentity( return parseIdentity(raw); } +function chatCompletionsUrl(apiUrl: string | undefined): string { + const trimmed = apiUrl?.trim(); + return trimmed && trimmed.length > 0 + ? trimmed + : "https://openrouter.ai/api/v1/chat/completions"; +} + export function parseIdentity(raw: string): Identity { const cleaned = raw .trim() diff --git a/src/worker/index.ts b/src/worker/index.ts index b601aea..2c91fc3 100644 --- a/src/worker/index.ts +++ b/src/worker/index.ts @@ -17,6 +17,8 @@ import { countRecentBansByIp, enqueueArticleForModeration, isSlugBanned, + isTitleModerationApproved, + topicRejectedMessage, runSweep, } from "./moderation"; import { createAdminApp } from "./admin"; @@ -32,6 +34,7 @@ export interface Env { DB: D1Database; ASSETS: Fetcher; OPENROUTER_API_KEY: string; + LLM_API_URL?: string; OPENROUTER_MODEL: string; OPENROUTER_MODERATION_MODEL?: string; MAX_ARTICLES_PER_DAY: string; @@ -150,6 +153,35 @@ async function backfillTotal(env: Env): Promise { return count; } +async function filterModeratedIndexItems( + db: D1Database, + items: { slug: string; title: string; generatedAt: number | null }[] +): Promise<{ slug: string; title: string; generatedAt: number | null }[]> { + if (items.length === 0) return items; + + const safeItems = items.filter((it) => !isPermanentlyBlockedSlug(it.slug)); + if (safeItems.length === 0) return []; + + const slugs = safeItems.map((it) => it.slug); + const placeholders = slugs.map(() => "?").join(","); + try { + const { results } = await db + .prepare( + `SELECT slug FROM article_moderation + WHERE status IN ('banned', 'pending', 'checking') + AND slug IN (${placeholders})` + ) + .bind(...slugs) + .all<{ slug: string }>(); + if (!results || results.length === 0) return safeItems; + const blocked = new Set(results.map((r) => r.slug)); + return safeItems.filter((it) => !blocked.has(it.slug)); + } catch (e) { + console.error("index: moderation filter failed", e); + throw e; + } +} + app.get("/api/index", async (c) => { const cursorRaw = c.req.query("cursor"); const cursor = cursorRaw && cursorRaw.length > 0 ? cursorRaw : undefined; @@ -170,6 +202,21 @@ app.get("/api/index", async (c) => { title: k.metadata?.title ?? slugToTitle(k.name), generatedAt: k.metadata?.generatedAt ?? null, })); + let filteredItems = items; + try { + filteredItems = await filterModeratedIndexItems(c.env.DB, items); + } catch { + return c.json( + { + error: "index temporarily unavailable", + items: [], + cursor: list.list_complete ? null : ((list as any).cursor ?? null), + complete: false, + total: null, + }, + 503 + ); + } // Total is only computed on the first page request — subsequent paginated // calls don't need it, and it costs an extra KV read (or full sweep). @@ -182,14 +229,14 @@ app.get("/api/index", async (c) => { total = await backfillTotal(c.env); } // If this first page is the entire dataset, opportunistically reconcile. - if (list.list_complete && total !== items.length) { - total = items.length; + if (list.list_complete && total !== filteredItems.length) { + total = filteredItems.length; try { await c.env.ARTICLES.put(TOTAL_KEY, String(total)); } catch {} } } return c.json({ - items, + items: filteredItems, cursor: list.list_complete ? null : (list as any).cursor ?? null, complete: list.list_complete, total, @@ -380,6 +427,7 @@ app.get("/api/search", async (c) => { // leaves us with enough. const titles = await hallucinateSearchTitles( c.env.OPENROUTER_API_KEY, + c.env.LLM_API_URL, c.env.OPENROUTER_MODERATION_MODEL || c.env.OPENROUTER_MODEL || "google/gemini-2.5-flash-lite", @@ -586,7 +634,23 @@ app.get("/api/page/:slug", async (c) => { ); } - // 4. Daily soft cap (per-namespace counter). + if (!c.env.OPENROUTER_API_KEY) { + return c.json({ error: "OPENROUTER_API_KEY is not configured" }, 500); + } + + const title = slugToTitle(slug); + + // 6. Pre-generation content policy check via moderation model. + const approvedByPolicy = await isTitleModerationApproved(title, c.env); + if (!approvedByPolicy) { + return c.json( + { error: topicRejectedMessage(), banned: true }, + 403, + { "x-robots-tag": "noindex" } + ); + } + + // 7. Daily soft cap (per-namespace counter). const today = new Date().toISOString().slice(0, 10); const counterKey = `__counter:${today}`; const countStr = await c.env.ARTICLES.get(counterKey); @@ -596,10 +660,6 @@ app.get("/api/page/:slug", async (c) => { return c.json({ error: "daily generation cap reached; try again tomorrow" }, 503); } - if (!c.env.OPENROUTER_API_KEY) { - return c.json({ error: "OPENROUTER_API_KEY is not configured" }, 500); - } - // 3. Fetch source context if `from` is present. let sourceContext: GenerateOptions["sourceContext"] = null; if (fromSlug) { @@ -612,8 +672,6 @@ app.get("/api/page/:slug", async (c) => { } } - const title = slugToTitle(slug); - // Pull every prior link-context blurb other articles have written about // this slug. These become CANON the LLM must respect. let priorHints: string[] = []; @@ -625,6 +683,7 @@ app.get("/api/page/:slug", async (c) => { const genOpts: GenerateOptions = { apiKey: c.env.OPENROUTER_API_KEY, + apiUrl: c.env.LLM_API_URL, model: c.env.OPENROUTER_MODEL || "google/gemini-2.0-flash-001", title, slug, diff --git a/src/worker/llm.ts b/src/worker/llm.ts index a808468..d59eaa6 100644 --- a/src/worker/llm.ts +++ b/src/worker/llm.ts @@ -17,6 +17,7 @@ RULES: export interface GenerateOptions { apiKey: string; + apiUrl?: string; model: string; title: string; slug: string; @@ -74,7 +75,7 @@ export async function streamGeneration(opts: GenerateOptions): Promise ""); - throw new Error(`OpenRouter error ${res.status}: ${errText.slice(0, 300)}`); + throw new Error(`LLM provider error ${res.status}: ${errText.slice(0, 300)}`); } const decoder = new TextDecoder(); @@ -154,6 +155,7 @@ Reply with ONLY a JSON array of N strings. No prose, no code fences, no explanat * fewer if the model misbehaves. Never throws. */ export async function hallucinateSearchTitles( apiKey: string, + apiUrl: string | undefined, model: string, query: string, count: number @@ -161,7 +163,7 @@ export async function hallucinateSearchTitles( const userMsg = `Search query: "${query}"\n\nReturn a JSON array of exactly ${count} plausible Halupedia titles inspired by this query. No commentary.`; let raw = ""; try { - const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + const res = await fetch(chatCompletionsUrl(apiUrl), { method: "POST", headers: { "Content-Type": "application/json", @@ -205,7 +207,7 @@ export async function hallucinateSearchTitles( * Non-streaming fallback (used for retry on malformed output). */ export async function generateOnce(opts: GenerateOptions): Promise { - const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + const res = await fetch(chatCompletionsUrl(opts.apiUrl), { method: "POST", headers: { "Content-Type": "application/json", @@ -224,7 +226,14 @@ export async function generateOnce(opts: GenerateOptions): Promise { ], }), }); - if (!res.ok) throw new Error(`OpenRouter error ${res.status}`); + if (!res.ok) throw new Error(`LLM provider error ${res.status}`); const json: any = await res.json(); return json?.choices?.[0]?.message?.content ?? ""; } + +function chatCompletionsUrl(apiUrl: string | undefined): string { + const trimmed = apiUrl?.trim(); + return trimmed && trimmed.length > 0 + ? trimmed + : "https://openrouter.ai/api/v1/chat/completions"; +} diff --git a/src/worker/moderation.ts b/src/worker/moderation.ts index df77ae1..90a6bc8 100644 --- a/src/worker/moderation.ts +++ b/src/worker/moderation.ts @@ -1,3 +1,5 @@ +import { containsDeterministicDisallowedAbuse } from "./abuse"; + /** * Content moderation. Two trigger paths: * @@ -19,6 +21,7 @@ export interface ModerationEnv { DB: D1Database; ARTICLES: KVNamespace; OPENROUTER_API_KEY: string; + LLM_API_URL?: string; OPENROUTER_MODEL: string; OPENROUTER_MODERATION_MODEL?: string; } @@ -73,6 +76,32 @@ interface JudgeItem { text: string; } +export function topicRejectedMessage(): string { + return "this topic was rejected by moderation"; +} + +/** + * Synchronous pre-generation title check used by `/api/page/:slug`. + * + * The broader moderation pipeline is async and fail-open, but this gate is + * intentionally in the hot path for fresh article generation so obvious abuse + * does not spend article-generation tokens or enter KV. If the moderation + * model itself fails, we still fail open and let the existing sweep catch + * anything borderline later. + */ +export async function isTitleModerationApproved( + title: string, + env: ModerationEnv +): Promise { + if (containsDeterministicDisallowedAbuse(title)) return false; + const rejected = await judgeBatch( + [{ index: 1, text: title }], + "article title", + env + ); + return !rejected.has(1); +} + /* -------------------------------------------------------------------------- */ /* Deterministic comment-spam detector */ /* -------------------------------------------------------------------------- */ @@ -113,6 +142,7 @@ const ENGAGEMENT_BAIT_PHRASES = [ "informative article", ]; export function isObviousCommentSpam(body: string): boolean { + if (containsDeterministicDisallowedAbuse(body)) return true; const trimmed = body.trim(); if (SPAM_FINGERPRINT.test(trimmed)) return true; // Strip emojis/punctuation and check if what remains is ONLY a bait phrase. @@ -146,11 +176,19 @@ async function judgeBatch( ): Promise> { if (items.length === 0) return new Set(); - const numbered = items + const out = new Set(); + for (const item of items) { + if (containsDeterministicDisallowedAbuse(item.text)) out.add(item.index); + } + + const modelItems = items.filter((item) => !out.has(item.index)); + if (modelItems.length === 0) return out; + + const numbered = modelItems .map((it) => `${it.index}. ${it.text.replace(/\s+/g, " ").slice(0, 500)}`) .join("\n"); - const userMsg = `Review the following ${items.length} ${kind}${items.length === 1 ? "" : "s"} and return the JSON array of 1-based indices to remove (or [] if all are acceptable):\n\n${numbered}`; + const userMsg = `Review the following ${modelItems.length} ${kind}${modelItems.length === 1 ? "" : "s"} and return the JSON array of 1-based indices to remove (or [] if all are acceptable):\n\n${numbered}`; const model = env.OPENROUTER_MODERATION_MODEL || @@ -159,7 +197,7 @@ async function judgeBatch( let raw = ""; try { - const res = await fetch("https://openrouter.ai/api/v1/chat/completions", { + const res = await fetch(chatCompletionsUrl(env.LLM_API_URL), { method: "POST", headers: { "Content-Type": "application/json", @@ -199,8 +237,7 @@ async function judgeBatch( } if (!Array.isArray(arr)) return new Set(); - const valid = new Set(items.map((it) => it.index)); - const out = new Set(); + const valid = new Set(modelItems.map((it) => it.index)); for (const v of arr) { const n = typeof v === "number" ? v : parseInt(String(v), 10); if (Number.isFinite(n) && valid.has(n)) out.add(n); @@ -208,6 +245,13 @@ async function judgeBatch( return out; } +function chatCompletionsUrl(apiUrl: string | undefined): string { + const trimmed = apiUrl?.trim(); + return trimmed && trimmed.length > 0 + ? trimmed + : "https://openrouter.ai/api/v1/chat/completions"; +} + /* -------------------------------------------------------------------------- */ /* Per-write enqueue helpers */ /* -------------------------------------------------------------------------- */ diff --git a/wrangler.toml b/wrangler.toml index 062fb2c..acc16ee 100644 --- a/wrangler.toml +++ b/wrangler.toml @@ -37,12 +37,18 @@ migrations_dir = "migrations" [vars] OPENROUTER_MODEL = "google/gemini-2.5-flash-lite" +# Optional OpenAI-compatible chat completions endpoint override. Leave unset +# for OpenRouter; set locally in .dev.vars to use z.ai or another compatible +# provider without changing code. +# LLM_API_URL = "https://api.z.ai/api/coding/paas/v4/chat/completions" MAX_ARTICLES_PER_DAY = "5000" # Per-IP guards against UA-spoofing scrapers. Bypassable via IP rotation, # but the global daily cap above is the hard backstop. GEN_PER_IP_PER_HOUR = "100" IDENT_PER_IP_PER_HOUR = "10" SEARCH_PER_IP_PER_HOUR = "15" +COMMENT_PER_IP_PER_HOUR = "90" +COMMENT_PER_IP_PER_MINUTE = "15" # --- Cloudflare Turnstile bot gating --------------------------------------- # Activates ONLY when all three secrets below are set. With them missing,