diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 08990831fe81a..5a859d05598ed 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -830,6 +830,8 @@ other . SET_YYLLOC(); /* throw back all but the initial u/U */ yyless(1); + /* Reject identifiers containing Unicode whitespace */ + check_ident_for_unicode_whitespace(yytext, yyleng); /* and treat it as {identifier} */ ident = downcase_truncate_identifier(yytext, yyleng, true); yylval->str = ident; @@ -1075,6 +1077,9 @@ other . SET_YYLLOC(); + /* Reject identifiers containing Unicode whitespace */ + check_ident_for_unicode_whitespace(yytext, yyleng); + /* Is it a keyword? */ kwnum = ScanKeywordLookup(yytext, yyextra->keywordlist); diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index 2feb2b6cf5a96..8b627f0b0ab92 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -16,6 +16,7 @@ #include +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "parser/scansup.h" @@ -104,6 +105,64 @@ truncate_identifier(char *ident, int len, bool warn) } } +/* + * check_ident_for_unicode_whitespace() --- reject identifiers containing + * Unicode whitespace or other invisible characters. + * + * The flex scanner's identifier rules use byte ranges (\200-\377) that match + * any non-ASCII byte, including bytes that form multi-byte Unicode whitespace + * characters like NO-BREAK SPACE (U+00A0). This creates a "Trojan Source" + * vulnerability where queries can be visually deceptive: + * + * SELECT password isnull FROM users; + * + * looks like "password IS NULL" but parses as password aliased to "is null", + * leaking the password value. This function detects and rejects such cases. + * + * Only applies to multi-byte encodings (primarily UTF-8) where the issue + * arises. Single-byte encodings are not affected because their high-byte + * characters don't encode Unicode whitespace. + */ +void +check_ident_for_unicode_whitespace(const char *ident, int len) +{ + int encoding = GetDatabaseEncoding(); + int i; + + /* Only UTF-8 encodes Unicode whitespace as sequences of high bytes */ + if (encoding != PG_UTF8) + return; + + for (i = 0; i < len;) + { + unsigned char ch = (unsigned char) ident[i]; + + if (IS_HIGHBIT_SET(ch)) + { + int mblen = pg_mblen(&ident[i]); + pg_wchar uchar; + + /* Ensure we don't read past the end */ + if (i + mblen > len) + break; + + uchar = utf8_to_unicode((const unsigned char *) &ident[i]); + + if (pg_u_prop_white_space(uchar)) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("identifier contains Unicode whitespace character U+%04X", + (unsigned int) uchar), + errdetail("Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces."), + errhint("Remove or replace the Unicode whitespace character."))); + + i += mblen; + } + else + i++; + } +} + /* * scanner_isspace() --- return true if flex scanner considers char whitespace * diff --git a/src/include/parser/scansup.h b/src/include/parser/scansup.h index 8f3a9f4c527bd..4fc26b6743904 100644 --- a/src/include/parser/scansup.h +++ b/src/include/parser/scansup.h @@ -22,6 +22,8 @@ extern char *downcase_identifier(const char *ident, int len, extern void truncate_identifier(char *ident, int len, bool warn); +extern void check_ident_for_unicode_whitespace(const char *ident, int len); + extern bool scanner_isspace(char ch); #endif /* SCANSUP_H */ diff --git a/src/test/regress/expected/unicode.out b/src/test/regress/expected/unicode.out index 1e06de2264912..6ee12be2753da 100644 --- a/src/test/regress/expected/unicode.out +++ b/src/test/regress/expected/unicode.out @@ -105,3 +105,26 @@ ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error ERROR: invalid normalization form: def +-- Test that Unicode whitespace in unquoted identifiers is rejected. +-- This prevents "Trojan Source" attacks where visually identical queries +-- parse with different semantics. +-- Normal identifiers with non-Latin letters should still work: +SELECT 1 AS тест; + тест +------ + 1 +(1 row) + +-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data): +SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space; + nbsp_is_not_regular_space +--------------------------- + f +(1 row) + +-- The following line contains U+00A0 (NBSP) between "is" and "null". +-- It should produce an error about Unicode whitespace in identifiers. +SELECT 1 is null; +ERROR: identifier contains Unicode whitespace character U+00A0 +DETAIL: Unicode whitespace characters are not allowed in identifiers because they are visually indistinguishable from regular spaces. +HINT: Remove or replace the Unicode whitespace character. diff --git a/src/test/regress/sql/unicode.sql b/src/test/regress/sql/unicode.sql index e50adb68ed0d5..0e03575a1c49a 100644 --- a/src/test/regress/sql/unicode.sql +++ b/src/test/regress/sql/unicode.sql @@ -36,3 +36,17 @@ FROM ORDER BY num; SELECT is_normalized('abc', 'def'); -- run-time error + +-- Test that Unicode whitespace in unquoted identifiers is rejected. +-- This prevents "Trojan Source" attacks where visually identical queries +-- parse with different semantics. + +-- Normal identifiers with non-Latin letters should still work: +SELECT 1 AS тест; + +-- U+00A0 NO-BREAK SPACE via Unicode escape in a string (should work as data): +SELECT U&'\00A0' = ' ' AS nbsp_is_not_regular_space; + +-- The following line contains U+00A0 (NBSP) between "is" and "null". +-- It should produce an error about Unicode whitespace in identifiers. +SELECT 1 is null;