Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,61 @@ plugins {
alias(libs.plugins.kover) apply false
alias(libs.plugins.binary.compat) apply false
}

// --- Fixture & lookup-table code generation ----------------------------------
// Thin Gradle wrappers over the offline Python generators under tools/, so every fixture and
// lookup table regenerates via `./gradlew` instead of a bare `python3` invocation. They read
// reference corpora vendored under tools/ and (for the IDNA tables) the untracked Unicode/WPT
// data under .claude/references/, so they are developer tooling: grouped under "codegen" and
// deliberately NOT wired into `check`/`build`. Override the interpreter with `-Ppython=…`.
val pythonInterpreter: String = (findProperty("python") as String?) ?: "python3"

val generators: List<Triple<String, String, String>> =
listOf(
Triple(
"generateUrlTestData",
"tools/url/generate_urltestdata_fixture.py",
"Regenerate the WPT urltestdata conformance fixture from tools/url/urltestdata.json.",
),
Triple(
"generateIdnaMappingTable",
"tools/idna/generate_idna_mapping_table.py",
"Regenerate the UTS-46 IDNA mapping-table data from the bundled Unicode UCD.",
),
Triple(
"generateIdnaValidityTables",
"tools/idna/generate_idna_validity_tables.py",
"Regenerate the UTS-46 IDNA label-validity table data from the bundled Unicode UCD.",
),
Triple(
"generateNfcTables",
"tools/idna/generate_nfc_tables.py",
"Regenerate the NFC normalization table data from the bundled Unicode UCD.",
),
Triple(
"generateNfcTestFixture",
"tools/idna/generate_nfc_test_fixture.py",
"Regenerate the NFC conformance fixture from Unicode NormalizationTest.txt.",
),
Triple(
"generateIdnaConformanceFixture",
"tools/idna/generate_conformance_fixture.py",
"Regenerate the IDNA conformance fixture from the WPT IdnaTestV2 and toascii corpora.",
),
)

val codegenTasks: List<TaskProvider<Exec>> =
generators.map { (taskName, script, summary) ->
tasks.register<Exec>(taskName) {
group = "codegen"
description = summary
workingDir = rootDir
commandLine(pythonInterpreter, script)
}
}

tasks.register("generateFixtures") {
group = "codegen"
description = "Run every Python fixture and lookup-table generator under tools/."
dependsOn(codegenTasks)
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,14 @@ internal object HostParser {
}

/**
* The special-scheme domain pipeline (§7.4): UTF-8 percent-decode, run UTS-46 ToASCII, then
* classify the ASCII domain. A ToASCII failure propagates unchanged ([HOST-26]).
* The special-scheme domain pipeline (§7.4): UTF-8 percent-decode, run the WHATWG "domain to
* ASCII" wrapper ([Idna.domainToAsciiForUrl] — UTS-46 with the ASCII fast-path and empty-result
* rule), then classify the ASCII domain. A domain-to-ASCII failure propagates unchanged ([HOST-26]).
*/
private fun parseSpecialDomain(input: String): ParseResult<Host> {
require(input.isNotEmpty()) { "empty domain reached the IDNA pipeline" }
val decoded = PercentCodec.decode(input)
return when (val ascii = Idna.domainToAscii(decoded)) {
return when (val ascii = Idna.domainToAsciiForUrl(decoded)) {
is ParseResult.Err -> ascii
is ParseResult.Ok -> classifyAsciiDomain(ascii.value)
}
Expand Down
41 changes: 41 additions & 0 deletions kuri/src/commonMain/kotlin/org/dexpace/kuri/idna/Idna.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ private const val LABEL_SEPARATOR: String = "."
/** First non-ASCII code point; a label with any point `>=` this needs ACE encoding. */
private const val NON_ASCII_MIN: Int = 0x80

/** Distance from an uppercase ASCII letter to its lowercase counterpart (`'A'` -> `'a'`). */
private const val ASCII_CASE_OFFSET: Int = 0x20

/** Largest Unicode scalar value (U+10FFFF). */
private const val MAX_CODE_POINT: Int = 0x10FFFF

Expand Down Expand Up @@ -84,6 +87,35 @@ internal object Idna {
return processLabels(labels, domain).map { it.joinToString(LABEL_SEPARATOR) }
}

/**
* WHATWG "domain to ASCII" for the `Url` profile (`beStrict = false`): the URL-layer wrapper over
* the pure UTS-46 [domainToAscii] (SPEC §7.4, [HOST-26]).
*
* An all-ASCII [domain] is returned **lowercased verbatim**: per the standard, an ASCII domain's
* Unicode ToASCII failures are only validation errors, never fatal, for web compatibility — so an
* invalid `xn--` label such as `xn--pokxncvks` is kept as-is rather than rejected (and an
* `IgnoreInvalidPunycode` flag alone would not suffice, since Punycode can decode yet still fail a
* later validity check). A non-ASCII [domain] runs the full UTS-46 pipeline and propagates its
* failure. Either way, a result that collapses to the empty string (e.g. a lone soft hyphen, which
* maps to nothing) is a failure ("if result is the empty string, return failure"). The residual
* forbidden-code-point check is applied by the host classifier, not here.
*
* @param domain the percent-decoded, assumed-NFC domain text.
* @return [ParseResult.Ok] with the ASCII domain, or [ParseResult.Err] on a domain-to-ASCII failure.
*/
internal fun domainToAsciiForUrl(domain: String): ParseResult<String> {
val result =
if (isAsciiString(domain)) {
asciiLowercase(domain)
} else {
when (val ascii = domainToAscii(domain)) {
is ParseResult.Err -> return ascii
is ParseResult.Ok -> ascii.value
}
}
return if (result.isEmpty()) idnaError(domain) else ParseResult.Ok(result)
}

/**
* Runs the inverse display transform (UTS-46 ToUnicode) over [domain]: map, (deferred) NFC,
* then Punycode-decode every `xn--` label. Best-effort — validity failures are non-fatal, so a
Expand Down Expand Up @@ -234,6 +266,15 @@ internal object Idna {
private fun idnaError(domain: String): ParseResult.Err =
ParseResult.Err(UriParseError.InvalidHost(domain, HostError.IdnaFailed))

/** True when every UTF-16 unit of [s] is ASCII (`< 0x80`); an ASCII domain skips UTS-46 ToASCII. */
private fun isAsciiString(s: String): Boolean = s.all { it.code < NON_ASCII_MIN }

/** ASCII-lowercases [s] (`A`–`Z` -> `a`–`z`), leaving every other code unit unchanged. */
private fun asciiLowercase(s: String): String =
buildString(s.length) {
for (c in s) append(if (c in 'A'..'Z') c + ASCII_CASE_OFFSET else c)
}

/** Splits [input] into Unicode code points, combining well-formed surrogate pairs. */
private fun codePointsOf(input: String): List<Int> {
val result = ArrayList<Int>(input.length)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ private const val KEY_SEPARATOR: String = "\u0000"
* Reconstruction mirrors the WHATWG component serializers (§11) for the getters under test: a path
* as the concatenation of `"/" + segment` (so the empty list renders `""` and the special root
* renders `"/"`), and a `search`/`hash` as `?`/`#` plus the value, with absent or present-but-empty
* both collapsing to `""` (the getter cannot tell them apart). The only residual divergence is the
* deferred IDNA host validity tracked in [KNOWN_FAILURES]. The suite ratchets in both directions: a
* brand-new failure breaks the untracked-regressions test, and a fixed gap breaks the
* baseline-equality test until the baseline is updated.
* both collapsing to `""` (the getter cannot tell them apart). There is no residual divergence:
* [KNOWN_FAILURES] is empty and the suite is at full conformance. It still ratchets in both
* directions — a brand-new failure breaks the untracked-regressions test, and a regression that
* repopulates the live set breaks the baseline-equality test until the baseline is updated.
*/
@Suppress("TooManyFunctions") // small single-purpose reconstruction helpers plus the four ratchet tests.
class UrlConformanceTest {
Expand Down Expand Up @@ -139,32 +139,27 @@ class UrlConformanceTest {
}

@Test
fun `the corpus and known-failures set are non-trivially populated`() {
assertTrue(URL_TEST_CASES.size > KNOWN_FAILURES.size * 2, "passing cases should dwarf known failures")
assertTrue(KNOWN_FAILURES.isNotEmpty(), "deferred corners should yield a tracked baseline")
fun `the corpus is substantial and fully conformant`() {
assertTrue(
URL_TEST_CASES.size > MIN_CORPUS_SIZE,
"the WPT corpus should be substantial: ${URL_TEST_CASES.size}",
)
assertTrue(liveFailingKeys().isEmpty(), "every WPT case must parse per spec; failing: ${liveFailingKeys()}")
}

private companion object {
/** The corpus-size floor for [the corpus is substantial and fully conformant]. */
private const val MIN_CORPUS_SIZE: Int = 500

/**
* The tracked baseline of currently-failing case keys (`input + U+0000 + base`). The single
* remaining category is a deferred dependency, not a §8 state-machine bug:
* - **deferred IDNA host validity (6):** a host that UTS-46 rejects (a soft-hyphen label
* that maps to empty, an invalid `xn--` A-label) is accepted because the same UTS-46
* validity steps deferred in `IdnaConformanceTest` are not yet applied, so the parse
* succeeds where WPT requires failure.
*
* None reflect a wrong parsed component; closing the IDNA-validity gap will empty this set,
* which the ratcheting `the known-failures set exactly equals the live failing set` enforces.
* The tracked baseline of currently-failing case keys (`input + U+0000 + base`). It is empty:
* the live parser now satisfies every in-scope WPT case under the `Url` profile. The former
* deferred IDNA host-validity residual (an invalid `xn--` label, a soft-hyphen host that maps
* to empty) closed when the WHATWG "domain to ASCII" wrapper landed -- an ASCII domain is kept
* verbatim and a domain that maps to the empty string fails. The ratcheting `the known-failures
* set exactly equals the live failing set` test now pins the suite at full conformance: any
* regression repopulates the live set and breaks the build until this baseline is updated.
*/
private val KNOWN_FAILURES: Set<String> =
setOf(
// deferred IDNA host validity (host accepted where UTS-46 requires rejection)
"file://\u00ad/p\u0000",
"file://%C2%AD/p\u0000",
"file://xn--/p\u0000",
"https://\u00ad/\u0000",
"https://%C2%AD/\u0000",
"https://xn--/\u0000",
)
private val KNOWN_FAILURES: Set<String> = emptySet()
}
}
Loading