diff --git a/url/src/parser.rs b/url/src/parser.rs index dbdf9b906..3f1780b4b 100644 --- a/url/src/parser.rs +++ b/url/src/parser.rs @@ -378,6 +378,17 @@ impl Parser<'_> { /// https://url.spec.whatwg.org/#concept-basic-url-parser pub fn parse_url(mut self, input: &str) -> ParseResult { let input = Input::new_trim_c0_control_and_space(input, self.violation_fn); + + // WHATWG URL spec change (whatwg/url#874): in scheme start state, + // when the parser sees ` : \`, it sets scheme to "file", + // host to empty, and transitions to path state. The single-letter + // "scheme" + `:\` is unambiguously a Windows drive path, not a URL. + // Forward-slash drive paths (`c:/foo`) are NOT covered by the spec + // change — those remain valid scheme URLs (e.g. `c:` scheme). + if starts_with_windows_drive_letter_path(&input) { + return self.parse_windows_drive_letter_path(input); + } + if let Ok(remaining) = self.parse_scheme(input.clone()) { return self.parse_with_scheme(remaining); } @@ -401,6 +412,38 @@ impl Parser<'_> { } } + /// Per WHATWG URL spec change (whatwg/url#874): handle a top-level input + /// starting with ` : \` as a Windows drive letter file path. + /// Sets scheme to "file", host to empty string, and runs path state over + /// the original input. Path state treats `\` as a path separator for + /// special schemes, producing a path of the form `/C:/path/file`. + fn parse_windows_drive_letter_path(mut self, input: Input<'_>) -> ParseResult { + debug_assert!(self.serialization.is_empty()); + self.serialization.push_str("file://"); + let scheme_end = "file".len() as u32; + let host_start = "file://".len() as u32; + let host_end = host_start; + let host = HostInternal::None; + + let remaining = self.parse_path(SchemeType::File, &mut false, host_end as usize, input); + + let (query_start, fragment_start) = + self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?; + + Ok(Url { + serialization: self.serialization, + scheme_end, + username_end: host_start, + host_start, + host_end, + host, + port: None, + path_start: host_end, + query_start, + fragment_start, + }) + } + pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result, ()> { // starts_with will also fail for empty strings so we can skip that comparison for perf if !input.starts_with(ascii_alpha) { @@ -1795,6 +1838,19 @@ fn starts_with_windows_drive_letter(s: &str) -> bool { && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#')) } +/// Detect the Windows drive letter file-path shape ` : \` at the start +/// of a top-level URL parser input. Per WHATWG URL spec change (whatwg/url#874), +/// this pattern is treated as a Windows drive path (file URL) rather than a +/// single-letter scheme. Forward-slash drive paths are intentionally not +/// matched: `c:/foo` is a valid `c:` scheme URL and must not be rewritten. +fn starts_with_windows_drive_letter_path(input: &Input<'_>) -> bool { + let mut iter = input.clone(); + matches!( + (iter.next(), iter.next(), iter.next()), + (Some(a), Some(':'), Some('\\')) if ascii_alpha(a) + ) +} + /// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool { let mut input = input.clone(); diff --git a/url/tests/unit.rs b/url/tests/unit.rs index 828f79756..fb00ad872 100644 --- a/url/tests/unit.rs +++ b/url/tests/unit.rs @@ -1392,3 +1392,98 @@ fn test_parse_url_with_single_byte_control_host() { let url2 = Url::parse(url1.as_str()).unwrap(); assert_eq!(url2, url1); } + +// WHATWG URL spec change (whatwg/url#874): Windows drive letter file paths. +// `:\` at the start of input is treated as a Windows drive path and +// parsed into a `file:///:/...` URL. + +#[test] +fn windows_drive_path_basic() { + let url = Url::parse(r"C:\path\file.txt").unwrap(); + assert_eq!(url.as_str(), "file:///C:/path/file.txt"); + assert_eq!(url.scheme(), "file"); + assert_eq!(url.host(), None); + assert_eq!(url.path(), "/C:/path/file.txt"); +} + +#[test] +fn windows_drive_path_different_drives() { + assert_eq!( + Url::parse(r"D:\foo\bar.exe").unwrap().as_str(), + "file:///D:/foo/bar.exe" + ); + assert_eq!( + Url::parse(r"Z:\deep\nested\path.rs").unwrap().as_str(), + "file:///Z:/deep/nested/path.rs" + ); +} + +#[test] +fn windows_drive_path_preserves_drive_case() { + // Per spec: drive letter case is preserved. + assert_eq!( + Url::parse(r"c:\folder\file.txt").unwrap().as_str(), + "file:///c:/folder/file.txt" + ); + assert_eq!( + Url::parse(r"C:\folder\file.txt").unwrap().as_str(), + "file:///C:/folder/file.txt" + ); +} + +#[test] +fn windows_drive_path_mixed_separators() { + // Forward slashes in the body are equivalent to backslashes for special schemes. + assert_eq!( + Url::parse(r"C:\path/mixed\separators/file.txt") + .unwrap() + .as_str(), + "file:///C:/path/mixed/separators/file.txt" + ); +} + +#[test] +fn windows_drive_path_percent_encodes_spaces() { + assert_eq!( + Url::parse(r"C:\path with space\file.txt").unwrap().as_str(), + "file:///C:/path%20with%20space/file.txt" + ); +} + +#[test] +fn windows_drive_path_drops_base() { + // The Windows-drive shortcut at top level ignores any base URL — the input + // is unambiguously absolute (file scheme, empty host). + let base = Url::parse("http://example.org/").unwrap(); + let url = Url::options() + .base_url(Some(&base)) + .parse(r"C:\path\file.node") + .unwrap(); + assert_eq!(url.as_str(), "file:///C:/path/file.node"); + assert_eq!(url.scheme(), "file"); +} + +#[test] +fn windows_drive_path_with_query_and_fragment() { + let url = Url::parse(r"C:\path\file.txt?q=1#frag").unwrap(); + assert_eq!(url.as_str(), "file:///C:/path/file.txt?q=1#frag"); + assert_eq!(url.query(), Some("q=1")); + assert_eq!(url.fragment(), Some("frag")); +} + +// Regression guard: `letter:/...` and `letter://...` shapes must NOT be +// rewritten to `file:///` — these are scheme URLs (single-letter schemes are +// valid per RFC 3986), not Windows drive paths. The spec change is scoped to +// the ` : \` (backslash) shape only. +#[test] +fn windows_drive_path_does_not_rewrite_scheme_urls() { + assert_eq!( + Url::parse("a://example.net").unwrap().as_str(), + "a://example.net" + ); + assert_eq!(Url::parse("h://.").unwrap().as_str(), "h://."); + assert_eq!(Url::parse("w://x:0").unwrap().as_str(), "w://x:0"); + // `c:/foo` is a `c:` scheme URL, not a Windows drive path. Forward-slash + // drive paths remain untouched and require explicit `file:///C:/foo`. + assert_eq!(Url::parse("c:/foo").unwrap().as_str(), "c:/foo"); +}