Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ unicode-width = { version = "0.2", default-features = false }
url = { version = "2", default-features = false }
which = { version = "8", default-features = false }
windows-sys = { version = "0.61", default-features = false }
zip = { version = "2", default-features = false, features = ["bzip2"] }
zip = { version = "2", default-features = false }

[patch.crates-io]
openai_responses = { git = "https://github.com/JeanMertz/openai-responses-rs" } # <https://github.com/m1guelpf/openai-responses-rs/pull/6>
Expand Down
6 changes: 5 additions & 1 deletion crates/contrib/bookworm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ jp_tool = { workspace = true }
chrono = { workspace = true, features = ["serde"] }
clap = { workspace = true, features = ["std", "derive", "help"] }
convert_case = { workspace = true }
directories = { workspace = true }
htmd = { workspace = true }
indoc = { workspace = true }
reqwest = { workspace = true, features = ["json", "rustls-tls"] }
Expand All @@ -27,7 +28,10 @@ tokio = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true, features = ["env-filter", "fmt"] }
url = { workspace = true, features = ["serde"] }
zip = { workspace = true }
zip = { workspace = true, features = ["bzip2", "deflate"] }

[dev-dependencies]
camino-tempfile = { workspace = true }

[lints]
workspace = true
Expand Down
69 changes: 58 additions & 11 deletions crates/contrib/bookworm/src/dl.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::{
collections::HashSet,
env, fs,
fs,
future::Future,
io,
path::{Path, PathBuf},
};

use directories::ProjectDirs;
use reqwest::header::ETAG;
use url::Url;
use zip::ZipArchive;
Expand All @@ -14,6 +15,23 @@ use crate::error::Error;

const DOCS_RS: &str = "https://docs.rs";

/// Default root for cached crate documentation.
///
/// Resolves to the OS-specific user cache directory
/// (e.g. `~/Library/Caches/bookworm/crates` on macOS,
/// `~/.cache/bookworm/crates` on Linux), so downloaded crate documentation
/// survives reboots and is trivial to locate or wipe.
///
/// Falls back to the system temp directory if `ProjectDirs` can't resolve
/// a user cache directory (e.g. when `$HOME` is unset).
#[must_use]
pub fn default_crates_root() -> PathBuf {
ProjectDirs::from("", "", "bookworm").map_or_else(
|| std::env::temp_dir().join("bookworm/crates"),
|p| p.cache_dir().join("crates"),
)
}

#[derive(Default)]
pub struct Config {
pub root: Option<PathBuf>,
Expand Down Expand Up @@ -97,7 +115,7 @@ pub async fn download(config: Config) -> Result<PathBuf, Error> {

let destination = config
.root
.unwrap_or_else(env::temp_dir)
.unwrap_or_else(default_crates_root)
.join(format!("{}/{version}/{etag}", config.crate_name));

if destination.is_dir() {
Expand All @@ -114,7 +132,7 @@ pub async fn download(config: Config) -> Result<PathBuf, Error> {
.await?;

unzip(&bytes, &destination)?;
sanitize(&destination, &config.crate_name)?;
sanitize(&destination)?;
rewrite_urls(&destination, &config.client).await?;

Ok(destination)
Expand Down Expand Up @@ -147,17 +165,42 @@ fn unzip(bytes: &[u8], destination: &Path) -> Result<(), Error> {
Ok(())
}

fn sanitize(path: &Path, crate_name: &str) -> Result<(), Error> {
// Some generated docsets contain more than the default platform. For now,
// it is OK to only parse the "main" platform and remove all the others
/// Remove auxiliary directories from a freshly-extracted docs.rs archive,
/// keeping only the default platform's docs.
///
/// docs.rs can ship multi-platform docsets, where each non-default platform
/// lives in a target-triple-named directory (`x86_64-unknown-linux-gnu/`,
/// `wasm32-unknown-unknown/`, …) that re-nests the full rustdoc layout.
/// The downstream indexer can't tell those apart from the real docs, so
/// would produce bogus module paths if they remained.
///
/// Detection is structural rather than name-based: rustdoc places each
/// crate's HTML docs in a directory that has an `index.html` file directly
/// inside it. Target-triple wrapper directories don't — they only contain
/// nested crate directories. Keeping dirs that look like crate docs dirs
/// handles hyphenated crate names (`ra-ap-rustc_lexer` -> `ra_ap_rustc_lexer/`),
/// custom `[lib] name = "…"` declarations, and any other naming variation,
/// without needing to know the crate's lib name in advance.
///
/// `src/` and `implementors/` are kept by explicit allow-list — they're part
/// of the rustdoc layout but don't have a top-level `index.html`.
fn sanitize(path: &Path) -> Result<(), Error> {
for item in path.read_dir()? {
let item = item?;
if item.path().is_dir()
&& ![crate_name, "src", "implementors"]
.contains(&item.file_name().to_string_lossy().as_ref())
{
fs::remove_dir_all(item.path())?;
if !item.path().is_dir() {
continue;
}

let name = item.file_name();
if matches!(name.to_string_lossy().as_ref(), "src" | "implementors") {
continue;
}

if item.path().join("index.html").is_file() {
continue;
}

fs::remove_dir_all(item.path())?;
}

Ok(())
Expand Down Expand Up @@ -265,3 +308,7 @@ where

Ok(())
}

#[cfg(test)]
#[path = "dl_tests.rs"]
mod tests;
154 changes: 154 additions & 0 deletions crates/contrib/bookworm/src/dl_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
use std::fs;

use camino_tempfile::tempdir;

use super::*;

/// Build a fake docs.rs extraction layout under `root`.
///
/// Each `(dir, has_index)` tuple creates `root/<dir>/` and, when
/// `has_index` is true, also creates `root/<dir>/index.html` with a
/// placeholder body. Used by the sanitize tests to assert which kinds of
/// directories survive based purely on the presence of `index.html`.
fn populate(root: &Path, entries: &[(&str, bool)]) {
for (dir, has_index) in entries {
let path = root.join(dir);
fs::create_dir_all(&path).expect("create dir");
if *has_index {
fs::write(path.join("index.html"), b"<html></html>").expect("write index.html");
}
}
}

#[test]
fn keeps_crate_docs_directory_with_index_html() {
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[("serde_json", true)]);

sanitize(root).expect("sanitize");

assert!(root.join("serde_json").is_dir());
assert!(root.join("serde_json/index.html").is_file());
}

#[test]
fn keeps_hyphenated_crate_docs_directory() {
// Regression: `ra-ap-rustc_lexer` has docs under `ra_ap_rustc_lexer/`
// because cargo replaces `-` with `_` in the lib name. The old
// name-based sanitize deleted this directory.
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[("ra_ap_rustc_lexer", true)]);

sanitize(root).expect("sanitize");

assert!(
root.join("ra_ap_rustc_lexer").is_dir(),
"hyphenated-crate docs directory must be preserved"
);
}

#[test]
fn keeps_directory_with_custom_lib_name() {
// A crate published as `foo` may declare `[lib] name = "fooz"`, in which
// case rustdoc emits its docs under `fooz/`. The structural detection
// doesn't care about the crates.io name, only about `index.html`.
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[("fooz", true)]);

sanitize(root).expect("sanitize");

assert!(root.join("fooz").is_dir());
}

#[test]
fn keeps_src_and_implementors_without_index_html() {
// `src/` and `implementors/` are part of rustdoc's output layout but
// do not have a top-level `index.html`. They are kept by allow-list.
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[("src", false), ("implementors", false)]);

sanitize(root).expect("sanitize");

assert!(root.join("src").is_dir());
assert!(root.join("implementors").is_dir());
}

#[test]
fn removes_target_triple_directory_without_index_html() {
// Multi-platform docsets nest each platform's full layout under a
// target-triple directory. The triple directory itself has no direct
// `index.html` (its child crate directory does), so it is removed.
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[
("serde_json", true),
("x86_64-unknown-linux-gnu", false),
("x86_64-unknown-linux-gnu/serde_json", true),
]);

sanitize(root).expect("sanitize");

assert!(
root.join("serde_json").is_dir(),
"default-platform docs kept"
);
assert!(
!root.join("x86_64-unknown-linux-gnu").exists(),
"target-triple directory removed"
);
}

#[test]
fn removes_arbitrary_directory_without_index_html() {
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[("nuisance", false)]);

sanitize(root).expect("sanitize");

assert!(!root.join("nuisance").exists());
}

#[test]
fn leaves_top_level_files_alone() {
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
fs::write(root.join("help.html"), b"help").expect("write file");
fs::write(root.join("settings.html"), b"settings").expect("write file");

sanitize(root).expect("sanitize");

assert!(root.join("help.html").is_file());
assert!(root.join("settings.html").is_file());
}

#[test]
fn realistic_docs_rs_layout() {
// End-to-end: a directory structure resembling what `unzip` produces
// from a real docs.rs archive for a hyphenated crate. Default-platform
// docs, `src/`, files, and a multi-platform wrapper.
let dir = tempdir().expect("tempdir");
let root = dir.path().as_std_path();
populate(root, &[
("ra_ap_rustc_lexer", true),
("src", false),
("implementors", false),
("wasm32-unknown-unknown", false),
("wasm32-unknown-unknown/ra_ap_rustc_lexer", true),
]);
fs::write(root.join("help.html"), b"help").expect("write file");
fs::write(root.join("settings.html"), b"settings").expect("write file");

sanitize(root).expect("sanitize");

assert!(root.join("ra_ap_rustc_lexer/index.html").is_file());
assert!(root.join("src").is_dir());
assert!(root.join("implementors").is_dir());
assert!(!root.join("wasm32-unknown-unknown").exists());
assert!(root.join("help.html").is_file());
assert!(root.join("settings.html").is_file());
}
3 changes: 2 additions & 1 deletion crates/contrib/bookworm/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ enum Command {
#[arg(short, long)]
version: Option<String>,

/// Root directory to save the documentation to (defaults to temp dir).
/// Root directory to save the documentation to (defaults to the
/// user cache directory, e.g. `~/Library/Caches/bookworm/crates`).
#[arg(short, long)]
root: Option<PathBuf>,
},
Expand Down
4 changes: 3 additions & 1 deletion crates/contrib/bookworm/src/query/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use std::{path::PathBuf, sync::LazyLock};

use reqwest::header::{self, USER_AGENT};

use crate::dl;

pub(crate) static GLOBAL_CLIENT: LazyLock<Client> = LazyLock::new(Client::default);

pub(crate) struct Client {
Expand All @@ -23,7 +25,7 @@ impl Default for Client {
.expect("Client::default()");

Self {
crates_path: std::env::temp_dir().join("bookworm/crates"),
crates_path: dl::default_crates_root(),
http_client,
}
}
Expand Down
Loading