Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions SECURITY.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ repository. Do not open public issues for vulnerabilities.
- Acknowledgement target: 48 hours.
- Coordinated disclosure: we will agree on a timeline with you; default 90 days.
- In-scope: parser memory safety, sandbox escapes, path traversal in CLI/MCP surfaces,
hidden-content exclusion bypasses, determinism-contract violations exploitable for cache
poisoning, dependency advisories in the base tree.
implemented hidden-content exclusion bypasses, determinism-contract violations exploitable for
cache poisoning, dependency advisories in the base tree.
- Out-of-scope until the relevant surface ships: hosted services (none exist; Ethos is OSS-only).

## Hardening guarantees (base build)
Expand All @@ -24,8 +24,11 @@ repository. Do not open public issues for vulnerabilities.
(enforced three ways: dependency policy, clippy disallowed-API lints, runtime no-egress test).
- Resource limits: max file size, page count, parse time; stable failure codes — a PDF that
cannot be parsed safely fails with a stable error code, never a panic.
- Hidden / off-page / low-contrast text is detected, surfaced in `security_report.json`, and
excluded from default chunks.
- Current `security_report.json` findings are limited to warnings emitted by the canonical
document. Image-only pages are surfaced today. Hidden, off-page, low-contrast text and PDF object
inventories (annotations, actions, attachments, scripts, links) are not detected by the base
parser yet; treat parsed document text and empty inventory arrays as untrusted, not as a clean
bill of health.

## Supported versions

Expand Down
64 changes: 55 additions & 9 deletions adapters/grounding/opendataloader-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
#![forbid(unsafe_code)]
#![warn(missing_docs)]

use std::collections::HashSet;
use std::collections::{HashMap, HashSet};

use ethos_core::grounding::{
Capabilities, CoordinateOrigin, GroundingCell, GroundingElement, GroundingSource,
Expand Down Expand Up @@ -175,6 +175,13 @@ fn bbox_from(value: &Value) -> Result<[i64; 4], AdapterError> {
Ok(out)
}

fn bbox_within_page(bbox: [i64; 4], page: &PageGeometry, label: &str) -> Result<(), AdapterError> {
if bbox[0] < 0 || bbox[1] < 0 || bbox[2] > page.width || bbox[3] > page.height {
return Err(err(&format!("{label} bbox exceeds page bounds")));
}
Ok(())
}

fn optional_positive_u32_field(
object: &Value,
field: &str,
Expand Down Expand Up @@ -241,7 +248,7 @@ fn parse_pages(root: &Value) -> Result<(Vec<PageGeometry>, HashSet<u32>), Adapte

fn parse_elements(
root: &Value,
page_numbers: &HashSet<u32>,
pages_by_number: &HashMap<u32, PageGeometry>,
) -> Result<Vec<GroundingElement>, AdapterError> {
let mut elements = Vec::new();
let mut element_ids = HashSet::new();
Expand All @@ -261,9 +268,9 @@ fn parse_elements(
if page_number == 0 {
return Err(err("element.page must be 1-based"));
}
if !page_numbers.contains(&page_number) {
let Some(page) = pages_by_number.get(&page_number) else {
return Err(err("element.page references unknown page"));
}
};
let id = el
.get("id")
.and_then(Value::as_str)
Expand All @@ -275,6 +282,7 @@ fn parse_elements(
return Err(err("duplicate element.id"));
}
let bbox = bbox_from(el.get("bbox").ok_or_else(|| err("missing element.bbox"))?)?;
bbox_within_page(bbox, page, "element")?;
let kind = el
.get("type")
.and_then(Value::as_str)
Expand All @@ -296,7 +304,7 @@ fn parse_elements(

fn parse_tables(
root: &Value,
page_numbers: &HashSet<u32>,
pages_by_number: &HashMap<u32, PageGeometry>,
) -> Result<(bool, Vec<GroundingTable>), AdapterError> {
let Some(tables_value) = root.get("tables") else {
return Ok((false, Vec::new()));
Expand All @@ -321,11 +329,15 @@ fn parse_tables(
if page_number == 0 {
return Err(err("table.page must be 1-based"));
}
if !page_numbers.contains(&page_number) {
let Some(page) = pages_by_number.get(&page_number) else {
return Err(err("table.page references unknown page"));
}
};
let bbox = bbox_from(table.get("bbox").ok_or_else(|| err("missing table.bbox"))?)?;
bbox_within_page(bbox, page, "table")?;
let cells = parse_table_cells(table)?;
for cell in &cells {
bbox_within_page(cell.bbox, page, "cell")?;
}
tables.push(GroundingTable {
id,
page: format!("page-{page_number}"),
Expand Down Expand Up @@ -856,8 +868,14 @@ impl OdlJsonSource {

let (parser_name, parser_version) = parse_tool(root)?;
let (pages, page_numbers) = parse_pages(root)?;
let elements = parse_elements(root, &page_numbers)?;
let (tables_capable, tables) = parse_tables(root, &page_numbers)?;
let pages_by_number = pages
.iter()
.cloned()
.map(|page| (page.index, page))
.collect::<HashMap<_, _>>();
debug_assert_eq!(pages_by_number.len(), page_numbers.len());
let elements = parse_elements(root, &pages_by_number)?;
let (tables_capable, tables) = parse_tables(root, &pages_by_number)?;

Ok(OdlJsonSource {
parser_name,
Expand Down Expand Up @@ -1466,6 +1484,34 @@ mod tests {
);
}

#[test]
fn rejects_documented_subset_bboxes_outside_declared_page_bounds() {
assert_error_contains(
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[{"page":1,"bbox":[1,1,613,2]}]}"#,
"element bbox exceeds page bounds",
);
assert_error_contains(
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[{"page":1,"bbox":[-1,1,2,2]}]}"#,
"element bbox exceeds page bounds",
);
assert_error_contains(
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[],"tables":[{"id":"t1","page":1,"bbox":[1,1,613,2],"cells":[]}]}"#,
"table bbox exceeds page bounds",
);
assert_error_contains(
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[],"tables":[{"id":"t1","page":1,"bbox":[1,1,2,2],"cells":[{"row":1,"col":1,"bbox":[1,1,613,2],"text":"x"}]}]}"#,
"cell bbox exceeds page bounds",
);
}

#[test]
fn accepts_documented_subset_bboxes_on_declared_page_bounds() {
OdlJsonSource::from_json_str(
r#"{"tool":{"name":"x","version":"1"},"pages":[{"number":1,"width":612,"height":792}],"elements":[{"page":1,"bbox":[0,0,612,792]}],"tables":[{"id":"t1","page":1,"bbox":[0,0,612,792],"cells":[{"row":1,"col":1,"bbox":[0,0,612,792],"text":"x"}]}]}"#,
)
.expect("exact page-boundary bboxes are valid");
}

fn assert_error_contains(json: &str, expected: &str) {
let error = OdlJsonSource::from_json_str(json).unwrap_err();
assert!(
Expand Down
79 changes: 77 additions & 2 deletions crates/ethos-cli/src/cmd/rag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use std::collections::{BTreeMap, BTreeSet};

use ethos_core::codes::WarningCode;
use ethos_core::error::EthosError;
use ethos_core::model::{Chunk, Document};
use ethos_core::model::{Chunk, Document, Table};

use crate::{read_document, write_output, Failure, RagChunkArgs};

Expand Down Expand Up @@ -50,6 +50,7 @@ struct PageBounds {
struct RagChunkRefs<'a> {
page_bounds: BTreeMap<&'a str, PageBounds>,
element_pages: BTreeMap<&'a str, &'a str>,
element_texts: BTreeMap<&'a str, Result<String, String>>,
element_span_refs: BTreeMap<&'a str, &'a [String]>,
element_warning_refs: BTreeMap<&'a str, &'a [String]>,
excluded_element_warnings: BTreeMap<&'a str, (&'a str, WarningCode)>,
Expand Down Expand Up @@ -105,6 +106,37 @@ impl<'a> RagChunkRefs<'a> {
.iter()
.map(|element| (element.id.as_str(), element.page.as_str()))
.collect(),
element_texts: {
let table_texts = doc
.payload
.tables
.iter()
.map(|table| (table.id.as_str(), table_chunk_text(table)))
.collect::<BTreeMap<_, _>>();
doc.payload
.elements
.iter()
.map(|element| {
let table_text = match element.table_ref.as_deref() {
Some(table_ref) => match table_texts.get(table_ref) {
Some(text) => Ok(Some(text.clone())),
None => Err(format!(
"element {} table_ref {} does not resolve",
element.id, table_ref
)),
},
None => Ok(None),
};
let text = match (element.text.as_ref(), table_text) {
(Some(text), Ok(_)) => Ok(text.clone()),
(None, Ok(Some(text))) => Ok(text),
(None, Ok(None)) => Ok(String::new()),
(_, Err(message)) => Err(message),
};
(element.id.as_str(), text)
})
.collect()
},
element_span_refs: doc
.payload
.elements
Expand Down Expand Up @@ -135,7 +167,8 @@ fn validate_chunk_refs(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Fai
validate_chunk_element_refs(chunk, refs, &page_refs, &mut backed_pages)?;
validate_chunk_bboxes(chunk, refs, &page_refs, &mut backed_pages)?;
validate_backed_page_refs(chunk, &page_refs, &backed_pages)?;
validate_chunk_warning_refs(chunk, refs)
validate_chunk_warning_refs(chunk, refs)?;
validate_chunk_text(chunk, refs)
}

fn validate_chunk_required_refs(chunk: &Chunk) -> Result<(), Failure> {
Expand Down Expand Up @@ -350,6 +383,48 @@ fn validate_element_default_chunk_warnings(
Ok(())
}

fn validate_chunk_text(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<(), Failure> {
let reconstructed = chunk
.element_refs
.iter()
.map(|element_ref| {
let Some(text) = refs.element_texts.get(element_ref.as_str()) else {
return Ok("");
};
text.as_deref()
.map_err(|message| Failure::Usage(message.clone()))
})
.collect::<Result<Vec<_>, _>>()?
.join("\n\n");
// Exact equality is intentional: chunk text is a deterministic derived artifact,
// not a verification claim using whitespace-normalized matching.
if chunk.text != reconstructed {
return Err(Failure::Usage(format!(
"chunk {} text does not match referenced element text",
chunk.id
)));
}
Ok(())
}

fn table_chunk_text(table: &Table) -> String {
let mut rows = Vec::new();
for row in 0..table.n_rows {
let mut cols = Vec::new();
for col in 0..table.n_cols {
let text = table
.cells
.iter()
.find(|cell| cell.row == row && cell.col == col)
.map(|cell| cell.text.as_str())
.unwrap_or("");
cols.push(text);
}
rows.push(cols.join(" | "));
}
rows.join("\n")
}

fn rag_chunk_record(chunk: &Chunk, refs: &RagChunkRefs<'_>) -> Result<serde_json::Value, Failure> {
let mut record = serde_json::Map::new();
record.insert(
Expand Down
Loading
Loading