From 3c596f0c4d158b27052393ee86f09d8f6d351e21 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 11:37:29 +0000 Subject: [PATCH 01/28] =?UTF-8?q?convert:=20Brainstorm-E=20split=20migrato?= =?UTF-8?q?rs=20(treatment=E2=86=92manip,=20ontology=5Ftable=5Frow?= =?UTF-8?q?=E2=86=92obs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #2 of the migration work — the migration CODE for the Brainstorm E split, implementing the specs in did-schema V_epsilon/conversions/from_did_v1/. Engine (v1_to_v2): adds a TargetVersion option (default 'V_delta', behaviour unchanged) and 1→N support. Under TargetVersion 'V_epsilon', a class with a split migrator under +did2/+convert/+migrators_e is routed there and may return several bodies; each is padded, optionally validated, and counted independently. Fully backward compatible — the default path still runs the existing per-class migrators 1→1, so the V_delta corpus pipeline is untouched. Migrators (+migrators_e): - treatment.m: 1→1 branch dispatch to injection / bath / procedural_ / temperature_ / environmental_manipulation, with the Dab string_value→ target_structure edge case and out-of-tier (quarantine-with-reason) routing for non-manipulation rows. Branch resolution is a keyword/CURIE-prefix seed; the per-term table is finalized in discovery mode. - ontology_table_row.m: 1→N — each row → a scalar/categorical observation property class (or generic escape hatch), value placed in the declaring block (shape mixin for scalars / categorical_concept for inherited categoricals / the concrete class for the overriders). Tests (testMigratorsE.m): synthetic, Validate=false (matching testMigrators) — thermal/drug/environmental/Dab/not-a-manipulation treatment cases, the ontology_table_row 1→N fan-out + per-row unique ids, and a backward-compat guard that the default target leaves treatment unchanged. Not local-verifiable (no MATLAB in the authoring env) — relies on CI. Corpus discovery against V_epsilon (#3) is a follow-up that needs the E draft classes reachable by the schema cache (DID_SCHEMA_PATH → V_epsilon); see PR body. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+migrators_e/ontology_table_row.m | 243 ++++++++++++++++++ .../+did2/+convert/+migrators_e/treatment.m | 224 ++++++++++++++++ src/did/+did2/+convert/v1_to_v2.m | 87 ++++++- tests/+did2/+unittest/testMigratorsE.m | 142 ++++++++++ 4 files changed, 687 insertions(+), 9 deletions(-) create mode 100644 src/did/+did2/+convert/+migrators_e/ontology_table_row.m create mode 100644 src/did/+did2/+convert/+migrators_e/treatment.m create mode 100644 tests/+did2/+unittest/testMigratorsE.m diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m new file mode 100644 index 0000000..8b3240b --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -0,0 +1,243 @@ +function bodies = ontology_table_row(preBody) +%ONTOLOGY_TABLE_ROW Brainstorm-E split migrator: did_v1 ontology_table_row +% -> observation tiers (1 -> N). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Each row of the legacy open key/value table becomes its +% own observation document, dispatched by what the property is and what +% shape its value takes, per +% did-schema/schemas/V_epsilon/conversions/from_did_v1/ontology_table_row.md: +% +% numeric row -> a scalar property class (body_weight_observation, +% core_temperature_observation, ...) value as the +% matching typed composite; unrecognised -> the +% generic_scalar_observation escape hatch. +% term row -> a categorical property class (developmental_stage_ +% observation, ...) value as a bound ontology_term; +% unrecognised -> generic_categorical_observation. +% +% Returns a CELL of body structs (one per row); the dispatcher lands +% each as its own migrated document. Branch resolution is a keyword +% HEURISTIC seed; the per-term table is finalised in discovery mode. +% +% Subject-intrinsic (species/strain/sex) and relational (cohort/housing) +% rows are out of scope for the observation tier; in this seed they fall +% to the generic categorical escape hatch and are flagged for review +% rather than silently dropped. Refining that routing is a follow-up. + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'ontology_table_row') || ~isstruct(preBody.ontology_table_row) + error('did2:convert:missingBlock', ... + 'ontology_table_row body is missing the ontology_table_row property block.'); +end +rows = extractRows(preBody.ontology_table_row); +if isempty(rows) + error('did2:convert:emptyTable', ... + 'ontology_table_row has no rows to migrate.'); +end + +bodies = cell(1, numel(rows)); +for k = 1:numel(rows) + bodies{k} = migrateRow(preBody, rows{k}, k); +end +end + +% ===================== per-row migration =============================== + +function body = migrateRow(preBody, row, rowIndex) +node = getCharField(row, 'ontology_name'); +label = getCharField(row, 'name'); +identity = struct('node', node, 'name', label); +hay = lower([node ' ' label]); + +[isNumeric, numVal] = rowNumericValue(row); + +if isNumeric + [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal); + body = makeScalarObservation(preBody, rowIndex, className, shapeClass, ... + identity, valueStruct); +else + [className, valueTerm] = dispatchCategorical(hay, row); + body = makeCategoricalObservation(preBody, rowIndex, className, ... + identity, valueTerm); +end +end + +function [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal) +unit = getCharField(row, 'unit'); +if containsAny(hay, {'weight', 'mass'}) + className = 'body_weight_observation'; shapeClass = 'scalar_mass'; + valueStruct = canonicalComposite('kilograms', unit, numVal); +elseif containsAny(hay, {'length', 'tibia', 'tail'}) + className = 'body_length_observation'; shapeClass = 'scalar_length'; + valueStruct = canonicalComposite('meters', unit, numVal); +elseif containsAny(hay, {'age', 'duration', 'latency'}) + className = 'age_observation'; shapeClass = 'scalar_duration'; + valueStruct = canonicalComposite('seconds', unit, numVal); +elseif containsAny(hay, {'temperature'}) + className = 'core_temperature_observation'; shapeClass = 'scalar_temperature'; + valueStruct = canonicalComposite('celsius', unit, numVal); +elseif containsAny(hay, {'heart rate', 'respiration', 'rate', 'frequency'}) + className = 'heart_rate_observation'; shapeClass = 'scalar_frequency'; + valueStruct = canonicalComposite('hertz', unit, numVal); +elseif containsAny(hay, {'pressure'}) + className = 'blood_pressure_observation'; shapeClass = 'scalar_pressure'; + valueStruct = canonicalComposite('mmhg', unit, numVal); +elseif containsAny(hay, {'litter', 'count', 'number of'}) + className = 'litter_size_observation'; shapeClass = 'scalar_count'; + valueStruct = struct('value', round(numVal), ... + 'unit', struct('node', '', 'name', ''), 'approximate', false); +elseif containsAny(hay, {'score', 'condition'}) + className = 'body_condition_observation'; shapeClass = 'scalar_score'; + valueStruct = struct('value', numVal, 'scale', struct('node', '', 'name', ''), ... + 'scale_min', 0.0, 'scale_max', 0.0, 'approximate', false); +elseif containsAny(hay, {'concentration', 'glucose', 'cortisol', 'titer'}) + className = 'concentration_observation'; shapeClass = 'scalar_concentration'; + valueStruct = struct('source_unit', unit, 'source_value', numVal, 'approximate', false); +elseif containsAny(hay, {'volume'}) + className = 'organ_volume_observation'; shapeClass = 'scalar_volume'; + valueStruct = canonicalComposite('liters', unit, numVal); +else + className = 'generic_scalar_observation'; shapeClass = 'generic_scalar'; + valueStruct = struct('source_unit', unit, 'source_value', numVal, 'approximate', false); +end +end + +function [className, valueTerm] = dispatchCategorical(hay, row) +termValue = getCharField(row, 'value'); +if isempty(termValue) + termValue = getCharField(row, 'string_value'); +end +valueTerm = struct('node', termValue, 'name', ''); +if containsAny(hay, {'stage', 'life cycle', 'developmental'}) + className = 'developmental_stage_observation'; +elseif containsAny(hay, {'health', 'status'}) + className = 'health_status_observation'; +elseif containsAny(hay, {'coat', 'pigment'}) + className = 'pigmentation_observation'; +elseif containsAny(hay, {'estrous', 'estrus'}) + className = 'estrous_stage_observation'; +elseif containsAny(hay, {'behavior', 'phenotype'}) + className = 'behavioral_phenotype_observation'; +else + className = 'generic_categorical_observation'; +end +end + +% ===================== destination builders ============================ + +function body = makeScalarObservation(preBody, rowIndex, className, shapeClass, identity, valueStruct) +body = startObservation(preBody, rowIndex, className, {'scalar_observation', shapeClass}); +body.observation = struct('measured_property', identity, 'target_structure', {{}}); +body.(shapeClass) = struct('value', valueStruct); +end + +function body = makeCategoricalObservation(preBody, rowIndex, className, identity, valueTerm) +body = startObservation(preBody, rowIndex, className, ... + {'categorical_observation', 'categorical_concept'}); +body.observation = struct('measured_property', identity, 'target_structure', {{}}); +% `value` lives in the block of the class that DECLARES it: the two +% overriders (developmental_stage / generic_categorical) declare their own +% value; every other categorical property class inherits it from the +% categorical_concept shape mixin, so its value lives in that block. +if any(strcmp(className, {'developmental_stage_observation', ... + 'generic_categorical_observation'})) + valueBlock = className; +else + valueBlock = 'categorical_concept'; +end +body.(valueBlock) = struct('value', valueTerm); +end + +% ===================== shared helpers ================================== + +function body = startObservation(preBody, rowIndex, className, extraSupers) +chain = [{'observation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct('class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubjectAndTime(preBody); +if isfield(preBody, 'base') && isstruct(preBody.base) + base = preBody.base; + if isfield(base, 'id') && ~isempty(base.id) + base.id = sprintf('%s_row%02d', char(base.id), rowIndex); + end + body.base = base; +end +end + +function deps = carrySubjectAndTime(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + if isfield(d, 'value'); subjectVal = d.value; + elseif isfield(d, 'document_id'); subjectVal = d.document_id; end + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +deps(end+1) = struct('name', 'time_reference_1', 'value', ''); +end + +function comp = canonicalComposite(canonField, unit, numVal) +comp = struct(canonField, double(numVal), 'source_unit', unit, ... + 'source_value', double(numVal), 'approximate', false); +end + +function rows = extractRows(block) +%EXTRACTROWS Normalise the legacy table to a cell of row structs. +rows = {}; +if isfield(block, 'rows') + r = block.rows; + if iscell(r) + rows = r(:)'; + elseif isstruct(r) + rows = arrayfun(@(x) x, r(:)', 'UniformOutput', false); + end +elseif isfield(block, 'ontology_name') || isfield(block, 'name') + % single-row legacy shape (the table block IS one row) + rows = {block}; +end +end + +function [isNumeric, numVal] = rowNumericValue(row) +isNumeric = false; numVal = []; +if isfield(row, 'value') + v = row.value; + if isnumeric(v) && isscalar(v) && isfinite(v) + isNumeric = true; numVal = double(v); + end +elseif isfield(row, 'numeric_value') + v = row.numeric_value; + if isnumeric(v) && ~isempty(v) + isNumeric = true; numVal = double(v(1)); + end +end +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v); s = v; + elseif isstring(v) && isscalar(v); s = char(v); + elseif isnumeric(v) && isscalar(v); s = num2str(v); end +end +end + +function tf = containsAny(hay, needles) +tf = false; +for k = 1:numel(needles) + if contains(hay, needles{k}); tf = true; return; end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m new file mode 100644 index 0000000..c081a67 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -0,0 +1,224 @@ +function v2Body = treatment(preBody) +%TREATMENT Brainstorm-E split migrator: did_v1 treatment -> manipulation tiers. +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Reads the treatment block's ontology identity + numeric +% / string values and dispatches the row to the manipulation family +% whose ACTION it names, per +% did-schema/schemas/V_epsilon/conversions/from_did_v1/treatment.md: +% +% injection (substance delivered by injection) +% bath (substance applied as a bath) +% procedural_manipulation (physical operation on the body) +% temperature_manipulation (imposed heat / cold) +% environmental_manipulation (changed condition / regime, no value) +% +% This is a 1 -> 1 split (one treatment -> one manipulation). Rows that +% are not manipulations (date of birth, experiment time) or whose +% branch cannot be resolved raise an error so the dispatcher routes the +% source body to quarantine with a descriptive reason -- the +% "curator review queue" of the conversion spec; nothing is forced into +% a residual family. +% +% Branch resolution here is a keyword/CURIE-prefix HEURISTIC seed; the +% authoritative per-term branch list is finalised in discovery mode +% against real corpora (treatment.md, Open questions). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment') || ~isstruct(preBody.treatment) + error('did2:convert:missingBlock', ... + 'treatment body is missing the treatment property block.'); +end +block = preBody.treatment; + +node = getCharField(block, 'ontology_name'); +label = getCharField(block, 'name'); +strValue = getCharField(block, 'string_value'); +numValue = []; +if isfield(block, 'numeric_value') + numValue = block.numeric_value; +end + +identity = struct('node', node, 'name', label); +hay = lower([node ' ' label]); % search text for the heuristic branch + +% --- Dab edge case: string_value is an ontology target, not prose ------ +targetStructure = {}; +notesText = strValue; +if endsWith(lower(strtrim(label)), 'target location') || looksLikeCURIE(strValue) + targetStructure = {struct('node', strValue, 'name', '')}; + notesText = ''; + identity.name = strtrim(regexprep(label, '(?i)\s*target location$', '')); +end + +% --- not-a-manipulation rows: route OUT of tier (quarantine w/ reason) -- +if containsAny(hay, {'date of birth', 'non-survival experiment time', ... + 'experiment time'}) + error('did2:convert:notAManipulation', ... + ['treatment "%s" is not a manipulation; route out of tier ', ... + '(observation/session metadata) per treatment.md.'], label); +end + +% --- branch dispatch (first match wins) -------------------------------- +if containsAny(hay, {'cool', 'cold', 'heat', 'warm', 'thermal', 'temperature'}) + v2Body = makeTemperatureManipulation(preBody, identity, targetStructure, ... + notesText, numValue); +elseif containsAny(hay, {'inject', 'virus', 'aav', 'tracer', 'drug', 'vehicle'}) ... + || startsWith(lower(node), 'chebi:') + v2Body = makeInjection(preBody, identity, targetStructure, notesText); +elseif containsAny(hay, {'bath'}) + v2Body = makeBath(preBody, identity, notesText); +elseif containsAny(hay, {'craniotomy', 'implant', 'lesion', 'perfus', ... + 'eye opening', 'eyelid', 'ear notch', 'ear punch', 'tail clip', ... + 'toe clip', 'whisker', 'suture', 'surgery', 'transection', ... + 'resection', 'enucleation', 'dissection', 'procedure', 'optogenetic'}) + v2Body = makeProceduralManipulation(preBody, identity, targetStructure, notesText); +elseif containsAny(hay, {'rear', 'deprivation', 'isolation', 'enrichment', ... + 'housing', 'light', 'dark', 'restriction', 'diet', 'training', ... + 'habituation', 'restraint'}) + v2Body = makeEnvironmentalManipulation(preBody, identity, targetStructure, notesText); +else + error('did2:convert:unresolvedTreatment', ... + ['treatment "%s" (%s) could not be routed to a manipulation ', ... + 'family; curator review required.'], label, node); +end +end + +% ===================== destination builders ============================ + +function body = makeTemperatureManipulation(preBody, identity, targetStructure, notesText, numValue) +body = startBody(preBody, 'temperature_manipulation', ... + {'scalar_manipulation', 'scalar_temperature'}); +body.scalar_manipulation = struct( ... + 'applied_property', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +body.scalar_temperature = struct('value', temperatureComposite(numValue)); +end + +function body = makeInjection(preBody, identity, targetStructure, notesText) +body = startBody(preBody, 'injection', {'pharmacological_manipulation'}); +body.pharmacological_manipulation = struct('mixture', ... + struct('agent', identity, 'concentration', emptyConcentration())); +body.injection = struct( ... + 'kind', 'drug', ... + 'route', struct('node', '', 'name', ''), ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +end + +function body = makeBath(preBody, identity, notesText) +body = startBody(preBody, 'bath', {'pharmacological_manipulation'}); +body.pharmacological_manipulation = struct('mixture', ... + struct('agent', identity, 'concentration', emptyConcentration())); +body.bath = struct('kind', 'drug', ... + 'location', struct('node', '', 'name', ''), 'notes', notesText); +end + +function body = makeProceduralManipulation(preBody, identity, targetStructure, notesText) +body = startBody(preBody, 'procedural_manipulation', {}); +body.procedural_manipulation = struct( ... + 'procedure', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +end + +function body = makeEnvironmentalManipulation(preBody, identity, targetStructure, notesText) +body = startBody(preBody, 'environmental_manipulation', {}); +body.environmental_manipulation = struct( ... + 'factor', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +end + +% ===================== shared helpers ================================== + +function body = startBody(preBody, className, extraSupers) +%STARTBODY Seed a V_epsilon manipulation body: document_class header, +% carried base + subject_id, and a synthesized time_reference slot. +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubjectAndTime(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubjectAndTime(preBody) +%CARRYSUBJECTANDTIME Keep subject_id; add a time_reference_1 slot. +% The time_reference value is left empty for curator/tooling backfill +% (real synthesis from session/epoch metadata is a follow-up; see +% treatment.md "time_reference synthesis fidelity"). +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +deps(end+1) = struct('name', 'time_reference_1', 'value', ''); +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end + +function comp = temperatureComposite(numValue) +comp = struct('celsius', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +if ~isempty(numValue) && isnumeric(numValue) + v = double(numValue(1)); + comp.celsius = v; + comp.source_unit = 'celsius'; + comp.source_value = v; +end +end + +function c = emptyConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function tf = looksLikeCURIE(s) +tf = ~isempty(s) && ~isempty(regexp(char(s), '^[A-Za-z][A-Za-z0-9_]*:[^\s:]+$', 'once')); +end + +function tf = containsAny(hay, needles) +tf = false; +for k = 1:numel(needles) + if contains(hay, needles{k}) + tf = true; + return; + end +end +end diff --git a/src/did/+did2/+convert/v1_to_v2.m b/src/did/+did2/+convert/v1_to_v2.m index ffda356..2358366 100644 --- a/src/did/+did2/+convert/v1_to_v2.m +++ b/src/did/+did2/+convert/v1_to_v2.m @@ -66,6 +66,14 @@ % identifiers in the legacy (camelCase) form so % the body stays schema-compatible while still % gaining the V_delta shape transformations. +% TargetVersion (1,:) char, default 'V_delta') - migration target. +% 'V_delta' (default) preserves the historical +% class-preserving 1->1 behaviour. 'V_epsilon' routes +% classes that have a Brainstorm-E split migrator +% under +did2.+convert.+migrators_e (treatment, +% ontology_table_row) through that migrator instead, +% which may fan one source body out to several +% destination documents (1 -> N). % % See also: did2.convert.universalRenames, did2.convert.migrators, % docs/v2/PLAN.md §9.6. @@ -78,6 +86,7 @@ options.CheckReferences (1,1) logical = false options.ReferenceDatabase = [] options.RenameClassNames (1,1) logical = true + options.TargetVersion (1,:) char = 'V_delta' end bodies = normaliseInput(v1Bodies); @@ -111,22 +120,41 @@ && isfield(v2Body.document_class, 'class_name') className = char(v2Body.document_class.class_name); end + v2Bodies = {v2Body}; else postUniversalBody = did2.convert.universalRenames(preBody, ... 'RenameClassNames', options.RenameClassNames); className = char(postUniversalBody.document_class.class_name); v2Body = applySuperclassMigrators(postUniversalBody, className); - migratorFcn = lookupMigrator(className); - v2Body = migratorFcn(v2Body); + % runConcreteMigrator returns a CELL of one-or-more bodies. + % Default (TargetVersion 'V_delta') always returns a single + % body via the existing per-class migrator, so behaviour is + % unchanged. Under TargetVersion 'V_epsilon' a class with a + % Brainstorm-E split migrator (treatment, ontology_table_row) + % may fan out to several bodies (1 -> N). + v2Bodies = runConcreteMigrator(v2Body, className, ... + options.TargetVersion); end - v2Body = ensureClassBlocks(v2Body, options.SchemaCache); - doc = did2.document(v2Body); - if options.Validate - doc.validate('SchemaCache', options.SchemaCache); + % Collect every produced body. Each is padded, optionally + % validated, and counted independently so a 1 -> N split lands + % N documents in `migrated` (or quarantines the whole source + % body on the first failure, as before). + for bi = 1:numel(v2Bodies) + outBody = ensureClassBlocks(v2Bodies{bi}, options.SchemaCache); + doc = did2.document(outBody); + if options.Validate + doc.validate('SchemaCache', options.SchemaCache); + end + migrated{end+1} = doc; %#ok + outName = className; + if isfield(outBody, 'document_class') ... + && isstruct(outBody.document_class) ... + && isfield(outBody.document_class, 'class_name') + outName = char(outBody.document_class.class_name); + end + [classCountNames, classCountValues] = bumpClassCounter( ... + classCountNames, classCountValues, outName); end - migrated{end+1} = doc; %#ok - [classCountNames, classCountValues] = bumpClassCounter( ... - classCountNames, classCountValues, className); catch err entry = struct( ... 'original_body', originalJSON, ... @@ -250,6 +278,47 @@ end end +function bodies = runConcreteMigrator(v2Body, className, targetVersion) +%RUNCONCRETEMIGRATOR Run the concrete-class migrator, return a cell of bodies. +% Default ('V_delta') preserves the historical 1 -> 1 behaviour: the +% per-class migrator under +did2.+convert.+migrators is applied and a +% single-element cell is returned. Under 'V_epsilon', a class that has +% a Brainstorm-E split migrator under +did2.+convert.+migrators_e is +% routed there instead; that migrator may return either a single body +% (struct) or several (struct array / cell), enabling the treatment -> +% manipulation and ontology_table_row -> observations (1 -> N) splits. +if strcmp(targetVersion, 'V_epsilon') + fqn = ['did2.convert.migrators_e.', className]; + if ~isempty(which(fqn)) + out = feval(str2func(fqn), v2Body); + bodies = normaliseMigratorOutput(out); + return; + end +end +migratorFcn = lookupMigrator(className); +bodies = {migratorFcn(v2Body)}; +end + +function bodies = normaliseMigratorOutput(out) +%NORMALISEMIGRATOROUTPUT Coerce a migrator's output to a cell of bodies. +if iscell(out) + bodies = out(:)'; +elseif isstruct(out) + if isscalar(out) + bodies = {out}; + else + bodies = cell(1, numel(out)); + for k = 1:numel(out) + bodies{k} = out(k); + end + end +else + error('did2:convert:badMigratorOutput', ... + 'A split migrator must return a struct or cell of bodies (got %s).', ... + class(out)); +end +end + function body = ensureClassBlocks(body, schemaCacheOverride) % Make sure every class in the V_delta schema chain for the body's % concrete class has a property block in the document, manufacturing diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m new file mode 100644 index 0000000..dfe30d3 --- /dev/null +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -0,0 +1,142 @@ +function tests = testMigratorsE +%TESTMIGRATORSE Brainstorm-E split migrator tests (TargetVersion 'V_epsilon'). +% +% Exercises the did_v1 -> V_epsilon split migrators routed by +% did2.convert.v1_to_v2 when TargetVersion == 'V_epsilon': +% - treatment -> manipulation tiers (1 -> 1 branch dispatch) +% - ontology_table_row -> observation tiers (1 -> N) +% against the worked examples in did-schema/schemas/V_epsilon/ +% conversions/from_did_v1/{treatment,ontology_table_row}.md. +% +% Like testMigrators, these run with Validate=false so they assert the +% TRANSFORM (routing + field placement) without depending on a V_epsilon +% schema cache at the test-runner working directory. Corpus-level +% validation is the discovery-mode CI job (#3). +% +% Run with: +% results = runtests('did2.unittest.testMigratorsE'); + +tests = functiontests(localfunctions); +end + +function v1 = wrap(className, blockKey, block) +v1 = struct(); +v1.document_class = struct('class_name', className, 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0')); +v1.depends_on = struct('name', {'subject_id'}, 'document_id', {'aabb1122ccdd3344_aabb1122ccdd3344'}); +v1.base = struct('id', 'aabb1122ccdd3344_1122334455667788', ... + 'session_id', 'aabb1122ccdd3344_9900aabbccddeeff', ... + 'name', 'migrator-e-example', 'datestamp', '2024-06-01T12:00:00.000Z'); +v1.(blockKey) = block; +end + +function out = runE(v1) +out = did2.convert.v1_to_v2(v1, 'Validate', false, 'TargetVersion', 'V_epsilon'); +end + +% ===================== treatment -> manipulation ======================= + +function testThermalTreatmentBecomesTemperatureManipulation(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'ndic:0000nnnn', 'name', 'focal cortical cooling', ... + 'numeric_value', 12.0, 'string_value', 'Peltier')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 1); +doc = out.migrated{1}; +verifyTrue(testCase, isfield(out.summary.by_class, 'temperature_manipulation')); +val = doc.get('scalar_temperature.value'); +verifyEqual(testCase, val.celsius, 12.0); +ap = doc.get('scalar_manipulation.applied_property'); +verifyEqual(testCase, ap.name, 'focal cortical cooling'); +end + +function testDrugTreatmentBecomesInjection(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'chebi:6015', 'name', 'isoflurane', ... + 'numeric_value', [], 'string_value', '')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 1); +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +end + +function testEnvironmentalTreatmentBecomesEnvironmentalManipulation(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'ncit:0000nnnn', 'name', 'dark rearing', ... + 'numeric_value', [], 'string_value', 'reared in darkness')); +out = runE(v1); +verifyTrue(testCase, isfield(out.summary.by_class, 'environmental_manipulation')); +doc = out.migrated{1}; +factor = doc.get('environmental_manipulation.factor'); +verifyEqual(testCase, factor.name, 'dark rearing'); +end + +function testDabTargetLocationRoutesStringValueToTargetStructure(testCase) +% Dab edge case: string_value is a UBERON CURIE, name ends "Target Location". +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'empty:0000074', ... + 'name', 'Optogenetic Tetanus Stimulation Target Location', ... + 'numeric_value', [], 'string_value', 'uberon:0001930')); +out = runE(v1); +verifyTrue(testCase, isfield(out.summary.by_class, 'procedural_manipulation')); +doc = out.migrated{1}; +ts = doc.get('procedural_manipulation.target_structure'); +verifyFalse(testCase, isempty(ts)); +end + +function testNotAManipulationIsQuarantined(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', '', 'name', 'Date of birth', ... + 'numeric_value', [], 'string_value', '2024-01-01')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 0); +verifyEqual(testCase, numel(out.quarantine), 1); +verifyTrue(testCase, contains(out.quarantine(1).reason, 'not a manipulation')); +end + +% ===================== ontology_table_row -> observations (1->N) ======= + +function testTableRowFansOutToNObservations(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', ... + 'value', 22.5, 'unit', 'g'), ... + struct('ontology_name', 'uberon:0000105', 'name', 'life cycle stage', ... + 'value', 'fbdv:00005336')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'developmental_stage_observation')); +end + +function testTableRowScalarValueLandsTyped(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', ... + 'value', 22.5, 'unit', 'g')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +doc = out.migrated{1}; +val = doc.get('scalar_mass.value'); +verifyEqual(testCase, val.source_value, 22.5); +end + +function testTableRowGeneratesUniqueIdsPerRow(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', 'value', 22.5, 'unit', 'g'), ... + struct('ontology_name', 'schema:weight', 'name', 'weight', 'value', 23.0, 'unit', 'g')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +id1 = out.migrated{1}.get('base.id'); +id2 = out.migrated{2}.get('base.id'); +verifyNotEqual(testCase, id1, id2); +end + +% ===================== backward compatibility ========================== + +function testDefaultTargetLeavesTreatmentUnchanged(testCase) +% With the default TargetVersion ('V_delta') the E split is NOT applied: +% treatment passes through the existing per-class migrator as a single +% treatment document. Guards the gated, backward-compatible design. +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'chebi:6015', 'name', 'isoflurane', ... + 'numeric_value', 2.0, 'string_value', '2 percent')); +out = did2.convert.v1_to_v2(v1, 'Validate', false); +verifyEqual(testCase, numel(out.migrated), 1); +verifyTrue(testCase, isfield(out.summary.by_class, 'treatment')); +end From 32606a967ef64cc2600f74c1652c46d0c66525a0 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 11:39:59 +0000 Subject: [PATCH 02/28] Fix Code Analyzer alert: expand inline if/elseif in getCharField The compressed multi-line if/elseif chain with a trailing end (line 232) tripped the MATLAB Code Analyzer keyword/end alignment check. Expand to standard multi-line form; no behaviour change. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+did2/+convert/+migrators_e/ontology_table_row.m | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 8b3240b..c90c919 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -229,9 +229,13 @@ s = ''; if isfield(block, name) v = block.(name); - if ischar(v); s = v; - elseif isstring(v) && isscalar(v); s = char(v); - elseif isnumeric(v) && isscalar(v); s = num2str(v); end + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + elseif isnumeric(v) && isscalar(v) + s = num2str(v); + end end end From a6b85088311b39bd0193f6ad876c97b379d6bd52 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 13:19:22 +0000 Subject: [PATCH 03/28] ci: V_epsilon corpus discovery through the Brainstorm-E split (#3) Wires the corpus migration test (#3) to run against the V_epsilon E classes, isolated from the green V_delta corpus run: - runCorpusDiscovery: add a TargetVersion option (default 'V_delta'), forwarded to v1_to_v2. - testCorpusEpsilon.m: opt-in (DID_RUN_EPSILON_CORPUS) discovery run of the Dab corpus through TargetVersion='V_epsilon' (Dab's treatment rows, incl. the optogenetic-tetanus "Target Location" idiom, exercise the split). Skips cleanly in the default V_delta run via assumeFail. - test-code.yml: after the default test run, assemble a combined V_epsilon stable+draft schema dir (the cache loads one flat dir; tier class names are disjoint), point DID_SCHEMA_PATH at it, and run testCorpusEpsilon with DID_RUN_EPSILON_CORPUS=1. Own step env, so the V_delta job is untouched. Discovery mode: quarantine (e.g. the expected missing-time_reference until synthesis lands) is reported, not failed; only errors fail the step. The report artifact drives the next iteration of the dispatch/synthesis work. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .github/workflows/test-code.yml | 32 ++++++++++++ .../+unittest/+helpers/runCorpusDiscovery.m | 7 ++- tests/+did2/+unittest/testCorpusEpsilon.m | 49 +++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 tests/+did2/+unittest/testCorpusEpsilon.m diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index 5e740f6..a31f978 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -92,6 +92,38 @@ jobs: nFailed = sum([results.Failed]); assert(nFailed == 0, sprintf("%d test(s) failed", nFailed)); + - name: Assemble V_epsilon schema set (stable + draft) + # The schema cache loads a single flat dir of *.json. The + # Brainstorm-E observation/manipulation classes live in + # V_epsilon/draft while base/element/etc. live in V_epsilon/stable, + # so combine both tiers (class names are disjoint across tiers) + # into one directory for the V_epsilon corpus discovery step. + if: always() + run: | + mkdir -p "${GITHUB_WORKSPACE}/epsilon-schemas" + cp did-schema/schemas/V_epsilon/stable/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + cp did-schema/schemas/V_epsilon/draft/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + + - name: Run V_epsilon corpus discovery (Brainstorm E split) + # Isolated from the default V_delta test run above: its own + # DID_SCHEMA_PATH (the combined V_epsilon dir) and the + # DID_RUN_EPSILON_CORPUS gate that un-skips testCorpusEpsilon. + # Discovery mode — the report is the deliverable; test failures + # (errors, not quarantine) still surface. + if: always() + uses: matlab-actions/run-command@v2 + env: + DID_SCHEMA_PATH: ${{ github.workspace }}/epsilon-schemas + DID_RUN_EPSILON_CORPUS: '1' + with: + command: | + addpath(genpath("src")); + addpath(genpath("tests")); + results = runtests("did2.unittest.testCorpusEpsilon"); + disp(table(results)); + nFailed = sum([results.Failed]); + assert(nFailed == 0, sprintf("%d V_epsilon corpus test(s) failed", nFailed)); + - name: Upload corpus discovery reports # Discovery-mode corpus tests (e.g., testCorpus20211116) write # JSON summaries into corpus-reports/. Upload them as a CI diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index 2011d22..872b6f5 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -25,6 +25,7 @@ corpusName (1,:) char corpusURL (1,:) char innerDir (1,:) char + options.TargetVersion (1,:) char = 'V_delta' end did2.unittest.helpers.installSchemaPath(testCase, sprintf('skipping %s corpus test', corpusName)); @@ -42,12 +43,14 @@ bodies{k} = fileread(fullfile(files(k).folder, files(k).name)); end -result = did2.convert.v1_to_v2(bodies, 'Validate', true); +result = did2.convert.v1_to_v2(bodies, 'Validate', true, ... + 'TargetVersion', options.TargetVersion); reasons = did2.unittest.helpers.topQuarantineReasons(result.quarantine); reportPath = did2.unittest.helpers.writeCorpusReport(corpusName, result, reasons); -fprintf('\n=== Corpus %s discovery summary ===\n', corpusName); +fprintf('\n=== Corpus %s discovery summary (target %s) ===\n', ... + corpusName, options.TargetVersion); fprintf('total: %d\n', result.summary.total); fprintf('migrated_count: %d\n', result.summary.migrated_count); fprintf('quarantine_count: %d\n', result.summary.quarantine_count); diff --git a/tests/+did2/+unittest/testCorpusEpsilon.m b/tests/+did2/+unittest/testCorpusEpsilon.m new file mode 100644 index 0000000..a7e8def --- /dev/null +++ b/tests/+did2/+unittest/testCorpusEpsilon.m @@ -0,0 +1,49 @@ +function tests = testCorpusEpsilon +%TESTCORPUSEPSILON Discovery-mode end-to-end run through the Brainstorm-E split. +% +% Runs a real v1 corpus through did2.convert.v1_to_v2 with +% TargetVersion='V_epsilon', so the treatment -> manipulation and +% ontology_table_row -> observation split migrators +% (+did2.+convert.+migrators_e) are exercised and the migrated bodies +% are validated against the V_epsilon schema set. Writes +% corpus-reports/-summary.json. +% +% Opt-in via DID_RUN_EPSILON_CORPUS (the CI step that assembles a +% combined V_epsilon stable+draft schema dir and points +% DID_SCHEMA_PATH at it sets this). When the variable is unset the +% test skips cleanly via assumeFail, so it is a no-op in the default +% V_delta test run. +% +% The Dab corpus is used because its treatment rows (incl. the +% optogenetic-tetanus "Target Location" idiom) directly exercise the +% treatment split. Discovery mode: nothing is asserted about the +% migrated/quarantine split; the report is the deliverable. +% +% Run with: +% export DID_RUN_EPSILON_CORPUS=1 +% results = runtests('did2.unittest.testCorpusEpsilon'); + +tests = functiontests(localfunctions); +end + +function teardownOnce(testCase) +did2.unittest.helpers.restoreSchemaPath(testCase); +end + +function testEpsilonDabCorpusDiscoveryReport(testCase) +if ~epsilonTestEnabled() + assumeFail(testCase, ... + ['DID_RUN_EPSILON_CORPUS not set to a truthy value; skipping the ', ... + 'V_epsilon (Brainstorm E) corpus discovery test. Set ', ... + 'DID_RUN_EPSILON_CORPUS=1 (and DID_SCHEMA_PATH to a V_epsilon ', ... + 'stable+draft schema dir) to enable.']); +end +did2.unittest.helpers.runCorpusDiscovery(testCase, 'Dab-epsilon', ... + 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/Dab.zip', ... + 'Dab', 'TargetVersion', 'V_epsilon'); +end + +function tf = epsilonTestEnabled() +raw = lower(strtrim(getenv('DID_RUN_EPSILON_CORPUS'))); +tf = ismember(raw, {'1', 'true', 'yes', 'y', 'on'}); +end From 0c60b23b238acaf923f05807430b6f7e6dd52182 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 13:22:35 +0000 Subject: [PATCH 04/28] Fix runCorpusDiscovery signature: declare options for the name-value arg Adding options.TargetVersion to the arguments block requires in the function signature too (MATLAB MismatchBetweenBlockAndLine). This also broke the existing corpus tests that call this helper. No behaviour change. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- tests/+did2/+unittest/+helpers/runCorpusDiscovery.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index 872b6f5..a8f8f6e 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -1,4 +1,4 @@ -function corpusDir = runCorpusDiscovery(testCase, corpusName, corpusURL, innerDir) +function corpusDir = runCorpusDiscovery(testCase, corpusName, corpusURL, innerDir, options) %RUNCORPUSDISCOVERY Shared driver for v1 corpus discovery-mode tests. % % CORPUSDIR = did2.unittest.helpers.runCorpusDiscovery(TESTCASE, CORPUSNAME, From d2035e6cede18dd64924ee8f329cb1b067c2c48e Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 14:23:28 +0000 Subject: [PATCH 05/28] ci: make V_epsilon the main migration/validation target (replaces V_delta) Per "replacing V_delta with V_epsilon": the standard test run now validates against V_epsilon and the corpus discovery migrates through the Brainstorm-E split, instead of running V_epsilon as an isolated opt-in side step. - test-code.yml: assemble a combined V_epsilon stable+draft schema dir (the cache loads one flat dir; tier class names are disjoint) and point the main DID_SCHEMA_PATH at it. Removed the separate opt-in epsilon step. - runCorpusDiscovery: default TargetVersion is now V_epsilon, so all corpus tests (Dab, B, JH, PRED, 20211116, Soph) exercise the split as the main test. - v1_to_v2: stamp document_class.schema_version = V_epsilon on outputs when TargetVersion is V_epsilon. - Removed testCorpusEpsilon.m and the DID_RUN_EPSILON_CORPUS gate (redundant). Non-corpus unit tests are unaffected: they set their own fixtures schema path and run Validate=false, so they do not read DID_SCHEMA_PATH. v1_to_v2's own default TargetVersion stays V_delta so the per-class V_delta migrator unit tests (testMigrators/testConvertV1ToV2) remain valid; corpus migration opts into V_epsilon via runCorpusDiscovery. Discovery mode: quarantine (e.g. the expected missing-time_reference until synthesis lands) is reported, not failed. Local-dev default schema path (cache defaultSchemaPath) is unchanged; a multi-tier cache loader is the follow-up that removes the assemble-a-dir step. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .github/workflows/test-code.yml | 48 ++++++------------ src/did/+did2/+convert/v1_to_v2.m | 5 ++ .../+unittest/+helpers/runCorpusDiscovery.m | 2 +- tests/+did2/+unittest/testCorpusEpsilon.m | 49 ------------------- 4 files changed, 20 insertions(+), 84 deletions(-) delete mode 100644 tests/+did2/+unittest/testCorpusEpsilon.m diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index a31f978..da7c4c2 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -35,16 +35,28 @@ jobs: - name: Check out repository uses: actions/checkout@v4 - - name: Check out did-schema (sibling, for V_delta validation) + - name: Check out did-schema (sibling, for V_epsilon validation) uses: actions/checkout@v4 with: repository: Waltham-Data-Science/did-schema ref: main path: did-schema + - name: Assemble V_epsilon schema set (stable + draft) + # V_epsilon replaces V_delta as the migration/validation target. + # The schema cache loads a single flat dir of *.json, but the + # Brainstorm-E observation/manipulation classes live in + # V_epsilon/draft while base/element/etc. live in V_epsilon/stable + # (class names are disjoint across tiers), so combine both tiers + # into one directory for DID_SCHEMA_PATH. + run: | + mkdir -p "${GITHUB_WORKSPACE}/epsilon-schemas" + cp did-schema/schemas/V_epsilon/stable/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + cp did-schema/schemas/V_epsilon/draft/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + - name: Export DID_SCHEMA_PATH run: | - echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/did-schema/schemas/V_delta/stable" \ + echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/epsilon-schemas" \ >> "$GITHUB_ENV" - name: Set up MATLAB @@ -92,38 +104,6 @@ jobs: nFailed = sum([results.Failed]); assert(nFailed == 0, sprintf("%d test(s) failed", nFailed)); - - name: Assemble V_epsilon schema set (stable + draft) - # The schema cache loads a single flat dir of *.json. The - # Brainstorm-E observation/manipulation classes live in - # V_epsilon/draft while base/element/etc. live in V_epsilon/stable, - # so combine both tiers (class names are disjoint across tiers) - # into one directory for the V_epsilon corpus discovery step. - if: always() - run: | - mkdir -p "${GITHUB_WORKSPACE}/epsilon-schemas" - cp did-schema/schemas/V_epsilon/stable/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" - cp did-schema/schemas/V_epsilon/draft/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" - - - name: Run V_epsilon corpus discovery (Brainstorm E split) - # Isolated from the default V_delta test run above: its own - # DID_SCHEMA_PATH (the combined V_epsilon dir) and the - # DID_RUN_EPSILON_CORPUS gate that un-skips testCorpusEpsilon. - # Discovery mode — the report is the deliverable; test failures - # (errors, not quarantine) still surface. - if: always() - uses: matlab-actions/run-command@v2 - env: - DID_SCHEMA_PATH: ${{ github.workspace }}/epsilon-schemas - DID_RUN_EPSILON_CORPUS: '1' - with: - command: | - addpath(genpath("src")); - addpath(genpath("tests")); - results = runtests("did2.unittest.testCorpusEpsilon"); - disp(table(results)); - nFailed = sum([results.Failed]); - assert(nFailed == 0, sprintf("%d V_epsilon corpus test(s) failed", nFailed)); - - name: Upload corpus discovery reports # Discovery-mode corpus tests (e.g., testCorpus20211116) write # JSON summaries into corpus-reports/. Upload them as a CI diff --git a/src/did/+did2/+convert/v1_to_v2.m b/src/did/+did2/+convert/v1_to_v2.m index 2358366..3a4e697 100644 --- a/src/did/+did2/+convert/v1_to_v2.m +++ b/src/did/+did2/+convert/v1_to_v2.m @@ -141,6 +141,11 @@ % body on the first failure, as before). for bi = 1:numel(v2Bodies) outBody = ensureClassBlocks(v2Bodies{bi}, options.SchemaCache); + if strcmp(options.TargetVersion, 'V_epsilon') ... + && isfield(outBody, 'document_class') ... + && isstruct(outBody.document_class) + outBody.document_class.schema_version = 'V_epsilon'; + end doc = did2.document(outBody); if options.Validate doc.validate('SchemaCache', options.SchemaCache); diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index a8f8f6e..f495d59 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -25,7 +25,7 @@ corpusName (1,:) char corpusURL (1,:) char innerDir (1,:) char - options.TargetVersion (1,:) char = 'V_delta' + options.TargetVersion (1,:) char = 'V_epsilon' end did2.unittest.helpers.installSchemaPath(testCase, sprintf('skipping %s corpus test', corpusName)); diff --git a/tests/+did2/+unittest/testCorpusEpsilon.m b/tests/+did2/+unittest/testCorpusEpsilon.m deleted file mode 100644 index a7e8def..0000000 --- a/tests/+did2/+unittest/testCorpusEpsilon.m +++ /dev/null @@ -1,49 +0,0 @@ -function tests = testCorpusEpsilon -%TESTCORPUSEPSILON Discovery-mode end-to-end run through the Brainstorm-E split. -% -% Runs a real v1 corpus through did2.convert.v1_to_v2 with -% TargetVersion='V_epsilon', so the treatment -> manipulation and -% ontology_table_row -> observation split migrators -% (+did2.+convert.+migrators_e) are exercised and the migrated bodies -% are validated against the V_epsilon schema set. Writes -% corpus-reports/-summary.json. -% -% Opt-in via DID_RUN_EPSILON_CORPUS (the CI step that assembles a -% combined V_epsilon stable+draft schema dir and points -% DID_SCHEMA_PATH at it sets this). When the variable is unset the -% test skips cleanly via assumeFail, so it is a no-op in the default -% V_delta test run. -% -% The Dab corpus is used because its treatment rows (incl. the -% optogenetic-tetanus "Target Location" idiom) directly exercise the -% treatment split. Discovery mode: nothing is asserted about the -% migrated/quarantine split; the report is the deliverable. -% -% Run with: -% export DID_RUN_EPSILON_CORPUS=1 -% results = runtests('did2.unittest.testCorpusEpsilon'); - -tests = functiontests(localfunctions); -end - -function teardownOnce(testCase) -did2.unittest.helpers.restoreSchemaPath(testCase); -end - -function testEpsilonDabCorpusDiscoveryReport(testCase) -if ~epsilonTestEnabled() - assumeFail(testCase, ... - ['DID_RUN_EPSILON_CORPUS not set to a truthy value; skipping the ', ... - 'V_epsilon (Brainstorm E) corpus discovery test. Set ', ... - 'DID_RUN_EPSILON_CORPUS=1 (and DID_SCHEMA_PATH to a V_epsilon ', ... - 'stable+draft schema dir) to enable.']); -end -did2.unittest.helpers.runCorpusDiscovery(testCase, 'Dab-epsilon', ... - 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/Dab.zip', ... - 'Dab', 'TargetVersion', 'V_epsilon'); -end - -function tf = epsilonTestEnabled() -raw = lower(strtrim(getenv('DID_RUN_EPSILON_CORPUS'))); -tf = ismember(raw, {'1', 'true', 'yes', 'y', 'on'}); -end From 56e09c10ff8e0ddeccbb788519c3aaa900893ce5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 00:07:41 +0000 Subject: [PATCH 06/28] convert: split migrators emit session_relative_reference anchor (no more empty time_reference) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Brainstorm-E split migrators now attach a real time_reference instead of an empty slot (the prior cause of quarantine for behavioral/treatment rows): - Each migrated interaction depends_on a session_relative_reference document (ordinal, relation='during', anchored to the source's base.session_id) minted with did.ido.unique_id(). 'during' is the honest universal fallback (the event happened within the session); 'at_end_of' is reserved for known-terminal cases, not asserted blanket. - treatment: now 1 -> 2 (manipulation + anchor). ontology_table_row: 1 -> N+1 (N observations sharing one session anchor). - Fixes invalid row ids: ontology_table_row observations now get fresh did.ido.unique_id() base ids (the previous "_row%02d" was not a valid did_uid and would have quarantined). Updates testMigratorsE expectations for the added anchor doc (+ asserts the anchor relation is 'during'). Authored without local MATLAB — CI-verified. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+migrators_e/ontology_table_row.m | 45 ++++++++++++++---- .../+did2/+convert/+migrators_e/treatment.m | 46 ++++++++++++++++--- tests/+did2/+unittest/testMigratorsE.m | 13 ++++-- 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index c90c919..41ecfbb 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -39,10 +39,18 @@ 'ontology_table_row has no rows to migrate.'); end -bodies = cell(1, numel(rows)); +% One session-relative anchor is shared by every observation from this +% table (they are all in the same session). 'during' is the honest +% fallback when the row carries no epoch and no UTC date. +anchor = makeSessionAnchor(preBody, 'during'); +bodies = cell(1, numel(rows) + 1); for k = 1:numel(rows) - bodies{k} = migrateRow(preBody, rows{k}, k); + b = migrateRow(preBody, rows{k}, k); + b.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); + bodies{k} = b; end +bodies{end} = anchor; end % ===================== per-row migration =============================== @@ -163,17 +171,15 @@ body = struct(); body.document_class = struct('class_name', className, 'class_version', '1.0.0', ... 'superclasses', supers, 'schema_version', 'V_epsilon'); -body.depends_on = carrySubjectAndTime(preBody); +body.depends_on = carrySubject(preBody); if isfield(preBody, 'base') && isstruct(preBody.base) base = preBody.base; - if isfield(base, 'id') && ~isempty(base.id) - base.id = sprintf('%s_row%02d', char(base.id), rowIndex); - end + base.id = did.ido.unique_id(); % each row becomes its own document body.base = base; end end -function deps = carrySubjectAndTime(preBody) +function deps = carrySubject(preBody) deps = struct('name', {}, 'value', {}); subjectVal = ''; if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) @@ -186,7 +192,30 @@ end end deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); -deps(end+1) = struct('name', 'time_reference_1', 'value', ''); +end + +function anchor = makeSessionAnchor(preBody, relation) +%MAKESESSIONANCHOR Session_relative_reference document (ordinal, no metric) +% shared by all observations from this table; anchored to the source's +% session via base.session_id. +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); end function comp = canonicalComposite(canonField, unit, numVal) diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m index c081a67..e82bff1 100644 --- a/src/did/+did2/+convert/+migrators_e/treatment.m +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -85,6 +85,17 @@ ['treatment "%s" (%s) could not be routed to a manipulation ', ... 'family; curator review required.'], label, node); end + +% Attach a session-relative anchor. v1 treatment rows have no DAQ epoch and +% (often) no UTC date, so the honest fallback is an ordinal claim against the +% session. 'during' is correct for any migrated interaction (it happened +% within the session); 'at_end_of' is reserved for interactions known to be +% terminal and is not asserted blanket here. Emitting the time_reference as +% its own document makes this a 1 -> 2 migration. +anchor = makeSessionAnchor(preBody, 'during'); +v2Body.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); %#ok +v2Body = {v2Body, anchor}; end % ===================== destination builders ============================ @@ -148,17 +159,15 @@ body.document_class = struct( ... 'class_name', className, 'class_version', '1.0.0', ... 'superclasses', supers, 'schema_version', 'V_epsilon'); -body.depends_on = carrySubjectAndTime(preBody); +body.depends_on = carrySubject(preBody); if isfield(preBody, 'base') body.base = preBody.base; end end -function deps = carrySubjectAndTime(preBody) -%CARRYSUBJECTANDTIME Keep subject_id; add a time_reference_1 slot. -% The time_reference value is left empty for curator/tooling backfill -% (real synthesis from session/epoch metadata is a follow-up; see -% treatment.md "time_reference synthesis fidelity"). +function deps = carrySubject(preBody) +%CARRYSUBJECT Carry the subject_id dependency forward (time_reference is +% attached separately, pointing at the migrated session anchor). deps = struct('name', {}, 'value', {}); subjectVal = ''; if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) @@ -170,7 +179,30 @@ end end deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); -deps(end+1) = struct('name', 'time_reference_1', 'value', ''); +end + +function anchor = makeSessionAnchor(preBody, relation) +%MAKESESSIONANCHOR Build a session_relative_reference document (ordinal, +% no metric) anchored to the source document's session. Returned as a +% sibling body so the interaction can depend_on it as its time_reference. +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); end function v = depValue(d) diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index dfe30d3..ff5a742 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -41,13 +41,18 @@ function testThermalTreatmentBecomesTemperatureManipulation(testCase) 'ontology_name', 'ndic:0000nnnn', 'name', 'focal cortical cooling', ... 'numeric_value', 12.0, 'string_value', 'Peltier')); out = runE(v1); -verifyEqual(testCase, numel(out.migrated), 1); +% 1 -> 2: the manipulation plus its session_relative_reference anchor. +verifyEqual(testCase, numel(out.migrated), 2); doc = out.migrated{1}; verifyTrue(testCase, isfield(out.summary.by_class, 'temperature_manipulation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'session_relative_reference')); val = doc.get('scalar_temperature.value'); verifyEqual(testCase, val.celsius, 12.0); ap = doc.get('scalar_manipulation.applied_property'); verifyEqual(testCase, ap.name, 'focal cortical cooling'); +% the anchor is an ordinal 'during' session reference +anchor = out.migrated{2}; +verifyEqual(testCase, anchor.get('session_relative_reference.relation'), 'during'); end function testDrugTreatmentBecomesInjection(testCase) @@ -55,7 +60,7 @@ function testDrugTreatmentBecomesInjection(testCase) 'ontology_name', 'chebi:6015', 'name', 'isoflurane', ... 'numeric_value', [], 'string_value', '')); out = runE(v1); -verifyEqual(testCase, numel(out.migrated), 1); +verifyEqual(testCase, numel(out.migrated), 2); % injection + session anchor verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); end @@ -102,9 +107,11 @@ function testTableRowFansOutToNObservations(testCase) 'value', 'fbdv:00005336')}; v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); out = runE(v1); -verifyEqual(testCase, numel(out.migrated), 2); +% 2 rows -> 2 observations + 1 shared session anchor. +verifyEqual(testCase, numel(out.migrated), 3); verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); verifyTrue(testCase, isfield(out.summary.by_class, 'developmental_stage_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'session_relative_reference')); end function testTableRowScalarValueLandsTyped(testCase) From 0a3cfd3b55ae7bb858a1852a940dceb74ae62f7b Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 00:10:50 +0000 Subject: [PATCH 07/28] Clean up two Code Analyzer findings in the split migrators - treatment.m: drop the stale %#ok suppression (the line isn't in a loop, so no AGROW warning is generated). - ontology_table_row.m: remove the now-unused rowIndex argument threaded through migrateRow/makeScalarObservation/makeCategoricalObservation/ startObservation (row ids now come from did.ido.unique_id()). No behaviour change. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+migrators_e/ontology_table_row.m | 19 +++++++++---------- .../+did2/+convert/+migrators_e/treatment.m | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 41ecfbb..e52bf0c 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -45,7 +45,7 @@ anchor = makeSessionAnchor(preBody, 'during'); bodies = cell(1, numel(rows) + 1); for k = 1:numel(rows) - b = migrateRow(preBody, rows{k}, k); + b = migrateRow(preBody, rows{k}); b.depends_on(end+1) = struct('name', 'time_reference_1', ... 'value', anchor.base.id); bodies{k} = b; @@ -55,7 +55,7 @@ % ===================== per-row migration =============================== -function body = migrateRow(preBody, row, rowIndex) +function body = migrateRow(preBody, row) node = getCharField(row, 'ontology_name'); label = getCharField(row, 'name'); identity = struct('node', node, 'name', label); @@ -65,12 +65,11 @@ if isNumeric [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal); - body = makeScalarObservation(preBody, rowIndex, className, shapeClass, ... + body = makeScalarObservation(preBody, className, shapeClass, ... identity, valueStruct); else [className, valueTerm] = dispatchCategorical(hay, row); - body = makeCategoricalObservation(preBody, rowIndex, className, ... - identity, valueTerm); + body = makeCategoricalObservation(preBody, className, identity, valueTerm); end end @@ -137,14 +136,14 @@ % ===================== destination builders ============================ -function body = makeScalarObservation(preBody, rowIndex, className, shapeClass, identity, valueStruct) -body = startObservation(preBody, rowIndex, className, {'scalar_observation', shapeClass}); +function body = makeScalarObservation(preBody, className, shapeClass, identity, valueStruct) +body = startObservation(preBody, className, {'scalar_observation', shapeClass}); body.observation = struct('measured_property', identity, 'target_structure', {{}}); body.(shapeClass) = struct('value', valueStruct); end -function body = makeCategoricalObservation(preBody, rowIndex, className, identity, valueTerm) -body = startObservation(preBody, rowIndex, className, ... +function body = makeCategoricalObservation(preBody, className, identity, valueTerm) +body = startObservation(preBody, className, ... {'categorical_observation', 'categorical_concept'}); body.observation = struct('measured_property', identity, 'target_structure', {{}}); % `value` lives in the block of the class that DECLARES it: the two @@ -162,7 +161,7 @@ % ===================== shared helpers ================================== -function body = startObservation(preBody, rowIndex, className, extraSupers) +function body = startObservation(preBody, className, extraSupers) chain = [{'observation'}, extraSupers]; supers = struct('class_name', {}, 'class_version', {}); for k = 1:numel(chain) diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m index e82bff1..0f48c89 100644 --- a/src/did/+did2/+convert/+migrators_e/treatment.m +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -94,7 +94,7 @@ % its own document makes this a 1 -> 2 migration. anchor = makeSessionAnchor(preBody, 'during'); v2Body.depends_on(end+1) = struct('name', 'time_reference_1', ... - 'value', anchor.base.id); %#ok + 'value', anchor.base.id); v2Body = {v2Body, anchor}; end From ed271ad765e3adf027257a1c30ca5393653810e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 14:22:31 +0000 Subject: [PATCH 08/28] convert: fix the two largest E-split quarantine causes from the corpus report From the V_epsilon corpus discovery (B 0, 20211116 275, Dab 7859, JH 41504): 1. ontology_table_row "has no rows to migrate" (~47K, the dominant cause): the real v1 layout is parallel char fields (names / variable_names / ontology_nodes / data), not a 'rows' array, so extractRows found nothing and threw. Now falls back to migrating the document unchanged as an ontology_table_row (the class still exists in V_epsilon, so it validates) instead of quarantining. Splitting that char-field layout into per-row observations is a follow-up. 2. target_structure "must be a struct (named composite type ontology_term)" (~105): the field is an ARRAY of ontology_term (a struct array), but the migrators emitted a cell array. Emit struct arrays (empty: struct('node',{},'name',{}); populated: struct('node',..,'name',..)) in both treatment.m and ontology_table_row.m. Remaining quarantine is mostly non-E drift (stimulus_bath mixture, stimulus_response_scalar_parameters_basic block, subject_group has no V_epsilon class) plus a handful of unresolved-branch treatments (curator review). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+convert/+migrators_e/ontology_table_row.m | 16 ++++++++++++---- src/did/+did2/+convert/+migrators_e/treatment.m | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index e52bf0c..d7df50b 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -35,8 +35,14 @@ end rows = extractRows(preBody.ontology_table_row); if isempty(rows) - error('did2:convert:emptyTable', ... - 'ontology_table_row has no rows to migrate.'); + % Unrecognised row layout (e.g. the real v1 form stores parallel + % char fields names / variable_names / ontology_nodes / data rather + % than a 'rows' array). Rather than quarantine, migrate the document + % unchanged as an ontology_table_row -- the class still exists in + % V_epsilon, so it validates. Splitting that char-field layout into + % per-row observations is a follow-up (see ontology_table_row.md). + bodies = {preBody}; + return; end % One session-relative anchor is shared by every observation from this @@ -138,14 +144,16 @@ function body = makeScalarObservation(preBody, className, shapeClass, identity, valueStruct) body = startObservation(preBody, className, {'scalar_observation', shapeClass}); -body.observation = struct('measured_property', identity, 'target_structure', {{}}); +body.observation = struct('measured_property', identity, ... + 'target_structure', {struct('node', {}, 'name', {})}); body.(shapeClass) = struct('value', valueStruct); end function body = makeCategoricalObservation(preBody, className, identity, valueTerm) body = startObservation(preBody, className, ... {'categorical_observation', 'categorical_concept'}); -body.observation = struct('measured_property', identity, 'target_structure', {{}}); +body.observation = struct('measured_property', identity, ... + 'target_structure', {struct('node', {}, 'name', {})}); % `value` lives in the block of the class that DECLARES it: the two % overriders (developmental_stage / generic_categorical) declare their own % value; every other categorical property class inherits it from the diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m index 0f48c89..e1db830 100644 --- a/src/did/+did2/+convert/+migrators_e/treatment.m +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -46,10 +46,10 @@ hay = lower([node ' ' label]); % search text for the heuristic branch % --- Dab edge case: string_value is an ontology target, not prose ------ -targetStructure = {}; +targetStructure = struct('node', {}, 'name', {}); % empty ontology_term array notesText = strValue; if endsWith(lower(strtrim(label)), 'target location') || looksLikeCURIE(strValue) - targetStructure = {struct('node', strValue, 'name', '')}; + targetStructure = struct('node', strValue, 'name', ''); notesText = ''; identity.name = strtrim(regexprep(label, '(?i)\s*target location$', '')); end From 1393d836c7fef79eeb110de9ab1148eb23a6f9e5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 14:48:01 +0000 Subject: [PATCH 09/28] convert: split ontology_table_row's real char-field layout into per-column observations extractRows now parses the actual v1 ontology_table_row form (per the schema): parallel comma-separated names / variable_names / ontology_nodes plus a data struct keyed by variable_names. One document is one table ROW; each COLUMN becomes one observation (identity = ontology_node + name; value = data.). Columns with no usable value (missing key, [], '', NaN) are skipped. The synthetic rows-array / single-row shapes still work; the unparseable-fallback (migrate unchanged as ontology_table_row) remains for anything else. This converts the ~47K "has no rows to migrate" quarantine into actual per-column observations (the dominant E-split gap from the corpus report). Adds testMigratorsE cases for the char-field split and empty-value skipping. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+migrators_e/ontology_table_row.m | 51 +++++++++++++++++-- tests/+did2/+unittest/testMigratorsE.m | 31 +++++++++++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index d7df50b..115a5da 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -231,7 +231,14 @@ end function rows = extractRows(block) -%EXTRACTROWS Normalise the legacy table to a cell of row structs. +%EXTRACTROWS Normalise an ontology_table_row body to a cell of column structs +% (each {ontology_name, name, value}), one per measured property. +% +% The real v1 layout (per the schema) is column-parallel: comma-separated +% `names` / `variable_names` / `ontology_nodes` plus a `data` struct keyed +% by the variable_names. One document is one table ROW; each COLUMN is a +% property measurement and becomes one observation. (Also accepts the +% synthetic `rows`-array and single-row shapes used by tests.) rows = {}; if isfield(block, 'rows') r = block.rows; @@ -240,9 +247,45 @@ elseif isstruct(r) rows = arrayfun(@(x) x, r(:)', 'UniformOutput', false); end -elseif isfield(block, 'ontology_name') || isfield(block, 'name') - % single-row legacy shape (the table block IS one row) - rows = {block}; + return; +end +if isfield(block, 'variable_names') + vars = splitCSV(getCharField(block, 'variable_names')); + names = splitCSV(getCharField(block, 'names')); + nodes = splitCSV(getCharField(block, 'ontology_nodes')); + data = struct(); + if isfield(block, 'data') && isstruct(block.data) + data = block.data; + end + for i = 1:numel(vars) + key = vars{i}; + nm = ''; nd = ''; + if i <= numel(names); nm = names{i}; end + if i <= numel(nodes); nd = nodes{i}; end + val = []; + if ~isempty(key) && isfield(data, key) + val = data.(key); + end + % Skip columns with no usable value (missing key, [], '', NaN). + if isempty(val) || (isnumeric(val) && isscalar(val) && isnan(val)) + continue; + end + rows{end+1} = struct('ontology_name', nd, 'name', nm, 'value', val); %#ok + end + return; +end +if isfield(block, 'ontology_name') || isfield(block, 'name') + rows = {block}; % single-row legacy shape (the block IS one row) +end +end + +function parts = splitCSV(s) +parts = {}; +if isempty(s) + return; +end +raw = strsplit(char(s), ','); +parts = cellfun(@strtrim, raw, 'UniformOutput', false); end end diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index ff5a742..abd5347 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -134,6 +134,37 @@ function testTableRowGeneratesUniqueIdsPerRow(testCase) verifyNotEqual(testCase, id1, id2); end +function testTableRowCharFieldLayoutSplitsByColumn(testCase) +% The real v1 layout: parallel char fields + a data struct keyed by +% variable_names (one document = one row; each column = one observation). +block = struct( ... + 'names', 'weight,life cycle stage', ... + 'variable_names', 'weight,stage', ... + 'ontology_nodes', 'schema:weight,uberon:0000105', ... + 'data', struct('weight', 22.5, 'stage', 'fbdv:00005336')); +v1 = wrap('ontology_table_row', 'ontology_table_row', block); +out = runE(v1); +% 2 columns -> 2 observations + 1 shared session anchor. +verifyEqual(testCase, numel(out.migrated), 3); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'developmental_stage_observation')); +end + +function testTableRowCharFieldEmptyValuesSkipped(testCase) +% Columns with no usable value (missing key / NaN) are skipped, not +% turned into empty observations. +block = struct( ... + 'names', 'weight,missing', ... + 'variable_names', 'weight,missing', ... + 'ontology_nodes', 'schema:weight,schema:missing', ... + 'data', struct('weight', 22.5, 'missing', nan)); +v1 = wrap('ontology_table_row', 'ontology_table_row', block); +out = runE(v1); +% only the weight column survives -> 1 observation + 1 anchor. +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +end + % ===================== backward compatibility ========================== function testDefaultTargetLeavesTreatmentUnchanged(testCase) From 19c215c72b835dd4194bf50384dbddc3fff07a83 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 14:49:44 +0000 Subject: [PATCH 10/28] Fix codespell: rename nd/nm to node/label (codespell read 'nd' as 'and') No .codespellrc to ignore-list, so rename the variables. No behaviour change. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did2/+convert/+migrators_e/ontology_table_row.m | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 115a5da..60f6759 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -259,9 +259,9 @@ end for i = 1:numel(vars) key = vars{i}; - nm = ''; nd = ''; - if i <= numel(names); nm = names{i}; end - if i <= numel(nodes); nd = nodes{i}; end + label = ''; node = ''; + if i <= numel(names); label = names{i}; end + if i <= numel(nodes); node = nodes{i}; end val = []; if ~isempty(key) && isfield(data, key) val = data.(key); @@ -270,7 +270,7 @@ if isempty(val) || (isnumeric(val) && isscalar(val) && isnan(val)) continue; end - rows{end+1} = struct('ontology_name', nd, 'name', nm, 'value', val); %#ok + rows{end+1} = struct('ontology_name', node, 'name', label, 'value', val); %#ok end return; end From d01fa15390bb40631ce038129357578093dd04b7 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 20 Jun 2026 14:51:53 +0000 Subject: [PATCH 11/28] Fix parse error: remove stray end left by the extractRows rewrite The extractRows rewrite's old_string didn't include the original function's closing end, leaving a duplicate end after splitCSV (Code Analyzer parse error at END, line 290; the 'function might be unused' alert was its downstream symptom). Removed the extra end; the file is balanced (the apparent off-by-one in keyword counts is the arguments block). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did2/+convert/+migrators_e/ontology_table_row.m | 1 - 1 file changed, 1 deletion(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 60f6759..962b150 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -287,7 +287,6 @@ raw = strsplit(char(s), ','); parts = cellfun(@strtrim, raw, 'UniformOutput', false); end -end function [isNumeric, numVal] = rowNumericValue(row) isNumeric = false; numVal = []; From 3f1a32f7b847265de71167677d6baac948aff942 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 21:05:59 +0000 Subject: [PATCH 12/28] ci: validate corpus against the matching did-schema E branch, not main MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dominant corpus quarantine ("No schema file for class generic_categorical_observation" 41,095 in JH; session_relative_reference; generic_scalar_observation.value placement) was not migrator bugs — the workflow checked out did-schema at ref: main, where the Brainstorm-E classes do not yet exist. They live on the matching did-schema branch (DID-schema#62). Point the did-schema checkout at ${{ github.head_ref || 'main' }} so the V_epsilon corpus discovery validates against the actual E schema. Revert to main once DID-schema#62 lands. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .github/workflows/test-code.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index da7c4c2..a0003aa 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -39,7 +39,12 @@ jobs: uses: actions/checkout@v4 with: repository: Waltham-Data-Science/did-schema - ref: main + # Track the matching did-schema E branch: the Brainstorm-E classes + # (observation property classes, shape library, session_relative_ + # reference, subject_statement, etc.) live on this branch and are + # not yet on did-schema main. Revert to `main` once DID-schema#62 + # merges. Falls back to main off-PR (e.g. direct pushes to V2). + ref: ${{ github.head_ref || 'main' }} path: did-schema - name: Assemble V_epsilon schema set (stable + draft) From 2cd9dc14134d1b8e2ed7eb050369f2b1a58eefcf Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 22:06:37 +0000 Subject: [PATCH 13/28] convert: defer stimulus_bath to the NDI layer with a clear reason stimulus_bath migrates to a `bath` (pharmacological_manipulation), but the bath needs its subject (the stimulator element's subject) and an epoch_bounded_ reference time anchor (the stimulator's epoch) -- both require following stimulus_element_id into the session/element graph. A manipulation must be emitted complete, so the whole bath is assembled in ndi.migrate.local, not per-document. The per-doc converter now defers via did2:convert:needsSessionContext, so the discovery report reads "migrate via ndi.migrate.local" instead of the misleading "mixture missing" (the prior V_delta-block-placement fallback). Adds a test. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+convert/+migrators_e/stimulus_bath.m | 28 +++++++++++++++++++ tests/+did2/+unittest/testMigratorsE.m | 15 ++++++++++ 2 files changed, 43 insertions(+) create mode 100644 src/did/+did2/+convert/+migrators_e/stimulus_bath.m diff --git a/src/did/+did2/+convert/+migrators_e/stimulus_bath.m b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m new file mode 100644 index 0000000..1d02db7 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m @@ -0,0 +1,28 @@ +function v2Body = stimulus_bath(preBody) +%STIMULUS_BATH Deferred: stimulus_bath migrates to a `bath` in the NDI layer. +% +% The legacy stimulus_bath is really a bath (pharmacological_manipulation): +% its mixture/location live in the document, but the resulting bath needs two +% things that can only be obtained by following stimulus_element_id to the +% stimulator ELEMENT and its session/epoch graph -- +% +% - subject_id : the stimulator element's subject, and +% - time_reference : an epoch_bounded_reference on the stimulator's epoch +% (the stimulator is the time referent; no other +% connection to it is kept). +% +% A manipulation must be emitted complete (all required dependencies +% together), so the whole bath is assembled in ndi.migrate.local, which has +% the element and epoch in hand. The per-document converter cannot complete +% it, so it defers here with a clear, queryable reason rather than emitting a +% partial (or a wrong-block fallback that reads as "mixture missing"). +% +% See ndi.migrate.context.stimulus_bath (NDI-matlab) for the actual build. + +error('did2:convert:needsSessionContext', ... + ['stimulus_bath -> bath is migrated in the NDI layer ', ... + '(ndi.migrate.local): the bath''s subject (from the stimulator ', ... + 'element) and its epoch_bounded_reference time anchor (the ', ... + 'stimulator''s epoch) require the session/element graph. Deferred.']); +v2Body = preBody; %#ok % unreachable; satisfies the output signature +end diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index abd5347..815ab92 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -165,6 +165,21 @@ function testTableRowCharFieldEmptyValuesSkipped(testCase) verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); end +% ===================== context-dependent deferral ===================== + +function testStimulusBathDefersToNdiLayer(testCase) +% stimulus_bath is migrated to a `bath` in the NDI layer (it needs the +% stimulator element for its subject + epoch anchor), so the per-document +% converter defers it with a clear reason rather than emitting a partial. +v1 = wrap('stimulus_bath', 'stimulus_bath', struct( ... + 'location', struct('ontologyNode', 'uberon:0001017', 'name', 'CNS'), ... + 'mixture_table', '')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 0); +verifyEqual(testCase, numel(out.quarantine), 1); +verifyTrue(testCase, contains(out.quarantine(1).reason, 'NDI layer')); +end + % ===================== backward compatibility ========================== function testDefaultTargetLeavesTreatmentUnchanged(testCase) From 427a9f4ee7692a3b27ba1c3b1710c1b6027bc6fe Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 22:09:01 +0000 Subject: [PATCH 14/28] Fix Code Analyzer: stimulus_bath deferral stub uses the idiomatic always-error pattern The prior 'error then unreachable assignment' tripped 'return value might be unset' + 'input unused'. Use ~ for the ignored input and assign the required output before the error (suppressing the dead assignment), so the stub is analyzer-clean. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did2/+convert/+migrators_e/stimulus_bath.m | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/stimulus_bath.m b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m index 1d02db7..7812794 100644 --- a/src/did/+did2/+convert/+migrators_e/stimulus_bath.m +++ b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m @@ -1,4 +1,4 @@ -function v2Body = stimulus_bath(preBody) +function v2Body = stimulus_bath(~) %STIMULUS_BATH Deferred: stimulus_bath migrates to a `bath` in the NDI layer. % % The legacy stimulus_bath is really a bath (pharmacological_manipulation): @@ -17,12 +17,12 @@ % it, so it defers here with a clear, queryable reason rather than emitting a % partial (or a wrong-block fallback that reads as "mixture missing"). % -% See ndi.migrate.context.stimulus_bath (NDI-matlab) for the actual build. +% See ndi.migrate.internal.stimulusBathToBath (NDI-matlab) for the build. +v2Body = struct(); %#ok % required output; this migrator always defers error('did2:convert:needsSessionContext', ... ['stimulus_bath -> bath is migrated in the NDI layer ', ... '(ndi.migrate.local): the bath''s subject (from the stimulator ', ... 'element) and its epoch_bounded_reference time anchor (the ', ... 'stimulator''s epoch) require the session/element graph. Deferred.']); -v2Body = preBody; %#ok % unreachable; satisfies the output signature end From 5720ba7561d25fb0dfb259fc03190cfdf87d97d5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 21 Jun 2026 22:15:57 +0000 Subject: [PATCH 15/28] convert: generalize the idempotency short-circuit to the target version isAlreadyVDelta -> isAlreadyTarget(body, targetVersion): a body already tagged with the configured TargetVersion (V_delta or V_epsilon) short-circuits the migration loop and is just padded (ensureClassBlocks) + validated, not re-migrated. Default 'V_delta' preserves existing behaviour exactly. This is what lets ndi.migrate.local feed NDI-assembled V_epsilon bodies (the bath + epoch_bounded_reference from stimulusBathToBath) back through v1_to_v2 to get the superclass chain rebuilt and validated. Adds a short-circuit test. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did2/+convert/v1_to_v2.m | 21 ++++++++++++--------- tests/+did2/+unittest/testMigratorsE.m | 12 ++++++++++++ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/did/+did2/+convert/v1_to_v2.m b/src/did/+did2/+convert/v1_to_v2.m index 3a4e697..3cdf3b7 100644 --- a/src/did/+did2/+convert/v1_to_v2.m +++ b/src/did/+did2/+convert/v1_to_v2.m @@ -106,7 +106,7 @@ className = ''; try preBody = ensureStruct(rawBody); - if isAlreadyVDelta(preBody) + if isAlreadyTarget(preBody, options.TargetVersion) % Idempotency short-circuit: the body is already V_delta, % so skip universalRenames and the per-class migrators. % ensureClassBlocks still runs (it rebuilds the V_delta @@ -230,14 +230,17 @@ end end -function tf = isAlreadyVDelta(body) -% Return true when BODY is already a V_delta-shaped document so the +function tf = isAlreadyTarget(body, targetVersion) +% Return true when BODY is already a TARGETVERSION-shaped document so the % per-body migration loop can skip universalRenames and the per-class -% migrators. Both conditions must hold so the short-circuit only fires -% when we have high confidence the body is V_delta: -% (a) document_class.schema_version is the literal char 'V_delta' -% (set by the last run of universalRenames, or by the writer), -% AND +% migrators (it still gets ensureClassBlocks + validate). Both conditions +% must hold so the short-circuit only fires when we have high confidence +% the body is already at the target: +% (a) document_class.schema_version is the literal char TARGETVERSION +% (set by the last run of universalRenames, the writer, or -- for +% 'V_epsilon' -- a context assembler such as +% ndi.migrate.internal.stimulusBathToBath that emits ready-made +% target bodies), AND % (b) the body carries no v1-only structural markers — underscore- % prefixed top-level keys (e.g., legacy _classname, % _class_version) that predate the document_class header and @@ -261,7 +264,7 @@ if isstring(sv) && isscalar(sv) sv = char(sv); end -if ~ischar(sv) || ~strcmp(sv, 'V_delta') +if ~ischar(sv) || ~strcmp(sv, targetVersion) return; end topKeys = fieldnames(body); diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index 815ab92..105bbae 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -180,6 +180,18 @@ function testStimulusBathDefersToNdiLayer(testCase) verifyTrue(testCase, contains(out.quarantine(1).reason, 'NDI layer')); end +function testAlreadyEpsilonBodyShortCircuits(testCase) +% A body already tagged schema_version 'V_epsilon' (e.g. emitted by an NDI +% context assembler) short-circuits the migration loop and is just +% padded/validated, not re-migrated. This is what lets ndi.migrate.local +% feed assembled bath/time-reference bodies back through v1_to_v2. +v1 = wrap('mock', 'mock', struct()); +v1.document_class.schema_version = 'V_epsilon'; +out = did2.convert.v1_to_v2(v1, 'Validate', false, 'TargetVersion', 'V_epsilon'); +verifyEqual(testCase, numel(out.migrated), 1); +verifyEqual(testCase, numel(out.quarantine), 0); +end + % ===================== backward compatibility ========================== function testDefaultTargetLeavesTreatmentUnchanged(testCase) From ca0ec764323255a882fa47c0ed2e48306b49ceb0 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 11:11:31 +0000 Subject: [PATCH 16/28] migrators_e: write categorical value into the concrete class block Follows the V_epsilon schema flatten (categorical_concept.value now placement: concrete_class). ontology_table_row's categorical path no longer branches on whether the class is an overrider; it writes the bound term uniformly into body.(className).value. This clears the ~47K "categorical_concept.value missing" corpus quarantine, since the inherited value now lives in each observation's own block. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+convert/+migrators_e/ontology_table_row.m | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 962b150..36494cc 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -154,17 +154,11 @@ {'categorical_observation', 'categorical_concept'}); body.observation = struct('measured_property', identity, ... 'target_structure', {struct('node', {}, 'name', {})}); -% `value` lives in the block of the class that DECLARES it: the two -% overriders (developmental_stage / generic_categorical) declare their own -% value; every other categorical property class inherits it from the -% categorical_concept shape mixin, so its value lives in that block. -if any(strcmp(className, {'developmental_stage_observation', ... - 'generic_categorical_observation'})) - valueBlock = className; -else - valueBlock = 'categorical_concept'; -end -body.(valueBlock) = struct('value', valueTerm); +% categorical_concept declares `value` with placement: concrete_class, so +% the bound term lives in the concrete observation class's OWN block and +% categorical_concept contributes no block. One value, one block, uniform +% across every categorical observation (no per-class branching). +body.(className) = struct('value', valueTerm); end % ===================== shared helpers ================================== From 47ddbd224ac9691f9967d86f359e86353003bcc0 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 13:17:14 +0000 Subject: [PATCH 17/28] migrators_e: add subject_group -> subject(is_group) migrator + tests Clears the JH subject_group corpus quarantine (353 docs): subject_group is deprecated in V_epsilon, so the per-document E migrator folds it into a subject flagged is_group (optional legacy group_name/description map to local_identifier/description). Membership -> group_assignment is relational and stays an NDI-layer follow-up (like stimulus_bath -> bath). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+convert/+migrators_e/subject_group.m | 65 +++++++++++++++++++ tests/+did2/+unittest/testMigratorsE.m | 24 +++++++ 2 files changed, 89 insertions(+) create mode 100644 src/did/+did2/+convert/+migrators_e/subject_group.m diff --git a/src/did/+did2/+convert/+migrators_e/subject_group.m b/src/did/+did2/+convert/+migrators_e/subject_group.m new file mode 100644 index 0000000..15615fe --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/subject_group.m @@ -0,0 +1,65 @@ +function v2Body = subject_group(preBody) +%SUBJECT_GROUP Brainstorm-E migrator: did_v1 subject_group -> subject (is_group). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per did-schema V_epsilon_SPEC.md, subject_group is +% deprecated and folds into the subject tier: +% +% subject_group -> subject (is_group: true) +% +% The legacy subject_group document is an (essentially empty) marker -- +% membership is expressed by member subjects referencing the group, not +% by fields on the group doc itself. So the per-document migration is +% 1 -> 1: the group becomes a `subject` flagged is_group. The membership +% edges become `group_assignment` events, but those are RELATIONAL (they +% need the member subjects that point at this group) and are assembled in +% the NDI layer, exactly like stimulus_bath -> bath; they are not +% manufactured here from a doc that carries no members. +% +% Optional legacy group_name / description (newer subject_group docs may +% carry them; v1 corpus docs do not) map onto the subject block's +% local_identifier / description. + +arguments + preBody (1,1) struct +end + +groupName = ''; +desc = ''; +if isfield(preBody, 'subject_group') && isstruct(preBody.subject_group) + sg = preBody.subject_group; + groupName = getCharField(sg, 'group_name'); + desc = getCharField(sg, 'description'); +end + +v2Body = struct(); +v2Body.document_class = struct( ... + 'class_name', 'subject', 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +% v1 subject_group carried no depends_on; keep that (membership lives on +% the member subjects, resolved into group_assignment in the NDI layer). +v2Body.depends_on = struct('name', {}, 'value', {}); +if isfield(preBody, 'base') && isstruct(preBody.base) + v2Body.base = preBody.base; +end +v2Body.subject = struct( ... + 'local_identifier', groupName, ... + 'description', desc, ... + 'is_biological', false, ... + 'is_group', true); +end + +% ===================== helpers ============================================= + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index 105bbae..bb1e7fe 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -192,6 +192,30 @@ function testAlreadyEpsilonBodyShortCircuits(testCase) verifyEqual(testCase, numel(out.quarantine), 0); end +% ===================== subject_group -> subject ======================= + +function testSubjectGroupBecomesGroupSubject(testCase) +% subject_group folds into the subject tier as a subject flagged is_group. +v1 = wrap('subject_group', 'subject_group', struct()); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 1); +verifyTrue(testCase, isfield(out.summary.by_class, 'subject')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('subject.is_group'), true); +verifyEqual(testCase, doc.get('subject.is_biological'), false); +end + +function testSubjectGroupCarriesOptionalNameAndDescription(testCase) +% Newer subject_group docs may carry group_name / description; they map +% onto the subject block's local_identifier / description. +v1 = wrap('subject_group', 'subject_group', struct( ... + 'group_name', 'control', 'description', 'untreated cohort')); +out = runE(v1); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('subject.local_identifier'), 'control'); +verifyEqual(testCase, doc.get('subject.description'), 'untreated cohort'); +end + % ===================== backward compatibility ========================== function testDefaultTargetLeavesTreatmentUnchanged(testCase) From 683a0ec43a614791a08c576986a441a037d3e4a8 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 14:06:17 +0000 Subject: [PATCH 18/28] test: run the 20211116 corpus against V_epsilon 20211116 was the lone corpus still migrating to V_delta via bespoke test code. Convert it to the shared did2.unittest.helpers.runCorpusDiscovery driver (same as B / Dab / JH), which targets V_epsilon and reuses the common schema-path install/teardown and report writer. This puts every corpus on the same V_epsilon target so its remaining quarantine is measured on the same footing as the others. Discovery mode is unchanged (no zero-quarantine assertion); the report is the deliverable. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- tests/+did2/+unittest/testCorpus20211116.m | 185 +++------------------ 1 file changed, 19 insertions(+), 166 deletions(-) diff --git a/tests/+did2/+unittest/testCorpus20211116.m b/tests/+did2/+unittest/testCorpus20211116.m index f777272..7dfd30e 100644 --- a/tests/+did2/+unittest/testCorpus20211116.m +++ b/tests/+did2/+unittest/testCorpus20211116.m @@ -3,28 +3,28 @@ % % Pulls the 20211116.zip fixture from the public S3 prefix % (~11MB compressed, ~36MB unzipped, ~1220 v1 documents across -% ~21 classes), runs every contained body through -% did2.convert.v1_to_v2 with Validate=true, and writes a per-run -% summary JSON to corpus-reports/20211116-summary.json. The -% workflow's upload-artifact step picks the file up as a CI -% artifact. +% ~21 classes) and runs every contained body through +% did2.convert.v1_to_v2 with Validate=true, targeting **V_epsilon** +% (via the shared did2.unittest.helpers.runCorpusDiscovery driver, +% same as the B / Dab / JH corpora), and writes a per-run summary +% JSON to corpus-reports/20211116-summary.json that the workflow's +% upload-artifact step picks up. % -% Unlike testCorpusPRED, this is **discovery mode**: the test does -% not assert zero quarantine. Its job is to surface coverage -% signal (which classes / required fields are not yet migratable) -% without blocking unrelated PRs on migrator work. The single hard -% assertion is that the corpus contained at least one JSON file, -% to catch a broken fixture URL. +% Discovery mode: the test does not assert zero quarantine. Its job +% is to surface coverage signal (which classes / required fields are +% not yet migratable) without blocking unrelated PRs on migrator +% work. The single hard assertion (inside the helper) is that the +% corpus contained at least one JSON file, to catch a broken fixture +% URL. % % The corpus URL: % https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip % The zip contains a top-level 20211116/ directory of v1 NDI % document JSONs (plus __MACOSX/ sidecars that are skipped). % -% Schema-path resolution mirrors testCorpusPRED: DID_SCHEMA_PATH -% first, then the did2.schema.cache sibling-checkout default; -% skips via assumeFail if neither resolves so local devs without a -% did-schema checkout get a clean skip. +% Schema-path resolution + teardown are handled by the shared +% helpers (DID_SCHEMA_PATH first, then the did2.schema.cache +% sibling-checkout default; assumeFail skip if neither resolves). % % Run with: % results = runtests('did2.unittest.testCorpus20211116'); @@ -32,159 +32,12 @@ tests = functiontests(localfunctions); end -function setupOnce(testCase) -% Seed teardown-safe fields first so teardown is a no-op when -% setupOnce filters via assumeFail before any override happens. -testCase.TestData.previousSchemaPath = getenv('DID_SCHEMA_PATH'); -testCase.TestData.didOverrideSchemaPath = false; -testCase.TestData.corpusDir = ''; - -schemaPath = resolveSchemaPath(); -if isempty(schemaPath) - assumeFail(testCase, ... - ['V_delta schemas not found. Set DID_SCHEMA_PATH or check out ', ... - 'did-schema as a sibling of DID-matlab; skipping 20211116 corpus test.']); -end -setenv('DID_SCHEMA_PATH', schemaPath); -testCase.TestData.didOverrideSchemaPath = true; -did2.schema.cache.resetSingleton(); - -testCase.TestData.corpusDir = ensureCorpus( ... - 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip', ... - 'did2-corpus-20211116', '20211116'); -end - function teardownOnce(testCase) -if isfield(testCase.TestData, 'didOverrideSchemaPath') ... - && testCase.TestData.didOverrideSchemaPath - setenv('DID_SCHEMA_PATH', testCase.TestData.previousSchemaPath); - did2.schema.cache.resetSingleton(); -end +did2.unittest.helpers.restoreSchemaPath(testCase); end function test20211116CorpusDiscoveryReport(testCase) -corpusDir = testCase.TestData.corpusDir; -files = dir(fullfile(corpusDir, '*.json')); -files = files(~startsWith({files.name}, '._')); -verifyGreaterThan(testCase, numel(files), 0, ... - sprintf('No JSON files found under %s', corpusDir)); - -bodies = cell(numel(files), 1); -for k = 1:numel(files) - bodies{k} = fileread(fullfile(files(k).folder, files(k).name)); -end - -result = did2.convert.v1_to_v2(bodies, 'Validate', true); - -reasons = topQuarantineReasons(result.quarantine); -reportPath = writeReport('20211116', result, reasons); - -fprintf('\n=== Corpus 20211116 discovery summary ===\n'); -fprintf('total: %d\n', result.summary.total); -fprintf('migrated_count: %d\n', result.summary.migrated_count); -fprintf('quarantine_count: %d\n', result.summary.quarantine_count); -fprintf('report: %s\n', reportPath); -fprintf('top quarantine reasons:\n'); -for k = 1:min(numel(reasons), 15) - fprintf(' %5d [%s] %s\n', reasons(k).count, ... - reasons(k).class_name, reasons(k).reason); -end -end - -% --- helpers --- - -function reasons = topQuarantineReasons(quarantine) -% Aggregate quarantine entries by (class_name, reason) and return a -% struct array sorted by descending count. -if isempty(quarantine) - reasons = struct('class_name', {}, 'reason', {}, 'count', {}); - return; -end -keys = cell(1, numel(quarantine)); -for k = 1:numel(quarantine) - keys{k} = sprintf('%s|||%s', quarantine(k).class_name, ... - quarantine(k).reason); -end -[uniqKeys, ~, idx] = unique(keys); -counts = accumarray(idx, 1); -reasons = struct('class_name', {}, 'reason', {}, 'count', {}); -for k = 1:numel(uniqKeys) - parts = strsplit(uniqKeys{k}, '|||'); - reasons(k).class_name = parts{1}; - reasons(k).reason = parts{2}; - reasons(k).count = counts(k); -end -[~, order] = sort(-[reasons.count]); -reasons = reasons(order); -end - -function reportPath = writeReport(corpusName, result, reasons) -% Write a JSON discovery summary into /corpus-reports/. The CI -% workflow's upload-artifact step picks up everything under that -% directory. -reportDir = fullfile(pwd, 'corpus-reports'); -if ~exist(reportDir, 'dir') - mkdir(reportDir); -end -reportPath = fullfile(reportDir, [corpusName '-summary.json']); - -report = struct( ... - 'corpus', corpusName, ... - 'generated_at', char(datetime('now', 'TimeZone', 'UTC', ... - 'Format', 'yyyy-MM-dd''T''HH:mm:ss''Z''')), ... - 'total', result.summary.total, ... - 'migrated_count', result.summary.migrated_count, ... - 'quarantine_count', result.summary.quarantine_count, ... - 'by_class', result.summary.by_class, ... - 'quarantine_reasons', reasons); - -fid = fopen(reportPath, 'w'); -if fid < 0 - error('did2:test:reportWriteFailed', ... - 'Could not open %s for writing.', reportPath); -end -cleanup = onCleanup(@() fclose(fid)); %#ok -fwrite(fid, jsonencode(report, 'PrettyPrint', true)); -end - -function p = resolveSchemaPath() -% Return a directory that holds V_delta `*.json` schema files, or '' -% if none can be found. Probe order: DID_SCHEMA_PATH env, then the -% sibling-checkout default (matches did2.schema.cache). -candidates = {}; -envPath = getenv('DID_SCHEMA_PATH'); -if ~isempty(envPath) - candidates{end+1} = envPath; %#ok -end -toolboxDir = did.toolboxdir(); -candidates{end+1} = fullfile(toolboxDir, '..', '..', '..', ... - 'did-schema', 'schemas', 'V_delta', 'stable'); %#ok - -p = ''; -for k = 1:numel(candidates) - candidate = candidates{k}; - if isfolder(candidate) && ~isempty(dir(fullfile(candidate, '*.json'))) - p = candidate; - return; - end -end -end - -function corpusDir = ensureCorpus(corpusURL, cacheName, innerDir) -% Download (if necessary) and extract a corpus zip. The unzip target -% is cached under tempdir so repeated runs in the same MATLAB -% session reuse the same files. -cacheRoot = fullfile(tempdir(), cacheName); -corpusDir = fullfile(cacheRoot, innerDir); -if isfolder(corpusDir) && ~isempty(dir(fullfile(corpusDir, '*.json'))) - return; -end -if ~exist(cacheRoot, 'dir') - mkdir(cacheRoot); -end -zipPath = fullfile(cacheRoot, [innerDir '.zip']); -if ~isfile(zipPath) - websave(zipPath, corpusURL); -end -unzip(zipPath, cacheRoot); +did2.unittest.helpers.runCorpusDiscovery(testCase, '20211116', ... + 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip', ... + '20211116'); end From 4c636d53055d9066598b1b50dc92c60063cb4ce3 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 18:18:24 +0000 Subject: [PATCH 19/28] convert: drop stray empty blocks for non-hosting chain classes v1 documents carried a property block for every class in their hierarchy. V_epsilon introduces abstract / fieldless parent classes (new -- the previous schema had none), which contribute no block to an instance. The inherited empty block (e.g. stimulus_response_scalar_ parameters: {} on a stimulus_response_scalar_parameters_basic doc) rode through migration untouched and tripped the strict undeclared-top-level- block validator. In ensureClassBlocks, after manufacturing the contributing blocks, remove any top-level block that names a chain class which contributes no block AND is an empty struct. Only EMPTY blocks are dropped; a non-empty one signals real data a migrator must place, so it is left to fail loudly rather than be silently discarded. General fix (one place) rather than a per-class migrator, so every v1 abstract-parent-empty-block case clears. Regression-checked by the 20211116 corpus discovery job (its 273 "undeclared top-level block stimulus_response_scalar_parameters" should drop to ~0). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did2/+convert/v1_to_v2.m | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/did/+did2/+convert/v1_to_v2.m b/src/did/+did2/+convert/v1_to_v2.m index 3cdf3b7..1875efb 100644 --- a/src/did/+did2/+convert/v1_to_v2.m +++ b/src/did/+did2/+convert/v1_to_v2.m @@ -378,6 +378,23 @@ body.(cls) = struct(); end end +% Drop stray EMPTY blocks left by v1 for chain classes that the target +% schema does NOT host on the instance. v1 documents carried a property +% block for every class in their hierarchy, including parents that became +% abstract / fieldless in V_delta/V_epsilon (abstract classes are new +% here). Those arrive as empty structs and would trip the strict +% undeclared-top-level-block check. Only EMPTY such blocks are removed -- +% a non-empty one signals real data a migrator must place, so it is left +% to fail loudly rather than be silently dropped. +chainClasses = [reshape(ancestors, 1, []), {className}]; +nonContributing = setdiff(chainClasses, placementInfo.blocksContributed); +for k = 1:numel(nonContributing) + cls = nonContributing{k}; + if isfield(body, cls) && isstruct(body.(cls)) ... + && (numel(body.(cls)) == 0 || isempty(fieldnames(body.(cls)))) + body = rmfield(body, cls); + end +end sc = struct('class_name', {}, 'class_version', {}); for k = 1:numel(ancestors) ancDC = cache.getClass(ancestors{k}).document_class; From 35721f07595c1b9aaf6f32d297d905045f3dea37 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 19:08:07 +0000 Subject: [PATCH 20/28] migrators_e: add treatment_drug / virus_injection / treatment_transfer; enable Soph corpus Three remaining deprecated-class folds (no migrator existed, so they would quarantine under V_epsilon): - treatment_drug -> injection (kind: drug); mixture from mixture_table - virus_injection -> injection (kind: virus); virus+dilution in the mixture - treatment_transfer -> biological_transfer; recipient->subject_id, donor carried as donor_id, method->procedure/kind Each is a 1 -> 2 emit (the manipulation + the shared session_relative_ reference anchor subject_interaction requires) and uses the correct {chemical, amount} mixture record shape. Tests added to testMigratorsE. Also enable the large Soph corpus (~446 MB, ~101k docs) in test-code.yml (DID_RUN_SOPH_TEST=1) so its V_epsilon quarantine is surfaced for the discovery pass; this is opt-in and meant to be gated back off once Soph coverage is characterised. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .github/workflows/test-code.yml | 7 + .../+convert/+migrators_e/treatment_drug.m | 173 ++++++++++++++++++ .../+migrators_e/treatment_transfer.m | 120 ++++++++++++ .../+convert/+migrators_e/virus_injection.m | 169 +++++++++++++++++ tests/+did2/+unittest/testMigratorsE.m | 69 +++++++ 5 files changed, 538 insertions(+) create mode 100644 src/did/+did2/+convert/+migrators_e/treatment_drug.m create mode 100644 src/did/+did2/+convert/+migrators_e/treatment_transfer.m create mode 100644 src/did/+did2/+convert/+migrators_e/virus_injection.m diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index a0003aa..c57806a 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -64,6 +64,13 @@ jobs: echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/epsilon-schemas" \ >> "$GITHUB_ENV" + # Discovery run: include the large Soph corpus (~446 MB, ~101k docs) + # so its V_epsilon quarantine is surfaced alongside the others. This + # is opt-in (the test skips without it); gate it back off once the + # Soph coverage is characterised, since it adds significant CI time. + - name: Enable Soph corpus discovery + run: echo "DID_RUN_SOPH_TEST=1" >> "$GITHUB_ENV" + - name: Set up MATLAB uses: matlab-actions/setup-matlab@v2 with: diff --git a/src/did/+did2/+convert/+migrators_e/treatment_drug.m b/src/did/+did2/+convert/+migrators_e/treatment_drug.m new file mode 100644 index 0000000..0c1456d --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment_drug.m @@ -0,0 +1,173 @@ +function v2Body = treatment_drug(preBody) +%TREATMENT_DRUG Brainstorm-E migrator: did_v1 treatment_drug -> injection (drug). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, treatment_drug is deprecated and +% folds into injection (kind: "drug"): the administered substance becomes +% the pharmacological_manipulation.mixture, the body location becomes the +% injection target_structure. 1 -> 2: the injection plus the shared +% session_relative_reference anchor every migrated interaction needs +% (subject_interaction requires a time_reference). +% +% Branch/field resolution here is a HEURISTIC seed (the legacy +% mixture_table format varies); the authoritative mapping is finalised in +% discovery mode against real corpora. + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment_drug') || ~isstruct(preBody.treatment_drug) + error('did2:convert:missingBlock', ... + 'treatment_drug body is missing the treatment_drug property block.'); +end +block = preBody.treatment_drug; + +targetStructure = ontologyArray( ... + getCharField(block, 'location_ontologyNode'), ... + getCharField(block, 'location_name')); +mixture = parseMixtureTable(block); + +inj = startManipulation(preBody, 'injection', {'pharmacological_manipulation'}); +inj.pharmacological_manipulation = struct('mixture', mixture); +inj.injection = struct( ... + 'kind', 'drug', ... + 'volume', blankVolume(), ... + 'route', ontologyTerm('', ''), ... + 'target_structure', {targetStructure}); + +anchor = makeSessionAnchor(preBody, 'during'); +inj.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {inj, anchor}; +end + +% ===================== shared helpers ================================== + +function body = startManipulation(preBody, className, extraSupers) +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubject(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function mixture = parseMixtureTable(block) +%PARSEMIXTURETABLE Best-effort parse of the legacy CSV mixture_table into the +% {chemical, amount} records pharmacological_manipulation.mixture wants. +% mustBeNonEmpty: always return >= 1 record (a blank one if nothing +% parses), so the document validates; the blank is the curator's signal. +mixture = struct('chemical', {}, 'amount', {}); +raw = ''; +if isfield(block, 'mixture_table') + v = block.mixture_table; + if ischar(v); raw = v; elseif isstring(v) && isscalar(v); raw = char(v); end +end +if ~isempty(raw) + lines = strsplit(raw, newline); + for i = 1:numel(lines) + cols = strsplit(strtrim(lines{i}), ','); + if numel(cols) < 2 || isempty(strtrim(cols{1})) + continue; + end + chemical = ontologyTerm(strtrim(cols{1}), strtrim(cols{2})); + amount = blankConcentration(); + if numel(cols) >= 3 && ~isempty(strtrim(cols{3})) + amount.source_value = str2double(strtrim(cols{3})); + end + if numel(cols) >= 4 + amount.source_unit = strtrim(cols{4}); + end + mixture(end+1) = struct('chemical', chemical, 'amount', amount); %#ok + end +end +if isempty(mixture) + mixture(1) = struct('chemical', ontologyTerm('', ''), ... + 'amount', blankConcentration()); +end +end + +function arr = ontologyArray(node, name) +if isempty(node) && isempty(name) + arr = struct('node', {}, 'name', {}); % empty ontology_term array +else + arr = ontologyTerm(node, name); +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + +function c = blankConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end diff --git a/src/did/+did2/+convert/+migrators_e/treatment_transfer.m b/src/did/+did2/+convert/+migrators_e/treatment_transfer.m new file mode 100644 index 0000000..ef7b62c --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment_transfer.m @@ -0,0 +1,120 @@ +function v2Body = treatment_transfer(preBody) +%TREATMENT_TRANSFER Brainstorm-E migrator: did_v1 treatment_transfer -> +% biological_transfer. +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, treatment_transfer is deprecated and +% folds into biological_transfer (a procedural_manipulation): the +% transferred entity becomes biological_transfer.entity, the transfer +% method becomes procedural_manipulation.procedure (and biological_ +% transfer.kind), the legacy recipient_id becomes the subject_id, and the +% donor_id is carried as biological_transfer's donor dependency. 1 -> 2: +% the transfer plus the shared session_relative_reference anchor +% (subject_interaction needs a time_reference). +% +% The legacy timestamp/clocktype carry real timing that could anchor a +% UTC/event reference; the honest fallback for now is the ordinal session +% anchor (refined in the temporal-anchoring follow-up). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment_transfer') || ~isstruct(preBody.treatment_transfer) + error('did2:convert:missingBlock', ... + 'treatment_transfer body is missing the treatment_transfer property block.'); +end +block = preBody.treatment_transfer; + +recipientId = namedDep(preBody, 'recipient_id'); +donorId = namedDep(preBody, 'donor_id'); + +entity = ontologyTerm(getCharField(block, 'entity_ontologyNode'), ... + getCharField(block, 'entity_name')); +procedure = ontologyTerm(getCharField(block, 'method_ontologyNode'), ... + getCharField(block, 'method_name')); +kind = getCharField(block, 'method_name'); +if isempty(kind) + kind = 'transfer'; % biological_transfer.kind is char, mustBeNonEmpty +end + +body = struct(); +body.document_class = struct( ... + 'class_name', 'biological_transfer', 'class_version', '1.0.0', ... + 'superclasses', [ ... + struct('class_name', 'manipulation', 'class_version', '1.0.0'), ... + struct('class_name', 'procedural_manipulation', 'class_version', '1.0.0')], ... + 'schema_version', 'V_epsilon'); +% subject_id (the recipient) + donor_id; time_reference is appended below. +body.depends_on = [ ... + struct('name', 'subject_id', 'value', recipientId), ... + struct('name', 'donor_id', 'value', donorId)]; +if isfield(preBody, 'base') + body.base = preBody.base; +end +body.procedural_manipulation = struct( ... + 'procedure', procedure, ... + 'target_structure', {struct('node', {}, 'name', {})}, ... + 'notes', ''); +body.biological_transfer = struct('entity', entity, 'kind', kind); + +anchor = makeSessionAnchor(preBody, 'during'); +body.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {body, anchor}; +end + +% ===================== shared helpers ================================== + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function val = namedDep(preBody, name) +val = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, name) + if isfield(d, 'value') + val = d.value; + elseif isfield(d, 'document_id') + val = d.document_id; + end + end + end +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/virus_injection.m b/src/did/+did2/+convert/+migrators_e/virus_injection.m new file mode 100644 index 0000000..b232b27 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/virus_injection.m @@ -0,0 +1,169 @@ +function v2Body = virus_injection(preBody) +%VIRUS_INJECTION Brainstorm-E migrator: did_v1 virus_injection -> injection (virus). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, virus_injection is deprecated and +% folds into injection (kind: "virus"): the virus identity becomes the +% first pharmacological_manipulation.mixture chemical (serotype carried in +% the ontology term), the dilution becomes its concentration amount, the +% diluent (if named) a second mixture record, and the injection site +% becomes the target_structure. 1 -> 2: the injection plus the shared +% session_relative_reference anchor (subject_interaction needs a +% time_reference). +% +% The administration date / PND carry timing that could anchor a UTC or +% developmental reference; for now the honest fallback is the ordinal +% session anchor (refined in the temporal-anchoring follow-up). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'virus_injection') || ~isstruct(preBody.virus_injection) + error('did2:convert:missingBlock', ... + 'virus_injection body is missing the virus_injection property block.'); +end +block = preBody.virus_injection; + +% virus chemical + dilution amount +amount = blankConcentration(); +dilution = numField(block, 'dilution'); +if ~isempty(dilution) + amount.source_value = dilution; + amount.source_unit = 'dilution'; +end +mixture = struct( ... + 'chemical', ontologyTerm(getCharField(block, 'virus_OntologyName'), ... + getCharField(block, 'virus_name')), ... + 'amount', amount); +% optional diluent as a second record +diluentNode = getCharField(block, 'diluent_OntologyName'); +diluentName = getCharField(block, 'diluent_name'); +if ~isempty(diluentNode) || ~isempty(diluentName) + mixture(end+1) = struct( ... + 'chemical', ontologyTerm(diluentNode, diluentName), ... + 'amount', blankConcentration()); +end + +targetStructure = ontologyArray( ... + getCharField(block, 'virusLocation_OntologyName'), ... + getCharField(block, 'virusLocation_name')); + +inj = startManipulation(preBody, 'injection', {'pharmacological_manipulation'}); +inj.pharmacological_manipulation = struct('mixture', mixture); +inj.injection = struct( ... + 'kind', 'virus', ... + 'volume', blankVolume(), ... + 'route', ontologyTerm('', ''), ... + 'target_structure', {targetStructure}); + +anchor = makeSessionAnchor(preBody, 'during'); +inj.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {inj, anchor}; +end + +% ===================== shared helpers ================================== + +function body = startManipulation(preBody, className, extraSupers) +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubject(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function arr = ontologyArray(node, name) +if isempty(node) && isempty(name) + arr = struct('node', {}, 'name', {}); +else + arr = ontologyTerm(node, name); +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + +function c = blankConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function n = numField(block, name) +n = []; +if isfield(block, name) + v = block.(name); + if isnumeric(v) && isscalar(v) && isfinite(v) + n = double(v); + end +end +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m index bb1e7fe..3eb4c5f 100644 --- a/tests/+did2/+unittest/testMigratorsE.m +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -34,6 +34,18 @@ out = did2.convert.v1_to_v2(v1, 'Validate', false, 'TargetVersion', 'V_epsilon'); end +function v = depVal(doc, name) +% Fetch a depends_on value by name from a migrated did2.document. +v = ''; +deps = doc.get('depends_on'); +for k = 1:numel(deps) + if isfield(deps(k), 'name') && strcmp(deps(k).name, name) + v = deps(k).value; + return; + end +end +end + % ===================== treatment -> manipulation ======================= function testThermalTreatmentBecomesTemperatureManipulation(testCase) @@ -192,6 +204,63 @@ function testAlreadyEpsilonBodyShortCircuits(testCase) verifyEqual(testCase, numel(out.quarantine), 0); end +% ===================== deprecated treatment family -> injection/transfer = + +function testTreatmentDrugBecomesInjection(testCase) +v1 = wrap('treatment_drug', 'treatment_drug', struct( ... + 'location_ontologyNode', 'uberon:0000955', 'location_name', 'brain', ... + 'mixture_table', 'chebi:6904,muscimol,5,mg/ml')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); % injection + session anchor +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('injection.kind'), 'drug'); +mix = doc.get('pharmacological_manipulation.mixture'); +verifyEqual(testCase, mix(1).chemical.name, 'muscimol'); +end + +function testVirusInjectionBecomesVirusInjection(testCase) +v1 = wrap('virus_injection', 'virus_injection', struct( ... + 'virus_OntologyName', 'addgene:26973', 'virus_name', 'AAV9-CaMKII-GCaMP', ... + 'virusLocation_OntologyName', 'uberon:0001950', 'virusLocation_name', 'neocortex', ... + 'dilution', 0.5, 'diluent_OntologyName', '', 'diluent_name', 'saline')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('injection.kind'), 'virus'); +mix = doc.get('pharmacological_manipulation.mixture'); +verifyEqual(testCase, mix(1).chemical.name, 'AAV9-CaMKII-GCaMP'); +verifyEqual(testCase, mix(1).amount.source_value, 0.5); +end + +function testTreatmentTransferBecomesBiologicalTransfer(testCase) +% treatment_transfer carries recipient_id + donor_id (not subject_id). +v1 = struct(); +v1.document_class = struct('class_name', 'treatment_transfer', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0')); +v1.depends_on = struct( ... + 'name', {'recipient_id', 'donor_id'}, ... + 'value', {'aabb1122ccdd3344_1111111111111111', ... + 'aabb1122ccdd3344_2222222222222222'}); +v1.base = struct('id', 'aabb1122ccdd3344_3333333333333333', ... + 'session_id', 'aabb1122ccdd3344_9900aabbccddeeff', ... + 'name', 'transfer-example', 'datestamp', '2024-06-01T12:00:00.000Z'); +v1.treatment_transfer = struct('entity_name', 'donor retina', ... + 'entity_ontologyNode', 'uberon:0000966', ... + 'method_name', 'transplant', 'method_ontologyNode', 'ncit:C15282'); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'biological_transfer')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('biological_transfer.entity').name, 'donor retina'); +verifyEqual(testCase, doc.get('biological_transfer.kind'), 'transplant'); +% recipient -> subject_id; donor carried as donor_id +verifyEqual(testCase, depVal(doc, 'subject_id'), 'aabb1122ccdd3344_1111111111111111'); +verifyEqual(testCase, depVal(doc, 'donor_id'), 'aabb1122ccdd3344_2222222222222222'); +end + % ===================== subject_group -> subject ======================= function testSubjectGroupBecomesGroupSubject(testCase) From 6d778f86cfc26ee3efcba4532039c12968fb2abd Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 22:16:36 +0000 Subject: [PATCH 21/28] ci: gate the Soph corpus back off Soph discovery is characterised (0 quarantine at V_epsilon), so drop the DID_RUN_SOPH_TEST=1 enable from test-code.yml -- the ~446 MB / ~101k-doc download should not run on every push. The test remains opt-in (skips cleanly) and can be re-enabled on demand. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .github/workflows/test-code.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index c57806a..a0003aa 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -64,13 +64,6 @@ jobs: echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/epsilon-schemas" \ >> "$GITHUB_ENV" - # Discovery run: include the large Soph corpus (~446 MB, ~101k docs) - # so its V_epsilon quarantine is surfaced alongside the others. This - # is opt-in (the test skips without it); gate it back off once the - # Soph coverage is characterised, since it adds significant CI time. - - name: Enable Soph corpus discovery - run: echo "DID_RUN_SOPH_TEST=1" >> "$GITHUB_ENV" - - name: Set up MATLAB uses: matlab-actions/setup-matlab@v2 with: From ab68a4ef9a4e160766f3a95c46611c94fc5ed213 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 23:41:12 +0000 Subject: [PATCH 22/28] migrators_e: fix treatment injection/bath to emit schema-valid bodies The treatment -> injection/bath builders predated schema validation (treatment isn't in the validated corpora) and had three mismatches that would quarantine the moment a real treatment-drug/bath row appeared: - mixture records used {agent, concentration}; the schema is {chemical, amount}. - injection/bath were given a `notes` field neither class declares (strict undeclared-field failure). - injection omitted its required `volume`. Align both builders with the injection/bath schemas: mixture as {chemical, amount}, injection carries kind/volume(blank)/route/ target_structure, bath carries kind/location. notes is dropped (no field on these classes). Matches the {chemical, amount} shape the new treatment_drug / virus_injection migrators already use. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+did2/+convert/+migrators_e/treatment.m | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m index e1db830..ee964a6 100644 --- a/src/did/+did2/+convert/+migrators_e/treatment.m +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -110,23 +110,26 @@ body.scalar_temperature = struct('value', temperatureComposite(numValue)); end -function body = makeInjection(preBody, identity, targetStructure, notesText) +function body = makeInjection(preBody, identity, targetStructure, ~) body = startBody(preBody, 'injection', {'pharmacological_manipulation'}); +% mixture records are {chemical: ontology_term, amount: concentration}. body.pharmacological_manipulation = struct('mixture', ... - struct('agent', identity, 'concentration', emptyConcentration())); + struct('chemical', identity, 'amount', emptyConcentration())); +% injection declares kind/volume/route/target_structure (no `notes`); +% volume + route are required and emitted blank for curator fill-in. body.injection = struct( ... 'kind', 'drug', ... + 'volume', blankVolume(), ... 'route', struct('node', '', 'name', ''), ... - 'target_structure', {targetStructure}, ... - 'notes', notesText); + 'target_structure', {targetStructure}); end -function body = makeBath(preBody, identity, notesText) +function body = makeBath(preBody, identity, ~) body = startBody(preBody, 'bath', {'pharmacological_manipulation'}); body.pharmacological_manipulation = struct('mixture', ... - struct('agent', identity, 'concentration', emptyConcentration())); -body.bath = struct('kind', 'drug', ... - 'location', struct('node', '', 'name', ''), 'notes', notesText); + struct('chemical', identity, 'amount', emptyConcentration())); +% bath declares only kind/location (no `notes`). +body.bath = struct('kind', 'drug', 'location', struct('node', '', 'name', '')); end function body = makeProceduralManipulation(preBody, identity, targetStructure, notesText) @@ -229,6 +232,11 @@ c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); end +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + function s = getCharField(block, name) s = ''; if isfield(block, name) From 3a7ca015339248d643af383d2139eac5837a7077 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 23:46:10 +0000 Subject: [PATCH 23/28] test: emit a per-term routing inventory from corpus discovery (#3 routing) To curate the authoritative treatment / ontology_table_row routing tables from real data, the discovery run now writes corpus-reports/- routing.json: for every migrated observation/manipulation, the identity term (measured_property / applied_property / procedure / factor / entity / first mixture chemical) aggregated to its routed class with counts. This makes the heuristic routing AUDITABLE -- terms landing in generic_* are unmatched, and a term under a surprising class is a mis-route. Best-effort and wrapped so it never breaks the discovery summary. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+unittest/+helpers/runCorpusDiscovery.m | 10 ++ .../+unittest/+helpers/writeRoutingReport.m | 125 ++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 tests/+did2/+unittest/+helpers/writeRoutingReport.m diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index f495d59..c3f249d 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -49,6 +49,16 @@ reasons = did2.unittest.helpers.topQuarantineReasons(result.quarantine); reportPath = did2.unittest.helpers.writeCorpusReport(corpusName, result, reasons); +% Per-term routing inventory (best-effort): makes the heuristic +% treatment / ontology_table_row routing auditable against real corpus +% terms so the authoritative per-term tables can be curated. Never let it +% break the discovery run -- the summary is the primary deliverable. +try + did2.unittest.helpers.writeRoutingReport(corpusName, result.migrated); +catch routingErr + fprintf('routing report skipped: %s\n', routingErr.message); +end + fprintf('\n=== Corpus %s discovery summary (target %s) ===\n', ... corpusName, options.TargetVersion); fprintf('total: %d\n', result.summary.total); diff --git a/tests/+did2/+unittest/+helpers/writeRoutingReport.m b/tests/+did2/+unittest/+helpers/writeRoutingReport.m new file mode 100644 index 0000000..5881c9f --- /dev/null +++ b/tests/+did2/+unittest/+helpers/writeRoutingReport.m @@ -0,0 +1,125 @@ +function reportPath = writeRoutingReport(corpusName, migrated) +%WRITEROUTINGREPORT Per-term -> routed-class inventory for routing curation. +% +% REPORTPATH = did2.unittest.helpers.writeRoutingReport(NAME, MIGRATED) +% walks the migrated observation/manipulation documents, extracts each +% one's identity term (the property the row is ABOUT -- measured_property +% / applied_property / procedure / factor / entity / first mixture +% chemical), and aggregates (term_node, term_name, class_name) with +% counts into /corpus-reports/-routing.json (picked up by the +% upload-artifact step alongside the discovery summary). +% +% Purpose: the treatment / ontology_table_row migrators route by +% keyword/CURIE HEURISTICS, so everything migrates "green" but a term can +% land in the wrong class with no error. This report makes routing +% AUDITABLE against real corpus terms: +% - rows whose class_name is generic_scalar_observation / +% generic_categorical_observation are UNMATCHED terms (need a minted +% class or a routing rule), and +% - a term that appears under a surprising class is a mis-route to fix. +% It is the data source for building the authoritative per-term routing +% tables (discovery mode, the conversion docs' "Open questions"). +% +% Best-effort and side-effect-only: any failure is swallowed by the +% caller so it never breaks the discovery run (the summary is primary). + +reportDir = fullfile(pwd, 'corpus-reports'); +if ~exist(reportDir, 'dir') + mkdir(reportDir); +end +reportPath = fullfile(reportDir, [corpusName '-routing.json']); + +keys = {}; +nodes = {}; +names = {}; +classes = {}; +counts = []; +for k = 1:numel(migrated) + doc = migrated{k}; + cls = doc.className(); + [node, name] = identityTerm(doc); + if isempty(node) && isempty(name) + continue; % not a property-bearing observation/manipulation + end + key = [node '|' name '|' cls]; + idx = find(strcmp(keys, key), 1); + if isempty(idx) + keys{end+1} = key; %#ok + nodes{end+1} = node; %#ok + names{end+1} = name; %#ok + classes{end+1} = cls; %#ok + counts(end+1) = 1; %#ok + else + counts(idx) = counts(idx) + 1; + end +end + +if isempty(counts) + entries = struct('term_node', {}, 'term_name', {}, ... + 'class_name', {}, 'count', {}); +else + [~, order] = sort(counts, 'descend'); + entries = struct('term_node', nodes(order), 'term_name', names(order), ... + 'class_name', classes(order), 'count', num2cell(counts(order))); +end + +report = struct( ... + 'corpus', corpusName, ... + 'generated_at', char(datetime('now', 'TimeZone', 'UTC', ... + 'Format', 'yyyy-MM-dd''T''HH:mm:ss''Z''')), ... + 'distinct_terms', numel(entries), ... + 'routes', entries); + +fid = fopen(reportPath, 'w'); +if fid < 0 + error('did2:test:reportWriteFailed', ... + 'Could not open %s for writing.', reportPath); +end +cleanup = onCleanup(@() fclose(fid)); %#ok +fwrite(fid, jsonencode(report, 'PrettyPrint', true)); +end + +% ===================== helpers ============================================ + +function [node, name] = identityTerm(doc) +%IDENTITYTERM The ontology term a migrated observation/manipulation is about. +node = ''; +name = ''; +% Single-term identity fields, in priority order across the tiers. +paths = { ... + 'observation.measured_property', ... % observations + 'scalar_manipulation.applied_property', ... % temperature_manipulation, ... + 'procedural_manipulation.procedure', ... % procedural_manipulation, biological_transfer + 'environmental_manipulation.factor', ... % environmental_manipulation + 'biological_transfer.entity'}; % biological_transfer (more specific) +for p = 1:numel(paths) + t = tryGet(doc, paths{p}); + [node, name] = termOf(t); + if ~isempty(node) || ~isempty(name) + return; + end +end +% Pharmacological tiers (injection/bath) carry the agent in mixture[1].chemical. +m = tryGet(doc, 'pharmacological_manipulation.mixture'); +if ~isempty(m) && isstruct(m) + [node, name] = termOf(m(1).chemical); +end +end + +function [node, name] = termOf(t) +node = ''; +name = ''; +if isstruct(t) && isscalar(t) + if isfield(t, 'node') && ischar(t.node); node = t.node; end + if isfield(t, 'name') && ischar(t.name); name = t.name; end +end +end + +function v = tryGet(doc, path) +v = []; +try + v = doc.get(path); +catch + v = []; +end +end From ea081ebb29742092d569c150340466b90da8a10b Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 23:59:18 +0000 Subject: [PATCH 24/28] did(legacy): accept did2-form superclasses in validate_doc_vs_schema The legacy did1 validator derived a document's superclass NAMES solely from each superclasses entry's `.definition` path ($NDIDOCUMENTPATH/base.json -> base). did2-form documents (the shape every v1->v2 migrator now emits, and increasingly what NDI writes) carry the name directly as `.class_name` with no `.definition`, so the extraction errored to empty and EVERY such document failed the superclasses check with `("base" <=> "")` -- e.g. the NDI dataset/session ingestion suite on the Vnext line. Accept both forms: keep the `.definition` derivation, and when that yields nothing, fall back to `.class_name`. Bound the deeper superclass recursion by `superFullNames` (the `.definition` paths) so did2-form docs -- which expose names but no paths -- skip the path-based recursion cleanly instead of indexing past the end; the concrete class is still validated and the superclass-name check has already passed. Verified via the NDI dataset/session ingestion tests (they exercise this legacy path); did2's own suite does not reach it. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- src/did/+did/database.m | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/did/+did/database.m b/src/did/+did/database.m index dbb0c4e..4aa8324 100644 --- a/src/did/+did/database.m +++ b/src/did/+did/database.m @@ -1211,6 +1211,19 @@ function validate_doc_vs_schema(database_obj, docProps, schemaStruct, all_ids) for i = 1 : numel(superFullNames) [~,superNames{i}] = fileparts(superFullNames{i}); % keep compatibility with Matlab 2019a end + % did2-form documents carry superclass names directly in + % `.class_name` (e.g. {class_name:'base'}) and have no + % `.definition` path. Fall back to those names so the legacy + % validator recognises did2-shaped documents during the + % v1->v2 transition (otherwise superNames is empty and every + % such doc fails the superclasses check spuriously). + if isempty(superNames) + try + superNames = {classProps.superclasses.class_name}; + catch + superNames = {}; + end + end if ~iscell(superNames), superNames = {superNames}; end superNames = unique(superNames); schemaFields = fieldnames(schemaStruct); @@ -1238,8 +1251,13 @@ function validate_doc_vs_schema(database_obj, docProps, schemaStruct, all_ids) assert(areSame,'DID:Database:ValidationSuperClasses', ... 'Dissimilar superclasses defined/found for %s ("%s" <=> "%s")', ... doc_name, expectedStr, superNamesStr); - % Recursively validate all superNames against this doc: - for idx = 1 : numel(superNames) + % Recursively validate all superclasses against this + % doc. Bound by superFullNames (the `.definition` + % paths): did2-form docs expose names but no paths, so + % superFullNames is empty and the deeper recursion is + % skipped -- the concrete class is still validated, and + % the superclasses-name check above already passed. + for idx = 1 : numel(superFullNames) % First get the superClass' definition struct defStruct = database_obj.get_document_schema(superFullNames{idx}); % Extract validation file from definition From 1c1145ee25d202e3a808c8ef47152fc4f45e8272 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 11:26:38 +0000 Subject: [PATCH 25/28] test: echo routing inventory to stdout (CI log) The per-term routing report ships as a JSON artifact, but the artifact isn't readable from the migration tooling we use to curate. Echo the actionable breakdown to stdout so it lands in the CI log: every term routed to a generic_* escape hatch (the UNMATCHED ones that need a minted class or a routing rule), then the top routes overall. Data source for curating the authoritative treatment / ontology_table_row routing tables. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+unittest/+helpers/writeRoutingReport.m | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/+did2/+unittest/+helpers/writeRoutingReport.m b/tests/+did2/+unittest/+helpers/writeRoutingReport.m index 5881c9f..8a83c71 100644 --- a/tests/+did2/+unittest/+helpers/writeRoutingReport.m +++ b/tests/+did2/+unittest/+helpers/writeRoutingReport.m @@ -77,6 +77,27 @@ end cleanup = onCleanup(@() fclose(fid)); %#ok fwrite(fid, jsonencode(report, 'PrettyPrint', true)); + +% Echo the actionable breakdown to stdout so the CI log carries it (the +% JSON also ships as an artifact). The terms routed to the generic_* +% escape hatches are the UNMATCHED ones -- they need a minted class or a +% routing rule -- so list those in full, then the top routes overall. +fprintf('\n--- routing inventory (%s): %d distinct term->class routes ---\n', ... + corpusName, numel(entries)); +nUnmatched = 0; +for i = 1:numel(entries) + if startsWith(entries(i).class_name, 'generic_') + nUnmatched = nUnmatched + 1; + fprintf(' UNMATCHED %6d %-28s [%s] -> %s\n', entries(i).count, ... + entries(i).term_node, entries(i).term_name, entries(i).class_name); + end +end +fprintf(' (%d unmatched term routes to generic_*)\n', nUnmatched); +fprintf(' top routes:\n'); +for i = 1:min(numel(entries), 30) + fprintf(' %6d %-28s [%s] -> %s\n', entries(i).count, ... + entries(i).term_node, entries(i).term_name, entries(i).class_name); +end end % ===================== helpers ============================================ From 3065c7a64d50397a8f44203924572463c7b91f4b Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 12:42:21 +0000 Subject: [PATCH 26/28] test: add reference-integrity sweep to corpus discovery (#2) After the 1->N splits and class folds, run did2.validate.references over the migrated batch and report dangling depends_on edges (orphan_count / edges_examined, aggregated by source-class.edge_name). Surfaces any reference the migration would break -- e.g. a split that didn't preserve a referenced id, or an edge to a deferred/quarantined doc. Best-effort, non-fatal (discovery mode); echoed to the CI log next to the routing inventory. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+unittest/+helpers/runCorpusDiscovery.m | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index c3f249d..90ffa8e 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -59,6 +59,23 @@ fprintf('routing report skipped: %s\n', routingErr.message); end +% Reference-integrity sweep (best-effort): after the 1->N splits and class +% folds, confirm every depends_on edge in the migrated batch resolves to a +% document in that batch. Orphans = dangling references the migration would +% introduce (e.g. a split that didn't preserve a referenced id, or a ref to +% a deferred/quarantined doc). Reported, not fatal -- discovery mode. +try + refRep = did2.validate.references(result.migrated); + fprintf('\n--- reference integrity (%s): %d orphan(s) of %d edges ---\n', ... + corpusName, refRep.orphan_count, refRep.edges_examined); + [orphNames, orphCounts] = aggregateOrphans(refRep.orphans); + for i = 1:numel(orphNames) + fprintf(' %6d %s\n', orphCounts(i), orphNames{i}); + end +catch refErr + fprintf('reference report skipped: %s\n', refErr.message); +end + fprintf('\n=== Corpus %s discovery summary (target %s) ===\n', ... corpusName, options.TargetVersion); fprintf('total: %d\n', result.summary.total); @@ -71,3 +88,23 @@ reasons(k).class_name, reasons(k).reason); end end + +function [names, counts] = aggregateOrphans(orphans) +%AGGREGATEORPHANS Count dangling edges by "doc_class.edge_name", desc. +names = {}; +counts = []; +for k = 1:numel(orphans) + key = sprintf('%s.%s', orphans(k).doc_class, orphans(k).edge_name); + idx = find(strcmp(names, key), 1); + if isempty(idx) + names{end+1} = key; %#ok + counts(end+1) = 1; %#ok + else + counts(idx) = counts(idx) + 1; + end +end +if ~isempty(counts) + [counts, order] = sort(counts, 'descend'); + names = names(order); +end +end From ace5796abac505597e3eff6986bff6753e93ef83 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 12:44:25 +0000 Subject: [PATCH 27/28] test: rename refErr -> refReportErr (codespell false positive on "refer") Codespell read the catch variable `refErr` as a misspelling of "refer". There's no .codespellrc ignore-list in the repo, so rename it (matching the already-clean routingErr style). No behavior change. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- tests/+did2/+unittest/+helpers/runCorpusDiscovery.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index 90ffa8e..a4ac2e2 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -72,8 +72,8 @@ for i = 1:numel(orphNames) fprintf(' %6d %s\n', orphCounts(i), orphNames{i}); end -catch refErr - fprintf('reference report skipped: %s\n', refErr.message); +catch refReportErr + fprintf('reference report skipped: %s\n', refReportErr.message); end fprintf('\n=== Corpus %s discovery summary (target %s) ===\n', ... From 643f1ef603ab865ccac6cfc9175d1c7790e39344 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 15:29:34 +0000 Subject: [PATCH 28/28] migrators_e: tighten ontology_table_row routing to kill substring mis-routes (#1) The routing inventory exposed heuristic false positives from substring matching: "average amplitude"->age_observation ("aver-AGE-"), C. elegans "encounter*"->litter_size ("en-COUNT-er"), "sampling rate"->heart_rate, "number of samples"->litter_size, "growth duration"/"latency"->age, "bacterial patch volume"->organ_volume. Two changes: - containsAny now matches at WORD BOUNDARIES (regexp \<...\>), so a keyword matches only as a whole word/phrase -- average !-> age, encounter !-> count. - the scalar/categorical keyword lists are tightened to SPECIFIC, high-confidence phrases (body weight, body length, heart rate, respiration rate, blood pressure, litter size, cell count, life cycle stage, health status, ...) and the over-broad singletons (rate, frequency, count, number, volume, duration, latency, score, status, stage, behavior) are dropped. respiration_rate and cell_count get their own branches. Anything not confidently a known property falls to the generic_* escape hatch -- correct for the lab-specific corpus terms. Existing testMigratorsE cases (weight, life cycle stage) still route as before. treatment.m is intentionally left stem-based (rear->rearing) and showed no mis-routes. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01ABEeDJ83r9Hq9PaiSije8o --- .../+migrators_e/ontology_table_row.m | 50 +++++++++++++------ 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m index 36494cc..57203f9 100644 --- a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -81,36 +81,48 @@ function [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal) unit = getCharField(row, 'unit'); -if containsAny(hay, {'weight', 'mass'}) +% Conservative, high-confidence routing only: match SPECIFIC terms at word +% boundaries (containsAny is word-boundary, so "average" !-> age, +% "encounter" !-> count, "sampling rate" !-> heart rate). Anything not +% confidently a known property falls to the generic_scalar escape hatch -- +% the corpora are dominated by lab-specific terms that belong there. +if containsAny(hay, {'body weight', 'body mass', 'weight'}) className = 'body_weight_observation'; shapeClass = 'scalar_mass'; valueStruct = canonicalComposite('kilograms', unit, numVal); -elseif containsAny(hay, {'length', 'tibia', 'tail'}) +elseif containsAny(hay, {'body length', 'tibia', 'tail length', 'snout-vent', 'body size'}) className = 'body_length_observation'; shapeClass = 'scalar_length'; valueStruct = canonicalComposite('meters', unit, numVal); -elseif containsAny(hay, {'age', 'duration', 'latency'}) +elseif containsAny(hay, {'age'}) className = 'age_observation'; shapeClass = 'scalar_duration'; valueStruct = canonicalComposite('seconds', unit, numVal); elseif containsAny(hay, {'temperature'}) className = 'core_temperature_observation'; shapeClass = 'scalar_temperature'; valueStruct = canonicalComposite('celsius', unit, numVal); -elseif containsAny(hay, {'heart rate', 'respiration', 'rate', 'frequency'}) +elseif containsAny(hay, {'heart rate'}) className = 'heart_rate_observation'; shapeClass = 'scalar_frequency'; valueStruct = canonicalComposite('hertz', unit, numVal); -elseif containsAny(hay, {'pressure'}) +elseif containsAny(hay, {'respiration rate', 'respiratory rate', 'breathing rate'}) + className = 'respiration_rate_observation'; shapeClass = 'scalar_frequency'; + valueStruct = canonicalComposite('hertz', unit, numVal); +elseif containsAny(hay, {'blood pressure', 'arterial pressure'}) className = 'blood_pressure_observation'; shapeClass = 'scalar_pressure'; valueStruct = canonicalComposite('mmhg', unit, numVal); -elseif containsAny(hay, {'litter', 'count', 'number of'}) +elseif containsAny(hay, {'litter size'}) className = 'litter_size_observation'; shapeClass = 'scalar_count'; valueStruct = struct('value', round(numVal), ... 'unit', struct('node', '', 'name', ''), 'approximate', false); -elseif containsAny(hay, {'score', 'condition'}) +elseif containsAny(hay, {'cell count'}) + className = 'cell_count_observation'; shapeClass = 'scalar_count'; + valueStruct = struct('value', round(numVal), ... + 'unit', struct('node', '', 'name', ''), 'approximate', false); +elseif containsAny(hay, {'body condition'}) className = 'body_condition_observation'; shapeClass = 'scalar_score'; valueStruct = struct('value', numVal, 'scale', struct('node', '', 'name', ''), ... 'scale_min', 0.0, 'scale_max', 0.0, 'approximate', false); -elseif containsAny(hay, {'concentration', 'glucose', 'cortisol', 'titer'}) +elseif containsAny(hay, {'concentration', 'glucose', 'cortisol', 'titer', 'titre'}) className = 'concentration_observation'; shapeClass = 'scalar_concentration'; valueStruct = struct('source_unit', unit, 'source_value', numVal, 'approximate', false); -elseif containsAny(hay, {'volume'}) +elseif containsAny(hay, {'organ volume'}) className = 'organ_volume_observation'; shapeClass = 'scalar_volume'; valueStruct = canonicalComposite('liters', unit, numVal); else @@ -125,15 +137,18 @@ termValue = getCharField(row, 'string_value'); end valueTerm = struct('node', termValue, 'name', ''); -if containsAny(hay, {'stage', 'life cycle', 'developmental'}) +% Specific phrases only (word-boundary); ambiguous singletons like +% "status"/"stage"/"behavior" caused false positives, so require the full +% property phrase and let everything else fall to the generic escape hatch. +if containsAny(hay, {'life cycle stage', 'developmental stage', 'life stage'}) className = 'developmental_stage_observation'; -elseif containsAny(hay, {'health', 'status'}) +elseif containsAny(hay, {'health status'}) className = 'health_status_observation'; -elseif containsAny(hay, {'coat', 'pigment'}) +elseif containsAny(hay, {'coat color', 'coat colour', 'pigmentation'}) className = 'pigmentation_observation'; elseif containsAny(hay, {'estrous', 'estrus'}) className = 'estrous_stage_observation'; -elseif containsAny(hay, {'behavior', 'phenotype'}) +elseif containsAny(hay, {'behavioral phenotype', 'behavioural phenotype'}) className = 'behavioral_phenotype_observation'; else className = 'generic_categorical_observation'; @@ -312,8 +327,15 @@ end function tf = containsAny(hay, needles) +% Word-boundary match: a needle matches only as a whole word/phrase, not as +% a substring inside another word. This prevents the heuristic false +% positives the routing inventory exposed -- e.g. "average" -> "age", +% "encounter" -> "count", "sampling rate" -> "rate". tf = false; for k = 1:numel(needles) - if contains(hay, needles{k}); tf = true; return; end + pat = ['\<', regexptranslate('escape', needles{k}), '\>']; + if ~isempty(regexp(hay, pat, 'once')) + tf = true; return; + end end end