diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml index 5e740f6..a0003aa 100644 --- a/.github/workflows/test-code.yml +++ b/.github/workflows/test-code.yml @@ -35,16 +35,33 @@ jobs: - name: Check out repository uses: actions/checkout@v4 - - name: Check out did-schema (sibling, for V_delta validation) + - name: Check out did-schema (sibling, for V_epsilon validation) uses: actions/checkout@v4 with: repository: Waltham-Data-Science/did-schema - ref: main + # Track the matching did-schema E branch: the Brainstorm-E classes + # (observation property classes, shape library, session_relative_ + # reference, subject_statement, etc.) live on this branch and are + # not yet on did-schema main. Revert to `main` once DID-schema#62 + # merges. Falls back to main off-PR (e.g. direct pushes to V2). + ref: ${{ github.head_ref || 'main' }} path: did-schema + - name: Assemble V_epsilon schema set (stable + draft) + # V_epsilon replaces V_delta as the migration/validation target. + # The schema cache loads a single flat dir of *.json, but the + # Brainstorm-E observation/manipulation classes live in + # V_epsilon/draft while base/element/etc. live in V_epsilon/stable + # (class names are disjoint across tiers), so combine both tiers + # into one directory for DID_SCHEMA_PATH. + run: | + mkdir -p "${GITHUB_WORKSPACE}/epsilon-schemas" + cp did-schema/schemas/V_epsilon/stable/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + cp did-schema/schemas/V_epsilon/draft/*.json "${GITHUB_WORKSPACE}/epsilon-schemas/" + - name: Export DID_SCHEMA_PATH run: | - echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/did-schema/schemas/V_delta/stable" \ + echo "DID_SCHEMA_PATH=${GITHUB_WORKSPACE}/epsilon-schemas" \ >> "$GITHUB_ENV" - name: Set up MATLAB diff --git a/src/did/+did/database.m b/src/did/+did/database.m index dbb0c4e..4aa8324 100644 --- a/src/did/+did/database.m +++ b/src/did/+did/database.m @@ -1211,6 +1211,19 @@ function validate_doc_vs_schema(database_obj, docProps, schemaStruct, all_ids) for i = 1 : numel(superFullNames) [~,superNames{i}] = fileparts(superFullNames{i}); % keep compatibility with Matlab 2019a end + % did2-form documents carry superclass names directly in + % `.class_name` (e.g. {class_name:'base'}) and have no + % `.definition` path. Fall back to those names so the legacy + % validator recognises did2-shaped documents during the + % v1->v2 transition (otherwise superNames is empty and every + % such doc fails the superclasses check spuriously). + if isempty(superNames) + try + superNames = {classProps.superclasses.class_name}; + catch + superNames = {}; + end + end if ~iscell(superNames), superNames = {superNames}; end superNames = unique(superNames); schemaFields = fieldnames(schemaStruct); @@ -1238,8 +1251,13 @@ function validate_doc_vs_schema(database_obj, docProps, schemaStruct, all_ids) assert(areSame,'DID:Database:ValidationSuperClasses', ... 'Dissimilar superclasses defined/found for %s ("%s" <=> "%s")', ... doc_name, expectedStr, superNamesStr); - % Recursively validate all superNames against this doc: - for idx = 1 : numel(superNames) + % Recursively validate all superclasses against this + % doc. Bound by superFullNames (the `.definition` + % paths): did2-form docs expose names but no paths, so + % superFullNames is empty and the deeper recursion is + % skipped -- the concrete class is still validated, and + % the superclasses-name check above already passed. + for idx = 1 : numel(superFullNames) % First get the superClass' definition struct defStruct = database_obj.get_document_schema(superFullNames{idx}); % Extract validation file from definition diff --git a/src/did/+did2/+convert/+migrators_e/ontology_table_row.m b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m new file mode 100644 index 0000000..57203f9 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/ontology_table_row.m @@ -0,0 +1,341 @@ +function bodies = ontology_table_row(preBody) +%ONTOLOGY_TABLE_ROW Brainstorm-E split migrator: did_v1 ontology_table_row +% -> observation tiers (1 -> N). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Each row of the legacy open key/value table becomes its +% own observation document, dispatched by what the property is and what +% shape its value takes, per +% did-schema/schemas/V_epsilon/conversions/from_did_v1/ontology_table_row.md: +% +% numeric row -> a scalar property class (body_weight_observation, +% core_temperature_observation, ...) value as the +% matching typed composite; unrecognised -> the +% generic_scalar_observation escape hatch. +% term row -> a categorical property class (developmental_stage_ +% observation, ...) value as a bound ontology_term; +% unrecognised -> generic_categorical_observation. +% +% Returns a CELL of body structs (one per row); the dispatcher lands +% each as its own migrated document. Branch resolution is a keyword +% HEURISTIC seed; the per-term table is finalised in discovery mode. +% +% Subject-intrinsic (species/strain/sex) and relational (cohort/housing) +% rows are out of scope for the observation tier; in this seed they fall +% to the generic categorical escape hatch and are flagged for review +% rather than silently dropped. Refining that routing is a follow-up. + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'ontology_table_row') || ~isstruct(preBody.ontology_table_row) + error('did2:convert:missingBlock', ... + 'ontology_table_row body is missing the ontology_table_row property block.'); +end +rows = extractRows(preBody.ontology_table_row); +if isempty(rows) + % Unrecognised row layout (e.g. the real v1 form stores parallel + % char fields names / variable_names / ontology_nodes / data rather + % than a 'rows' array). Rather than quarantine, migrate the document + % unchanged as an ontology_table_row -- the class still exists in + % V_epsilon, so it validates. Splitting that char-field layout into + % per-row observations is a follow-up (see ontology_table_row.md). + bodies = {preBody}; + return; +end + +% One session-relative anchor is shared by every observation from this +% table (they are all in the same session). 'during' is the honest +% fallback when the row carries no epoch and no UTC date. +anchor = makeSessionAnchor(preBody, 'during'); +bodies = cell(1, numel(rows) + 1); +for k = 1:numel(rows) + b = migrateRow(preBody, rows{k}); + b.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); + bodies{k} = b; +end +bodies{end} = anchor; +end + +% ===================== per-row migration =============================== + +function body = migrateRow(preBody, row) +node = getCharField(row, 'ontology_name'); +label = getCharField(row, 'name'); +identity = struct('node', node, 'name', label); +hay = lower([node ' ' label]); + +[isNumeric, numVal] = rowNumericValue(row); + +if isNumeric + [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal); + body = makeScalarObservation(preBody, className, shapeClass, ... + identity, valueStruct); +else + [className, valueTerm] = dispatchCategorical(hay, row); + body = makeCategoricalObservation(preBody, className, identity, valueTerm); +end +end + +function [className, shapeClass, valueStruct] = dispatchScalar(hay, row, numVal) +unit = getCharField(row, 'unit'); +% Conservative, high-confidence routing only: match SPECIFIC terms at word +% boundaries (containsAny is word-boundary, so "average" !-> age, +% "encounter" !-> count, "sampling rate" !-> heart rate). Anything not +% confidently a known property falls to the generic_scalar escape hatch -- +% the corpora are dominated by lab-specific terms that belong there. +if containsAny(hay, {'body weight', 'body mass', 'weight'}) + className = 'body_weight_observation'; shapeClass = 'scalar_mass'; + valueStruct = canonicalComposite('kilograms', unit, numVal); +elseif containsAny(hay, {'body length', 'tibia', 'tail length', 'snout-vent', 'body size'}) + className = 'body_length_observation'; shapeClass = 'scalar_length'; + valueStruct = canonicalComposite('meters', unit, numVal); +elseif containsAny(hay, {'age'}) + className = 'age_observation'; shapeClass = 'scalar_duration'; + valueStruct = canonicalComposite('seconds', unit, numVal); +elseif containsAny(hay, {'temperature'}) + className = 'core_temperature_observation'; shapeClass = 'scalar_temperature'; + valueStruct = canonicalComposite('celsius', unit, numVal); +elseif containsAny(hay, {'heart rate'}) + className = 'heart_rate_observation'; shapeClass = 'scalar_frequency'; + valueStruct = canonicalComposite('hertz', unit, numVal); +elseif containsAny(hay, {'respiration rate', 'respiratory rate', 'breathing rate'}) + className = 'respiration_rate_observation'; shapeClass = 'scalar_frequency'; + valueStruct = canonicalComposite('hertz', unit, numVal); +elseif containsAny(hay, {'blood pressure', 'arterial pressure'}) + className = 'blood_pressure_observation'; shapeClass = 'scalar_pressure'; + valueStruct = canonicalComposite('mmhg', unit, numVal); +elseif containsAny(hay, {'litter size'}) + className = 'litter_size_observation'; shapeClass = 'scalar_count'; + valueStruct = struct('value', round(numVal), ... + 'unit', struct('node', '', 'name', ''), 'approximate', false); +elseif containsAny(hay, {'cell count'}) + className = 'cell_count_observation'; shapeClass = 'scalar_count'; + valueStruct = struct('value', round(numVal), ... + 'unit', struct('node', '', 'name', ''), 'approximate', false); +elseif containsAny(hay, {'body condition'}) + className = 'body_condition_observation'; shapeClass = 'scalar_score'; + valueStruct = struct('value', numVal, 'scale', struct('node', '', 'name', ''), ... + 'scale_min', 0.0, 'scale_max', 0.0, 'approximate', false); +elseif containsAny(hay, {'concentration', 'glucose', 'cortisol', 'titer', 'titre'}) + className = 'concentration_observation'; shapeClass = 'scalar_concentration'; + valueStruct = struct('source_unit', unit, 'source_value', numVal, 'approximate', false); +elseif containsAny(hay, {'organ volume'}) + className = 'organ_volume_observation'; shapeClass = 'scalar_volume'; + valueStruct = canonicalComposite('liters', unit, numVal); +else + className = 'generic_scalar_observation'; shapeClass = 'generic_scalar'; + valueStruct = struct('source_unit', unit, 'source_value', numVal, 'approximate', false); +end +end + +function [className, valueTerm] = dispatchCategorical(hay, row) +termValue = getCharField(row, 'value'); +if isempty(termValue) + termValue = getCharField(row, 'string_value'); +end +valueTerm = struct('node', termValue, 'name', ''); +% Specific phrases only (word-boundary); ambiguous singletons like +% "status"/"stage"/"behavior" caused false positives, so require the full +% property phrase and let everything else fall to the generic escape hatch. +if containsAny(hay, {'life cycle stage', 'developmental stage', 'life stage'}) + className = 'developmental_stage_observation'; +elseif containsAny(hay, {'health status'}) + className = 'health_status_observation'; +elseif containsAny(hay, {'coat color', 'coat colour', 'pigmentation'}) + className = 'pigmentation_observation'; +elseif containsAny(hay, {'estrous', 'estrus'}) + className = 'estrous_stage_observation'; +elseif containsAny(hay, {'behavioral phenotype', 'behavioural phenotype'}) + className = 'behavioral_phenotype_observation'; +else + className = 'generic_categorical_observation'; +end +end + +% ===================== destination builders ============================ + +function body = makeScalarObservation(preBody, className, shapeClass, identity, valueStruct) +body = startObservation(preBody, className, {'scalar_observation', shapeClass}); +body.observation = struct('measured_property', identity, ... + 'target_structure', {struct('node', {}, 'name', {})}); +body.(shapeClass) = struct('value', valueStruct); +end + +function body = makeCategoricalObservation(preBody, className, identity, valueTerm) +body = startObservation(preBody, className, ... + {'categorical_observation', 'categorical_concept'}); +body.observation = struct('measured_property', identity, ... + 'target_structure', {struct('node', {}, 'name', {})}); +% categorical_concept declares `value` with placement: concrete_class, so +% the bound term lives in the concrete observation class's OWN block and +% categorical_concept contributes no block. One value, one block, uniform +% across every categorical observation (no per-class branching). +body.(className) = struct('value', valueTerm); +end + +% ===================== shared helpers ================================== + +function body = startObservation(preBody, className, extraSupers) +chain = [{'observation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct('class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') && isstruct(preBody.base) + base = preBody.base; + base.id = did.ido.unique_id(); % each row becomes its own document + body.base = base; +end +end + +function deps = carrySubject(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + if isfield(d, 'value'); subjectVal = d.value; + elseif isfield(d, 'document_id'); subjectVal = d.document_id; end + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +%MAKESESSIONANCHOR Session_relative_reference document (ordinal, no metric) +% shared by all observations from this table; anchored to the source's +% session via base.session_id. +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function comp = canonicalComposite(canonField, unit, numVal) +comp = struct(canonField, double(numVal), 'source_unit', unit, ... + 'source_value', double(numVal), 'approximate', false); +end + +function rows = extractRows(block) +%EXTRACTROWS Normalise an ontology_table_row body to a cell of column structs +% (each {ontology_name, name, value}), one per measured property. +% +% The real v1 layout (per the schema) is column-parallel: comma-separated +% `names` / `variable_names` / `ontology_nodes` plus a `data` struct keyed +% by the variable_names. One document is one table ROW; each COLUMN is a +% property measurement and becomes one observation. (Also accepts the +% synthetic `rows`-array and single-row shapes used by tests.) +rows = {}; +if isfield(block, 'rows') + r = block.rows; + if iscell(r) + rows = r(:)'; + elseif isstruct(r) + rows = arrayfun(@(x) x, r(:)', 'UniformOutput', false); + end + return; +end +if isfield(block, 'variable_names') + vars = splitCSV(getCharField(block, 'variable_names')); + names = splitCSV(getCharField(block, 'names')); + nodes = splitCSV(getCharField(block, 'ontology_nodes')); + data = struct(); + if isfield(block, 'data') && isstruct(block.data) + data = block.data; + end + for i = 1:numel(vars) + key = vars{i}; + label = ''; node = ''; + if i <= numel(names); label = names{i}; end + if i <= numel(nodes); node = nodes{i}; end + val = []; + if ~isempty(key) && isfield(data, key) + val = data.(key); + end + % Skip columns with no usable value (missing key, [], '', NaN). + if isempty(val) || (isnumeric(val) && isscalar(val) && isnan(val)) + continue; + end + rows{end+1} = struct('ontology_name', node, 'name', label, 'value', val); %#ok + end + return; +end +if isfield(block, 'ontology_name') || isfield(block, 'name') + rows = {block}; % single-row legacy shape (the block IS one row) +end +end + +function parts = splitCSV(s) +parts = {}; +if isempty(s) + return; +end +raw = strsplit(char(s), ','); +parts = cellfun(@strtrim, raw, 'UniformOutput', false); +end + +function [isNumeric, numVal] = rowNumericValue(row) +isNumeric = false; numVal = []; +if isfield(row, 'value') + v = row.value; + if isnumeric(v) && isscalar(v) && isfinite(v) + isNumeric = true; numVal = double(v); + end +elseif isfield(row, 'numeric_value') + v = row.numeric_value; + if isnumeric(v) && ~isempty(v) + isNumeric = true; numVal = double(v(1)); + end +end +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + elseif isnumeric(v) && isscalar(v) + s = num2str(v); + end +end +end + +function tf = containsAny(hay, needles) +% Word-boundary match: a needle matches only as a whole word/phrase, not as +% a substring inside another word. This prevents the heuristic false +% positives the routing inventory exposed -- e.g. "average" -> "age", +% "encounter" -> "count", "sampling rate" -> "rate". +tf = false; +for k = 1:numel(needles) + pat = ['\<', regexptranslate('escape', needles{k}), '\>']; + if ~isempty(regexp(hay, pat, 'once')) + tf = true; return; + end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/stimulus_bath.m b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m new file mode 100644 index 0000000..7812794 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/stimulus_bath.m @@ -0,0 +1,28 @@ +function v2Body = stimulus_bath(~) +%STIMULUS_BATH Deferred: stimulus_bath migrates to a `bath` in the NDI layer. +% +% The legacy stimulus_bath is really a bath (pharmacological_manipulation): +% its mixture/location live in the document, but the resulting bath needs two +% things that can only be obtained by following stimulus_element_id to the +% stimulator ELEMENT and its session/epoch graph -- +% +% - subject_id : the stimulator element's subject, and +% - time_reference : an epoch_bounded_reference on the stimulator's epoch +% (the stimulator is the time referent; no other +% connection to it is kept). +% +% A manipulation must be emitted complete (all required dependencies +% together), so the whole bath is assembled in ndi.migrate.local, which has +% the element and epoch in hand. The per-document converter cannot complete +% it, so it defers here with a clear, queryable reason rather than emitting a +% partial (or a wrong-block fallback that reads as "mixture missing"). +% +% See ndi.migrate.internal.stimulusBathToBath (NDI-matlab) for the build. + +v2Body = struct(); %#ok % required output; this migrator always defers +error('did2:convert:needsSessionContext', ... + ['stimulus_bath -> bath is migrated in the NDI layer ', ... + '(ndi.migrate.local): the bath''s subject (from the stimulator ', ... + 'element) and its epoch_bounded_reference time anchor (the ', ... + 'stimulator''s epoch) require the session/element graph. Deferred.']); +end diff --git a/src/did/+did2/+convert/+migrators_e/subject_group.m b/src/did/+did2/+convert/+migrators_e/subject_group.m new file mode 100644 index 0000000..15615fe --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/subject_group.m @@ -0,0 +1,65 @@ +function v2Body = subject_group(preBody) +%SUBJECT_GROUP Brainstorm-E migrator: did_v1 subject_group -> subject (is_group). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per did-schema V_epsilon_SPEC.md, subject_group is +% deprecated and folds into the subject tier: +% +% subject_group -> subject (is_group: true) +% +% The legacy subject_group document is an (essentially empty) marker -- +% membership is expressed by member subjects referencing the group, not +% by fields on the group doc itself. So the per-document migration is +% 1 -> 1: the group becomes a `subject` flagged is_group. The membership +% edges become `group_assignment` events, but those are RELATIONAL (they +% need the member subjects that point at this group) and are assembled in +% the NDI layer, exactly like stimulus_bath -> bath; they are not +% manufactured here from a doc that carries no members. +% +% Optional legacy group_name / description (newer subject_group docs may +% carry them; v1 corpus docs do not) map onto the subject block's +% local_identifier / description. + +arguments + preBody (1,1) struct +end + +groupName = ''; +desc = ''; +if isfield(preBody, 'subject_group') && isstruct(preBody.subject_group) + sg = preBody.subject_group; + groupName = getCharField(sg, 'group_name'); + desc = getCharField(sg, 'description'); +end + +v2Body = struct(); +v2Body.document_class = struct( ... + 'class_name', 'subject', 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +% v1 subject_group carried no depends_on; keep that (membership lives on +% the member subjects, resolved into group_assignment in the NDI layer). +v2Body.depends_on = struct('name', {}, 'value', {}); +if isfield(preBody, 'base') && isstruct(preBody.base) + v2Body.base = preBody.base; +end +v2Body.subject = struct( ... + 'local_identifier', groupName, ... + 'description', desc, ... + 'is_biological', false, ... + 'is_group', true); +end + +% ===================== helpers ============================================= + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/treatment.m b/src/did/+did2/+convert/+migrators_e/treatment.m new file mode 100644 index 0000000..ee964a6 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment.m @@ -0,0 +1,264 @@ +function v2Body = treatment(preBody) +%TREATMENT Brainstorm-E split migrator: did_v1 treatment -> manipulation tiers. +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Reads the treatment block's ontology identity + numeric +% / string values and dispatches the row to the manipulation family +% whose ACTION it names, per +% did-schema/schemas/V_epsilon/conversions/from_did_v1/treatment.md: +% +% injection (substance delivered by injection) +% bath (substance applied as a bath) +% procedural_manipulation (physical operation on the body) +% temperature_manipulation (imposed heat / cold) +% environmental_manipulation (changed condition / regime, no value) +% +% This is a 1 -> 1 split (one treatment -> one manipulation). Rows that +% are not manipulations (date of birth, experiment time) or whose +% branch cannot be resolved raise an error so the dispatcher routes the +% source body to quarantine with a descriptive reason -- the +% "curator review queue" of the conversion spec; nothing is forced into +% a residual family. +% +% Branch resolution here is a keyword/CURIE-prefix HEURISTIC seed; the +% authoritative per-term branch list is finalised in discovery mode +% against real corpora (treatment.md, Open questions). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment') || ~isstruct(preBody.treatment) + error('did2:convert:missingBlock', ... + 'treatment body is missing the treatment property block.'); +end +block = preBody.treatment; + +node = getCharField(block, 'ontology_name'); +label = getCharField(block, 'name'); +strValue = getCharField(block, 'string_value'); +numValue = []; +if isfield(block, 'numeric_value') + numValue = block.numeric_value; +end + +identity = struct('node', node, 'name', label); +hay = lower([node ' ' label]); % search text for the heuristic branch + +% --- Dab edge case: string_value is an ontology target, not prose ------ +targetStructure = struct('node', {}, 'name', {}); % empty ontology_term array +notesText = strValue; +if endsWith(lower(strtrim(label)), 'target location') || looksLikeCURIE(strValue) + targetStructure = struct('node', strValue, 'name', ''); + notesText = ''; + identity.name = strtrim(regexprep(label, '(?i)\s*target location$', '')); +end + +% --- not-a-manipulation rows: route OUT of tier (quarantine w/ reason) -- +if containsAny(hay, {'date of birth', 'non-survival experiment time', ... + 'experiment time'}) + error('did2:convert:notAManipulation', ... + ['treatment "%s" is not a manipulation; route out of tier ', ... + '(observation/session metadata) per treatment.md.'], label); +end + +% --- branch dispatch (first match wins) -------------------------------- +if containsAny(hay, {'cool', 'cold', 'heat', 'warm', 'thermal', 'temperature'}) + v2Body = makeTemperatureManipulation(preBody, identity, targetStructure, ... + notesText, numValue); +elseif containsAny(hay, {'inject', 'virus', 'aav', 'tracer', 'drug', 'vehicle'}) ... + || startsWith(lower(node), 'chebi:') + v2Body = makeInjection(preBody, identity, targetStructure, notesText); +elseif containsAny(hay, {'bath'}) + v2Body = makeBath(preBody, identity, notesText); +elseif containsAny(hay, {'craniotomy', 'implant', 'lesion', 'perfus', ... + 'eye opening', 'eyelid', 'ear notch', 'ear punch', 'tail clip', ... + 'toe clip', 'whisker', 'suture', 'surgery', 'transection', ... + 'resection', 'enucleation', 'dissection', 'procedure', 'optogenetic'}) + v2Body = makeProceduralManipulation(preBody, identity, targetStructure, notesText); +elseif containsAny(hay, {'rear', 'deprivation', 'isolation', 'enrichment', ... + 'housing', 'light', 'dark', 'restriction', 'diet', 'training', ... + 'habituation', 'restraint'}) + v2Body = makeEnvironmentalManipulation(preBody, identity, targetStructure, notesText); +else + error('did2:convert:unresolvedTreatment', ... + ['treatment "%s" (%s) could not be routed to a manipulation ', ... + 'family; curator review required.'], label, node); +end + +% Attach a session-relative anchor. v1 treatment rows have no DAQ epoch and +% (often) no UTC date, so the honest fallback is an ordinal claim against the +% session. 'during' is correct for any migrated interaction (it happened +% within the session); 'at_end_of' is reserved for interactions known to be +% terminal and is not asserted blanket here. Emitting the time_reference as +% its own document makes this a 1 -> 2 migration. +anchor = makeSessionAnchor(preBody, 'during'); +v2Body.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {v2Body, anchor}; +end + +% ===================== destination builders ============================ + +function body = makeTemperatureManipulation(preBody, identity, targetStructure, notesText, numValue) +body = startBody(preBody, 'temperature_manipulation', ... + {'scalar_manipulation', 'scalar_temperature'}); +body.scalar_manipulation = struct( ... + 'applied_property', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +body.scalar_temperature = struct('value', temperatureComposite(numValue)); +end + +function body = makeInjection(preBody, identity, targetStructure, ~) +body = startBody(preBody, 'injection', {'pharmacological_manipulation'}); +% mixture records are {chemical: ontology_term, amount: concentration}. +body.pharmacological_manipulation = struct('mixture', ... + struct('chemical', identity, 'amount', emptyConcentration())); +% injection declares kind/volume/route/target_structure (no `notes`); +% volume + route are required and emitted blank for curator fill-in. +body.injection = struct( ... + 'kind', 'drug', ... + 'volume', blankVolume(), ... + 'route', struct('node', '', 'name', ''), ... + 'target_structure', {targetStructure}); +end + +function body = makeBath(preBody, identity, ~) +body = startBody(preBody, 'bath', {'pharmacological_manipulation'}); +body.pharmacological_manipulation = struct('mixture', ... + struct('chemical', identity, 'amount', emptyConcentration())); +% bath declares only kind/location (no `notes`). +body.bath = struct('kind', 'drug', 'location', struct('node', '', 'name', '')); +end + +function body = makeProceduralManipulation(preBody, identity, targetStructure, notesText) +body = startBody(preBody, 'procedural_manipulation', {}); +body.procedural_manipulation = struct( ... + 'procedure', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +end + +function body = makeEnvironmentalManipulation(preBody, identity, targetStructure, notesText) +body = startBody(preBody, 'environmental_manipulation', {}); +body.environmental_manipulation = struct( ... + 'factor', identity, ... + 'target_structure', {targetStructure}, ... + 'notes', notesText); +end + +% ===================== shared helpers ================================== + +function body = startBody(preBody, className, extraSupers) +%STARTBODY Seed a V_epsilon manipulation body: document_class header, +% carried base + subject_id, and a synthesized time_reference slot. +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubject(preBody) +%CARRYSUBJECT Carry the subject_id dependency forward (time_reference is +% attached separately, pointing at the migrated session anchor). +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +%MAKESESSIONANCHOR Build a session_relative_reference document (ordinal, +% no metric) anchored to the source document's session. Returned as a +% sibling body so the interaction can depend_on it as its time_reference. +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end + +function comp = temperatureComposite(numValue) +comp = struct('celsius', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +if ~isempty(numValue) && isnumeric(numValue) + v = double(numValue(1)); + comp.celsius = v; + comp.source_unit = 'celsius'; + comp.source_value = v; +end +end + +function c = emptyConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function tf = looksLikeCURIE(s) +tf = ~isempty(s) && ~isempty(regexp(char(s), '^[A-Za-z][A-Za-z0-9_]*:[^\s:]+$', 'once')); +end + +function tf = containsAny(hay, needles) +tf = false; +for k = 1:numel(needles) + if contains(hay, needles{k}) + tf = true; + return; + end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/treatment_drug.m b/src/did/+did2/+convert/+migrators_e/treatment_drug.m new file mode 100644 index 0000000..0c1456d --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment_drug.m @@ -0,0 +1,173 @@ +function v2Body = treatment_drug(preBody) +%TREATMENT_DRUG Brainstorm-E migrator: did_v1 treatment_drug -> injection (drug). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, treatment_drug is deprecated and +% folds into injection (kind: "drug"): the administered substance becomes +% the pharmacological_manipulation.mixture, the body location becomes the +% injection target_structure. 1 -> 2: the injection plus the shared +% session_relative_reference anchor every migrated interaction needs +% (subject_interaction requires a time_reference). +% +% Branch/field resolution here is a HEURISTIC seed (the legacy +% mixture_table format varies); the authoritative mapping is finalised in +% discovery mode against real corpora. + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment_drug') || ~isstruct(preBody.treatment_drug) + error('did2:convert:missingBlock', ... + 'treatment_drug body is missing the treatment_drug property block.'); +end +block = preBody.treatment_drug; + +targetStructure = ontologyArray( ... + getCharField(block, 'location_ontologyNode'), ... + getCharField(block, 'location_name')); +mixture = parseMixtureTable(block); + +inj = startManipulation(preBody, 'injection', {'pharmacological_manipulation'}); +inj.pharmacological_manipulation = struct('mixture', mixture); +inj.injection = struct( ... + 'kind', 'drug', ... + 'volume', blankVolume(), ... + 'route', ontologyTerm('', ''), ... + 'target_structure', {targetStructure}); + +anchor = makeSessionAnchor(preBody, 'during'); +inj.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {inj, anchor}; +end + +% ===================== shared helpers ================================== + +function body = startManipulation(preBody, className, extraSupers) +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubject(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function mixture = parseMixtureTable(block) +%PARSEMIXTURETABLE Best-effort parse of the legacy CSV mixture_table into the +% {chemical, amount} records pharmacological_manipulation.mixture wants. +% mustBeNonEmpty: always return >= 1 record (a blank one if nothing +% parses), so the document validates; the blank is the curator's signal. +mixture = struct('chemical', {}, 'amount', {}); +raw = ''; +if isfield(block, 'mixture_table') + v = block.mixture_table; + if ischar(v); raw = v; elseif isstring(v) && isscalar(v); raw = char(v); end +end +if ~isempty(raw) + lines = strsplit(raw, newline); + for i = 1:numel(lines) + cols = strsplit(strtrim(lines{i}), ','); + if numel(cols) < 2 || isempty(strtrim(cols{1})) + continue; + end + chemical = ontologyTerm(strtrim(cols{1}), strtrim(cols{2})); + amount = blankConcentration(); + if numel(cols) >= 3 && ~isempty(strtrim(cols{3})) + amount.source_value = str2double(strtrim(cols{3})); + end + if numel(cols) >= 4 + amount.source_unit = strtrim(cols{4}); + end + mixture(end+1) = struct('chemical', chemical, 'amount', amount); %#ok + end +end +if isempty(mixture) + mixture(1) = struct('chemical', ontologyTerm('', ''), ... + 'amount', blankConcentration()); +end +end + +function arr = ontologyArray(node, name) +if isempty(node) && isempty(name) + arr = struct('node', {}, 'name', {}); % empty ontology_term array +else + arr = ontologyTerm(node, name); +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + +function c = blankConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end diff --git a/src/did/+did2/+convert/+migrators_e/treatment_transfer.m b/src/did/+did2/+convert/+migrators_e/treatment_transfer.m new file mode 100644 index 0000000..ef7b62c --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/treatment_transfer.m @@ -0,0 +1,120 @@ +function v2Body = treatment_transfer(preBody) +%TREATMENT_TRANSFER Brainstorm-E migrator: did_v1 treatment_transfer -> +% biological_transfer. +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, treatment_transfer is deprecated and +% folds into biological_transfer (a procedural_manipulation): the +% transferred entity becomes biological_transfer.entity, the transfer +% method becomes procedural_manipulation.procedure (and biological_ +% transfer.kind), the legacy recipient_id becomes the subject_id, and the +% donor_id is carried as biological_transfer's donor dependency. 1 -> 2: +% the transfer plus the shared session_relative_reference anchor +% (subject_interaction needs a time_reference). +% +% The legacy timestamp/clocktype carry real timing that could anchor a +% UTC/event reference; the honest fallback for now is the ordinal session +% anchor (refined in the temporal-anchoring follow-up). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'treatment_transfer') || ~isstruct(preBody.treatment_transfer) + error('did2:convert:missingBlock', ... + 'treatment_transfer body is missing the treatment_transfer property block.'); +end +block = preBody.treatment_transfer; + +recipientId = namedDep(preBody, 'recipient_id'); +donorId = namedDep(preBody, 'donor_id'); + +entity = ontologyTerm(getCharField(block, 'entity_ontologyNode'), ... + getCharField(block, 'entity_name')); +procedure = ontologyTerm(getCharField(block, 'method_ontologyNode'), ... + getCharField(block, 'method_name')); +kind = getCharField(block, 'method_name'); +if isempty(kind) + kind = 'transfer'; % biological_transfer.kind is char, mustBeNonEmpty +end + +body = struct(); +body.document_class = struct( ... + 'class_name', 'biological_transfer', 'class_version', '1.0.0', ... + 'superclasses', [ ... + struct('class_name', 'manipulation', 'class_version', '1.0.0'), ... + struct('class_name', 'procedural_manipulation', 'class_version', '1.0.0')], ... + 'schema_version', 'V_epsilon'); +% subject_id (the recipient) + donor_id; time_reference is appended below. +body.depends_on = [ ... + struct('name', 'subject_id', 'value', recipientId), ... + struct('name', 'donor_id', 'value', donorId)]; +if isfield(preBody, 'base') + body.base = preBody.base; +end +body.procedural_manipulation = struct( ... + 'procedure', procedure, ... + 'target_structure', {struct('node', {}, 'name', {})}, ... + 'notes', ''); +body.biological_transfer = struct('entity', entity, 'kind', kind); + +anchor = makeSessionAnchor(preBody, 'during'); +body.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {body, anchor}; +end + +% ===================== shared helpers ================================== + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function val = namedDep(preBody, name) +val = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, name) + if isfield(d, 'value') + val = d.value; + elseif isfield(d, 'document_id') + val = d.document_id; + end + end + end +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end diff --git a/src/did/+did2/+convert/+migrators_e/virus_injection.m b/src/did/+did2/+convert/+migrators_e/virus_injection.m new file mode 100644 index 0000000..b232b27 --- /dev/null +++ b/src/did/+did2/+convert/+migrators_e/virus_injection.m @@ -0,0 +1,169 @@ +function v2Body = virus_injection(preBody) +%VIRUS_INJECTION Brainstorm-E migrator: did_v1 virus_injection -> injection (virus). +% +% Routed from did2.convert.v1_to_v2 only when TargetVersion == +% 'V_epsilon'. Per V_epsilon_SPEC.md, virus_injection is deprecated and +% folds into injection (kind: "virus"): the virus identity becomes the +% first pharmacological_manipulation.mixture chemical (serotype carried in +% the ontology term), the dilution becomes its concentration amount, the +% diluent (if named) a second mixture record, and the injection site +% becomes the target_structure. 1 -> 2: the injection plus the shared +% session_relative_reference anchor (subject_interaction needs a +% time_reference). +% +% The administration date / PND carry timing that could anchor a UTC or +% developmental reference; for now the honest fallback is the ordinal +% session anchor (refined in the temporal-anchoring follow-up). + +arguments + preBody (1,1) struct +end + +if ~isfield(preBody, 'virus_injection') || ~isstruct(preBody.virus_injection) + error('did2:convert:missingBlock', ... + 'virus_injection body is missing the virus_injection property block.'); +end +block = preBody.virus_injection; + +% virus chemical + dilution amount +amount = blankConcentration(); +dilution = numField(block, 'dilution'); +if ~isempty(dilution) + amount.source_value = dilution; + amount.source_unit = 'dilution'; +end +mixture = struct( ... + 'chemical', ontologyTerm(getCharField(block, 'virus_OntologyName'), ... + getCharField(block, 'virus_name')), ... + 'amount', amount); +% optional diluent as a second record +diluentNode = getCharField(block, 'diluent_OntologyName'); +diluentName = getCharField(block, 'diluent_name'); +if ~isempty(diluentNode) || ~isempty(diluentName) + mixture(end+1) = struct( ... + 'chemical', ontologyTerm(diluentNode, diluentName), ... + 'amount', blankConcentration()); +end + +targetStructure = ontologyArray( ... + getCharField(block, 'virusLocation_OntologyName'), ... + getCharField(block, 'virusLocation_name')); + +inj = startManipulation(preBody, 'injection', {'pharmacological_manipulation'}); +inj.pharmacological_manipulation = struct('mixture', mixture); +inj.injection = struct( ... + 'kind', 'virus', ... + 'volume', blankVolume(), ... + 'route', ontologyTerm('', ''), ... + 'target_structure', {targetStructure}); + +anchor = makeSessionAnchor(preBody, 'during'); +inj.depends_on(end+1) = struct('name', 'time_reference_1', ... + 'value', anchor.base.id); +v2Body = {inj, anchor}; +end + +% ===================== shared helpers ================================== + +function body = startManipulation(preBody, className, extraSupers) +chain = [{'manipulation'}, extraSupers]; +supers = struct('class_name', {}, 'class_version', {}); +for k = 1:numel(chain) + supers(end+1) = struct('class_name', chain{k}, 'class_version', '1.0.0'); %#ok +end +body = struct(); +body.document_class = struct( ... + 'class_name', className, 'class_version', '1.0.0', ... + 'superclasses', supers, 'schema_version', 'V_epsilon'); +body.depends_on = carrySubject(preBody); +if isfield(preBody, 'base') + body.base = preBody.base; +end +end + +function deps = carrySubject(preBody) +deps = struct('name', {}, 'value', {}); +subjectVal = ''; +if isfield(preBody, 'depends_on') && isstruct(preBody.depends_on) + for k = 1:numel(preBody.depends_on) + d = preBody.depends_on(k); + if isfield(d, 'name') && strcmp(d.name, 'subject_id') + subjectVal = depValue(d); + end + end +end +deps(end+1) = struct('name', 'subject_id', 'value', subjectVal); +end + +function anchor = makeSessionAnchor(preBody, relation) +sessionId = ''; +ds = '2024-01-01T00:00:00.000Z'; +if isfield(preBody, 'base') && isstruct(preBody.base) + if isfield(preBody.base, 'session_id'); sessionId = preBody.base.session_id; end + if isfield(preBody.base, 'datestamp') && ~isempty(preBody.base.datestamp) + ds = preBody.base.datestamp; + end +end +anchor = struct(); +anchor.document_class = struct('class_name', 'session_relative_reference', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'time_reference', 'class_version', '1.0.0'), ... + 'schema_version', 'V_epsilon'); +anchor.depends_on = struct('name', 'session_id', 'value', sessionId); +anchor.base = struct('id', did.ido.unique_id(), 'session_id', sessionId, ... + 'name', 'migrated_session_anchor', 'datestamp', ds); +anchor.time_reference = struct('is_approximate', true); +anchor.session_relative_reference = struct('relation', relation); +end + +function arr = ontologyArray(node, name) +if isempty(node) && isempty(name) + arr = struct('node', {}, 'name', {}); +else + arr = ontologyTerm(node, name); +end +end + +function t = ontologyTerm(node, name) +t = struct('node', char(node), 'name', char(name)); +end + +function v = blankVolume() +v = struct('liters', 0.0, 'source_unit', '', 'source_value', 0.0, ... + 'approximate', false); +end + +function c = blankConcentration() +c = struct('source_unit', '', 'source_value', 0.0, 'approximate', false); +end + +function s = getCharField(block, name) +s = ''; +if isfield(block, name) + v = block.(name); + if ischar(v) + s = v; + elseif isstring(v) && isscalar(v) + s = char(v); + end +end +end + +function n = numField(block, name) +n = []; +if isfield(block, name) + v = block.(name); + if isnumeric(v) && isscalar(v) && isfinite(v) + n = double(v); + end +end +end + +function v = depValue(d) +v = ''; +if isfield(d, 'value') + v = d.value; +elseif isfield(d, 'document_id') + v = d.document_id; +end +end diff --git a/src/did/+did2/+convert/v1_to_v2.m b/src/did/+did2/+convert/v1_to_v2.m index ffda356..1875efb 100644 --- a/src/did/+did2/+convert/v1_to_v2.m +++ b/src/did/+did2/+convert/v1_to_v2.m @@ -66,6 +66,14 @@ % identifiers in the legacy (camelCase) form so % the body stays schema-compatible while still % gaining the V_delta shape transformations. +% TargetVersion (1,:) char, default 'V_delta') - migration target. +% 'V_delta' (default) preserves the historical +% class-preserving 1->1 behaviour. 'V_epsilon' routes +% classes that have a Brainstorm-E split migrator +% under +did2.+convert.+migrators_e (treatment, +% ontology_table_row) through that migrator instead, +% which may fan one source body out to several +% destination documents (1 -> N). % % See also: did2.convert.universalRenames, did2.convert.migrators, % docs/v2/PLAN.md §9.6. @@ -78,6 +86,7 @@ options.CheckReferences (1,1) logical = false options.ReferenceDatabase = [] options.RenameClassNames (1,1) logical = true + options.TargetVersion (1,:) char = 'V_delta' end bodies = normaliseInput(v1Bodies); @@ -97,7 +106,7 @@ className = ''; try preBody = ensureStruct(rawBody); - if isAlreadyVDelta(preBody) + if isAlreadyTarget(preBody, options.TargetVersion) % Idempotency short-circuit: the body is already V_delta, % so skip universalRenames and the per-class migrators. % ensureClassBlocks still runs (it rebuilds the V_delta @@ -111,22 +120,46 @@ && isfield(v2Body.document_class, 'class_name') className = char(v2Body.document_class.class_name); end + v2Bodies = {v2Body}; else postUniversalBody = did2.convert.universalRenames(preBody, ... 'RenameClassNames', options.RenameClassNames); className = char(postUniversalBody.document_class.class_name); v2Body = applySuperclassMigrators(postUniversalBody, className); - migratorFcn = lookupMigrator(className); - v2Body = migratorFcn(v2Body); + % runConcreteMigrator returns a CELL of one-or-more bodies. + % Default (TargetVersion 'V_delta') always returns a single + % body via the existing per-class migrator, so behaviour is + % unchanged. Under TargetVersion 'V_epsilon' a class with a + % Brainstorm-E split migrator (treatment, ontology_table_row) + % may fan out to several bodies (1 -> N). + v2Bodies = runConcreteMigrator(v2Body, className, ... + options.TargetVersion); end - v2Body = ensureClassBlocks(v2Body, options.SchemaCache); - doc = did2.document(v2Body); - if options.Validate - doc.validate('SchemaCache', options.SchemaCache); + % Collect every produced body. Each is padded, optionally + % validated, and counted independently so a 1 -> N split lands + % N documents in `migrated` (or quarantines the whole source + % body on the first failure, as before). + for bi = 1:numel(v2Bodies) + outBody = ensureClassBlocks(v2Bodies{bi}, options.SchemaCache); + if strcmp(options.TargetVersion, 'V_epsilon') ... + && isfield(outBody, 'document_class') ... + && isstruct(outBody.document_class) + outBody.document_class.schema_version = 'V_epsilon'; + end + doc = did2.document(outBody); + if options.Validate + doc.validate('SchemaCache', options.SchemaCache); + end + migrated{end+1} = doc; %#ok + outName = className; + if isfield(outBody, 'document_class') ... + && isstruct(outBody.document_class) ... + && isfield(outBody.document_class, 'class_name') + outName = char(outBody.document_class.class_name); + end + [classCountNames, classCountValues] = bumpClassCounter( ... + classCountNames, classCountValues, outName); end - migrated{end+1} = doc; %#ok - [classCountNames, classCountValues] = bumpClassCounter( ... - classCountNames, classCountValues, className); catch err entry = struct( ... 'original_body', originalJSON, ... @@ -197,14 +230,17 @@ end end -function tf = isAlreadyVDelta(body) -% Return true when BODY is already a V_delta-shaped document so the +function tf = isAlreadyTarget(body, targetVersion) +% Return true when BODY is already a TARGETVERSION-shaped document so the % per-body migration loop can skip universalRenames and the per-class -% migrators. Both conditions must hold so the short-circuit only fires -% when we have high confidence the body is V_delta: -% (a) document_class.schema_version is the literal char 'V_delta' -% (set by the last run of universalRenames, or by the writer), -% AND +% migrators (it still gets ensureClassBlocks + validate). Both conditions +% must hold so the short-circuit only fires when we have high confidence +% the body is already at the target: +% (a) document_class.schema_version is the literal char TARGETVERSION +% (set by the last run of universalRenames, the writer, or -- for +% 'V_epsilon' -- a context assembler such as +% ndi.migrate.internal.stimulusBathToBath that emits ready-made +% target bodies), AND % (b) the body carries no v1-only structural markers — underscore- % prefixed top-level keys (e.g., legacy _classname, % _class_version) that predate the document_class header and @@ -228,7 +264,7 @@ if isstring(sv) && isscalar(sv) sv = char(sv); end -if ~ischar(sv) || ~strcmp(sv, 'V_delta') +if ~ischar(sv) || ~strcmp(sv, targetVersion) return; end topKeys = fieldnames(body); @@ -250,6 +286,47 @@ end end +function bodies = runConcreteMigrator(v2Body, className, targetVersion) +%RUNCONCRETEMIGRATOR Run the concrete-class migrator, return a cell of bodies. +% Default ('V_delta') preserves the historical 1 -> 1 behaviour: the +% per-class migrator under +did2.+convert.+migrators is applied and a +% single-element cell is returned. Under 'V_epsilon', a class that has +% a Brainstorm-E split migrator under +did2.+convert.+migrators_e is +% routed there instead; that migrator may return either a single body +% (struct) or several (struct array / cell), enabling the treatment -> +% manipulation and ontology_table_row -> observations (1 -> N) splits. +if strcmp(targetVersion, 'V_epsilon') + fqn = ['did2.convert.migrators_e.', className]; + if ~isempty(which(fqn)) + out = feval(str2func(fqn), v2Body); + bodies = normaliseMigratorOutput(out); + return; + end +end +migratorFcn = lookupMigrator(className); +bodies = {migratorFcn(v2Body)}; +end + +function bodies = normaliseMigratorOutput(out) +%NORMALISEMIGRATOROUTPUT Coerce a migrator's output to a cell of bodies. +if iscell(out) + bodies = out(:)'; +elseif isstruct(out) + if isscalar(out) + bodies = {out}; + else + bodies = cell(1, numel(out)); + for k = 1:numel(out) + bodies{k} = out(k); + end + end +else + error('did2:convert:badMigratorOutput', ... + 'A split migrator must return a struct or cell of bodies (got %s).', ... + class(out)); +end +end + function body = ensureClassBlocks(body, schemaCacheOverride) % Make sure every class in the V_delta schema chain for the body's % concrete class has a property block in the document, manufacturing @@ -301,6 +378,23 @@ body.(cls) = struct(); end end +% Drop stray EMPTY blocks left by v1 for chain classes that the target +% schema does NOT host on the instance. v1 documents carried a property +% block for every class in their hierarchy, including parents that became +% abstract / fieldless in V_delta/V_epsilon (abstract classes are new +% here). Those arrive as empty structs and would trip the strict +% undeclared-top-level-block check. Only EMPTY such blocks are removed -- +% a non-empty one signals real data a migrator must place, so it is left +% to fail loudly rather than be silently dropped. +chainClasses = [reshape(ancestors, 1, []), {className}]; +nonContributing = setdiff(chainClasses, placementInfo.blocksContributed); +for k = 1:numel(nonContributing) + cls = nonContributing{k}; + if isfield(body, cls) && isstruct(body.(cls)) ... + && (numel(body.(cls)) == 0 || isempty(fieldnames(body.(cls)))) + body = rmfield(body, cls); + end +end sc = struct('class_name', {}, 'class_version', {}); for k = 1:numel(ancestors) ancDC = cache.getClass(ancestors{k}).document_class; diff --git a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m index 2011d22..a4ac2e2 100644 --- a/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m +++ b/tests/+did2/+unittest/+helpers/runCorpusDiscovery.m @@ -1,4 +1,4 @@ -function corpusDir = runCorpusDiscovery(testCase, corpusName, corpusURL, innerDir) +function corpusDir = runCorpusDiscovery(testCase, corpusName, corpusURL, innerDir, options) %RUNCORPUSDISCOVERY Shared driver for v1 corpus discovery-mode tests. % % CORPUSDIR = did2.unittest.helpers.runCorpusDiscovery(TESTCASE, CORPUSNAME, @@ -25,6 +25,7 @@ corpusName (1,:) char corpusURL (1,:) char innerDir (1,:) char + options.TargetVersion (1,:) char = 'V_epsilon' end did2.unittest.helpers.installSchemaPath(testCase, sprintf('skipping %s corpus test', corpusName)); @@ -42,12 +43,41 @@ bodies{k} = fileread(fullfile(files(k).folder, files(k).name)); end -result = did2.convert.v1_to_v2(bodies, 'Validate', true); +result = did2.convert.v1_to_v2(bodies, 'Validate', true, ... + 'TargetVersion', options.TargetVersion); reasons = did2.unittest.helpers.topQuarantineReasons(result.quarantine); reportPath = did2.unittest.helpers.writeCorpusReport(corpusName, result, reasons); -fprintf('\n=== Corpus %s discovery summary ===\n', corpusName); +% Per-term routing inventory (best-effort): makes the heuristic +% treatment / ontology_table_row routing auditable against real corpus +% terms so the authoritative per-term tables can be curated. Never let it +% break the discovery run -- the summary is the primary deliverable. +try + did2.unittest.helpers.writeRoutingReport(corpusName, result.migrated); +catch routingErr + fprintf('routing report skipped: %s\n', routingErr.message); +end + +% Reference-integrity sweep (best-effort): after the 1->N splits and class +% folds, confirm every depends_on edge in the migrated batch resolves to a +% document in that batch. Orphans = dangling references the migration would +% introduce (e.g. a split that didn't preserve a referenced id, or a ref to +% a deferred/quarantined doc). Reported, not fatal -- discovery mode. +try + refRep = did2.validate.references(result.migrated); + fprintf('\n--- reference integrity (%s): %d orphan(s) of %d edges ---\n', ... + corpusName, refRep.orphan_count, refRep.edges_examined); + [orphNames, orphCounts] = aggregateOrphans(refRep.orphans); + for i = 1:numel(orphNames) + fprintf(' %6d %s\n', orphCounts(i), orphNames{i}); + end +catch refReportErr + fprintf('reference report skipped: %s\n', refReportErr.message); +end + +fprintf('\n=== Corpus %s discovery summary (target %s) ===\n', ... + corpusName, options.TargetVersion); fprintf('total: %d\n', result.summary.total); fprintf('migrated_count: %d\n', result.summary.migrated_count); fprintf('quarantine_count: %d\n', result.summary.quarantine_count); @@ -58,3 +88,23 @@ reasons(k).class_name, reasons(k).reason); end end + +function [names, counts] = aggregateOrphans(orphans) +%AGGREGATEORPHANS Count dangling edges by "doc_class.edge_name", desc. +names = {}; +counts = []; +for k = 1:numel(orphans) + key = sprintf('%s.%s', orphans(k).doc_class, orphans(k).edge_name); + idx = find(strcmp(names, key), 1); + if isempty(idx) + names{end+1} = key; %#ok + counts(end+1) = 1; %#ok + else + counts(idx) = counts(idx) + 1; + end +end +if ~isempty(counts) + [counts, order] = sort(counts, 'descend'); + names = names(order); +end +end diff --git a/tests/+did2/+unittest/+helpers/writeRoutingReport.m b/tests/+did2/+unittest/+helpers/writeRoutingReport.m new file mode 100644 index 0000000..8a83c71 --- /dev/null +++ b/tests/+did2/+unittest/+helpers/writeRoutingReport.m @@ -0,0 +1,146 @@ +function reportPath = writeRoutingReport(corpusName, migrated) +%WRITEROUTINGREPORT Per-term -> routed-class inventory for routing curation. +% +% REPORTPATH = did2.unittest.helpers.writeRoutingReport(NAME, MIGRATED) +% walks the migrated observation/manipulation documents, extracts each +% one's identity term (the property the row is ABOUT -- measured_property +% / applied_property / procedure / factor / entity / first mixture +% chemical), and aggregates (term_node, term_name, class_name) with +% counts into /corpus-reports/-routing.json (picked up by the +% upload-artifact step alongside the discovery summary). +% +% Purpose: the treatment / ontology_table_row migrators route by +% keyword/CURIE HEURISTICS, so everything migrates "green" but a term can +% land in the wrong class with no error. This report makes routing +% AUDITABLE against real corpus terms: +% - rows whose class_name is generic_scalar_observation / +% generic_categorical_observation are UNMATCHED terms (need a minted +% class or a routing rule), and +% - a term that appears under a surprising class is a mis-route to fix. +% It is the data source for building the authoritative per-term routing +% tables (discovery mode, the conversion docs' "Open questions"). +% +% Best-effort and side-effect-only: any failure is swallowed by the +% caller so it never breaks the discovery run (the summary is primary). + +reportDir = fullfile(pwd, 'corpus-reports'); +if ~exist(reportDir, 'dir') + mkdir(reportDir); +end +reportPath = fullfile(reportDir, [corpusName '-routing.json']); + +keys = {}; +nodes = {}; +names = {}; +classes = {}; +counts = []; +for k = 1:numel(migrated) + doc = migrated{k}; + cls = doc.className(); + [node, name] = identityTerm(doc); + if isempty(node) && isempty(name) + continue; % not a property-bearing observation/manipulation + end + key = [node '|' name '|' cls]; + idx = find(strcmp(keys, key), 1); + if isempty(idx) + keys{end+1} = key; %#ok + nodes{end+1} = node; %#ok + names{end+1} = name; %#ok + classes{end+1} = cls; %#ok + counts(end+1) = 1; %#ok + else + counts(idx) = counts(idx) + 1; + end +end + +if isempty(counts) + entries = struct('term_node', {}, 'term_name', {}, ... + 'class_name', {}, 'count', {}); +else + [~, order] = sort(counts, 'descend'); + entries = struct('term_node', nodes(order), 'term_name', names(order), ... + 'class_name', classes(order), 'count', num2cell(counts(order))); +end + +report = struct( ... + 'corpus', corpusName, ... + 'generated_at', char(datetime('now', 'TimeZone', 'UTC', ... + 'Format', 'yyyy-MM-dd''T''HH:mm:ss''Z''')), ... + 'distinct_terms', numel(entries), ... + 'routes', entries); + +fid = fopen(reportPath, 'w'); +if fid < 0 + error('did2:test:reportWriteFailed', ... + 'Could not open %s for writing.', reportPath); +end +cleanup = onCleanup(@() fclose(fid)); %#ok +fwrite(fid, jsonencode(report, 'PrettyPrint', true)); + +% Echo the actionable breakdown to stdout so the CI log carries it (the +% JSON also ships as an artifact). The terms routed to the generic_* +% escape hatches are the UNMATCHED ones -- they need a minted class or a +% routing rule -- so list those in full, then the top routes overall. +fprintf('\n--- routing inventory (%s): %d distinct term->class routes ---\n', ... + corpusName, numel(entries)); +nUnmatched = 0; +for i = 1:numel(entries) + if startsWith(entries(i).class_name, 'generic_') + nUnmatched = nUnmatched + 1; + fprintf(' UNMATCHED %6d %-28s [%s] -> %s\n', entries(i).count, ... + entries(i).term_node, entries(i).term_name, entries(i).class_name); + end +end +fprintf(' (%d unmatched term routes to generic_*)\n', nUnmatched); +fprintf(' top routes:\n'); +for i = 1:min(numel(entries), 30) + fprintf(' %6d %-28s [%s] -> %s\n', entries(i).count, ... + entries(i).term_node, entries(i).term_name, entries(i).class_name); +end +end + +% ===================== helpers ============================================ + +function [node, name] = identityTerm(doc) +%IDENTITYTERM The ontology term a migrated observation/manipulation is about. +node = ''; +name = ''; +% Single-term identity fields, in priority order across the tiers. +paths = { ... + 'observation.measured_property', ... % observations + 'scalar_manipulation.applied_property', ... % temperature_manipulation, ... + 'procedural_manipulation.procedure', ... % procedural_manipulation, biological_transfer + 'environmental_manipulation.factor', ... % environmental_manipulation + 'biological_transfer.entity'}; % biological_transfer (more specific) +for p = 1:numel(paths) + t = tryGet(doc, paths{p}); + [node, name] = termOf(t); + if ~isempty(node) || ~isempty(name) + return; + end +end +% Pharmacological tiers (injection/bath) carry the agent in mixture[1].chemical. +m = tryGet(doc, 'pharmacological_manipulation.mixture'); +if ~isempty(m) && isstruct(m) + [node, name] = termOf(m(1).chemical); +end +end + +function [node, name] = termOf(t) +node = ''; +name = ''; +if isstruct(t) && isscalar(t) + if isfield(t, 'node') && ischar(t.node); node = t.node; end + if isfield(t, 'name') && ischar(t.name); name = t.name; end +end +end + +function v = tryGet(doc, path) +v = []; +try + v = doc.get(path); +catch + v = []; +end +end diff --git a/tests/+did2/+unittest/testCorpus20211116.m b/tests/+did2/+unittest/testCorpus20211116.m index f777272..7dfd30e 100644 --- a/tests/+did2/+unittest/testCorpus20211116.m +++ b/tests/+did2/+unittest/testCorpus20211116.m @@ -3,28 +3,28 @@ % % Pulls the 20211116.zip fixture from the public S3 prefix % (~11MB compressed, ~36MB unzipped, ~1220 v1 documents across -% ~21 classes), runs every contained body through -% did2.convert.v1_to_v2 with Validate=true, and writes a per-run -% summary JSON to corpus-reports/20211116-summary.json. The -% workflow's upload-artifact step picks the file up as a CI -% artifact. +% ~21 classes) and runs every contained body through +% did2.convert.v1_to_v2 with Validate=true, targeting **V_epsilon** +% (via the shared did2.unittest.helpers.runCorpusDiscovery driver, +% same as the B / Dab / JH corpora), and writes a per-run summary +% JSON to corpus-reports/20211116-summary.json that the workflow's +% upload-artifact step picks up. % -% Unlike testCorpusPRED, this is **discovery mode**: the test does -% not assert zero quarantine. Its job is to surface coverage -% signal (which classes / required fields are not yet migratable) -% without blocking unrelated PRs on migrator work. The single hard -% assertion is that the corpus contained at least one JSON file, -% to catch a broken fixture URL. +% Discovery mode: the test does not assert zero quarantine. Its job +% is to surface coverage signal (which classes / required fields are +% not yet migratable) without blocking unrelated PRs on migrator +% work. The single hard assertion (inside the helper) is that the +% corpus contained at least one JSON file, to catch a broken fixture +% URL. % % The corpus URL: % https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip % The zip contains a top-level 20211116/ directory of v1 NDI % document JSONs (plus __MACOSX/ sidecars that are skipped). % -% Schema-path resolution mirrors testCorpusPRED: DID_SCHEMA_PATH -% first, then the did2.schema.cache sibling-checkout default; -% skips via assumeFail if neither resolves so local devs without a -% did-schema checkout get a clean skip. +% Schema-path resolution + teardown are handled by the shared +% helpers (DID_SCHEMA_PATH first, then the did2.schema.cache +% sibling-checkout default; assumeFail skip if neither resolves). % % Run with: % results = runtests('did2.unittest.testCorpus20211116'); @@ -32,159 +32,12 @@ tests = functiontests(localfunctions); end -function setupOnce(testCase) -% Seed teardown-safe fields first so teardown is a no-op when -% setupOnce filters via assumeFail before any override happens. -testCase.TestData.previousSchemaPath = getenv('DID_SCHEMA_PATH'); -testCase.TestData.didOverrideSchemaPath = false; -testCase.TestData.corpusDir = ''; - -schemaPath = resolveSchemaPath(); -if isempty(schemaPath) - assumeFail(testCase, ... - ['V_delta schemas not found. Set DID_SCHEMA_PATH or check out ', ... - 'did-schema as a sibling of DID-matlab; skipping 20211116 corpus test.']); -end -setenv('DID_SCHEMA_PATH', schemaPath); -testCase.TestData.didOverrideSchemaPath = true; -did2.schema.cache.resetSingleton(); - -testCase.TestData.corpusDir = ensureCorpus( ... - 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip', ... - 'did2-corpus-20211116', '20211116'); -end - function teardownOnce(testCase) -if isfield(testCase.TestData, 'didOverrideSchemaPath') ... - && testCase.TestData.didOverrideSchemaPath - setenv('DID_SCHEMA_PATH', testCase.TestData.previousSchemaPath); - did2.schema.cache.resetSingleton(); -end +did2.unittest.helpers.restoreSchemaPath(testCase); end function test20211116CorpusDiscoveryReport(testCase) -corpusDir = testCase.TestData.corpusDir; -files = dir(fullfile(corpusDir, '*.json')); -files = files(~startsWith({files.name}, '._')); -verifyGreaterThan(testCase, numel(files), 0, ... - sprintf('No JSON files found under %s', corpusDir)); - -bodies = cell(numel(files), 1); -for k = 1:numel(files) - bodies{k} = fileread(fullfile(files(k).folder, files(k).name)); -end - -result = did2.convert.v1_to_v2(bodies, 'Validate', true); - -reasons = topQuarantineReasons(result.quarantine); -reportPath = writeReport('20211116', result, reasons); - -fprintf('\n=== Corpus 20211116 discovery summary ===\n'); -fprintf('total: %d\n', result.summary.total); -fprintf('migrated_count: %d\n', result.summary.migrated_count); -fprintf('quarantine_count: %d\n', result.summary.quarantine_count); -fprintf('report: %s\n', reportPath); -fprintf('top quarantine reasons:\n'); -for k = 1:min(numel(reasons), 15) - fprintf(' %5d [%s] %s\n', reasons(k).count, ... - reasons(k).class_name, reasons(k).reason); -end -end - -% --- helpers --- - -function reasons = topQuarantineReasons(quarantine) -% Aggregate quarantine entries by (class_name, reason) and return a -% struct array sorted by descending count. -if isempty(quarantine) - reasons = struct('class_name', {}, 'reason', {}, 'count', {}); - return; -end -keys = cell(1, numel(quarantine)); -for k = 1:numel(quarantine) - keys{k} = sprintf('%s|||%s', quarantine(k).class_name, ... - quarantine(k).reason); -end -[uniqKeys, ~, idx] = unique(keys); -counts = accumarray(idx, 1); -reasons = struct('class_name', {}, 'reason', {}, 'count', {}); -for k = 1:numel(uniqKeys) - parts = strsplit(uniqKeys{k}, '|||'); - reasons(k).class_name = parts{1}; - reasons(k).reason = parts{2}; - reasons(k).count = counts(k); -end -[~, order] = sort(-[reasons.count]); -reasons = reasons(order); -end - -function reportPath = writeReport(corpusName, result, reasons) -% Write a JSON discovery summary into /corpus-reports/. The CI -% workflow's upload-artifact step picks up everything under that -% directory. -reportDir = fullfile(pwd, 'corpus-reports'); -if ~exist(reportDir, 'dir') - mkdir(reportDir); -end -reportPath = fullfile(reportDir, [corpusName '-summary.json']); - -report = struct( ... - 'corpus', corpusName, ... - 'generated_at', char(datetime('now', 'TimeZone', 'UTC', ... - 'Format', 'yyyy-MM-dd''T''HH:mm:ss''Z''')), ... - 'total', result.summary.total, ... - 'migrated_count', result.summary.migrated_count, ... - 'quarantine_count', result.summary.quarantine_count, ... - 'by_class', result.summary.by_class, ... - 'quarantine_reasons', reasons); - -fid = fopen(reportPath, 'w'); -if fid < 0 - error('did2:test:reportWriteFailed', ... - 'Could not open %s for writing.', reportPath); -end -cleanup = onCleanup(@() fclose(fid)); %#ok -fwrite(fid, jsonencode(report, 'PrettyPrint', true)); -end - -function p = resolveSchemaPath() -% Return a directory that holds V_delta `*.json` schema files, or '' -% if none can be found. Probe order: DID_SCHEMA_PATH env, then the -% sibling-checkout default (matches did2.schema.cache). -candidates = {}; -envPath = getenv('DID_SCHEMA_PATH'); -if ~isempty(envPath) - candidates{end+1} = envPath; %#ok -end -toolboxDir = did.toolboxdir(); -candidates{end+1} = fullfile(toolboxDir, '..', '..', '..', ... - 'did-schema', 'schemas', 'V_delta', 'stable'); %#ok - -p = ''; -for k = 1:numel(candidates) - candidate = candidates{k}; - if isfolder(candidate) && ~isempty(dir(fullfile(candidate, '*.json'))) - p = candidate; - return; - end -end -end - -function corpusDir = ensureCorpus(corpusURL, cacheName, innerDir) -% Download (if necessary) and extract a corpus zip. The unzip target -% is cached under tempdir so repeated runs in the same MATLAB -% session reuse the same files. -cacheRoot = fullfile(tempdir(), cacheName); -corpusDir = fullfile(cacheRoot, innerDir); -if isfolder(corpusDir) && ~isempty(dir(fullfile(corpusDir, '*.json'))) - return; -end -if ~exist(cacheRoot, 'dir') - mkdir(cacheRoot); -end -zipPath = fullfile(cacheRoot, [innerDir '.zip']); -if ~isfile(zipPath) - websave(zipPath, corpusURL); -end -unzip(zipPath, cacheRoot); +did2.unittest.helpers.runCorpusDiscovery(testCase, '20211116', ... + 'https://ndi-programming-development.s3.us-east-1.amazonaws.com/20211116.zip', ... + '20211116'); end diff --git a/tests/+did2/+unittest/testMigratorsE.m b/tests/+did2/+unittest/testMigratorsE.m new file mode 100644 index 0000000..3eb4c5f --- /dev/null +++ b/tests/+did2/+unittest/testMigratorsE.m @@ -0,0 +1,300 @@ +function tests = testMigratorsE +%TESTMIGRATORSE Brainstorm-E split migrator tests (TargetVersion 'V_epsilon'). +% +% Exercises the did_v1 -> V_epsilon split migrators routed by +% did2.convert.v1_to_v2 when TargetVersion == 'V_epsilon': +% - treatment -> manipulation tiers (1 -> 1 branch dispatch) +% - ontology_table_row -> observation tiers (1 -> N) +% against the worked examples in did-schema/schemas/V_epsilon/ +% conversions/from_did_v1/{treatment,ontology_table_row}.md. +% +% Like testMigrators, these run with Validate=false so they assert the +% TRANSFORM (routing + field placement) without depending on a V_epsilon +% schema cache at the test-runner working directory. Corpus-level +% validation is the discovery-mode CI job (#3). +% +% Run with: +% results = runtests('did2.unittest.testMigratorsE'); + +tests = functiontests(localfunctions); +end + +function v1 = wrap(className, blockKey, block) +v1 = struct(); +v1.document_class = struct('class_name', className, 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0')); +v1.depends_on = struct('name', {'subject_id'}, 'document_id', {'aabb1122ccdd3344_aabb1122ccdd3344'}); +v1.base = struct('id', 'aabb1122ccdd3344_1122334455667788', ... + 'session_id', 'aabb1122ccdd3344_9900aabbccddeeff', ... + 'name', 'migrator-e-example', 'datestamp', '2024-06-01T12:00:00.000Z'); +v1.(blockKey) = block; +end + +function out = runE(v1) +out = did2.convert.v1_to_v2(v1, 'Validate', false, 'TargetVersion', 'V_epsilon'); +end + +function v = depVal(doc, name) +% Fetch a depends_on value by name from a migrated did2.document. +v = ''; +deps = doc.get('depends_on'); +for k = 1:numel(deps) + if isfield(deps(k), 'name') && strcmp(deps(k).name, name) + v = deps(k).value; + return; + end +end +end + +% ===================== treatment -> manipulation ======================= + +function testThermalTreatmentBecomesTemperatureManipulation(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'ndic:0000nnnn', 'name', 'focal cortical cooling', ... + 'numeric_value', 12.0, 'string_value', 'Peltier')); +out = runE(v1); +% 1 -> 2: the manipulation plus its session_relative_reference anchor. +verifyEqual(testCase, numel(out.migrated), 2); +doc = out.migrated{1}; +verifyTrue(testCase, isfield(out.summary.by_class, 'temperature_manipulation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'session_relative_reference')); +val = doc.get('scalar_temperature.value'); +verifyEqual(testCase, val.celsius, 12.0); +ap = doc.get('scalar_manipulation.applied_property'); +verifyEqual(testCase, ap.name, 'focal cortical cooling'); +% the anchor is an ordinal 'during' session reference +anchor = out.migrated{2}; +verifyEqual(testCase, anchor.get('session_relative_reference.relation'), 'during'); +end + +function testDrugTreatmentBecomesInjection(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'chebi:6015', 'name', 'isoflurane', ... + 'numeric_value', [], 'string_value', '')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); % injection + session anchor +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +end + +function testEnvironmentalTreatmentBecomesEnvironmentalManipulation(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'ncit:0000nnnn', 'name', 'dark rearing', ... + 'numeric_value', [], 'string_value', 'reared in darkness')); +out = runE(v1); +verifyTrue(testCase, isfield(out.summary.by_class, 'environmental_manipulation')); +doc = out.migrated{1}; +factor = doc.get('environmental_manipulation.factor'); +verifyEqual(testCase, factor.name, 'dark rearing'); +end + +function testDabTargetLocationRoutesStringValueToTargetStructure(testCase) +% Dab edge case: string_value is a UBERON CURIE, name ends "Target Location". +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'empty:0000074', ... + 'name', 'Optogenetic Tetanus Stimulation Target Location', ... + 'numeric_value', [], 'string_value', 'uberon:0001930')); +out = runE(v1); +verifyTrue(testCase, isfield(out.summary.by_class, 'procedural_manipulation')); +doc = out.migrated{1}; +ts = doc.get('procedural_manipulation.target_structure'); +verifyFalse(testCase, isempty(ts)); +end + +function testNotAManipulationIsQuarantined(testCase) +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', '', 'name', 'Date of birth', ... + 'numeric_value', [], 'string_value', '2024-01-01')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 0); +verifyEqual(testCase, numel(out.quarantine), 1); +verifyTrue(testCase, contains(out.quarantine(1).reason, 'not a manipulation')); +end + +% ===================== ontology_table_row -> observations (1->N) ======= + +function testTableRowFansOutToNObservations(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', ... + 'value', 22.5, 'unit', 'g'), ... + struct('ontology_name', 'uberon:0000105', 'name', 'life cycle stage', ... + 'value', 'fbdv:00005336')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +% 2 rows -> 2 observations + 1 shared session anchor. +verifyEqual(testCase, numel(out.migrated), 3); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'developmental_stage_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'session_relative_reference')); +end + +function testTableRowScalarValueLandsTyped(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', ... + 'value', 22.5, 'unit', 'g')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +doc = out.migrated{1}; +val = doc.get('scalar_mass.value'); +verifyEqual(testCase, val.source_value, 22.5); +end + +function testTableRowGeneratesUniqueIdsPerRow(testCase) +rows = {struct('ontology_name', 'schema:weight', 'name', 'weight', 'value', 22.5, 'unit', 'g'), ... + struct('ontology_name', 'schema:weight', 'name', 'weight', 'value', 23.0, 'unit', 'g')}; +v1 = wrap('ontology_table_row', 'ontology_table_row', struct('rows', {rows})); +out = runE(v1); +id1 = out.migrated{1}.get('base.id'); +id2 = out.migrated{2}.get('base.id'); +verifyNotEqual(testCase, id1, id2); +end + +function testTableRowCharFieldLayoutSplitsByColumn(testCase) +% The real v1 layout: parallel char fields + a data struct keyed by +% variable_names (one document = one row; each column = one observation). +block = struct( ... + 'names', 'weight,life cycle stage', ... + 'variable_names', 'weight,stage', ... + 'ontology_nodes', 'schema:weight,uberon:0000105', ... + 'data', struct('weight', 22.5, 'stage', 'fbdv:00005336')); +v1 = wrap('ontology_table_row', 'ontology_table_row', block); +out = runE(v1); +% 2 columns -> 2 observations + 1 shared session anchor. +verifyEqual(testCase, numel(out.migrated), 3); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +verifyTrue(testCase, isfield(out.summary.by_class, 'developmental_stage_observation')); +end + +function testTableRowCharFieldEmptyValuesSkipped(testCase) +% Columns with no usable value (missing key / NaN) are skipped, not +% turned into empty observations. +block = struct( ... + 'names', 'weight,missing', ... + 'variable_names', 'weight,missing', ... + 'ontology_nodes', 'schema:weight,schema:missing', ... + 'data', struct('weight', 22.5, 'missing', nan)); +v1 = wrap('ontology_table_row', 'ontology_table_row', block); +out = runE(v1); +% only the weight column survives -> 1 observation + 1 anchor. +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'body_weight_observation')); +end + +% ===================== context-dependent deferral ===================== + +function testStimulusBathDefersToNdiLayer(testCase) +% stimulus_bath is migrated to a `bath` in the NDI layer (it needs the +% stimulator element for its subject + epoch anchor), so the per-document +% converter defers it with a clear reason rather than emitting a partial. +v1 = wrap('stimulus_bath', 'stimulus_bath', struct( ... + 'location', struct('ontologyNode', 'uberon:0001017', 'name', 'CNS'), ... + 'mixture_table', '')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 0); +verifyEqual(testCase, numel(out.quarantine), 1); +verifyTrue(testCase, contains(out.quarantine(1).reason, 'NDI layer')); +end + +function testAlreadyEpsilonBodyShortCircuits(testCase) +% A body already tagged schema_version 'V_epsilon' (e.g. emitted by an NDI +% context assembler) short-circuits the migration loop and is just +% padded/validated, not re-migrated. This is what lets ndi.migrate.local +% feed assembled bath/time-reference bodies back through v1_to_v2. +v1 = wrap('mock', 'mock', struct()); +v1.document_class.schema_version = 'V_epsilon'; +out = did2.convert.v1_to_v2(v1, 'Validate', false, 'TargetVersion', 'V_epsilon'); +verifyEqual(testCase, numel(out.migrated), 1); +verifyEqual(testCase, numel(out.quarantine), 0); +end + +% ===================== deprecated treatment family -> injection/transfer = + +function testTreatmentDrugBecomesInjection(testCase) +v1 = wrap('treatment_drug', 'treatment_drug', struct( ... + 'location_ontologyNode', 'uberon:0000955', 'location_name', 'brain', ... + 'mixture_table', 'chebi:6904,muscimol,5,mg/ml')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); % injection + session anchor +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('injection.kind'), 'drug'); +mix = doc.get('pharmacological_manipulation.mixture'); +verifyEqual(testCase, mix(1).chemical.name, 'muscimol'); +end + +function testVirusInjectionBecomesVirusInjection(testCase) +v1 = wrap('virus_injection', 'virus_injection', struct( ... + 'virus_OntologyName', 'addgene:26973', 'virus_name', 'AAV9-CaMKII-GCaMP', ... + 'virusLocation_OntologyName', 'uberon:0001950', 'virusLocation_name', 'neocortex', ... + 'dilution', 0.5, 'diluent_OntologyName', '', 'diluent_name', 'saline')); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'injection')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('injection.kind'), 'virus'); +mix = doc.get('pharmacological_manipulation.mixture'); +verifyEqual(testCase, mix(1).chemical.name, 'AAV9-CaMKII-GCaMP'); +verifyEqual(testCase, mix(1).amount.source_value, 0.5); +end + +function testTreatmentTransferBecomesBiologicalTransfer(testCase) +% treatment_transfer carries recipient_id + donor_id (not subject_id). +v1 = struct(); +v1.document_class = struct('class_name', 'treatment_transfer', ... + 'class_version', '1.0.0', ... + 'superclasses', struct('class_name', 'base', 'class_version', '1.0.0')); +v1.depends_on = struct( ... + 'name', {'recipient_id', 'donor_id'}, ... + 'value', {'aabb1122ccdd3344_1111111111111111', ... + 'aabb1122ccdd3344_2222222222222222'}); +v1.base = struct('id', 'aabb1122ccdd3344_3333333333333333', ... + 'session_id', 'aabb1122ccdd3344_9900aabbccddeeff', ... + 'name', 'transfer-example', 'datestamp', '2024-06-01T12:00:00.000Z'); +v1.treatment_transfer = struct('entity_name', 'donor retina', ... + 'entity_ontologyNode', 'uberon:0000966', ... + 'method_name', 'transplant', 'method_ontologyNode', 'ncit:C15282'); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 2); +verifyTrue(testCase, isfield(out.summary.by_class, 'biological_transfer')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('biological_transfer.entity').name, 'donor retina'); +verifyEqual(testCase, doc.get('biological_transfer.kind'), 'transplant'); +% recipient -> subject_id; donor carried as donor_id +verifyEqual(testCase, depVal(doc, 'subject_id'), 'aabb1122ccdd3344_1111111111111111'); +verifyEqual(testCase, depVal(doc, 'donor_id'), 'aabb1122ccdd3344_2222222222222222'); +end + +% ===================== subject_group -> subject ======================= + +function testSubjectGroupBecomesGroupSubject(testCase) +% subject_group folds into the subject tier as a subject flagged is_group. +v1 = wrap('subject_group', 'subject_group', struct()); +out = runE(v1); +verifyEqual(testCase, numel(out.migrated), 1); +verifyTrue(testCase, isfield(out.summary.by_class, 'subject')); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('subject.is_group'), true); +verifyEqual(testCase, doc.get('subject.is_biological'), false); +end + +function testSubjectGroupCarriesOptionalNameAndDescription(testCase) +% Newer subject_group docs may carry group_name / description; they map +% onto the subject block's local_identifier / description. +v1 = wrap('subject_group', 'subject_group', struct( ... + 'group_name', 'control', 'description', 'untreated cohort')); +out = runE(v1); +doc = out.migrated{1}; +verifyEqual(testCase, doc.get('subject.local_identifier'), 'control'); +verifyEqual(testCase, doc.get('subject.description'), 'untreated cohort'); +end + +% ===================== backward compatibility ========================== + +function testDefaultTargetLeavesTreatmentUnchanged(testCase) +% With the default TargetVersion ('V_delta') the E split is NOT applied: +% treatment passes through the existing per-class migrator as a single +% treatment document. Guards the gated, backward-compatible design. +v1 = wrap('treatment', 'treatment', struct( ... + 'ontology_name', 'chebi:6015', 'name', 'isoflurane', ... + 'numeric_value', 2.0, 'string_value', '2 percent')); +out = did2.convert.v1_to_v2(v1, 'Validate', false); +verifyEqual(testCase, numel(out.migrated), 1); +verifyTrue(testCase, isfield(out.summary.by_class, 'treatment')); +end