From 5bfb64144d254ea98786aeda72d6033852fc010e Mon Sep 17 00:00:00 2001 From: Mattias Holm Date: Sat, 20 Jun 2026 10:45:02 +0200 Subject: [PATCH 1/5] Inital GEDCOM 5 import support --- Sources/Gedcom/GedcomDialect.swift | 39 +++ Sources/Gedcom/GedcomFile.swift | 303 ++++++++++++++++++ Sources/Gedcom/Header.swift | 9 +- .../GedcomTests/GedcomImprovementsTests.swift | 160 +++++++++ 4 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 Sources/Gedcom/GedcomDialect.swift diff --git a/Sources/Gedcom/GedcomDialect.swift b/Sources/Gedcom/GedcomDialect.swift new file mode 100644 index 0000000..765fd88 --- /dev/null +++ b/Sources/Gedcom/GedcomDialect.swift @@ -0,0 +1,39 @@ +// +// SPDX-License-Identifier: Apache-2.0 +// +// Copyright 2026 Mattias Holm +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +public enum GedcomDialect: Equatable { + case gedcom5(version: String) + case gedcom7(version: String) + case unknown(version: String?) + + public static func from(version: String?) -> GedcomDialect { + guard let version, !version.isEmpty else { + return .unknown(version: nil) + } + + if version.hasPrefix("5.") { + return .gedcom5(version: version) + } + + if version.hasPrefix("7.") { + return .gedcom7(version: version) + } + + return .unknown(version: version) + } +} diff --git a/Sources/Gedcom/GedcomFile.swift b/Sources/Gedcom/GedcomFile.swift index d855dc2..12328d8 100644 --- a/Sources/Gedcom/GedcomFile.swift +++ b/Sources/Gedcom/GedcomFile.swift @@ -34,6 +34,8 @@ public class GedcomFile { public var sourceRecords: [Source] = [] public var submitterRecords: [Submitter] = [] public var extensionRecords: [GedcomExtensionNode] = [] + public var sourceDialect: GedcomDialect = .unknown(version: nil) + public var exportDialect: GedcomDialect { .gedcom7(version: "7.0") } public var familyRecordsMap: [String: Family] = [:] public var individualRecordsMap: [String: Individual] = [:] @@ -43,6 +45,10 @@ public class GedcomFile { public var sourceRecordsMap: [String: Source] = [:] public var submitterRecordsMap: [String: Submitter] = [:] + private var generatedMultimediaRecordIndex = 1 + private var generatedSourceRecordIndex = 1 + private var liftedGedcom5Records: [Record] = [] + public init(withArchive path: URL, encoding: String.Encoding = .utf8) throws { self.url = path self.archive = try Archive(url: path, accessMode: .read, pathEncoding: nil) @@ -65,6 +71,7 @@ public class GedcomFile { } try parse(encoding: encoding) + try prepareRecordsForBuild() try build() } @@ -79,6 +86,7 @@ public class GedcomFile { } try parse(encoding: encoding) + try prepareRecordsForBuild() try build() } @@ -151,6 +159,300 @@ public class GedcomFile { "SUBM" : \GedcomFile.submitterRecords, ] + func prepareRecordsForBuild() throws { + sourceDialect = GedcomDialect.from(version: gedcomVersion(in: recordLines)) + + switch sourceDialect { + case .gedcom5: + convertGedcom5RecordsToGedcom7() + case .gedcom7, .unknown: + if containsTag("CONC", in: recordLines) { + throw GedcomError.badRecord + } + } + } + + private func gedcomVersion(in records: [Record]) -> String? { + guard let header = records.first(where: { $0.line.tag == "HEAD" }), + let gedc = header.children.first(where: { $0.line.tag == "GEDC" }), + let vers = gedc.children.first(where: { $0.line.tag == "VERS" }) else { + return nil + } + return vers.line.value + } + + private func containsTag(_ tag: String, in records: [Record]) -> Bool { + records.contains { record in + record.line.tag == tag || containsTag(tag, in: record.children) + } + } + + private func convertGedcom5RecordsToGedcom7() { + generatedMultimediaRecordIndex = 1 + generatedSourceRecordIndex = 1 + liftedGedcom5Records = [] + + for record in recordLines { + convertGedcom5RecordToGedcom7(record) + if record.line.level == 0 && record.line.tag == "NOTE" { + record.line.tag = "SNOTE" + } + } + + if !liftedGedcom5Records.isEmpty { + if let trailerIndex = recordLines.firstIndex(where: { $0.line.tag == "TRLR" }) { + recordLines.insert(contentsOf: liftedGedcom5Records, at: trailerIndex) + } else { + recordLines.append(contentsOf: liftedGedcom5Records) + } + } + } + + private func convertGedcom5RecordToGedcom7(_ record: Record) { + var convertedChildren: [Record] = [] + + for child in record.children { + if child.line.tag == "CONC" || child.line.tag == "CONT" { + if record.line.value == nil { + record.line.value = "" + } + if child.line.tag == "CONT" { + record.line.value?.append("\n\(child.line.value ?? "")") + } else { + record.line.value?.append(child.line.value ?? "") + } + } else { + convertGedcom5RecordToGedcom7(child) + convertedChildren.append(child) + } + } + + record.children = convertedChildren + + if record.line.tag == "DATE" { + convertGedcom5DateRecordToGedcom7(record) + } else if record.line.tag == "OBJE" && record.line.level > 0 && !isPointerValue(record.line.value) { + liftGedcom5InlineMultimedia(record) + } else if record.line.tag == "SOUR" && record.line.level > 0 && !isPointerValue(record.line.value) { + liftGedcom5InlineSource(record) + } + } + + private func liftGedcom5InlineMultimedia(_ record: Record) { + let xref = nextGeneratedXref(prefix: "O", existing: existingXrefs()) + let multimediaRecord = Record(level: 0, xref: xref, tag: "OBJE") + var linkChildren: [Record] = [] + var fileChildren: [Record] = [] + + for child in record.children { + switch child.line.tag { + case "FILE": + let file = clonedRecord(child, level: 1) + multimediaRecord.children.append(file) + case "FORM": + fileChildren.append(clonedRecord(child, level: 2)) + case "TITL": + fileChildren.append(clonedRecord(child, level: 2)) + case "CROP": + linkChildren.append(child) + default: + multimediaRecord.children.append(clonedRecord(child, level: 1)) + } + } + + if let file = multimediaRecord.children.first(where: { $0.line.tag == "FILE" }) { + file.children.append(contentsOf: fileChildren) + } else if let path = record.line.value, + !path.isEmpty { + let file = Record(level: 1, tag: "FILE", value: path) + file.children = fileChildren + multimediaRecord.children.insert(file, at: 0) + } + + convertGedcom5MultimediaFormValues(in: multimediaRecord) + record.line.value = xref + record.children = linkChildren + liftedGedcom5Records.append(multimediaRecord) + } + + private func liftGedcom5InlineSource(_ record: Record) { + let xref = nextGeneratedXref(prefix: "S", existing: existingXrefs()) + let sourceRecord = Record(level: 0, xref: xref, tag: "SOUR") + var citationChildren: [Record] = [] + + if let title = record.line.value, !title.isEmpty { + sourceRecord.children.append(Record(level: 1, tag: "TITL", value: title)) + } + + for child in record.children { + switch child.line.tag { + case "PAGE", "DATA", "EVEN", "QUAY", "OBJE", "NOTE", "SNOTE": + citationChildren.append(child) + default: + sourceRecord.children.append(clonedRecord(child, level: 1)) + } + } + + record.line.value = xref + record.children = citationChildren + liftedGedcom5Records.append(sourceRecord) + } + + private func convertGedcom5MultimediaFormValues(in record: Record) { + if record.line.tag == "FORM", let value = record.line.value { + record.line.value = gedcom7MediaType(forGedcom5Form: value) + } + + for child in record.children { + convertGedcom5MultimediaFormValues(in: child) + } + } + + private func gedcom7MediaType(forGedcom5Form form: String) -> String { + switch form.lowercased() { + case "bmp": + return "image/bmp" + case "gif": + return "image/gif" + case "jpg", "jpeg": + return "image/jpeg" + case "ole": + return "application/ole" + case "pcx": + return "image/vnd.zbrush.pcx" + case "tif", "tiff": + return "image/tiff" + case "wav": + return "audio/wav" + default: + return form + } + } + + private func isPointerValue(_ value: String?) -> Bool { + guard let value else { + return false + } + return value.hasPrefix("@") && value.hasSuffix("@") && value.count > 2 + } + + private func existingXrefs() -> Set { + Set(recordLines.compactMap(\.line.xref) + liftedGedcom5Records.compactMap(\.line.xref)) + } + + private func nextGeneratedXref(prefix: String, existing: Set) -> String { + if prefix == "O" { + defer { generatedMultimediaRecordIndex += 1 } + var xref = "@O\(generatedMultimediaRecordIndex)@" + while existing.contains(xref) { + generatedMultimediaRecordIndex += 1 + xref = "@O\(generatedMultimediaRecordIndex)@" + } + return xref + } + + defer { generatedSourceRecordIndex += 1 } + var xref = "@S\(generatedSourceRecordIndex)@" + while existing.contains(xref) { + generatedSourceRecordIndex += 1 + xref = "@S\(generatedSourceRecordIndex)@" + } + return xref + } + + private func clonedRecord(_ record: Record, level: Int) -> Record { + let clone = Record(level: level, xref: record.line.xref, tag: record.line.tag, value: record.line.value) + clone.children = record.children.map { clonedRecord($0, level: level + 1) } + return clone + } + + private func convertGedcom5DateRecordToGedcom7(_ record: Record) { + guard let value = record.line.value?.trimmingCharacters(in: .whitespacesAndNewlines), + !value.isEmpty else { + return + } + + if let interpreted = convertGedcom5InterpretedDate(value) { + record.line.value = interpreted.date + appendPhrase(interpreted.phrase, to: record) + return + } + + if let dualYear = convertGedcom5DualYearDate(value) { + record.line.value = dualYear.date + appendPhrase(dualYear.phrase, to: record) + } + } + + private func convertGedcom5InterpretedDate(_ value: String) -> (date: String, phrase: String)? { + let prefix = "INT " + guard value.hasPrefix(prefix), + let phraseStart = value.firstIndex(of: "("), + value.last == ")", + phraseStart > value.index(value.startIndex, offsetBy: prefix.count) else { + return nil + } + + let date = value[value.index(value.startIndex, offsetBy: prefix.count).. (date: String, phrase: String)? { + let parts = value.split(separator: " ", omittingEmptySubsequences: false) + var convertedParts: [String] = [] + var converted = false + + for part in parts { + if !converted, + let slash = part.firstIndex(of: "/") { + let firstYear = part[.. { mutableSelf[keyPath: wkp] = child.line.value ?? "" + } else if let wkp = kp as? WritableKeyPath { + mutableSelf[keyPath: wkp] = child.line.value ?? "" } } } @@ -351,6 +355,8 @@ public class Header : RecordProtocol, GedcomExtensionContainer { public var place: HeaderPlace? public var copyright: String? public var lang: String? + public var characterEncoding: String? + public var file: String? public var submitter: String? public var note: NoteStructure? @@ -365,6 +371,8 @@ public class Header : RecordProtocol, GedcomExtensionContainer { "SUBM" : \Header.submitter, "COPR" : \Header.copyright, "LANG" : \Header.lang, + "CHAR" : \Header.characterEncoding, + "FILE" : \Header.file, "PLAC" : \Header.place, "NOTE" : \Header.note, "SOTE" : \Header.note, @@ -456,4 +464,3 @@ public class Header : RecordProtocol, GedcomExtensionContainer { return record } } - diff --git a/Tests/GedcomTests/GedcomImprovementsTests.swift b/Tests/GedcomTests/GedcomImprovementsTests.swift index 6da5f3e..4b8cc79 100644 --- a/Tests/GedcomTests/GedcomImprovementsTests.swift +++ b/Tests/GedcomTests/GedcomImprovementsTests.swift @@ -140,6 +140,166 @@ import Foundation #expect(ged.uri(forExtensionTag: "_FOO") == URL(string: "https://openorbit.org/gedcom/extensions/unknown/_FOO")!) } + @Test func testGedcom551ImportsAsGedcom7Output() throws { + let content = """ +0 HEAD +1 SOUR LegacyApp +1 GEDC +2 VERS 5.5.1 +2 FORM LINEAGE-LINKED +1 CHAR UTF-8 +1 FILE legacy.ged +0 @N1@ NOTE This is a long +1 CONC legacy note +1 CONT with another line. +0 @I1@ INDI +1 NAME Jane /Doe/ +1 NOTE @N1@ +0 TRLR +""" + let ged = try loadGedcom(content) + + #expect(ged.sourceDialect == .gedcom5(version: "5.5.1")) + #expect(ged.header.gedc.vers == "7.0") + #expect(ged.header.gedc.form == "LINEAGE-LINKED") + #expect(ged.header.characterEncoding == "UTF-8") + #expect(ged.header.file == "legacy.ged") + #expect(ged.sharedNoteRecords.count == 1) + #expect(ged.sharedNoteRecords[0].xref == "@N1@") + #expect(ged.sharedNoteRecords[0].text == "This is a longlegacy note\nwith another line.") + + let exported = ged.exportContent() + #expect(exported.contains("2 VERS 7.0\n")) + #expect(exported.contains("0 @N1@ SNOTE This is a longlegacy note\n1 CONT with another line.\n")) + #expect(!exported.contains("2 FORM LINEAGE-LINKED\n")) + #expect(!exported.contains("1 CHAR UTF-8\n")) + #expect(!exported.contains("1 FILE legacy.ged\n")) + #expect(!exported.contains("0 @N1@ NOTE")) + } + + @Test func testConcIsRejectedForGedcom7Input() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 7.0 +0 @I1@ INDI +1 NOTE First +2 CONC second +0 TRLR +""" + + do { + _ = try loadGedcom(content) + Issue.record("GEDCOM 7 input must not accept CONC") + } catch GedcomError.badRecord { + #expect(true) + } catch { + Issue.record("Unexpected error: \(error)") + } + } + + @Test func testGedcom551DateConversionToGedcom7Phrases() throws { + let content = """ +0 HEAD +1 SOUR LegacyApp +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Date /Tester/ +1 BIRT +2 DATE INT 1800 (interpreted from census) +1 DEAT +2 DATE 2 FEB 1711/12 +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let birth = try #require(individual.events.first(where: { $0.kind == .BIRT })) + let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) + + #expect(birth.date?.date == "1800") + #expect(birth.date?.phrase == "interpreted from census") + #expect(death.date?.date == "2 FEB 1711") + #expect(death.date?.phrase == "2 FEB 1711/12") + + let exported = ged.exportContent() + #expect(exported.contains("1 BIRT\n2 DATE 1800\n3 PHRASE interpreted from census\n")) + #expect(exported.contains("1 DEAT\n2 DATE 2 FEB 1711\n3 PHRASE 2 FEB 1711/12\n")) + } + + @Test func testGedcom551InlineMultimediaLiftedToRecord() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Media /Tester/ +1 OBJE +2 FILE photos/john.jpg +3 FORM jpg +2 TITL John Smith portrait +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let link = try #require(individual.multimediaLinks.first) + let multimedia = try #require(ged.multimediaRecords.first) + let file = try #require(multimedia.files.first) + + #expect(link.xref == multimedia.xref) + #expect(file.path == "photos/john.jpg") + #expect(file.form.form == "image/jpeg") + #expect(file.title == "John Smith portrait") + + let exported = ged.exportContent() + #expect(exported.contains("1 OBJE @O1@\n")) + #expect(exported.contains("0 @O1@ OBJE\n1 FILE photos/john.jpg\n2 FORM image/jpeg\n2 TITL John Smith portrait\n")) + } + + @Test func testGedcom551InlineSourceLiftedToRecord() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Source /Tester/ +1 DEAT +2 DATE 1910 +2 SOUR Letter from Alice Smith, 13 April 1946 +3 TEXT My father passed away back in 1910. +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) + let citation = try #require(death.citations.first) + let source = try #require(ged.sourceRecords.first) + + #expect(citation.xref == source.xref) + #expect(source.title == "Letter from Alice Smith, 13 April 1946") + #expect(source.text?.text == "My father passed away back in 1910.") + + let exported = ged.exportContent() + #expect(exported.contains("2 SOUR @S1@\n")) + #expect(exported.contains("0 @S1@ SOUR\n1 TITL Letter from Alice Smith, 13 April 1946\n1 TEXT My father passed away back in 1910.\n")) + } + + @Test func testCommonLineEndingsImport() throws { + let lines = [ + "0 HEAD", + "1 GEDC", + "2 VERS 7.0", + "0 TRLR", + "" + ] + + for separator in ["\n", "\r\n", "\r"] { + let ged = try loadGedcom(lines.joined(separator: separator)) + #expect(ged.sourceDialect == .gedcom7(version: "7.0")) + #expect(ged.exportContent() == "0 HEAD\n1 GEDC\n2 VERS 7.0\n0 TRLR\n") + } + } + private func loadGedcom(_ content: String) throws -> GedcomFile { let url = FileManager.default.temporaryDirectory .appendingPathComponent(UUID().uuidString) From 432e99b8e012f729e8b01a8da8b1daa58b5df89a Mon Sep 17 00:00:00 2001 From: Mattias Holm Date: Sat, 20 Jun 2026 10:53:01 +0200 Subject: [PATCH 2/5] Structural conversions ROMN / FONE -> TRAN RELA -> ROLE OTHER + PHRASE --- Sources/Gedcom/GedcomFile.swift | 63 ++++++ .../GedcomTests/Gedcom5ConversionTests.swift | 189 ++++++++++++++++++ .../GedcomTests/GedcomImprovementsTests.swift | 123 ------------ 3 files changed, 252 insertions(+), 123 deletions(-) create mode 100644 Tests/GedcomTests/Gedcom5ConversionTests.swift diff --git a/Sources/Gedcom/GedcomFile.swift b/Sources/Gedcom/GedcomFile.swift index 12328d8..226b749 100644 --- a/Sources/Gedcom/GedcomFile.swift +++ b/Sources/Gedcom/GedcomFile.swift @@ -235,6 +235,69 @@ public class GedcomFile { liftGedcom5InlineMultimedia(record) } else if record.line.tag == "SOUR" && record.line.level > 0 && !isPointerValue(record.line.value) { liftGedcom5InlineSource(record) + } else if record.line.tag == "ROMN" || record.line.tag == "FONE" { + convertGedcom5NameVariationToTranslation(record) + } else if record.line.tag == "RELA" { + convertGedcom5RelationshipToRole(record) + } + } + + private func convertGedcom5NameVariationToTranslation(_ record: Record) { + let originalTag = record.line.tag + var convertedChildren: [Record] = [] + var language: String? + + for child in record.children { + if child.line.tag == "TYPE" { + language = gedcom7Language(forGedcom5NameVariation: child.line.value, tag: originalTag) + } else { + convertedChildren.append(child) + } + } + + record.line.tag = "TRAN" + record.children = convertedChildren + record.children.insert( + Record(level: record.line.level + 1, + tag: "LANG", + value: language ?? defaultLanguage(forGedcom5NameVariation: originalTag)), + at: 0 + ) + } + + private func gedcom7Language(forGedcom5NameVariation type: String?, tag: String) -> String { + guard let type else { + return defaultLanguage(forGedcom5NameVariation: tag) + } + + switch type.lowercased() { + case "hangul": + return "ko-hang" + case "kana": + return "ja-hrkt" + case "pinyin": + return "und-Latn-pinyin" + case "romaji": + return "ja-Latn" + case "wadegiles": + return "zh-Latn-wadegile" + default: + return defaultLanguage(forGedcom5NameVariation: tag) + } + } + + private func defaultLanguage(forGedcom5NameVariation tag: String) -> String { + tag == "ROMN" ? "und-Latn" : "und" + } + + private func convertGedcom5RelationshipToRole(_ record: Record) { + let phrase = record.line.value?.trimmingCharacters(in: .whitespacesAndNewlines) + + record.line.tag = "ROLE" + record.line.value = "OTHER" + + if let phrase, !phrase.isEmpty { + appendPhrase(phrase, to: record) } } diff --git a/Tests/GedcomTests/Gedcom5ConversionTests.swift b/Tests/GedcomTests/Gedcom5ConversionTests.swift new file mode 100644 index 0000000..53e4c65 --- /dev/null +++ b/Tests/GedcomTests/Gedcom5ConversionTests.swift @@ -0,0 +1,189 @@ +import Testing +import Foundation +@testable import Gedcom + +@Suite struct Gedcom5ConversionTests { + @Test func testGedcom551ImportsAsGedcom7Output() throws { + let content = """ +0 HEAD +1 SOUR LegacyApp +1 GEDC +2 VERS 5.5.1 +2 FORM LINEAGE-LINKED +1 CHAR UTF-8 +1 FILE legacy.ged +0 @N1@ NOTE This is a long +1 CONC legacy note +1 CONT with another line. +0 @I1@ INDI +1 NAME Jane /Doe/ +1 NOTE @N1@ +0 TRLR +""" + let ged = try loadGedcom(content) + + #expect(ged.sourceDialect == .gedcom5(version: "5.5.1")) + #expect(ged.header.gedc.vers == "7.0") + #expect(ged.header.gedc.form == "LINEAGE-LINKED") + #expect(ged.header.characterEncoding == "UTF-8") + #expect(ged.header.file == "legacy.ged") + #expect(ged.sharedNoteRecords.count == 1) + #expect(ged.sharedNoteRecords[0].xref == "@N1@") + #expect(ged.sharedNoteRecords[0].text == "This is a longlegacy note\nwith another line.") + + let exported = ged.exportContent() + #expect(exported.contains("2 VERS 7.0\n")) + #expect(exported.contains("0 @N1@ SNOTE This is a longlegacy note\n1 CONT with another line.\n")) + #expect(!exported.contains("2 FORM LINEAGE-LINKED\n")) + #expect(!exported.contains("1 CHAR UTF-8\n")) + #expect(!exported.contains("1 FILE legacy.ged\n")) + #expect(!exported.contains("0 @N1@ NOTE")) + } + + @Test func testGedcom551DateConversionToGedcom7Phrases() throws { + let content = """ +0 HEAD +1 SOUR LegacyApp +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Date /Tester/ +1 BIRT +2 DATE INT 1800 (interpreted from census) +1 DEAT +2 DATE 2 FEB 1711/12 +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let birth = try #require(individual.events.first(where: { $0.kind == .BIRT })) + let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) + + #expect(birth.date?.date == "1800") + #expect(birth.date?.phrase == "interpreted from census") + #expect(death.date?.date == "2 FEB 1711") + #expect(death.date?.phrase == "2 FEB 1711/12") + + let exported = ged.exportContent() + #expect(exported.contains("1 BIRT\n2 DATE 1800\n3 PHRASE interpreted from census\n")) + #expect(exported.contains("1 DEAT\n2 DATE 2 FEB 1711\n3 PHRASE 2 FEB 1711/12\n")) + } + + @Test func testGedcom551InlineMultimediaLiftedToRecord() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Media /Tester/ +1 OBJE +2 FILE photos/john.jpg +3 FORM jpg +2 TITL John Smith portrait +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let link = try #require(individual.multimediaLinks.first) + let multimedia = try #require(ged.multimediaRecords.first) + let file = try #require(multimedia.files.first) + + #expect(link.xref == multimedia.xref) + #expect(file.path == "photos/john.jpg") + #expect(file.form.form == "image/jpeg") + #expect(file.title == "John Smith portrait") + + let exported = ged.exportContent() + #expect(exported.contains("1 OBJE @O1@\n")) + #expect(exported.contains("0 @O1@ OBJE\n1 FILE photos/john.jpg\n2 FORM image/jpeg\n2 TITL John Smith portrait\n")) + } + + @Test func testGedcom551InlineSourceLiftedToRecord() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Source /Tester/ +1 DEAT +2 DATE 1910 +2 SOUR Letter from Alice Smith, 13 April 1946 +3 TEXT My father passed away back in 1910. +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) + let citation = try #require(death.citations.first) + let source = try #require(ged.sourceRecords.first) + + #expect(citation.xref == source.xref) + #expect(source.title == "Letter from Alice Smith, 13 April 1946") + #expect(source.text?.text == "My father passed away back in 1910.") + + let exported = ged.exportContent() + #expect(exported.contains("2 SOUR @S1@\n")) + #expect(exported.contains("0 @S1@ SOUR\n1 TITL Letter from Alice Smith, 13 April 1946\n1 TEXT My father passed away back in 1910.\n")) + } + + @Test func testGedcom551NameVariationStructuresBecomeTranslations() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME /橘/ 逸勢 +2 ROMN /Tachibana/ no Hayanari +3 TYPE romaji +2 FONE /たちばな/ の はやなり +3 TYPE kana +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let name = try #require(individual.names.first) + + #expect(name.translations.count == 2) + #expect(name.translations[0].name == "/Tachibana/ no Hayanari") + #expect(name.translations[0].lang == "ja-Latn") + #expect(name.translations[1].name == "/たちばな/ の はやなり") + #expect(name.translations[1].lang == "ja-hrkt") + + let exported = ged.exportContent() + #expect(exported.contains("2 TRAN /Tachibana/ no Hayanari\n3 LANG ja-Latn\n")) + #expect(exported.contains("2 TRAN /たちばな/ の はやなり\n3 LANG ja-hrkt\n")) + } + + @Test func testGedcom551RelationshipStructureBecomesRolePhrase() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Associated /Person/ +1 ASSO @I2@ +2 RELA Honorary uncle +0 @I2@ INDI +1 NAME Other /Person/ +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let association = try #require(individual.associations.first) + + #expect(association.role?.kind == .OTHER) + #expect(association.role?.phrase == "Honorary uncle") + + let exported = ged.exportContent() + #expect(exported.contains("1 ASSO @I2@\n2 ROLE OTHER\n3 PHRASE Honorary uncle\n")) + } + + private func loadGedcom(_ content: String) throws -> GedcomFile { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent(UUID().uuidString) + .appendingPathExtension("ged") + try content.write(to: url, atomically: true, encoding: .utf8) + defer { try? FileManager.default.removeItem(at: url) } + return try GedcomFile(withFile: url) + } +} diff --git a/Tests/GedcomTests/GedcomImprovementsTests.swift b/Tests/GedcomTests/GedcomImprovementsTests.swift index 4b8cc79..e54741b 100644 --- a/Tests/GedcomTests/GedcomImprovementsTests.swift +++ b/Tests/GedcomTests/GedcomImprovementsTests.swift @@ -140,43 +140,6 @@ import Foundation #expect(ged.uri(forExtensionTag: "_FOO") == URL(string: "https://openorbit.org/gedcom/extensions/unknown/_FOO")!) } - @Test func testGedcom551ImportsAsGedcom7Output() throws { - let content = """ -0 HEAD -1 SOUR LegacyApp -1 GEDC -2 VERS 5.5.1 -2 FORM LINEAGE-LINKED -1 CHAR UTF-8 -1 FILE legacy.ged -0 @N1@ NOTE This is a long -1 CONC legacy note -1 CONT with another line. -0 @I1@ INDI -1 NAME Jane /Doe/ -1 NOTE @N1@ -0 TRLR -""" - let ged = try loadGedcom(content) - - #expect(ged.sourceDialect == .gedcom5(version: "5.5.1")) - #expect(ged.header.gedc.vers == "7.0") - #expect(ged.header.gedc.form == "LINEAGE-LINKED") - #expect(ged.header.characterEncoding == "UTF-8") - #expect(ged.header.file == "legacy.ged") - #expect(ged.sharedNoteRecords.count == 1) - #expect(ged.sharedNoteRecords[0].xref == "@N1@") - #expect(ged.sharedNoteRecords[0].text == "This is a longlegacy note\nwith another line.") - - let exported = ged.exportContent() - #expect(exported.contains("2 VERS 7.0\n")) - #expect(exported.contains("0 @N1@ SNOTE This is a longlegacy note\n1 CONT with another line.\n")) - #expect(!exported.contains("2 FORM LINEAGE-LINKED\n")) - #expect(!exported.contains("1 CHAR UTF-8\n")) - #expect(!exported.contains("1 FILE legacy.ged\n")) - #expect(!exported.contains("0 @N1@ NOTE")) - } - @Test func testConcIsRejectedForGedcom7Input() throws { let content = """ 0 HEAD @@ -198,92 +161,6 @@ import Foundation } } - @Test func testGedcom551DateConversionToGedcom7Phrases() throws { - let content = """ -0 HEAD -1 SOUR LegacyApp -1 GEDC -2 VERS 5.5.1 -0 @I1@ INDI -1 NAME Date /Tester/ -1 BIRT -2 DATE INT 1800 (interpreted from census) -1 DEAT -2 DATE 2 FEB 1711/12 -0 TRLR -""" - let ged = try loadGedcom(content) - let individual = try #require(ged.individualRecords.first) - let birth = try #require(individual.events.first(where: { $0.kind == .BIRT })) - let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) - - #expect(birth.date?.date == "1800") - #expect(birth.date?.phrase == "interpreted from census") - #expect(death.date?.date == "2 FEB 1711") - #expect(death.date?.phrase == "2 FEB 1711/12") - - let exported = ged.exportContent() - #expect(exported.contains("1 BIRT\n2 DATE 1800\n3 PHRASE interpreted from census\n")) - #expect(exported.contains("1 DEAT\n2 DATE 2 FEB 1711\n3 PHRASE 2 FEB 1711/12\n")) - } - - @Test func testGedcom551InlineMultimediaLiftedToRecord() throws { - let content = """ -0 HEAD -1 GEDC -2 VERS 5.5.1 -0 @I1@ INDI -1 NAME Media /Tester/ -1 OBJE -2 FILE photos/john.jpg -3 FORM jpg -2 TITL John Smith portrait -0 TRLR -""" - let ged = try loadGedcom(content) - let individual = try #require(ged.individualRecords.first) - let link = try #require(individual.multimediaLinks.first) - let multimedia = try #require(ged.multimediaRecords.first) - let file = try #require(multimedia.files.first) - - #expect(link.xref == multimedia.xref) - #expect(file.path == "photos/john.jpg") - #expect(file.form.form == "image/jpeg") - #expect(file.title == "John Smith portrait") - - let exported = ged.exportContent() - #expect(exported.contains("1 OBJE @O1@\n")) - #expect(exported.contains("0 @O1@ OBJE\n1 FILE photos/john.jpg\n2 FORM image/jpeg\n2 TITL John Smith portrait\n")) - } - - @Test func testGedcom551InlineSourceLiftedToRecord() throws { - let content = """ -0 HEAD -1 GEDC -2 VERS 5.5.1 -0 @I1@ INDI -1 NAME Source /Tester/ -1 DEAT -2 DATE 1910 -2 SOUR Letter from Alice Smith, 13 April 1946 -3 TEXT My father passed away back in 1910. -0 TRLR -""" - let ged = try loadGedcom(content) - let individual = try #require(ged.individualRecords.first) - let death = try #require(individual.events.first(where: { $0.kind == .DEAT })) - let citation = try #require(death.citations.first) - let source = try #require(ged.sourceRecords.first) - - #expect(citation.xref == source.xref) - #expect(source.title == "Letter from Alice Smith, 13 April 1946") - #expect(source.text?.text == "My father passed away back in 1910.") - - let exported = ged.exportContent() - #expect(exported.contains("2 SOUR @S1@\n")) - #expect(exported.contains("0 @S1@ SOUR\n1 TITL Letter from Alice Smith, 13 April 1946\n1 TEXT My father passed away back in 1910.\n")) - } - @Test func testCommonLineEndingsImport() throws { let lines = [ "0 HEAD", From 26b5523ad1f75eb4667e57c704f149e5082b3fc4 Mon Sep 17 00:00:00 2001 From: Mattias Holm Date: Sat, 20 Jun 2026 10:57:23 +0200 Subject: [PATCH 3/5] GEDCOM 5 value conversions --- Sources/Gedcom/GedcomFile.swift | 136 +++++++++++++++++- .../GedcomTests/Gedcom5ConversionTests.swift | 73 ++++++++++ 2 files changed, 207 insertions(+), 2 deletions(-) diff --git a/Sources/Gedcom/GedcomFile.swift b/Sources/Gedcom/GedcomFile.swift index 226b749..ab5496a 100644 --- a/Sources/Gedcom/GedcomFile.swift +++ b/Sources/Gedcom/GedcomFile.swift @@ -208,7 +208,7 @@ public class GedcomFile { } } - private func convertGedcom5RecordToGedcom7(_ record: Record) { + private func convertGedcom5RecordToGedcom7(_ record: Record, parentTag: String?) { var convertedChildren: [Record] = [] for child in record.children { @@ -222,7 +222,7 @@ public class GedcomFile { record.line.value?.append(child.line.value ?? "") } } else { - convertGedcom5RecordToGedcom7(child) + convertGedcom5RecordToGedcom7(child, parentTag: record.line.tag) convertedChildren.append(child) } } @@ -239,6 +239,90 @@ public class GedcomFile { convertGedcom5NameVariationToTranslation(record) } else if record.line.tag == "RELA" { convertGedcom5RelationshipToRole(record) + } else { + convertGedcom5PayloadValueToGedcom7(record, parentTag: parentTag) + } + } + + private func convertGedcom5RecordToGedcom7(_ record: Record) { + convertGedcom5RecordToGedcom7(record, parentTag: nil) + } + + private func convertGedcom5PayloadValueToGedcom7(_ record: Record, parentTag: String?) { + switch record.line.tag { + case "AGE": + convertGedcom5AgeValue(record) + case "SEX": + convertGedcom5SexValue(record) + case "STAT": + convertGedcom5StatusValue(record) + case "PEDI": + record.line.value = record.line.value?.uppercased() + case "RESN": + record.line.value = record.line.value? + .components(separatedBy: ",") + .map { $0.trimmingCharacters(in: .whitespacesAndNewlines).uppercased() } + .joined(separator: ", ") + case "TYPE" where parentTag == "NAME": + record.line.value = record.line.value?.uppercased() + default: + break + } + } + + private func convertGedcom5AgeValue(_ record: Record) { + guard let value = record.line.value?.trimmingCharacters(in: .whitespacesAndNewlines), + !value.isEmpty else { + return + } + + switch value.uppercased() { + case "CHILD": + record.line.value = "< 8y" + appendPhrase("Child", to: record) + case "INFANT": + record.line.value = "< 1y" + appendPhrase("Infant", to: record) + case "STILLBORN": + record.line.value = "0y" + appendPhrase("Stillborn", to: record) + default: + break + } + } + + private func convertGedcom5SexValue(_ record: Record) { + guard let value = record.line.value?.trimmingCharacters(in: .whitespacesAndNewlines), + let first = value.first else { + record.line.value = "U" + return + } + + switch String(first).uppercased() { + case "M": + record.line.value = "M" + case "F": + record.line.value = "F" + case "X": + record.line.value = "X" + default: + record.line.value = "U" + } + } + + private func convertGedcom5StatusValue(_ record: Record) { + guard let value = record.line.value?.trimmingCharacters(in: .whitespacesAndNewlines), + !value.isEmpty else { + return + } + + switch value.uppercased() { + case "DNS/CAN": + record.line.value = "DNS_CAN" + case "PRE-1970": + record.line.value = "PRE_1970" + default: + record.line.value = value.uppercased() } } @@ -435,6 +519,11 @@ public class GedcomFile { return } + if let calendar = convertGedcom5CalendarEscape(value) { + record.line.value = calendar + return + } + if let interpreted = convertGedcom5InterpretedDate(value) { record.line.value = interpreted.date appendPhrase(interpreted.phrase, to: record) @@ -444,7 +533,22 @@ public class GedcomFile { if let dualYear = convertGedcom5DualYearDate(value) { record.line.value = dualYear.date appendPhrase(dualYear.phrase, to: record) + return } + + if let range = convertGedcom5DateRange(value) { + record.line.value = range + } + } + + private func convertGedcom5CalendarEscape(_ value: String) -> String? { + if value.hasPrefix("@#ROMAN@") { + return value.replacingOccurrences(of: "@#ROMAN@", with: "_ROMAN", options: [.anchored]) + } + if value.hasPrefix("@#UNKNOWN@") { + return value.replacingOccurrences(of: "@#UNKNOWN@", with: "_UNKNOWN", options: [.anchored]) + } + return nil } private func convertGedcom5InterpretedDate(_ value: String) -> (date: String, phrase: String)? { @@ -501,6 +605,34 @@ public class GedcomFile { return (convertedParts.joined(separator: " "), value) } + private func convertGedcom5DateRange(_ value: String) -> String? { + let prefix = "BET " + guard value.hasPrefix(prefix), + let separatorRange = value.range(of: " AND ") else { + return nil + } + + let start = value[value.index(value.startIndex, offsetBy: prefix.count).. endYear else { + return nil + } + + return "BET \(end) AND \(start)" + } + + private func trailingYear(in value: String) -> Int? { + value + .split(separator: " ") + .last + .flatMap { Int($0) } + } + private func appendPhrase(_ phrase: String, to record: Record) { guard !phrase.isEmpty else { return diff --git a/Tests/GedcomTests/Gedcom5ConversionTests.swift b/Tests/GedcomTests/Gedcom5ConversionTests.swift index 53e4c65..0e18e86 100644 --- a/Tests/GedcomTests/Gedcom5ConversionTests.swift +++ b/Tests/GedcomTests/Gedcom5ConversionTests.swift @@ -178,6 +178,79 @@ import Foundation #expect(exported.contains("1 ASSO @I2@\n2 ROLE OTHER\n3 PHRASE Honorary uncle\n")) } + @Test func testGedcom551SimplePayloadValuesNormalizeBeforeTypedParsing() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 RESN confidential, privacy +1 NAME Value /Tester/ +2 TYPE birth +1 SEX Male +1 BIRT +2 AGE CHILD +1 FAMC @F1@ +2 PEDI birth +2 STAT challenged +1 BAPL +2 STAT DNS/CAN +3 DATE 1 JAN 1900 +0 @F1@ FAM +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let name = try #require(individual.names.first) + let birth = try #require(individual.events.first(where: { $0.kind == .BIRT })) + let familyChild = try #require(individual.childOfFamilies.first) + let ordinance = try #require(individual.ldsDetails.first(where: { $0.kind == .BAPL })) + + #expect(individual.restrictions == [.CONFIDENTIAL, .PRIVACY]) + #expect(individual.sex == .male) + #expect(name.type?.kind == .BIRTH) + #expect(birth.age?.age == "< 8y") + #expect(birth.age?.phrase == "Child") + #expect(familyChild.pedigree?.kind == .BIRTH) + #expect(familyChild.status?.kind == .CHALLENGED) + #expect(ordinance.status?.kind == .DNS_CAN) + + let exported = ged.exportContent() + #expect(exported.contains("1 RESN CONFIDENTIAL, PRIVACY\n")) + #expect(exported.contains("2 TYPE BIRTH\n")) + #expect(exported.contains("1 SEX M\n")) + #expect(exported.contains("2 AGE < 8y\n3 PHRASE Child\n")) + #expect(exported.contains("2 PEDI BIRTH\n2 STAT CHALLENGED\n")) + #expect(exported.contains("2 STAT DNS_CAN\n3 DATE 1 JAN 1900\n")) + } + + @Test func testGedcom551DatePayloadValuesNormalizeBeforeTypedParsing() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Date /Payload/ +1 RESI +2 DATE BET 1900 AND 1880 +1 EVEN +2 TYPE Roman date +2 DATE @#ROMAN@ 71 +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + let residence = try #require(individual.attributes.first(where: { $0.kind == .RESI })) + let event = try #require(individual.events.first(where: { $0.kind == .EVEN })) + + #expect(residence.date?.date == "BET 1880 AND 1900") + #expect(event.date?.date == "_ROMAN 71") + + let exported = ged.exportContent() + #expect(exported.contains("2 DATE BET 1880 AND 1900\n")) + #expect(exported.contains("2 DATE _ROMAN 71\n")) + } + private func loadGedcom(_ content: String) throws -> GedcomFile { let url = FileManager.default.temporaryDirectory .appendingPathComponent(UUID().uuidString) From 72f5e2a28368cfbb221dd7e7daa8bc8c8448f02e Mon Sep 17 00:00:00 2001 From: Mattias Holm Date: Sat, 20 Jun 2026 13:40:14 +0200 Subject: [PATCH 4/5] More conversions AFN, RFN, RIN -> EXID WAC -> INIL SUBN -> _SUBN --- Sources/Gedcom/GedcomFile.swift | 23 ++++++++++- .../GedcomTests/Gedcom5ConversionTests.swift | 38 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/Sources/Gedcom/GedcomFile.swift b/Sources/Gedcom/GedcomFile.swift index ab5496a..4059ce2 100644 --- a/Sources/Gedcom/GedcomFile.swift +++ b/Sources/Gedcom/GedcomFile.swift @@ -233,17 +233,38 @@ public class GedcomFile { convertGedcom5DateRecordToGedcom7(record) } else if record.line.tag == "OBJE" && record.line.level > 0 && !isPointerValue(record.line.value) { liftGedcom5InlineMultimedia(record) - } else if record.line.tag == "SOUR" && record.line.level > 0 && !isPointerValue(record.line.value) { + } else if record.line.tag == "SOUR" && parentTag != "HEAD" && record.line.level > 0 && !isPointerValue(record.line.value) { liftGedcom5InlineSource(record) } else if record.line.tag == "ROMN" || record.line.tag == "FONE" { convertGedcom5NameVariationToTranslation(record) } else if record.line.tag == "RELA" { convertGedcom5RelationshipToRole(record) + } else if record.line.tag == "AFN" || record.line.tag == "RFN" || record.line.tag == "RIN" { + convertGedcom5IdentifierToExternalIdentifier(record) + } else if record.line.tag == "WAC" { + record.line.tag = "INIL" + convertGedcom5PayloadValueToGedcom7(record, parentTag: parentTag) + } else if record.line.tag == "SUBN" { + record.line.tag = "_SUBN" } else { convertGedcom5PayloadValueToGedcom7(record, parentTag: parentTag) } } + private func convertGedcom5IdentifierToExternalIdentifier(_ record: Record) { + let originalTag = record.line.tag + record.line.tag = "EXID" + + if record.children.first(where: { $0.line.tag == "TYPE" }) == nil { + record.children.insert( + Record(level: record.line.level + 1, + tag: "TYPE", + value: "GEDCOM 5.5.1 \(originalTag)"), + at: 0 + ) + } + } + private func convertGedcom5RecordToGedcom7(_ record: Record) { convertGedcom5RecordToGedcom7(record, parentTag: nil) } diff --git a/Tests/GedcomTests/Gedcom5ConversionTests.swift b/Tests/GedcomTests/Gedcom5ConversionTests.swift index 0e18e86..0b4ce8b 100644 --- a/Tests/GedcomTests/Gedcom5ConversionTests.swift +++ b/Tests/GedcomTests/Gedcom5ConversionTests.swift @@ -251,6 +251,44 @@ import Foundation #expect(exported.contains("2 DATE _ROMAN 71\n")) } + @Test func testGedcom551LegacyIdentifiersWacAndSubmissionPreserveAsGedcom7() throws { + let content = """ +0 HEAD +1 SOUR LegacyApp +1 GEDC +2 VERS 5.5.1 +0 @I1@ INDI +1 NAME Legacy /Tester/ +1 AFN AFN-123 +1 RFN RFN-456 +1 RIN RIN-789 +1 WAC +2 STAT PRE-1970 +3 DATE 2 FEB 1900 +0 @SUBN1@ SUBN +1 SUBM @U1@ +0 @U1@ SUBM +1 NAME Submitter /One/ +0 TRLR +""" + let ged = try loadGedcom(content) + let individual = try #require(ged.individualRecords.first) + + #expect(individual.identifiers.count == 3) + #expect(individual.ldsDetails.first?.kind == .INIL) + #expect(individual.ldsDetails.first?.status?.kind == .PRE_1970) + #expect(ged.extensionRecords.first?.tag == "_SUBN") + #expect(ged.extensionRecords.first?.xref == "@SUBN1@") + + let exported = ged.exportContent() + #expect(exported.contains("1 EXID AFN-123\n2 TYPE GEDCOM 5.5.1 AFN\n")) + #expect(exported.contains("1 EXID RFN-456\n2 TYPE GEDCOM 5.5.1 RFN\n")) + #expect(exported.contains("1 EXID RIN-789\n2 TYPE GEDCOM 5.5.1 RIN\n")) + #expect(exported.contains("1 INIL\n2 STAT PRE_1970\n3 DATE 2 FEB 1900\n")) + #expect(exported.contains("2 TAG _SUBN https://openorbit.org/gedcom/extensions/vendor-legacyapp/_SUBN\n")) + #expect(exported.contains("0 @SUBN1@ _SUBN\n1 SUBM @U1@\n")) + } + private func loadGedcom(_ content: String) throws -> GedcomFile { let url = FileManager.default.temporaryDirectory .appendingPathComponent(UUID().uuidString) From 60a63a369e554c5a83abbfbdb8aebf478573ea31 Mon Sep 17 00:00:00 2001 From: Mattias Holm Date: Sat, 20 Jun 2026 13:51:50 +0200 Subject: [PATCH 5/5] Encoding support --- Sources/Gedcom/Error.swift | 2 + Sources/Gedcom/GedcomFile.swift | 108 +++++++++++++++--- .../GedcomTests/Gedcom5ConversionTests.swift | 84 +++++++++++++- 3 files changed, 177 insertions(+), 17 deletions(-) diff --git a/Sources/Gedcom/Error.swift b/Sources/Gedcom/Error.swift index 8da9d02..726cbf2 100644 --- a/Sources/Gedcom/Error.swift +++ b/Sources/Gedcom/Error.swift @@ -27,4 +27,6 @@ public enum GedcomError : Error { case badNamePiece case badURL case badSchema + case badEncoding + case unsupportedEncoding(String) } diff --git a/Sources/Gedcom/GedcomFile.swift b/Sources/Gedcom/GedcomFile.swift index 4059ce2..edd7952 100644 --- a/Sources/Gedcom/GedcomFile.swift +++ b/Sources/Gedcom/GedcomFile.swift @@ -35,6 +35,8 @@ public class GedcomFile { public var submitterRecords: [Submitter] = [] public var extensionRecords: [GedcomExtensionNode] = [] public var sourceDialect: GedcomDialect = .unknown(version: nil) + public private(set) var sourceEncoding: String.Encoding? + public private(set) var sourceEncodingLabel: String? public var exportDialect: GedcomDialect { .gedcom7(version: "7.0") } public var familyRecordsMap: [String: Family] = [:] @@ -49,7 +51,7 @@ public class GedcomFile { private var generatedSourceRecordIndex = 1 private var liftedGedcom5Records: [Record] = [] - public init(withArchive path: URL, encoding: String.Encoding = .utf8) throws { + public init(withArchive path: URL, encoding: String.Encoding? = nil) throws { self.url = path self.archive = try Archive(url: path, accessMode: .read, pathEncoding: nil) @@ -65,27 +67,17 @@ public class GedcomFile { self.data!.append(data) } - if data!.starts(with: [0xef, 0xbb, 0xbf]) { - // File starts with a BOM, drop it - data!.removeFirst(3) - } - - try parse(encoding: encoding) + try parse(encoding: resolveImportEncoding(preferred: encoding)) try prepareRecordsForBuild() try build() } - public init(withFile path: URL, encoding: String.Encoding = .utf8) throws { + public init(withFile path: URL, encoding: String.Encoding? = nil) throws { self.url = path self.archive = nil self.data = try Data(contentsOf: path) - if data!.starts(with: [0xef, 0xbb, 0xbf]) { - // File starts with a BOM, drop it - data!.removeFirst(3) - } - - try parse(encoding: encoding) + try parse(encoding: resolveImportEncoding(preferred: encoding)) try prepareRecordsForBuild() try build() } @@ -99,8 +91,94 @@ public class GedcomFile { } return String(data: data, encoding: encoding) } + + private func resolveImportEncoding(preferred encoding: String.Encoding?) throws -> String.Encoding { + if let encoding { + removeUTF8ByteOrderMarkIfNeeded(for: encoding) + sourceEncoding = encoding + sourceEncodingLabel = "explicit" + return encoding + } + + guard let data else { + throw GedcomError.badEncoding + } + + if data.starts(with: [0xef, 0xbb, 0xbf]) { + self.data?.removeFirst(3) + sourceEncoding = .utf8 + sourceEncodingLabel = "UTF-8" + return .utf8 + } + + if data.starts(with: [0xff, 0xfe]) || data.starts(with: [0xfe, 0xff]) { + sourceEncoding = .utf16 + sourceEncodingLabel = "UNICODE" + return .utf16 + } + + if let characterEncoding = declaredCharacterEncoding(in: data) { + guard let encoding = swiftEncoding(forGedcomCharacterEncoding: characterEncoding) else { + throw GedcomError.unsupportedEncoding(characterEncoding) + } + sourceEncoding = encoding + sourceEncodingLabel = characterEncoding + return encoding + } + + sourceEncoding = .utf8 + sourceEncodingLabel = "UTF-8" + return .utf8 + } + + private func removeUTF8ByteOrderMarkIfNeeded(for encoding: String.Encoding) { + guard encoding == .utf8, data?.starts(with: [0xef, 0xbb, 0xbf]) == true else { + return + } + data?.removeFirst(3) + } + + private func declaredCharacterEncoding(in data: Data) -> String? { + let prefix = data.prefix(8192) + guard let headerText = String(data: prefix, encoding: .isoLatin1) else { + return nil + } + + for line in headerText.split(whereSeparator: { $0 == "\n" || $0 == "\r" }) { + let parts = line.split(separator: " ", maxSplits: 2, omittingEmptySubsequences: true) + guard parts.count >= 3, parts[0] == "1", parts[1] == "CHAR" else { + continue + } + return parts[2].trimmingCharacters(in: .whitespacesAndNewlines).uppercased() + } + + return nil + } + + private func swiftEncoding(forGedcomCharacterEncoding encoding: String) -> String.Encoding? { + switch encoding + .trimmingCharacters(in: .whitespacesAndNewlines) + .uppercased() + .replacingOccurrences(of: "_", with: "-") { + case "UTF-8", "UTF8": + return .utf8 + case "UNICODE", "UTF-16", "UTF16": + return .utf16 + case "ASCII", "US-ASCII": + return .ascii + case "ANSI", "WINDOWS-1252", "CP1252": + return .windowsCP1252 + case "MACROMAN", "MAC-ROMAN", "MACOSROMAN": + return .macOSRoman + default: + return nil + } + } + func parse(encoding: String.Encoding) throws { - let gedcom = dataAsString(encoding: encoding)! + guard let gedcom = dataAsString(encoding: encoding) else { + throw GedcomError.badEncoding + } var recordStack: [Record] = [] var errorOnLine: Int? diff --git a/Tests/GedcomTests/Gedcom5ConversionTests.swift b/Tests/GedcomTests/Gedcom5ConversionTests.swift index 0b4ce8b..09e9744 100644 --- a/Tests/GedcomTests/Gedcom5ConversionTests.swift +++ b/Tests/GedcomTests/Gedcom5ConversionTests.swift @@ -69,6 +69,81 @@ import Foundation #expect(exported.contains("1 DEAT\n2 DATE 2 FEB 1711\n3 PHRASE 2 FEB 1711/12\n")) } + @Test func testGedcom551AutoDetectsAnsiEncoding() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +1 CHAR ANSI +0 @I1@ INDI +1 NAME André /Müller/ +0 TRLR +""" + let data = try #require(content.data(using: .windowsCP1252)) + let ged = try loadGedcom(data) + + #expect(ged.sourceEncoding == .windowsCP1252) + #expect(ged.sourceEncodingLabel == "ANSI") + #expect(ged.individualRecords.first?.names.first?.name == "André /Müller/") + } + + @Test func testGedcom551AutoDetectsUnicodeEncoding() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +1 CHAR UNICODE +0 @I1@ INDI +1 NAME Unicode /Tester/ +0 TRLR +""" + let data = try #require(content.data(using: .utf16)) + let ged = try loadGedcom(data) + + #expect(ged.sourceEncoding == .utf16) + #expect(ged.sourceEncodingLabel == "UNICODE") + #expect(ged.individualRecords.first?.names.first?.name == "Unicode /Tester/") + } + + @Test func testGedcom551ExplicitEncodingOverrideIsStillSupported() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +1 CHAR UTF-8 +0 @I1@ INDI +1 NAME André /Müller/ +0 TRLR +""" + let data = try #require(content.data(using: .windowsCP1252)) + let ged = try loadGedcom(data, encoding: .windowsCP1252) + + #expect(ged.sourceEncoding == .windowsCP1252) + #expect(ged.sourceEncodingLabel == "explicit") + #expect(ged.individualRecords.first?.names.first?.name == "André /Müller/") + } + + @Test func testGedcom551AnselEncodingIsReportedUnsupported() throws { + let content = """ +0 HEAD +1 GEDC +2 VERS 5.5.1 +1 CHAR ANSEL +0 @I1@ INDI +1 NAME Ansel /Tester/ +0 TRLR +""" + + do { + _ = try loadGedcom(content) + #expect(Bool(false), "ANSEL should not be reported as supported without a portable decoder") + } catch GedcomError.unsupportedEncoding(let encoding) { + #expect(encoding == "ANSEL") + } catch { + throw error + } + } + @Test func testGedcom551InlineMultimediaLiftedToRecord() throws { let content = """ 0 HEAD @@ -290,11 +365,16 @@ import Foundation } private func loadGedcom(_ content: String) throws -> GedcomFile { + let data = try #require(content.data(using: .utf8)) + return try loadGedcom(data) + } + + private func loadGedcom(_ data: Data, encoding: String.Encoding? = nil) throws -> GedcomFile { let url = FileManager.default.temporaryDirectory .appendingPathComponent(UUID().uuidString) .appendingPathExtension("ged") - try content.write(to: url, atomically: true, encoding: .utf8) + try data.write(to: url) defer { try? FileManager.default.removeItem(at: url) } - return try GedcomFile(withFile: url) + return try GedcomFile(withFile: url, encoding: encoding) } }