sign · AmitMY · May 20, 2026 · May 20, 2026 · cursor · May 20, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -19,12 +19,13 @@ RUN pip install --no-cache-dir ".[web]"
 # Download the wordnet data and initialize the database
 # CILI is for the Collaborative Interlingual Index
 # ODENET is for the German WordNet (linked to CILI)
-# TODO: this should be done in a separate volume
 RUN python -m wn download omw:1.4 cili odenet:1.4
 
-# Load data extensions
+# Load data extensions and merge them into their base lexicons so the
+# Open Multilingual Wordnet ends up with one lexicon per language.
 COPY extensions/wikidata-lexemes/output ./extensions/wikidata-lexemes/output
-RUN python -c "import wn; import sys; [wn.add(f) for f in sys.argv[1:]]" extensions/wikidata-lexemes/output/*.xml
+COPY extensions/wikidata-lexemes/merge_extension.py ./extensions/wikidata-lexemes/merge_extension.py
+RUN python extensions/wikidata-lexemes/merge_extension.py extensions/wikidata-lexemes/output/*.xml
 
 # Run ANALYZE so SQLite has query planner statistics baked into the image
 RUN python -c "from wn._db import connect; c = connect(); c.execute('ANALYZE')"

diff --git a/docs/api/wn.constants.rst b/docs/api/wn.constants.rst
@@ -254,6 +254,12 @@ Parts of Speech
    - ``p`` -- Adposition
    - ``x`` -- Other
    - ``u`` -- Unknown
+   - ``h`` -- Pronoun
+   - ``d`` -- Determiner
+   - ``m`` -- Numeral
+   - ``i`` -- Interjection
+   - ``q`` -- Interrogative
+   - ``y`` -- Particle
 
 .. autodata:: NOUN
 .. autodata:: VERB
@@ -280,6 +286,35 @@ Parts of Speech
 
 .. autodata:: OTHER
 .. autodata:: UNKNOWN
+.. autodata:: PRONOUN
+.. data:: PRON
+
+   Alias of :py:data:`PRONOUN`
+
+.. autodata:: DETERMINER
+.. data:: DET
+
+   Alias of :py:data:`DETERMINER`
+
+.. autodata:: NUMERAL
+.. data:: NUM
+
+   Alias of :py:data:`NUMERAL`
+
+.. autodata:: INTERJECTION
+.. data:: INTJ
+
+   Alias of :py:data:`INTERJECTION`
+
+.. autodata:: INTERROGATIVE
+.. data:: INTRG
+
+   Alias of :py:data:`INTERROGATIVE`
+
+.. autodata:: PARTICLE
+.. data:: PART
+
+   Alias of :py:data:`PARTICLE`
 
 
 Adjective Positions

diff --git a/extensions/wikidata-lexemes/README.md b/extensions/wikidata-lexemes/README.md
@@ -40,8 +40,22 @@ The script generates ~130 language-specific XML files in Global WordNet LMF form
 - etc.
 
 Each file contains lexical entries with:
-- Lemma and part of speech
+- Lemma and part of speech (short WN-LMF codes — `h` pronoun, `d` determiner, `m` numeral, `i` interjection, `q` interrogative, `y` particle, plus the existing `n/v/a/r/s/t/c/p/x`)
 - Sense definitions and glosses
 - Usage examples (where available)
 - Sense relations (synonyms, antonyms, hypernyms, hyponyms)
 - "Interlingual index" like links to English senses
+
+## Loading into the database
+
+`wn.add(file.xml)` stores an extension as a separate lexicon. To get one
+lexicon per language instead, use the bundled merge utility:
+
+```bash
+python merge_extension.py output/*.xml
+```
+
+It calls `wn.add` and then rewrites the database so the extension's
+entries, senses and synsets belong to the base lexicon (e.g. `omw-en:1.4`),
+and deletes the now-empty extension lexicon row. See
+[goodmami/wn#304](https://github.com/goodmami/wn/issues/304) for context.
diff --git a/extensions/wikidata-lexemes/_pos_map.py b/extensions/wikidata-lexemes/_pos_map.py
@@ -0,0 +1,47 @@
+"""Map verbose Wikidata POS labels to short WN-LMF POS codes."""
+from wn.constants import (
+    ADP,
+    ADV,
+    CONJ,
+    DET,
+    INTERROGATIVE,
+    INTJ,
+    NUM,
+    OTHER,
+    PART,
+    PRON,
+)
+
+POS_MAP: dict[str, str] = {
+    "pronoun": PRON,
+    "personal pronoun": PRON,
+    "possessive determiner": PRON,
+    "reflexive personal pronoun": PRON,
+    "reflexive pronoun": PRON,
+    "reciprocal pronoun": PRON,
+    "interrogative pronoun": PRON,
+    "indefinite pronoun": PRON,
+    "relative pronoun": PRON,
+    "demonstrative pronoun": PRON,
+    "definite pronoun": PRON,
+    "pronominal locution": PRON,
+    "determiner": DET,
+    "definite article": DET,
+    "indefinite article": DET,
+    "demonstrative determiner": DET,
+    "adverbial phrase": ADV,
+    "adverbial locution": ADV,
+    "adverbial particle": ADV,
+    "interrogative adverb": ADV,
+    "prepositional adverb": ADV,
+    "preposition": ADP,
+    "prepositional locution": ADP,
+    "conjunction": CONJ,
+    "numeral": NUM,
+    "interjection": INTJ,
+    "interrogative word": INTERROGATIVE,
+    "interrogative expression": INTERROGATIVE,
+    "grammatical particle": PART,
+    "infinitive marker": PART,
+    "acronym": OTHER,
+}
diff --git a/extensions/wikidata-lexemes/create_extensions.py b/extensions/wikidata-lexemes/create_extensions.py
@@ -6,8 +6,11 @@
 
 import ijson
 import requests
+from _pos_map import POS_MAP
 from tqdm import tqdm
 
+from wn.constants import OTHER
+
 DATA_PATH = Path(__file__).parent / "latest-lexemes.json.bz2"
 EXTENSIONS_DIR = Path(__file__).parent / "output"
 EXTRAS_DIR = Path(__file__).parent / "extras"
@@ -72,6 +75,7 @@
     "toponym",
 }
 
+
 SENSE_RELATIONS = {
     "P5973": "similar",  # synonym
     "P5974": "antonym",  # antonym
@@ -291,6 +295,7 @@ def build_xml_entry(
     if not pos_q:
         return None
     pos_name = get_label(pos_q)
+    pos_code = POS_MAP.get(pos_name, OTHER)
 
     senses = lexeme.get("senses", [])
     if not senses:
@@ -322,7 +327,9 @@ def build_xml_entry(
         sense_content += "      </Sense>"
         sense_entries.append(sense_content)
 
-        synset_content = f'    <Synset id="{synset_id}"{ili_attr}>\n'
+        synset_content = (
+            f'    <Synset id="{synset_id}"{ili_attr} partOfSpeech="{pos_code}">\n'
+        )
         synset_content += f'      <Definition>{escape_xml(gloss)}</Definition>\n'
         synset_content += "    </Synset>"
         synset_entries.append(synset_content)
@@ -333,7 +340,7 @@ def build_xml_entry(
     entry_xml = (
         f'    <LexicalEntry id="{lexeme_id}">\n'
         f'      <Lemma writtenForm="{escape_xml(lemma)}"'
-        f' partOfSpeech="{escape_xml(pos_name)}"/>\n'
+        f' partOfSpeech="{pos_code}"/>\n'
         + "\n".join(sense_entries)
         + "\n    </LexicalEntry>"
     )

diff --git a/extensions/wikidata-lexemes/merge_extension.py b/extensions/wikidata-lexemes/merge_extension.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""Merge a WN-LMF lexicon extension into its base lexicon.
+
+`wn.add()` stores an extension as a separate lexicon row, which means
+queries against the base lexicon won't see the extension's entries.
+This script loads the extension normally and then rewrites the
+``lexicon_rowid`` columns of every content table so the extension's
+rows appear under the base lexicon — leaving one lexicon per language
+in the database.
+
+See https://github.com/goodmami/wn/issues/304 for the original motivation.
+
+Usage::
+
+    python merge_extension.py path/to/extension.xml [more.xml ...]
+"""
+from __future__ import annotations
+
+import logging
+import re
+import sys
+from contextlib import contextmanager
+from pathlib import Path
+
+import wn
+from wn import lmf
+from wn._db import connect
+from wn._util import format_lexicon_specifier
+
+log = logging.getLogger("wn")
+
+_FIRST_ENTRY_ID_RE = re.compile(rb'<LexicalEntry\b[^>]*\bid="([^"]+)"')
+
+# Tables with a ``lexicon_rowid`` column. Keep in sync with wn/schema.sql.
+_LEXICON_OWNED_TABLES = (
+    "entries",
+    "forms",
+    "pronunciations",
+    "tags",
+    "synsets",
+    "synset_relations",
+    "definitions",
+    "synset_examples",
+    "senses",
+    "sense_relations",
+    "sense_synset_relations",
+    "sense_examples",
+    "counts",
+    "syntactic_behaviours",
+)
+
+# Tables in _LEXICON_OWNED_TABLES that lack an index on lexicon_rowid in
+# the shipped schema. We create a temporary index over the bulk merge so
+# each UPDATE is O(rows-to-update) instead of O(table-size).
+_UNINDEXED_TABLES = (
+    "entries",
+    "pronunciations",
+    "tags",
+    "synsets",
+    "synset_relations",
+    "definitions",
+    "synset_examples",
+    "senses",
+    "sense_relations",
+    "sense_synset_relations",
+    "sense_examples",
+    "counts",
+)
+
+
+def _get_lexicon_rowid(cur, specifier: str) -> int | None:
+    row = cur.execute(
+        "SELECT rowid FROM lexicons WHERE specifier = ?",
+        (specifier,),
+    ).fetchone()
+    return row[0] if row else None
+
+
+def _first_entry_id(xml_path: Path) -> str | None:
+    with open(xml_path, "rb") as fh:
+        match = _FIRST_ENTRY_ID_RE.search(fh.read())
+    return match.group(1).decode("utf-8") if match else None
+
+
+def _is_already_merged(cur, xml_path: Path, base_rowid: int) -> bool:
+    entry_id = _first_entry_id(xml_path)
+    if entry_id is None:
+        return False
+    return (
+        cur.execute(
+            "SELECT 1 FROM entries WHERE id = ? AND lexicon_rowid = ?",
+            (entry_id, base_rowid),
+        ).fetchone()
+        is not None
+    )
+
+
+def merge_extension(xml_path: Path) -> None:
+    """Add *xml_path* and absorb it into its base lexicon."""
+    infos = lmf.scan_lexicons(xml_path)
+    if not infos:
+        log.warning("%s: no lexicons found, skipping", xml_path)
+        return
+
+    info = infos[0]
+    extends = info.get("extends")
+    if not extends:
+        log.info("%s: not a lexicon extension, loading as-is", xml_path)
+        wn.add(xml_path, progress_handler=None)
+        return
+
+    ext_spec = format_lexicon_specifier(info["id"], info["version"])
+    base_spec = format_lexicon_specifier(extends["id"], extends["version"])
+
+    conn = connect()
+    cur = conn.cursor()
+
+    base_rowid = _get_lexicon_rowid(cur, base_spec)
+    if base_rowid is None:
+        log.info(
+            "skipping %s: base lexicon %s is not in the database",
+            ext_spec,
+            base_spec,
+        )
+        return
+
+    if _is_already_merged(cur, xml_path, base_rowid):
+        log.info("%s already merged into %s", ext_spec, base_spec)
+        return
+
+    wn.add(xml_path, progress_handler=None)
+
+    ext_rowid = _get_lexicon_rowid(cur, ext_spec)
+    if ext_rowid is None:
+        raise RuntimeError(f"failed to add extension {ext_spec}")
+
+    with conn:
+        for table in _LEXICON_OWNED_TABLES:
+            cur.execute(
+                f"UPDATE {table} SET lexicon_rowid = ? WHERE lexicon_rowid = ?",
+                (base_rowid, ext_rowid),
+            )
+        cur.execute(
+            "DELETE FROM lexicon_extensions WHERE extension_rowid = ?",
+            (ext_rowid,),
+        )
+        cur.execute(
+            "DELETE FROM lexicon_dependencies WHERE dependent_rowid = ?",
+            (ext_rowid,),
+        )
+        cur.execute("DELETE FROM lexicons WHERE rowid = ?", (ext_rowid,))
+
+    log.info("Merged %s into %s", ext_spec, base_spec)
+
+
+@contextmanager
+def _bulk_merge_indexes(conn):
+    for table in _UNINDEXED_TABLES:
+        conn.execute(
+            f"CREATE INDEX IF NOT EXISTS "
+            f"tmp_merge_{table}_lex ON {table}(lexicon_rowid)"
+        )
+    try:
+        yield
+    finally:
+        for table in _UNINDEXED_TABLES:
+            conn.execute(f"DROP INDEX IF EXISTS tmp_merge_{table}_lex")
+
+
+def main(argv: list[str]) -> int:
+    if not argv:
+        print(__doc__)
+        return 1
+
+    with _bulk_merge_indexes(connect()):
+        for arg in argv:
+            path = Path(arg)
+            if not path.exists():
+                log.warning("skipping missing path: %s", path)
+                continue
+            merge_extension(path)
+    return 0
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    sys.exit(main(sys.argv[1:]))