Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ RUN pip install --no-cache-dir ".[web]"
# Download the wordnet data and initialize the database
# CILI is for the Collaborative Interlingual Index
# ODENET is for the German WordNet (linked to CILI)
# TODO: this should be done in a separate volume
RUN python -m wn download omw:1.4 cili odenet:1.4

# Load data extensions
# Load data extensions and merge them into their base lexicons so the
# Open Multilingual Wordnet ends up with one lexicon per language.
COPY extensions/wikidata-lexemes/output ./extensions/wikidata-lexemes/output
RUN python -c "import wn; import sys; [wn.add(f) for f in sys.argv[1:]]" extensions/wikidata-lexemes/output/*.xml
COPY extensions/wikidata-lexemes/merge_extension.py ./extensions/wikidata-lexemes/merge_extension.py
RUN python extensions/wikidata-lexemes/merge_extension.py extensions/wikidata-lexemes/output/*.xml

# Run ANALYZE so SQLite has query planner statistics baked into the image
RUN python -c "from wn._db import connect; c = connect(); c.execute('ANALYZE')"
Expand Down
35 changes: 35 additions & 0 deletions docs/api/wn.constants.rst
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@ Parts of Speech
- ``p`` -- Adposition
- ``x`` -- Other
- ``u`` -- Unknown
- ``h`` -- Pronoun
- ``d`` -- Determiner
- ``m`` -- Numeral
- ``i`` -- Interjection
- ``q`` -- Interrogative
- ``y`` -- Particle

.. autodata:: NOUN
.. autodata:: VERB
Expand All @@ -280,6 +286,35 @@ Parts of Speech

.. autodata:: OTHER
.. autodata:: UNKNOWN
.. autodata:: PRONOUN
.. data:: PRON

Alias of :py:data:`PRONOUN`

.. autodata:: DETERMINER
.. data:: DET

Alias of :py:data:`DETERMINER`

.. autodata:: NUMERAL
.. data:: NUM

Alias of :py:data:`NUMERAL`

.. autodata:: INTERJECTION
.. data:: INTJ

Alias of :py:data:`INTERJECTION`

.. autodata:: INTERROGATIVE
.. data:: INTRG

Alias of :py:data:`INTERROGATIVE`

.. autodata:: PARTICLE
.. data:: PART

Alias of :py:data:`PARTICLE`


Adjective Positions
Expand Down
16 changes: 15 additions & 1 deletion extensions/wikidata-lexemes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,22 @@ The script generates ~130 language-specific XML files in Global WordNet LMF form
- etc.

Each file contains lexical entries with:
- Lemma and part of speech
- Lemma and part of speech (short WN-LMF codes — `h` pronoun, `d` determiner, `m` numeral, `i` interjection, `q` interrogative, `y` particle, plus the existing `n/v/a/r/s/t/c/p/x`)
- Sense definitions and glosses
- Usage examples (where available)
- Sense relations (synonyms, antonyms, hypernyms, hyponyms)
- "Interlingual index" like links to English senses

## Loading into the database

`wn.add(file.xml)` stores an extension as a separate lexicon. To get one
lexicon per language instead, use the bundled merge utility:

```bash
python merge_extension.py output/*.xml
```

It calls `wn.add` and then rewrites the database so the extension's
entries, senses and synsets belong to the base lexicon (e.g. `omw-en:1.4`),
and deletes the now-empty extension lexicon row. See
[goodmami/wn#304](https://github.com/goodmami/wn/issues/304) for context.
47 changes: 47 additions & 0 deletions extensions/wikidata-lexemes/_pos_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""Map verbose Wikidata POS labels to short WN-LMF POS codes."""
from wn.constants import (
ADP,
ADV,
CONJ,
DET,
INTERROGATIVE,
INTJ,
NUM,
OTHER,
PART,
PRON,
)

POS_MAP: dict[str, str] = {
"pronoun": PRON,
"personal pronoun": PRON,
"possessive determiner": PRON,
"reflexive personal pronoun": PRON,
"reflexive pronoun": PRON,
"reciprocal pronoun": PRON,
"interrogative pronoun": PRON,
"indefinite pronoun": PRON,
"relative pronoun": PRON,
"demonstrative pronoun": PRON,
"definite pronoun": PRON,
"pronominal locution": PRON,
"determiner": DET,
"definite article": DET,
"indefinite article": DET,
"demonstrative determiner": DET,
"adverbial phrase": ADV,
"adverbial locution": ADV,
"adverbial particle": ADV,
"interrogative adverb": ADV,
"prepositional adverb": ADV,
"preposition": ADP,
"prepositional locution": ADP,
"conjunction": CONJ,
"numeral": NUM,
"interjection": INTJ,
"interrogative word": INTERROGATIVE,
"interrogative expression": INTERROGATIVE,
"grammatical particle": PART,
"infinitive marker": PART,
"acronym": OTHER,
}
11 changes: 9 additions & 2 deletions extensions/wikidata-lexemes/create_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

import ijson
import requests
from _pos_map import POS_MAP
from tqdm import tqdm

from wn.constants import OTHER

DATA_PATH = Path(__file__).parent / "latest-lexemes.json.bz2"
EXTENSIONS_DIR = Path(__file__).parent / "output"
EXTRAS_DIR = Path(__file__).parent / "extras"
Expand Down Expand Up @@ -72,6 +75,7 @@
"toponym",
}


SENSE_RELATIONS = {
"P5973": "similar", # synonym
"P5974": "antonym", # antonym
Expand Down Expand Up @@ -291,6 +295,7 @@ def build_xml_entry(
if not pos_q:
return None
pos_name = get_label(pos_q)
pos_code = POS_MAP.get(pos_name, OTHER)

senses = lexeme.get("senses", [])
if not senses:
Expand Down Expand Up @@ -322,7 +327,9 @@ def build_xml_entry(
sense_content += " </Sense>"
sense_entries.append(sense_content)

synset_content = f' <Synset id="{synset_id}"{ili_attr}>\n'
synset_content = (
f' <Synset id="{synset_id}"{ili_attr} partOfSpeech="{pos_code}">\n'
)
synset_content += f' <Definition>{escape_xml(gloss)}</Definition>\n'
synset_content += " </Synset>"
synset_entries.append(synset_content)
Expand All @@ -333,7 +340,7 @@ def build_xml_entry(
entry_xml = (
f' <LexicalEntry id="{lexeme_id}">\n'
f' <Lemma writtenForm="{escape_xml(lemma)}"'
f' partOfSpeech="{escape_xml(pos_name)}"/>\n'
f' partOfSpeech="{pos_code}"/>\n'
+ "\n".join(sense_entries)
+ "\n </LexicalEntry>"
)
Expand Down
187 changes: 187 additions & 0 deletions extensions/wikidata-lexemes/merge_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""Merge a WN-LMF lexicon extension into its base lexicon.

`wn.add()` stores an extension as a separate lexicon row, which means
queries against the base lexicon won't see the extension's entries.
This script loads the extension normally and then rewrites the
``lexicon_rowid`` columns of every content table so the extension's
rows appear under the base lexicon — leaving one lexicon per language
in the database.

See https://github.com/goodmami/wn/issues/304 for the original motivation.

Usage::

python merge_extension.py path/to/extension.xml [more.xml ...]
"""
from __future__ import annotations

import logging
import re
import sys
from contextlib import contextmanager
from pathlib import Path

import wn
from wn import lmf
from wn._db import connect
from wn._util import format_lexicon_specifier

log = logging.getLogger("wn")

_FIRST_ENTRY_ID_RE = re.compile(rb'<LexicalEntry\b[^>]*\bid="([^"]+)"')

# Tables with a ``lexicon_rowid`` column. Keep in sync with wn/schema.sql.
_LEXICON_OWNED_TABLES = (
"entries",
"forms",
"pronunciations",
"tags",
"synsets",
"synset_relations",
"definitions",
"synset_examples",
"senses",
"sense_relations",
"sense_synset_relations",
"sense_examples",
"counts",
"syntactic_behaviours",
)

# Tables in _LEXICON_OWNED_TABLES that lack an index on lexicon_rowid in
# the shipped schema. We create a temporary index over the bulk merge so
# each UPDATE is O(rows-to-update) instead of O(table-size).
_UNINDEXED_TABLES = (
"entries",
"pronunciations",
"tags",
"synsets",
"synset_relations",
"definitions",
"synset_examples",
"senses",
"sense_relations",
"sense_synset_relations",
"sense_examples",
"counts",
)


def _get_lexicon_rowid(cur, specifier: str) -> int | None:
row = cur.execute(
"SELECT rowid FROM lexicons WHERE specifier = ?",
(specifier,),
).fetchone()
return row[0] if row else None


def _first_entry_id(xml_path: Path) -> str | None:
with open(xml_path, "rb") as fh:
match = _FIRST_ENTRY_ID_RE.search(fh.read())
return match.group(1).decode("utf-8") if match else None


def _is_already_merged(cur, xml_path: Path, base_rowid: int) -> bool:
entry_id = _first_entry_id(xml_path)
if entry_id is None:
return False
return (
cur.execute(
"SELECT 1 FROM entries WHERE id = ? AND lexicon_rowid = ?",
(entry_id, base_rowid),
).fetchone()
is not None
)


def merge_extension(xml_path: Path) -> None:
"""Add *xml_path* and absorb it into its base lexicon."""
infos = lmf.scan_lexicons(xml_path)
if not infos:
log.warning("%s: no lexicons found, skipping", xml_path)
return

info = infos[0]
extends = info.get("extends")
if not extends:
log.info("%s: not a lexicon extension, loading as-is", xml_path)
wn.add(xml_path, progress_handler=None)
return

ext_spec = format_lexicon_specifier(info["id"], info["version"])
base_spec = format_lexicon_specifier(extends["id"], extends["version"])

conn = connect()
cur = conn.cursor()

base_rowid = _get_lexicon_rowid(cur, base_spec)
if base_rowid is None:
log.info(
"skipping %s: base lexicon %s is not in the database",
ext_spec,
base_spec,
)
return

if _is_already_merged(cur, xml_path, base_rowid):
log.info("%s already merged into %s", ext_spec, base_spec)
return

wn.add(xml_path, progress_handler=None)

ext_rowid = _get_lexicon_rowid(cur, ext_spec)
if ext_rowid is None:
raise RuntimeError(f"failed to add extension {ext_spec}")

with conn:
for table in _LEXICON_OWNED_TABLES:
cur.execute(
f"UPDATE {table} SET lexicon_rowid = ? WHERE lexicon_rowid = ?",
(base_rowid, ext_rowid),
)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Re-merge duplicates base entries

High Severity

After a successful merge the extension lexicon row is deleted, so a later merge_extension run calls wn.add again and re-inserts the same entry ids under a new extension lexicon. Updating lexicon_rowid to the base then collides with rows already merged, violating entries uniqueness on (id, lexicon_rowid).

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit d856879. Configure here.

cur.execute(
"DELETE FROM lexicon_extensions WHERE extension_rowid = ?",
(ext_rowid,),
)
cur.execute(
"DELETE FROM lexicon_dependencies WHERE dependent_rowid = ?",
(ext_rowid,),
)
cur.execute("DELETE FROM lexicons WHERE rowid = ?", (ext_rowid,))

log.info("Merged %s into %s", ext_spec, base_spec)


@contextmanager
def _bulk_merge_indexes(conn):
for table in _UNINDEXED_TABLES:
conn.execute(
f"CREATE INDEX IF NOT EXISTS "
f"tmp_merge_{table}_lex ON {table}(lexicon_rowid)"
)
try:
yield
finally:
for table in _UNINDEXED_TABLES:
conn.execute(f"DROP INDEX IF EXISTS tmp_merge_{table}_lex")


def main(argv: list[str]) -> int:
if not argv:
print(__doc__)
return 1

with _bulk_merge_indexes(connect()):
for arg in argv:
path = Path(arg)
if not path.exists():
log.warning("skipping missing path: %s", path)
continue
merge_extension(path)
return 0


if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(message)s")
sys.exit(main(sys.argv[1:]))
Loading
Loading