Skip to content

Python API

The Python bindings are distributed as the theopendictionary package on PyPI. They are native extensions built with PyO3.

Terminal window
pip install theopendictionary

Requires Python 3.8.1+.

from theopendictionary import OpenDictionary, compile
# Compile XML to bytes
xml = """
<dictionary name="My Dictionary">
<entry term="hello">
<ety>
<sense pos="intj">
<definition value="A greeting">
<example value="Hello, world!" />
</definition>
</sense>
</ety>
</entry>
</dictionary>
"""
compiled_bytes = compile(xml)
dictionary = OpenDictionary(compiled_bytes)
results = dictionary.lookup("hello")
print(results[0].entry.term) # "hello"
print(results[0].entry.etymologies) # [Etymology(...)]

Compiles an ODXML string into binary .odict data (as a bytes object). This data can be passed to OpenDictionary() or saved to disk.

from theopendictionary import compile
data = compile("<dictionary><entry term='hi'><ety><sense><definition value='greeting'/></sense></ety></entry></dictionary>")

The main class for working with compiled dictionaries.

Creates a dictionary from compiled binary data (as returned by compile()) or directly from an XML string.

from theopendictionary import OpenDictionary, compile
# From compiled bytes
data = compile(xml_string)
dictionary = OpenDictionary(data)
# Directly from XML string
dictionary = OpenDictionary(xml_string)

await OpenDictionary.load(dictionary: str, options: LoadOptions | None = None) -> OpenDictionary

Section titled “await OpenDictionary.load(dictionary: str, options: LoadOptions | None = None) -> OpenDictionary”

Loads a dictionary from a file path, alias, or remote identifier. This is an async method.

  • If dictionary is a path to a .odict file, it loads from disk.
  • If it matches the format org/lang (e.g. wiktionary/eng), it downloads from the remote registry.
import asyncio
from theopendictionary import OpenDictionary, LoadOptions, RemoteLoadOptions
async def main():
# Load from file
dictionary = await OpenDictionary.load("./my-dictionary.odict")
# Load from remote registry
dictionary = await OpenDictionary.load("wiktionary/eng")
# Load with options
opts = LoadOptions(
config_dir="./config",
remote=RemoteLoadOptions(caching=True)
)
dictionary = await OpenDictionary.load("wiktionary/eng", opts)
asyncio.run(main())
PropertyTypeDescription
min_rankint | NoneThe minimum rank value across all entries, or None if no entries have ranks
max_rankint | NoneThe maximum rank value across all entries, or None if no entries have ranks

save(path: str, quality: int | None = None, window_size: int | None = None) -> None

Section titled “save(path: str, quality: int | None = None, window_size: int | None = None) -> None”

Saves the dictionary to disk as a .odict file. Optionally configure Brotli compression.

ParameterTypeDefaultDescription
pathstrOutput file path
qualityint | NoneNoneBrotli compression level (0–11)
window_sizeint | NoneNoneBrotli window size (0–22)
dictionary.save("output.odict")
dictionary.save("output.odict", quality=11, window_size=22)

lookup(query, split=None, follow=None, insensitive=None) -> list[LookupResult]

Section titled “lookup(query, split=None, follow=None, insensitive=None) -> list[LookupResult]”

Looks up one or more terms by exact match.

ParameterTypeDefaultDescription
querystr | list[str]Term(s) to look up
splitint | NoneNoneMinimum word length for compound splitting
followbool | NoneNoneFollow see cross-references until an entry with etymologies is found
insensitivebool | NoneNoneEnable case-insensitive matching
# Simple lookup
results = dictionary.lookup("cat")
# Multiple terms
results = dictionary.lookup(["cat", "dog"])
# Follow cross-references, case-insensitive
results = dictionary.lookup("RaN", follow=True, insensitive=True)
# results[0].entry.term == "run"
# results[0].directed_from.term == "ran"
# Compound word splitting
results = dictionary.lookup("catdog", split=3)

split(query, min_length=None, follow=None, insensitive=None) -> list[LookupResult]

Section titled “split(query, min_length=None, follow=None, insensitive=None) -> list[LookupResult]”

Splits one or more compound terms into component dictionary entries. Unlike lookup(..., split=N), this does not try the whole query first.

ParameterTypeDefaultDescription
querystr | list[str]Term(s) to split
min_lengthint | NoneNoneMinimum character length for each segment
followbool | NoneNoneFollow see cross-references
insensitivebool | NoneNoneEnable case-insensitive matching
results = dictionary.split("catdog", min_length=3)
results = dictionary.split("CATdog", min_length=3, insensitive=True)

Returns all terms defined in the dictionary, sorted alphabetically.

words = dictionary.lexicon()
# ["cat", "dog", "run", ...]

Creates a full-text search index for the dictionary.

ParameterTypeDefaultDescription
optionsIndexOptions | NoneNoneIndexing configuration
from theopendictionary import IndexOptions
dictionary.index()
dictionary.index(IndexOptions(overwrite=True, memory=50_000_000))

search(query: str, options=None) -> list[Entry]

Section titled “search(query: str, options=None) -> list[Entry]”

Runs a full-text search across the dictionary. Requires an index (call index() first).

ParameterTypeDefaultDescription
querystrSearch query
optionsSearchOptions | NoneNoneSearch configuration
from theopendictionary import SearchOptions
dictionary.index()
results = dictionary.search("domesticated mammal")
results = dictionary.search("greeting", SearchOptions(limit=5))

tokenize(text: str, follow=None, insensitive=None) -> list[Token]

Section titled “tokenize(text: str, follow=None, insensitive=None) -> list[Token]”

Tokenizes text using NLP-based segmentation and matches each token against the dictionary. Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages.

ParameterTypeDefaultDescription
textstrText to tokenize
followbool | int | NoneNoneFollow see cross-references. Accepts True/False or a number (nonzero = follow)
insensitivebool | NoneNoneCase-insensitive matching
tokens = dictionary.tokenize("the cat ran")
for token in tokens:
print(token.lemma, token.entries)

PropertyTypeDescription
entryEntryThe matched entry
directed_fromEntry | NoneThe original entry if a see redirect was followed
PropertyTypeDescription
termstrThe headword
rankint | NoneOptional frequency rank
see_alsostr | NoneCross-reference target term
etymologieslist[Etymology]List of etymologies
medialist[MediaURL]Media URLs
PropertyTypeDescription
lemmastrThe original token text
languagestr | NoneDetected language code
scriptstrDetected script name
kindstrToken kind
startintStart offset in the original text
endintEnd offset in the original text
entrieslist[LookupResult]Matched dictionary entries
ParameterTypeDefaultDescription
directorystr | NoneNoneCustom directory for the index
memoryint | NoneNoneMemory arena per thread in bytes (must be >15MB)
overwritebool | NoneNoneOverwrite existing index
ParameterTypeDefaultDescription
directorystr | NoneNoneCustom index directory
thresholdint | NoneNoneRelevance threshold
autoindexbool | NoneNoneAuto-create index if missing
limitint | NoneNoneMaximum results
ParameterTypeDefaultDescription
min_lengthint | NoneNoneMinimum character length for each segment
followbool | NoneNoneFollow see cross-references
insensitivebool | NoneNoneEnable case-insensitive matching
PropertyTypeDescription
kindEnumWrapper | NoneThe pronunciation system (e.g. IPA, Pinyin)
valuestrThe pronunciation notation
medialist[MediaURL]Audio URLs
PropertyTypeDescription
srcstrURL or path to the media file
mime_typestr | NoneMIME type (e.g. audio/mpeg)
descriptionstr | NoneDescription of the media