@inproceedings{ylonen-2022-wiktextract,
title = "Wiktextract: {W}iktionary as Machine-Readable Structured Data",
author = "Ylonen, Tatu",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.140/",
pages = "1317--1325",
abstract = "We present a machine-readable structured data version of Wiktionary. Unlike previous Wiktionary extractions, the new extractor, Wiktextract, fully interprets and expands templates and Lua modules in Wiktionary. This enables it to perform a more complete, robust, and maintainable extraction. The extracted data is multilingual and includes lemmas, inflected forms, translations, etymology, usage examples, pronunciations (including URLs of sound files), lexical and semantic relations, and various morphological, syntactic, semantic, topical, and dialectal annotations. We extract all data from the English Wiktionary. Comparing against previous extractions from language-specific dictionaries, we find that its coverage for non-English languages often matches or exceeds the coverage in the language-specific editions, with the added benefit that all glosses are in English. The data is freely available and regularly updated, enabling anyone to add more data and correct errors by editing Wiktionary. The extracted data is in JSON format and designed to be easy to use by researchers, downstream resources, and application developers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ylonen-2022-wiktextract">
<titleInfo>
<title>Wiktextract: Wiktionary as Machine-Readable Structured Data</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatu</namePart>
<namePart type="family">Ylonen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frédéric</namePart>
<namePart type="family">Béchet</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philippe</namePart>
<namePart type="family">Blache</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Cieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Goggi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hitoshi</namePart>
<namePart type="family">Isahara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hélène</namePart>
<namePart type="family">Mazo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a machine-readable structured data version of Wiktionary. Unlike previous Wiktionary extractions, the new extractor, Wiktextract, fully interprets and expands templates and Lua modules in Wiktionary. This enables it to perform a more complete, robust, and maintainable extraction. The extracted data is multilingual and includes lemmas, inflected forms, translations, etymology, usage examples, pronunciations (including URLs of sound files), lexical and semantic relations, and various morphological, syntactic, semantic, topical, and dialectal annotations. We extract all data from the English Wiktionary. Comparing against previous extractions from language-specific dictionaries, we find that its coverage for non-English languages often matches or exceeds the coverage in the language-specific editions, with the added benefit that all glosses are in English. The data is freely available and regularly updated, enabling anyone to add more data and correct errors by editing Wiktionary. The extracted data is in JSON format and designed to be easy to use by researchers, downstream resources, and application developers.</abstract>
<identifier type="citekey">ylonen-2022-wiktextract</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.140/</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>1317</start>
<end>1325</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Wiktextract: Wiktionary as Machine-Readable Structured Data
%A Ylonen, Tatu
%Y Calzolari, Nicoletta
%Y Béchet, Frédéric
%Y Blache, Philippe
%Y Choukri, Khalid
%Y Cieri, Christopher
%Y Declerck, Thierry
%Y Goggi, Sara
%Y Isahara, Hitoshi
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Mazo, Hélène
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F ylonen-2022-wiktextract
%X We present a machine-readable structured data version of Wiktionary. Unlike previous Wiktionary extractions, the new extractor, Wiktextract, fully interprets and expands templates and Lua modules in Wiktionary. This enables it to perform a more complete, robust, and maintainable extraction. The extracted data is multilingual and includes lemmas, inflected forms, translations, etymology, usage examples, pronunciations (including URLs of sound files), lexical and semantic relations, and various morphological, syntactic, semantic, topical, and dialectal annotations. We extract all data from the English Wiktionary. Comparing against previous extractions from language-specific dictionaries, we find that its coverage for non-English languages often matches or exceeds the coverage in the language-specific editions, with the added benefit that all glosses are in English. The data is freely available and regularly updated, enabling anyone to add more data and correct errors by editing Wiktionary. The extracted data is in JSON format and designed to be easy to use by researchers, downstream resources, and application developers.
%U https://aclanthology.org/2022.lrec-1.140/
%P 1317-1325
Markdown (Informal)
[Wiktextract: Wiktionary as Machine-Readable Structured Data](https://aclanthology.org/2022.lrec-1.140/) (Ylonen, LREC 2022)
ACL