Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from . import tweets2013_ia
from . import vaswani
from . import wapo
from . import wikiclir
from . import wikir
from . import trec_fair_2021
from . import trec_cast # must be after wapo,car,msmarco_passage
Expand Down
93 changes: 93 additions & 0 deletions ir_datasets/datasets/wikiclir.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import contextlib
from pathlib import Path
from typing import NamedTuple
import ir_datasets
from ir_datasets.util import TarExtractAll, DownloadConfig, RelativePath, Lazy
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats import TsvDocs, TsvQueries, TrecQrels

NAME = 'wikiclir'

_logger = ir_datasets.log.easy()

QRELS_DEFS = {
2: "Document assigned to the (English) cross-lingual mate",
1: "All other articles that link to the mate, and are linked by the mate",
}


class WikiClirQuery(NamedTuple):
query_id: str
title: str
first_sent: str


class WikiClirDoc(NamedTuple):
doc_id: str
title: str
text: str


def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')

langs = [
('arabic', 'ar', 'ar'),
('catalan', 'ca', 'ca'),
('chinese', 'zh', 'zh'),
('czech', 'cs', 'cs'),
('dutch', 'nl', 'nl'),
('finnish', 'fi', 'fi'),
('french', 'fr', 'fr'),
('german', 'de', 'de'),
('italian', 'it', 'it'),
('japanese', 'ja', 'ja'),
('korean', 'ko', 'ko'),
('norwegian_(bokmal)', 'no', 'no'),
('norwegian_(nynorsk)', 'nn', 'nn'),
('polish', 'pl', 'pl'),
('portuguese', 'pt', 'pt'),
('romanian', 'ro', 'ro'),
('russian', 'ru', 'ru'),
('simple_english', 'en', 'en-simple'),
('spanish', 'es', 'es'),
('swahili', 'sw', 'sw'),
('swedish', 'sv', 'sv'),
('tagalog', 'tl', 'tl'),
('turkish', 'tr', 'tr'),
('ukrainian', 'uk', 'uk'),
('vietnamese', 'vi', 'vi'),
]

dlc = TarExtractAll(dlc['source'], base_path/'source')

queries = TsvQueries(RelativePath(dlc, 'wiki-clir/english/wiki_en.queries'), namespace=NAME, query_cls=WikiClirQuery, lang='en')

base = Dataset(documentation('_'))

subsets = {}

for source_path, lang, dsid in langs:
file_suffix = lang if dsid != 'en-simple' else 'simple'
qrels = TrecQrels(RelativePath(dlc, f'wiki-clir/{source_path}/en2{file_suffix}.rel'), QRELS_DEFS, format_3col=True)
qids = _qid_filter(qrels)
subsets[dsid] = Dataset(
TsvDocs(RelativePath(dlc, f'wiki-clir/{source_path}/wiki_{file_suffix}.documents'), doc_cls=WikiClirDoc, namespace=NAME, lang=lang),
FilteredQueries(queries, qids, mode='include'),
qrels,
documentation(dsid),
)

ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return base, subsets

def _qid_filter(qrels):
return Lazy(lambda: {q.query_id for q in qrels.qrels_iter()})


collection, subsets = _init()
17 changes: 17 additions & 0 deletions ir_datasets/docs/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,23 @@ @inproceedings{Voorhees1999Trec8
booktitle = {TREC}
}

@inproceedings{sasaki-etal-2018-cross,
title = "Cross-Lingual Learning-to-Rank with Shared Representations",
author = "Sasaki, Shota and
Sun, Shuo and
Schamoni, Shigehiko and
Duh, Kevin and
Inui, Kentaro",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)",
month = jun,
year = "2018",
address = "New Orleans, Louisiana",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-2073",
doi = "10.18653/v1/N18-2073",
pages = "458--463"
}

@article{Lawrie2022HC4,
author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang},
title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR},
Expand Down
240 changes: 240 additions & 0 deletions ir_datasets/docs/wikiclir.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
_:
pretty_name: "WikiCLIR"
desc: '
<p>
A Cross-Language IR (CLIR) collection between English queries and other language documents,
built from Wikipedia.
</p>
<ul>
<li><a href="https://www.cs.jhu.edu/~kevinduh/papers/sasaki18letor.pdf">Dataset Paper</a></li>
<li><a href="https://www.cs.jhu.edu/~kevinduh/a/wikiclir2018/">Dataset Information</a></li>
</ul>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ar:
desc: '
<p>
WikiCLIR with Arabic documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ca:
desc: '
<p>
WikiCLIR with Catalan documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

zh:
desc: '
<p>
WikiCLIR with Chinese documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

cs:
desc: '
<p>
WikiCLIR with Czech documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

nl:
desc: '
<p>
WikiCLIR with Dutch documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

fi:
desc: '
<p>
WikiCLIR with Finnish documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

fr:
desc: '
<p>
WikiCLIR with French documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

de:
desc: '
<p>
WikiCLIR with German documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

it:
desc: '
<p>
WikiCLIR with Italian documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ja:
desc: '
<p>
WikiCLIR with Japanese documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ko:
desc: '
<p>
WikiCLIR with Korean documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

no:
desc: '
<p>
WikiCLIR with Norwegian (Nynorsk) documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

nn:
desc: '
<p>
WikiCLIR with Norwegian (Bokmål) documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

pl:
desc: '
<p>
WikiCLIR with Polish documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

pt:
desc: '
<p>
WikiCLIR with Portuguese documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ro:
desc: '
<p>
WikiCLIR with Romanian documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

ru:
desc: '
<p>
WikiCLIR with Russian documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

en-simple:
desc: '
<p>
WikiCLIR with Simple English documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

es:
desc: '
<p>
WikiCLIR with Spanish documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

sw:
desc: '
<p>
WikiCLIR with Swahili documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

sv:
desc: '
<p>
WikiCLIR with Swedish documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

tl:
desc: '
<p>
WikiCLIR with Tagalog documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

tr:
desc: '
<p>
WikiCLIR with Turkish documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

uk:
desc: '
<p>
WikiCLIR with Ukrainian documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

vi:
desc: '
<p>
WikiCLIR with Vietnamese documents.
</p>
'
bibtex_ids:
- 'sasaki-etal-2018-cross'

8 changes: 8 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -3641,6 +3641,14 @@
}
},

"wikiclir": {
"source": {
"url": "https://www.cs.jhu.edu/~kevinduh/a/wikiclir2018/wiki-clir.tar.gz",
"size_hint": 7036445773,
"expected_md5": "705abb611eb8cbab9ced2b8767a3bdb6"
}
},

"wikir": {
"en1k": {
"url": "https://zenodo.org/record/3565761/files/wikIR1k.zip?download=1",
Expand Down
Loading