Skip to content
This repository was archived by the owner on May 19, 2025. It is now read-only.

Commit 721d9cb

Browse files
MIRACL (allenai#248)
* wip * miracl * remove unnecessary auth
1 parent 632abcf commit 721d9cb

File tree

7 files changed

+3357
-1
lines changed

7 files changed

+3357
-1
lines changed

ir_datasets/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from . import kilt
2424
from . import lotte
2525
from . import medline
26+
from . import miracl
2627
from . import mmarco
2728
from . import mr_tydi
2829
from . import msmarco_document

ir_datasets/datasets/miracl.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import ir_datasets
2+
from typing import NamedTuple
3+
from ir_datasets.util import DownloadConfig, GzipExtract
4+
from ir_datasets.datasets.base import Dataset, YamlDocumentation
5+
from ir_datasets.formats import JsonlDocs, TsvQueries, TrecQrels, TrecScoredDocs
6+
7+
NAME = 'miracl'
8+
9+
_logger = ir_datasets.log.easy()
10+
11+
QRELS_DEFS = {
12+
0: 'Not Relevant',
13+
1: 'Relevant',
14+
}
15+
16+
class MiraclDoc(NamedTuple):
17+
doc_id: str
18+
title: str
19+
text: str
20+
def default_text(self):
21+
return f'{self.title} {self.text}'
22+
23+
24+
def _init():
25+
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
26+
base_path = ir_datasets.util.home_path()/NAME
27+
dlc = DownloadConfig.context(NAME, base_path)
28+
29+
subsets = {}
30+
31+
langs = [
32+
('ar', 5, {'train', 'dev', 'test-a', 'test-b'}),
33+
('bn', 1, {'train', 'dev', 'test-a', 'test-b'}),
34+
('de', 32, {'dev', 'test-b'}),
35+
('en', 66, {'train', 'dev', 'test-a', 'test-b'}),
36+
('es', 21, {'train', 'dev', 'test-b'}),
37+
('fa', 5, {'train', 'dev', 'test-b'}),
38+
('fi', 4, {'train', 'dev', 'test-a', 'test-b'}),
39+
('fr', 30, {'train', 'dev', 'test-b'}),
40+
('hi', 2, {'train', 'dev', 'test-b'}),
41+
('id', 3, {'train', 'dev', 'test-a', 'test-b'}),
42+
('ja', 14, {'train', 'dev', 'test-a', 'test-b'}),
43+
('ko', 3, {'train', 'dev', 'test-a', 'test-b'}),
44+
('ru', 20, {'train', 'dev', 'test-a', 'test-b'}),
45+
('sw', 1, {'train', 'dev', 'test-a', 'test-b'}),
46+
('te', 2, {'train', 'dev', 'test-a', 'test-b'}),
47+
('th', 2, {'train', 'dev', 'test-a', 'test-b'}),
48+
('yo', 1, {'dev', 'test-b'}),
49+
('zh', 10, {'train', 'dev', 'test-b'}),
50+
]
51+
52+
for lang, n_doc_files, topic_sets in langs:
53+
collection = JsonlDocs(
54+
[GzipExtract(dlc[f'v1.0/{lang}/corpus/{i}']) for i in range(n_doc_files)],
55+
doc_cls=MiraclDoc,
56+
mapping={'doc_id': 'docid', 'title': 'title', 'text': 'text'},
57+
namespace=f'{NAME}/{lang}',
58+
lang=lang,
59+
count_hint=ir_datasets.util.count_hint(f'{NAME}/{lang}'),
60+
docstore_path=base_path/'v1.0'/lang/'docs.pklz4')
61+
subsets[f'{lang}'] = Dataset(collection, documentation(f'{lang}'))
62+
if 'train' in topic_sets:
63+
subsets[f'{lang}/train'] = Dataset(
64+
collection,
65+
TsvQueries(dlc[f'v1.0/{lang}/train/topics'], namespace=f'{NAME}/{lang}', lang=lang),
66+
TrecQrels(dlc[f'v1.0/{lang}/train/qrels'], QRELS_DEFS),
67+
documentation(f'{lang}/train'))
68+
if 'dev' in topic_sets:
69+
subsets[f'{lang}/dev'] = Dataset(
70+
collection,
71+
TsvQueries(dlc[f'v1.0/{lang}/dev/topics'], namespace=f'{NAME}/{lang}', lang=lang),
72+
TrecQrels(dlc[f'v1.0/{lang}/dev/qrels'], QRELS_DEFS),
73+
documentation(f'{lang}/dev'))
74+
if 'test-a' in topic_sets:
75+
subsets[f'{lang}/test-a'] = Dataset(
76+
collection,
77+
TsvQueries(dlc[f'v1.0/{lang}/test-a/topics'], namespace=f'{NAME}/{lang}', lang=lang),
78+
documentation(f'{lang}/test-a'))
79+
if 'test-b' in topic_sets:
80+
subsets[f'{lang}/test-b'] = Dataset(
81+
collection,
82+
TsvQueries(dlc[f'v1.0/{lang}/test-b/topics'], namespace=f'{NAME}/{lang}', lang=lang),
83+
documentation(f'{lang}/test-b'))
84+
85+
ir_datasets.registry.register(NAME, Dataset(documentation('_')))
86+
for s in sorted(subsets):
87+
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
88+
89+
return collection, subsets
90+
91+
92+
collection, subsets = _init()

ir_datasets/docs/bibliography.bib

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,13 @@ @inproceedings{Yanai2007Image
946946
doi = {10.1145/1242572.1242816},
947947
}
948948

949+
@article{Zhang2022Miracl,
950+
title={Making a MIRACL: Multilingual information retrieval across a continuum of languages},
951+
author={Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
952+
journal={arXiv preprint arXiv:2210.09984},
953+
year={2022}
954+
}
955+
949956
@article{Lawrie2022HC4,
950957
author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang},
951958
title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR},

0 commit comments

Comments
 (0)