Skip to content

Commit 89e22d8

Browse files
improved HTML/XML parser, TREC 7 and 8
* wip: improved html parsing * character set detection from provided tags; cleanup; tests * tests for entity encodings * revert aquaint changes * revert formats/trec changes * customizable title tag * added trec disks 4&5 to show off new html parser * deprecating trec-robust04 * missed a spot * fixed recursive globs fixes #160 * working on documentation * various cleanup, metadata, and documentation
1 parent 137a900 commit 89e22d8

File tree

15 files changed

+608
-9
lines changed

15 files changed

+608
-9
lines changed

ir_datasets/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from . import codec
1414
from . import cord19
1515
from . import cranfield
16+
from . import disks45
1617
from . import dpr_w100
1718
from . import codesearchnet
1819
from . import gov

ir_datasets/datasets/base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,14 @@ def documentation(self):
401401
return {}
402402

403403

404+
class Deprecated:
405+
def __init__(self, message):
406+
self._message = message
407+
408+
def deprecated(self):
409+
return self._message
410+
411+
404412
class ExpectedFile:
405413
def __init__(self, path, expected_md5=None, instructions=None):
406414
self._path = Path(path)

ir_datasets/datasets/disks45.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
import ir_datasets
2+
from ir_datasets.util import GzipExtract, TarExtract, Lazy, DownloadConfig
3+
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
4+
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
5+
6+
7+
NAME = 'disks45'
8+
9+
10+
QREL_DEFS = {
11+
2: 'highly relevant',
12+
1: 'relevant',
13+
0: 'not relevant',
14+
}
15+
16+
QREL_DEFS_TREC78 = {
17+
1: 'relevant',
18+
0: 'not relevant',
19+
}
20+
21+
DUA = ("Please confirm you agree to the TREC data usage agreement found at "
22+
"<https://trec.nist.gov/data/cd45/index.html>")
23+
24+
25+
# folds from Huston & Croft 2014 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.646.7749>
26+
ROBUST04_FOLDS = {
27+
'fold1': {'302', '303', '309', '316', '317', '319', '323', '331', '336', '341', '356', '357', '370', '373', '378', '381', '383', '392', '394', '406', '410', '411', '414', '426', '428', '433', '447', '448', '601', '607', '608', '612', '617', '619', '635', '641', '642', '646', '647', '654', '656', '662', '665', '669', '670', '679', '684', '690', '692', '700'},
28+
'fold2': {'301', '308', '312', '322', '327', '328', '338', '343', '348', '349', '352', '360', '364', '365', '369', '371', '374', '386', '390', '397', '403', '419', '422', '423', '424', '432', '434', '440', '446', '602', '604', '611', '623', '624', '627', '632', '638', '643', '651', '652', '663', '674', '675', '678', '680', '683', '688', '689', '695', '698'},
29+
'fold3': {'306', '307', '313', '321', '324', '326', '334', '347', '351', '354', '358', '361', '362', '363', '376', '380', '382', '396', '404', '413', '415', '417', '427', '436', '437', '439', '444', '445', '449', '450', '603', '605', '606', '614', '620', '622', '626', '628', '631', '637', '644', '648', '661', '664', '666', '671', '677', '685', '687', '693'},
30+
'fold4': {'320', '325', '330', '332', '335', '337', '342', '344', '350', '355', '368', '377', '379', '387', '393', '398', '402', '405', '407', '408', '412', '420', '421', '425', '430', '431', '435', '438', '616', '618', '625', '630', '633', '636', '639', '649', '650', '653', '655', '657', '659', '667', '668', '672', '673', '676', '682', '686', '691', '697'},
31+
'fold5': {'304', '305', '310', '311', '314', '315', '318', '329', '333', '339', '340', '345', '346', '353', '359', '366', '367', '372', '375', '384', '385', '388', '389', '391', '395', '399', '400', '401', '409', '416', '418', '429', '441', '442', '443', '609', '610', '613', '615', '621', '629', '634', '640', '645', '658', '660', '681', '694', '696', '699'}
32+
}
33+
34+
35+
def _init():
36+
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
37+
base_path = ir_datasets.util.home_path()/NAME
38+
dlc = DownloadConfig.context(NAME, base_path, dua=DUA)
39+
subsets = {}
40+
41+
collection_nocr = TrecDocs(dlc['docs'],
42+
path_globs=['**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*'],
43+
namespace=NAME,
44+
lang='en',
45+
expected_file_count=2295,
46+
count_hint=ir_datasets.util.count_hint(NAME),
47+
parser='sax',
48+
docstore_path=base_path/'corpus.nocr.pklz4')
49+
50+
robust_queries = TrecQueries(GzipExtract(dlc['robust04-queries']), namespace=NAME, lang='en')
51+
robust_qrels = TrecQrels(dlc['robust04-qrels'], QREL_DEFS)
52+
53+
base = Dataset(documentation('_'))
54+
55+
subsets['nocr'] = Dataset(
56+
collection_nocr,
57+
documentation('nocr'))
58+
59+
subsets['nocr/trec-robust-2004'] = Dataset(
60+
collection_nocr,
61+
robust_queries,
62+
robust_qrels,
63+
documentation('nocr/trec-robust-2004'))
64+
65+
for fold in ROBUST04_FOLDS:
66+
qid_filter = make_filter(fold)
67+
subsets[f'nocr/trec-robust-2004/{fold}'] = Dataset(
68+
collection_nocr,
69+
FilteredQueries(robust_queries, qid_filter),
70+
FilteredQrels(robust_qrels, qid_filter),
71+
documentation(f'nocr/trec-robust-2004/{fold}'))
72+
73+
subsets['nocr/trec8'] = Dataset(
74+
collection_nocr,
75+
TrecQrels(TarExtract(dlc['trec8-qrels'], 'qrels.trec8.adhoc.parts1-5'), QREL_DEFS_TREC78),
76+
TrecQueries(GzipExtract(dlc['trec8-queries']), namespace=NAME, lang='en'),
77+
documentation('nocr/trec8'))
78+
79+
subsets['nocr/trec7'] = Dataset(
80+
collection_nocr,
81+
TrecQrels([
82+
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part1.gz')),
83+
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part2.gz')),
84+
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part3.gz')),
85+
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part4.gz')),
86+
GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part5.gz')),
87+
], QREL_DEFS_TREC78),
88+
TrecQueries(GzipExtract(dlc['trec7-queries']), namespace=NAME, lang='en'),
89+
documentation('nocr/trec7'))
90+
91+
ir_datasets.registry.register(NAME, base)
92+
for s in sorted(subsets):
93+
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
94+
95+
return base, subsets
96+
97+
98+
def make_filter(fold):
99+
return Lazy(lambda: ROBUST04_FOLDS[fold])
100+
101+
102+
base, subsets = _init()

ir_datasets/datasets/trec_robust04.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import ir_datasets
22
from ir_datasets.util import GzipExtract, Lazy, DownloadConfig
33
from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries
4-
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation
4+
from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation, Deprecated
55

66

77
NAME = 'trec-robust04'
@@ -27,6 +27,9 @@
2727
}
2828

2929

30+
DEPRECATED_MESSAGE = '{} is deprecated. Consider using {} instead, which provides better parsing of the corpus.'
31+
32+
3033
def _init():
3134
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
3235
base_path = ir_datasets.util.home_path()/NAME
@@ -38,15 +41,21 @@ def _init():
3841
queries = TrecQueries(GzipExtract(dlc['queries']), namespace=NAME, lang='en')
3942
qrels = TrecQrels(dlc['qrels'], QREL_DEFS)
4043

41-
base = Dataset(collection, queries, qrels, documentation('_'))
44+
base = Dataset(
45+
collection,
46+
queries,
47+
qrels,
48+
documentation('_'),
49+
Deprecated(DEPRECATED_MESSAGE.format(NAME, f'disks45/nocr/trec-robust-2004')))
4250

4351
for fold in FOLDS:
4452
qid_filter = make_filter(fold)
4553
subsets[fold] = Dataset(
4654
FilteredQueries(queries, qid_filter),
4755
FilteredQrels(qrels, qid_filter),
4856
collection,
49-
documentation(fold))
57+
documentation(fold),
58+
Deprecated(DEPRECATED_MESSAGE.format(f'{NAME}/{fold}', f'disks45/nocr/trec-robust-2004/{fold}')))
5059

5160
ir_datasets.registry.register(NAME, base)
5261
for s in sorted(subsets):

ir_datasets/docs/bibliography.bib

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,28 @@ @inproceedings{petroni-etal-2021-kilt
867867
pages = "2523--2544",
868868
}
869869

870+
@misc{Voorhees1996Disks45,
871+
title = {NIST TREC Disks 4 and 5: Retrieval Test Collections Document Set},
872+
author = {Ellen M. Voorhees},
873+
doi = {10.18434/t47g6m},
874+
year = {1996},
875+
publisher = {National Institute of Standards and Technology}
876+
}
877+
878+
@inproceedings{Voorhees1998Trec7,
879+
title = {Overview of the Seventh Text Retrieval Conference (TREC-7)},
880+
author = {Ellen M. Voorhees and Donna Harman},
881+
year = {1998},
882+
booktitle = {TREC}
883+
}
884+
885+
@inproceedings{Voorhees1999Trec8,
886+
title = {Overview of the Eight Text Retrieval Conference (TREC-8)},
887+
author = {Ellen M. Voorhees and Donna Harman},
888+
year = {1999},
889+
booktitle = {TREC}
890+
}
891+
870892
@article{Lawrie2022HC4,
871893
author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang},
872894
title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR},

ir_datasets/docs/disks45.yaml

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
_:
2+
pretty_name: 'TREC Disks 4 and 5'
3+
desc: '
4+
<p>
5+
TREC Disks 4 and 5, including documents from the Financial Times, the Congressional Record,
6+
the Federal Register, the Foreign Broadcast Information Service, and the Los Angeles Times.
7+
</p>
8+
<p>
9+
This dataset is a placeholder for the complete collection, but at this time, only the version
10+
of the dataset without the Congressional Record (<a class="ds-ref">disks45/nocr</a>) are provided.
11+
</p>
12+
<ul>
13+
<li><a href="https://trec.nist.gov/data/cd45/">Information and access to TREC Disks 4 and 5.</a></li>
14+
</ul>
15+
'
16+
docs_instructions: &inst "docs available from NIST"
17+
data_access: '
18+
<p>
19+
To use this dataset, you need a copy of <a href="https://trec.nist.gov/data/cd45/index.html">TREC
20+
Disks 4 and 5</a>, provided by NIST.
21+
</p>
22+
<p>
23+
Your organization may already have a copy. If this is the case, you may only need to complete a new
24+
"Individual Argeement". Otherwise, your organization will need to file the "Organizational agreement"
25+
with NIST. It can take some time to process, but you will end up with a password-protected download link.
26+
</p>
27+
<p>
28+
ir_datasets needs the following directories from the source:
29+
</p>
30+
<ul>
31+
<li><kbd>FBIS</kbd></li>
32+
<li><kbd>FR94</kbd></li>
33+
<li><kbd>FT</kbd></li>
34+
<li><kbd>LATIMES</kbd></li>
35+
</ul>
36+
<p>
37+
ir_datasets expects the above directories to be copied/linked under <kbd>~/.ir_datasets/disks45/corpus</kbd>.
38+
The source document files themselves can either be compressed or uncompressed (it seems they have been distributed
39+
both ways in the past.) If ir_datasets does not find the files it is expecting, it will raise an error.
40+
</p>
41+
'
42+
43+
nocr:
44+
desc: '
45+
<p>
46+
A version of <a class="ds-ref">disks45</a> without the Congressional Record. This is the typical setting for
47+
tasks like TREC 7, TREC 8, and TREC Robust 2004.
48+
</p>
49+
'
50+
docs_instructions: *inst
51+
bibtex_ids: ['Voorhees1996Disks45']
52+
53+
nocr/trec-robust-2004:
54+
desc: '
55+
<p>
56+
The TREC Robust retrieval task focuses on "improving the consistency of retrieval technology by
57+
focusing on poorly performing topics."
58+
</p>
59+
<p>
60+
The TREC Robust document collection is from TREC disks 4 and 5. Due to the
61+
copyrighted nature of the documents, this collection is for research use only, which requires
62+
agreements to be filed with NIST. See details <a href="https://trec.nist.gov/data/cd45/index.html">here</a>.
63+
</p>
64+
<ul>
65+
<li>Documents: News articles</li>
66+
<li>Queries: keyword queries, descriptions, narratives</li>
67+
<li>Relevance: Deep judgments</li>
68+
<li><a href="https://trec.nist.gov/pubs/trec13/papers/ROBUST.OVERVIEW.pdf">Task Overview Paper</a></li>
69+
<li>See also: <a class="ds-ref">aquaint/trec-robust-2005</a></li>
70+
</ul>'
71+
docs_instructions: *inst
72+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
73+
74+
nocr/trec-robust-2004/fold1:
75+
desc: '
76+
<p>Robust04 Fold 1 (Title) proposed by Huston &amp; Croft (2014) and used in numerous works</p>'
77+
docs_instructions: *inst
78+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
79+
80+
nocr/trec-robust-2004/fold2:
81+
desc: '
82+
<p>Robust04 Fold 2 (Title) proposed by Huston &amp; Croft (2014) and used in numerous works</p>'
83+
docs_instructions: *inst
84+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
85+
86+
nocr/trec-robust-2004/fold3:
87+
desc: '
88+
<p>Robust04 Fold 3 (Title) proposed by Huston &amp; Croft (2014) and used in numerous works</p>'
89+
docs_instructions: *inst
90+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
91+
92+
nocr/trec-robust-2004/fold4:
93+
desc: '
94+
<p>Robust04 Fold 4 (Title) proposed by Huston &amp; Croft (2014) and used in numerous works</p>'
95+
docs_instructions: *inst
96+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
97+
98+
nocr/trec-robust-2004/fold5:
99+
desc: '
100+
<p>Robust04 Fold 5 (Title) proposed by Huston &amp; Croft (2014) and used in numerous works</p>'
101+
docs_instructions: *inst
102+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees2004Robust', 'Huston2014ACO']
103+
104+
nocr/trec7:
105+
desc: '
106+
<p>
107+
The TREC 7 Adhoc Retrieval track.
108+
</p>
109+
<ul>
110+
<li><a href="https://trec.nist.gov/pubs/trec7/papers/overview_7.pdf.gz">Task Overview Paper</a></li>
111+
</ul>
112+
'
113+
docs_instructions: *inst
114+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees1998Trec7']
115+
116+
nocr/trec8:
117+
desc: '
118+
<p>
119+
The TREC 8 Adhoc Retrieval track.
120+
</p>
121+
<ul>
122+
<li><a href="https://trec.nist.gov/pubs/trec8/papers/overview_8.pdf">Task Overview Paper</a></li>
123+
</ul>
124+
'
125+
docs_instructions: *inst
126+
bibtex_ids: ['Voorhees1996Disks45', 'Voorhees1999Trec8']

ir_datasets/etc/downloads.json

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,6 +822,55 @@
822822
}
823823
},
824824

825+
"disks45": {
826+
"docs": {
827+
"instructions": "The TREC Robust document collection is from TREC disks 4 and 5. Due to the copyrighted nature of the documents, this collection is for research use only, which requires agreements to be filed with NIST. See details here: <https://trec.nist.gov/data/cd45/index.html>.\nMore details about the procedure can be found here: <https://ir-datasets.com/trec-robust04.html#DataAccess>.\nOnce completed, place the uncompressed source here: {path}\nThis should contain directories like NEWS_data/FBIS, NEWS_data/FR94, etc.",
828+
"cache_path": "corpus"
829+
},
830+
"robust04-queries": {
831+
"url": "https://trec.nist.gov/data/robust/04.testset.gz",
832+
"irds_mirror": true,
833+
"size_hint": 34293,
834+
"expected_md5": "5eac3d774a2f87da61c08a94f945beff",
835+
"cache_path": "04.testset.gz"
836+
},
837+
"robust04-qrels": {
838+
"url": "https://trec.nist.gov/data/robust/qrels.robust2004.txt",
839+
"irds_mirror": true,
840+
"size_hint": 6543541,
841+
"expected_md5": "123c2a0ba2ec31178cb1050995dcfdfa",
842+
"cache_path": "qrels.robust2004.txt"
843+
},
844+
"trec8-qrels": {
845+
"url": "https://trec.nist.gov/data/qrels_eng/qrels.trec8.adhoc.parts1-5.tar.gz",
846+
"irds_mirror": true,
847+
"size_hint": 325935,
848+
"expected_md5": "ce1cfa80b29746d2a5eeddab268d4f6a",
849+
"cache_path": "qrels.trec8.adhoc.parts1-5.tar.gz"
850+
},
851+
"trec8-queries": {
852+
"url": "https://trec.nist.gov/data/topics_eng/topics.401-450.gz",
853+
"irds_mirror": true,
854+
"size_hint": 6946,
855+
"expected_md5": "daaafb700eed76f61a6e9e4b0dcc40c8",
856+
"cache_path": "topics.401-450.gz"
857+
},
858+
"trec7-qrels": {
859+
"url": "https://trec.nist.gov/data/qrels_eng/qrels.trec7.adhoc.parts1-5.tar.gz",
860+
"irds_mirror": true,
861+
"size_hint": 307120,
862+
"expected_md5": "43def30d4f4b33a830ae67e3dce19023",
863+
"cache_path": "qrels.trec7.adhoc.parts1-5.tar.gz"
864+
},
865+
"trec7-queries": {
866+
"url": "https://trec.nist.gov/data/topics_eng/topics.351-400.gz",
867+
"irds_mirror": true,
868+
"size_hint": 7400,
869+
"expected_md5": "fdee3f7e37e173fd6fcdc00fbe1fc671",
870+
"cache_path": "topics.351-400.gz"
871+
}
872+
},
873+
825874
"dpr-w100": {
826875
"docs": {
827876
"url": "https://dl.fbaipublicfiles.com/dpr/wikipedia_split/psgs_w100.tsv.gz",

ir_datasets/etc/metadata.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,16 @@
151151
"cord19/trec-covid/round4": {"docs": {"count": 158274, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 45}, "qrels": {"count": 13262, "fields": {"relevance": {"counts_by_value": {"1": 2279, "0": 7438, "2": 3545}}}}},
152152
"cord19/trec-covid/round5": {"docs": {"_ref": "cord19"}, "queries": {"_ref": "cord19/trec-covid"}, "qrels": {"count": 23151, "fields": {"relevance": {"counts_by_value": {"2": 6677, "1": 4233, "0": 12239, "-1": 2}}}}},
153153
"cranfield": {"docs": {"count": 1400, "fields": {"doc_id": {"max_len": 4, "common_prefix": ""}}}, "queries": {"count": 225}, "qrels": {"count": 1837, "fields": {"relevance": {"counts_by_value": {"2": 387, "3": 734, "4": 363, "-1": 225, "1": 128}}}}},
154+
"disks45": {},
155+
"disks45/nocr": {"docs": {"count": 528155, "fields": {"doc_id": {"max_len": 16, "common_prefix": ""}}}},
156+
"disks45/nocr/trec-robust-2004": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 250}, "qrels": {"count": 311410, "fields": {"relevance": {"counts_by_value": {"1": 16381, "0": 293998, "2": 1031}}}}},
157+
"disks45/nocr/trec-robust-2004/fold1": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 62789, "fields": {"relevance": {"counts_by_value": {"0": 59765, "1": 2795, "2": 229}}}}},
158+
"disks45/nocr/trec-robust-2004/fold2": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 63917, "fields": {"relevance": {"counts_by_value": {"1": 3334, "0": 60246, "2": 337}}}}},
159+
"disks45/nocr/trec-robust-2004/fold3": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 62901, "fields": {"relevance": {"counts_by_value": {"0": 58859, "1": 3877, "2": 165}}}}},
160+
"disks45/nocr/trec-robust-2004/fold4": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 57962, "fields": {"relevance": {"counts_by_value": {"0": 55103, "1": 2707, "2": 152}}}}},
161+
"disks45/nocr/trec-robust-2004/fold5": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 63841, "fields": {"relevance": {"counts_by_value": {"0": 60025, "1": 3668, "2": 148}}}}},
162+
"disks45/nocr/trec7": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 80345, "fields": {"relevance": {"counts_by_value": {"0": 75671, "1": 4674}}}}},
163+
"disks45/nocr/trec8": {"docs": {"_ref": "disks45/nocr"}, "queries": {"count": 50}, "qrels": {"count": 86830, "fields": {"relevance": {"counts_by_value": {"0": 82102, "1": 4728}}}}},
154164
"dpr-w100": {"docs": {"count": 21015324, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
155165
"dpr-w100/natural-questions/dev": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 6515}, "qrels": {"count": 979893, "fields": {"relevance": {"counts_by_value": {"2": 6515, "1": 44736, "0": 602894, "-1": 325748}}}}},
156166
"dpr-w100/natural-questions/train": {"docs": {"_ref": "dpr-w100"}, "queries": {"count": 58880}, "qrels": {"count": 8856662, "fields": {"relevance": {"counts_by_value": {"2": 58880, "1": 405729, "0": 5448064, "-1": 2943989}}}}},

0 commit comments

Comments
 (0)