Skip to content

Commit 7508b75

Browse files
trec-dl-2022 topics and scoreddocs (#200)
1 parent 716cd64 commit 7508b75

File tree

8 files changed

+70
-0
lines changed

8 files changed

+70
-0
lines changed

ir_datasets/datasets/msmarco_document_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,11 @@ def _init():
197197
FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
198198
subsets['trec-dl-2021'],
199199
)
200+
subsets['trec-dl-2022'] = Dataset(
201+
collection,
202+
TsvQueries(dlc['trec-dl-2022/queries'], namespace='msmarco', lang='en'),
203+
TrecScoredDocs(GzipExtract(dlc['trec-dl-2022/scoreddocs'])),
204+
)
200205

201206
subsets['anchor-text'] = Dataset(
202207
MsMarcoV2AnchorTextDocs(

ir_datasets/datasets/msmarco_passage_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,11 @@ def _init():
272272
FilteredScoredDocs(subsets['trec-dl-2021'].scoreddocs_handler(), dl21_judged),
273273
subsets['trec-dl-2021'],
274274
)
275+
subsets['trec-dl-2022'] = Dataset(
276+
collection,
277+
TsvQueries(dlc['trec-dl-2022/queries'], namespace='msmarco', lang='en'),
278+
TrecScoredDocs(GzipExtract(dlc['trec-dl-2022/scoreddocs'])),
279+
)
275280

276281
ir_datasets.registry.register(NAME, Dataset(collection, documentation("_")))
277282
for s in sorted(subsets):

ir_datasets/docs/msmarco-document-v2.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ Note that at this time, this is only available to those with TREC active partici
106106
'
107107
official_measures: ['AP@100', 'nDCG@10', 'P@10', 'RR(rel=2)']
108108

109+
trec-dl-2022:
110+
desc: '
111+
<p>
112+
Official topics for the TREC Deep Learning (DL) 2022 shared task.
113+
</p>
114+
'
115+
109116
anchor-text:
110117
pretty_name: "Anchor Text for version 2 of MS Marco"
111118
desc: '

ir_datasets/docs/msmarco-passage-v2.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,10 @@ with qrels.
7777
</p>
7878
'
7979
official_measures: ['AP@100', 'nDCG@10', 'P(rel=2)@10', 'RR(rel=2)']
80+
81+
trec-dl-2022:
82+
desc: '
83+
<p>
84+
Official topics for the TREC Deep Learning (DL) 2022 shared task.
85+
</p>
86+
'

ir_datasets/etc/downloads.json

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2735,6 +2735,18 @@
27352735
"size_hint": 751699569,
27362736
"expected_md5": "8b96dbaf4efcae08e0ee307e03f3434d",
27372737
"cache_path": "anchor-text-separate-v2.jsonl.gz"
2738+
},
2739+
"trec-dl-2022/queries": {
2740+
"url": "https://msmarco.blob.core.windows.net/msmarcoranking/2022_queries.tsv",
2741+
"size_hint": 21508,
2742+
"expected_md5": "f1bfd53d80e81e58207ce557fd2211a0",
2743+
"cache_path": "trec-dl-2022/queries.tsv"
2744+
},
2745+
"trec-dl-2022/scoreddocs": {
2746+
"url": "https://msmarco.blob.core.windows.net/msmarcoranking/2022_document_top100.txt.gz",
2747+
"size_hint": 642721,
2748+
"expected_md5": "93f70329ce1b9ce913a5f87008736ff2",
2749+
"cache_path": "trec-dl-2022/top100.txt.gz"
27382750
}
27392751
},
27402752

@@ -2931,6 +2943,18 @@
29312943
"size_hint": 433887,
29322944
"expected_md5": "c5b76ec95b589732edc9040302e22a2b",
29332945
"cache_path": "trec-dl-2021/qrels"
2946+
},
2947+
"trec-dl-2022/queries": {
2948+
"url": "https://msmarco.blob.core.windows.net/msmarcoranking/2022_queries.tsv",
2949+
"size_hint": 21508,
2950+
"expected_md5": "f1bfd53d80e81e58207ce557fd2211a0",
2951+
"cache_path": "trec-dl-2022/queries.tsv"
2952+
},
2953+
"trec-dl-2022/scoreddocs": {
2954+
"url": "https://msmarco.blob.core.windows.net/msmarcoranking/2022_passage_top100.txt.gz",
2955+
"size_hint": 630095,
2956+
"expected_md5": "36004dfad64826167aeecddff1d490a6",
2957+
"cache_path": "trec-dl-2022/top100.txt.gz"
29342958
}
29352959
},
29362960

ir_datasets/etc/metadata.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@
417417
"msmarco-document-v2/trec-dl-2020/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 45}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2020"}},
418418
"msmarco-document-v2/trec-dl-2021": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 477}, "scoreddocs": {"count": 47700}, "qrels": {"count": 13058, "fields": {"relevance": {"counts_by_value": {"2": 2769, "0": 4855, "3": 1256, "1": 4178}}}}},
419419
"msmarco-document-v2/trec-dl-2021/judged": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 57}, "qrels": {"_ref": "msmarco-document-v2/trec-dl-2021"}, "scoreddocs": {"count": 5700}},
420+
"msmarco-document-v2/trec-dl-2022": {"docs": {"_ref": "msmarco-document-v2"}, "queries": {"count": 500}, "scoreddocs": {"count": 50000}},
420421
"msmarco-document/anchor-text": {"docs": {"count": 1703834, "fields": {"doc_id": {"max_len": 8, "common_prefix": "D"}}}},
421422
"msmarco-document/dev": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 5193}, "qrels": {"count": 5193, "fields": {"relevance": {"counts_by_value": {"1": 5193}}}}, "scoreddocs": {"count": 519300}},
422423
"msmarco-document/eval": {"docs": {"_ref": "msmarco-document"}, "queries": {"count": 5793}, "scoreddocs": {"count": 579300}},
@@ -439,6 +440,7 @@
439440
"msmarco-passage-v2/train": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 277144}, "qrels": {"count": 284212, "fields": {"relevance": {"counts_by_value": {"1": 284212}}}}, "scoreddocs": {"count": 27713673}},
440441
"msmarco-passage-v2/trec-dl-2021": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 477}, "scoreddocs": {"count": 47700}, "qrels": {"count": 10828, "fields": {"relevance": {"counts_by_value": {"0": 4338, "3": 1086, "1": 3063, "2": 2341}}}}},
441442
"msmarco-passage-v2/trec-dl-2021/judged": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 53}, "qrels": {"_ref": "msmarco-passage-v2/trec-dl-2021"}, "scoreddocs": {"count": 5300}},
443+
"msmarco-passage-v2/trec-dl-2022": {"docs": {"_ref": "msmarco-passage-v2"}, "queries": {"count": 500}, "scoreddocs": {"count": 50000}},
442444
"msmarco-passage/dev": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 101093}, "qrels": {"count": 59273, "fields": {"relevance": {"counts_by_value": {"1": 59273}}}}},
443445
"msmarco-passage/dev/judged": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 55578}, "qrels": {"_ref": "msmarco-passage/dev"}},
444446
"msmarco-passage/dev/small": {"docs": {"_ref": "msmarco-passage"}, "queries": {"count": 6980}, "qrels": {"count": 7437, "fields": {"relevance": {"counts_by_value": {"1": 7437}}}}, "scoreddocs": {"count": 6668967}},

test/integration/msmarco_document_v2.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ def test_queries(self):
5959
9: GenericQuery('1104447', 'which kind of continental boundary is formed where two plates move horizontally past one another?'),
6060
56: GenericQuery('1040198', 'who is the final arbiter of florida law in instances where there is no federal authority?'),
6161
})
62+
self._test_queries('msmarco-document-v2/trec-dl-2022', count=500, items={
63+
0: GenericQuery('588', '1099 b cost basis i sell specific shares'),
64+
9: GenericQuery('77640', "can you get a master's degree in tefl"),
65+
499: GenericQuery('2056473', 'is a dairy farm considered as an agriculture'),
66+
})
6267

6368
def test_qrels(self):
6469
self._test_qrels('msmarco-document-v2/train', count=331956, items={
@@ -128,6 +133,11 @@ def test_scoreddocs(self):
128133
9: GenericScoredDoc('2082', 'msmarco_doc_01_1320056135', 14.554399),
129134
47699: GenericScoredDoc('1136769', 'msmarco_doc_57_1870160943', 15.2451),
130135
})
136+
self._test_scoreddocs('msmarco-document-v2/trec-dl-2022', count=50000, items={
137+
0: GenericScoredDoc('588', 'msmarco_doc_01_1675156368', 16.856501),
138+
9: GenericScoredDoc('588', 'msmarco_doc_44_970138133', 16.118099),
139+
49999: GenericScoredDoc('2056473', 'msmarco_doc_58_500974264', 10.868498),
140+
})
131141

132142
def test_anchor_text(self):
133143
self._test_docs("msmarco-document-v2/anchor-text", count=4821244, items={

test/integration/msmarco_passage_v2.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ def test_queries(self):
3939
9: GenericQuery('1107704', 'what was the main benefit of a single european currency?'),
4040
52: GenericQuery('1040198', 'who is the final arbiter of florida law in instances where there is no federal authority?'),
4141
})
42+
self._test_queries('msmarco-passage-v2/trec-dl-2022', count=500, items={
43+
0: GenericQuery('588', '1099 b cost basis i sell specific shares'),
44+
9: GenericQuery('77640', "can you get a master's degree in tefl"),
45+
499: GenericQuery('2056473', 'is a dairy farm considered as an agriculture'),
46+
})
4247

4348
def test_qrels(self):
4449
self._test_qrels('msmarco-passage-v2/train', count=284212, items={
@@ -88,6 +93,11 @@ def test_scoreddocs(self):
8893
9: GenericScoredDoc('2082', 'msmarco_passage_30_709623997', 17.350901),
8994
47699: GenericScoredDoc('1136769', 'msmarco_passage_06_68704200', 14.8941),
9095
})
96+
self._test_scoreddocs('msmarco-passage-v2/trec-dl-2022', count=50000, items={
97+
0: GenericScoredDoc('588', 'msmarco_passage_30_337959223', 18.762699),
98+
9: GenericScoredDoc('588', 'msmarco_passage_10_355039123', 17.7628),
99+
49999: GenericScoredDoc('2056473', 'msmarco_passage_17_225374709', 12.147499),
100+
})
91101

92102

93103
if __name__ == '__main__':

0 commit comments

Comments
 (0)