Skip to content

Commit f4377f9

Browse files
Merge pull request allenai#290 from mam10eks/trec-tot-2025
Add TREC tip-of-the-tongue 2025
2 parents d4a4186 + d00d8af commit f4377f9

File tree

9 files changed

+450
-1
lines changed

9 files changed

+450
-1
lines changed

ir_datasets/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,5 @@
5656
from . import hc4
5757
from . import neuclir # must be after hc4
5858
from . import sara
59+
from . import trec_tot_2025
60+
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from ir_datasets import registry
2+
from ir_datasets.datasets.base import Dataset, YamlDocumentation
3+
from ir_datasets.util.download import RequestsDownload
4+
from ir_datasets.formats.base import BaseDocs
5+
from ir_datasets.indices import Docstore
6+
from ir_datasets.util import ZipExtractCache, home_path, Cache, DownloadConfig
7+
from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries
8+
from ir_datasets.indices import PickleLz4FullStore
9+
import os
10+
import gzip
11+
import json
12+
from tqdm import tqdm
13+
from typing import NamedTuple
14+
15+
NAME = "trec-tot"
16+
17+
18+
class JsonlDocumentOffset(NamedTuple):
19+
doc_id: str
20+
offset_start: int
21+
offset_end: int
22+
23+
24+
class TrecToT2025Doc(NamedTuple):
25+
doc_id: str
26+
title: str
27+
url: str
28+
text: str
29+
30+
@staticmethod
31+
def _from_json(json_doc):
32+
return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"])
33+
34+
def default_text(self):
35+
return self.title + " " + self.text
36+
37+
38+
class JsonlWithOffsetsDocsStore(Docstore):
39+
def __init__(self, docs, offsets):
40+
self.__docs = docs
41+
self.__offsets = offsets
42+
self._docs_dict = None
43+
self._id_field = "doc_id"
44+
45+
def offsets_iter(self):
46+
with gzip.open(self.__offsets.path(), "rt") as f:
47+
for i in f:
48+
i = json.loads(i)
49+
yield JsonlDocumentOffset(doc_id=i["id"], offset_start=i["offset_start"], offset_end=i["offset_end"])
50+
51+
def docs_dict(self):
52+
return PickleLz4FullStore(
53+
path=str(self.__offsets.path()) + '.pklz4',
54+
init_iter_fn=self.offsets_iter,
55+
data_cls=JsonlDocumentOffset,
56+
lookup_field="doc_id",
57+
index_fields=("doc_id",)
58+
)
59+
60+
def get_many_iter(self, doc_ids):
61+
offsets = self.docs_dict()
62+
63+
with open(self.__docs.path(), "rb") as f:
64+
for doc in doc_ids:
65+
doc = offsets.get(doc)
66+
f.seek(doc.offset_start)
67+
raw_content_bytes = f.read(doc.offset_end - doc.offset_start)
68+
yield gzip.decompress(raw_content_bytes)
69+
70+
71+
class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore):
72+
def get_many_iter(self, doc_ids):
73+
for i in super().get_many_iter(doc_ids):
74+
yield TrecToT2025Doc._from_json(json.loads(i))
75+
76+
77+
class JsonlDocumentsWithOffsets(BaseDocs):
78+
def __init__(self, docs, offsets):
79+
self.__docs = docs
80+
self.__offsets = offsets
81+
82+
def docs_iter(self):
83+
with gzip.open(self.__docs.path()) as f:
84+
for l in f:
85+
yield TrecToT2025Doc._from_json(json.loads(l))
86+
87+
def docs_cls(self):
88+
return TrecToT2025Doc
89+
90+
def docs_store(self, field='doc_id'):
91+
return TrecToT2025DocsStore(self.__docs, self.__offsets)
92+
93+
def docs_namespace(self):
94+
raise ValueError("ToDo: Implement this")
95+
96+
def docs_count(self):
97+
return 6407814
98+
99+
def docs_lang(self):
100+
return "en"
101+
102+
103+
class TrecToT2025Dataset(Dataset):
104+
def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None, documentation=None):
105+
docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file)
106+
107+
if queries:
108+
queries = JsonlQueries(queries, lang='en', mapping={"text": "query", "query_id": "query_id"})
109+
if qrels:
110+
qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'})
111+
112+
super().__init__(docs, queries, qrels, documentation)
113+
114+
115+
def register_dataset():
116+
if f"{NAME}/2025" in registry:
117+
return
118+
119+
dlc = DownloadConfig.context("trec-tot-2025", home_path() / NAME / "2025")
120+
121+
documentation = YamlDocumentation(f'docs/{NAME}.yaml')
122+
doc_offsets = dlc['trec-tot-2025-offsets.jsonl.gz']
123+
doc_corpus = dlc['trec-tot-2025-corpus.jsonl.gz']
124+
registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets, documentation=documentation("2025")))
125+
for i in ["train", "dev1", "dev2", "dev3"]:
126+
qrels = dlc[i + "-2025-qrel.txt"]
127+
queries = dlc[i + "-2025-queries.jsonl"]
128+
registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels, documentation(f"2025/{i}")))
129+
130+
131+
register_dataset()
132+
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
_:
2+
pretty_name: 'TREC Tip-of-the-Tongue'
3+
desc: '
4+
<p>
5+
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>.
6+
</p>
7+
'
8+
9+
2025:
10+
desc: '
11+
<p>
12+
Corpus for the TREC 2025 tip-of-the-tongue search track.
13+
</p>
14+
'
15+
16+
2025/train:
17+
desc: '
18+
<p>
19+
Train query set for TREC 2025 tip-of-the-tongue search track.
20+
</p>
21+
'
22+
23+
2025/dev1:
24+
desc: '
25+
<p>
26+
Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set).
27+
</p>
28+
'
29+
30+
2025/dev2:
31+
desc: '
32+
<p>
33+
Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set).
34+
</p>
35+
'
36+
37+
2025/dev3:
38+
desc: '
39+
<p>
40+
Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set).
41+
</p>
42+
'

ir_datasets/etc/downloads.json

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6197,6 +6197,59 @@
61976197
"cache_path": "trec-tot-2024-queries.zip"
61986198
}
61996199
},
6200+
6201+
"trec-tot-2025": {
6202+
"trec-tot-2025-offsets.jsonl.gz": {
6203+
"url": "https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz",
6204+
"expected_md5": "00678e3155d962bb244e034e6401b79b",
6205+
"cache_path": "trec-tot-2025-offsets.jsonl.gz"
6206+
},
6207+
"trec-tot-2025-corpus.jsonl.gz": {
6208+
"url": "https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz",
6209+
"expected_md5": "a2c82398aa86df6a68c8706b9b462bf2",
6210+
"cache_path": "trec-tot-2025-corpus.jsonl.gz"
6211+
},
6212+
"train-2025-qrel.txt": {
6213+
"url": "https://zenodo.org/records/15356599/files/train-2025-qrel.txt",
6214+
"expected_md5": "10a3c727fc5806ec4510f7a071b57cd7",
6215+
"cache_path": "train-2025-qrel.txt"
6216+
},
6217+
"train-2025-queries.jsonl": {
6218+
"url": "https://zenodo.org/records/15356599/files/train-2025-queries.jsonl",
6219+
"expected_md5": "288b7707b4e897f7447aac2cc2f613be",
6220+
"cache_path": "train-2025-queries.jsonl"
6221+
},
6222+
"dev1-2025-qrel.txt": {
6223+
"url": "https://zenodo.org/records/15356599/files/dev1-2025-qrel.txt",
6224+
"expected_md5": "0c913ce8b5b287c73a6dfac662971e82",
6225+
"cache_path": "dev1-2025-qrel.txt"
6226+
},
6227+
"dev1-2025-queries.jsonl": {
6228+
"url": "https://zenodo.org/records/15356599/files/dev1-2025-queries.jsonl",
6229+
"expected_md5": "b87c2f51d058de844e258a69b02e70fc",
6230+
"cache_path": "dev1-2025-queries.jsonl"
6231+
},
6232+
"dev2-2025-qrel.txt": {
6233+
"url": "https://zenodo.org/records/15356599/files/dev2-2025-qrel.txt",
6234+
"expected_md5": "4548eb41e639905384aa017c69129bfc",
6235+
"cache_path": "dev2-2025-qrel.txt"
6236+
},
6237+
"dev2-2025-queries.jsonl": {
6238+
"url": "https://zenodo.org/records/15356599/files/dev2-2025-queries.jsonl",
6239+
"expected_md5": "b174a128a255e92d0d54b76465d596b5",
6240+
"cache_path": "dev2-2025-queries.jsonl"
6241+
},
6242+
"dev3-2025-qrel.txt": {
6243+
"url": "https://zenodo.org/records/15356599/files/dev3-2025-qrel.txt",
6244+
"expected_md5": "48ab0d24a5946861546e54064238477f",
6245+
"cache_path": "dev3-2025-qrel.txt"
6246+
},
6247+
"dev3-2025-queries.jsonl": {
6248+
"url": "https://zenodo.org/records/15356599/files/dev3-2025-queries.jsonl",
6249+
"expected_md5": "259c11645694a3c5230b66c7852d4d80",
6250+
"cache_path": "dev3-2025-queries.jsonl"
6251+
}
6252+
},
62006253

62016254
"tripclick": {
62026255
"benchmark": {

ir_datasets/etc/metadata.json

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@
578578
"nano-beir/quora": {"docs": {"count": 5046, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 70, "fields": {"relevance": {"counts_by_value": {"1": 70}}}}},
579579
"nano-beir/scidocs": {"docs": {"count": 2210, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 244, "fields": {"relevance": {"counts_by_value": {"1": 244}}}}},
580580
"nano-beir/scifact": {"docs": {"count": 2919, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 56, "fields": {"relevance": {"counts_by_value": {"1": 56}}}}},
581-
"nano-beir/webis-touche2020": {"docs": {"count": 5745 , "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}},
581+
"nano-beir/webis-touche2020": {"docs": {"count": 5745, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}},
582582
"natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}},
583583
"natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}},
584584
"natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}},
@@ -703,6 +703,13 @@
703703
"trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
704704
"trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
705705
"trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
706+
"trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
707+
"trec-tot/2024/test": {"docs": {"_ref": "trec-tot/2024"}, "queries": {"count": 600}},
708+
"trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
709+
"trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}},
710+
"trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}},
711+
"trec-tot/2025/dev3": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 536}, "qrels": {"count": 536, "fields": {"relevance": {"counts_by_value": {"1": 536}}}}},
712+
"trec-tot/2025/train": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}},
706713
"tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
707714
"tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}},
708715
"tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}},
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import unittest
2+
3+
def load_dataset():
4+
import ir_datasets
5+
return ir_datasets.load("trec-tot/2025")
6+
7+
def load_doc_number(num):
8+
index = 0
9+
for i in load_dataset().docs_iter():
10+
if num == index:
11+
return i
12+
index += 1
13+
14+
class TestDocsIter(unittest.TestCase):
15+
def test_dataset_can_be_loaded(self):
16+
actual = load_dataset()
17+
self.assertIsNotNone(actual)
18+
19+
def test_first_doc(self):
20+
actual = load_doc_number(0)
21+
22+
self.assertIsNotNone(actual)
23+
self.assertEqual("12", actual.doc_id)
24+
self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url)
25+
self.assertEqual("Anarchism", actual.title)
26+
self.assertIn("a political philosophy and movement that is skeptical", actual.text)
27+
self.assertIn("a political philosophy and movement that is skeptical", actual.default_text())
28+
self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text())
29+
30+
def test_third_doc(self):
31+
actual = load_doc_number(3)
32+
33+
self.assertIsNotNone(actual)
34+
self.assertEqual("303", actual.doc_id)
35+
self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url)
36+
self.assertEqual("Alabama", actual.title)
37+
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text)
38+
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text())
39+
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import unittest
2+
3+
def load_docs_store():
4+
import ir_datasets
5+
return ir_datasets.load("trec-tot/2025").docs_store()
6+
7+
class TestDocsStore(unittest.TestCase):
8+
def test_docs_store_can_be_loaded(self):
9+
actual = load_docs_store()
10+
self.assertIsNotNone(actual)
11+
12+
def test_first_doc(self):
13+
actual = load_docs_store().get("12")
14+
15+
self.assertIsNotNone(actual)
16+
self.assertEqual("12", actual.doc_id)
17+
self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url)
18+
self.assertEqual("Anarchism", actual.title)
19+
self.assertIn("a political philosophy and movement that is skeptical", actual.text)
20+
self.assertIn("a political philosophy and movement that is skeptical", actual.default_text())
21+
self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text())
22+
23+
def test_third_doc(self):
24+
actual = load_docs_store().get("303")
25+
26+
self.assertIsNotNone(actual)
27+
self.assertEqual("303", actual.doc_id)
28+
self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url)
29+
self.assertEqual("Alabama", actual.title)
30+
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text)
31+
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text())
32+
33+
def test_some_random_doc(self):
34+
actual = load_docs_store().get("6596604")
35+
36+
self.assertIsNotNone(actual)
37+
self.assertEqual("6596604", actual.doc_id)
38+
self.assertEqual("https://en.wikipedia.org/wiki/Radio%20Reloj", actual.url)
39+
self.assertEqual("Radio Reloj", actual.title)
40+
self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.text)
41+
self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.default_text())
42+

0 commit comments

Comments
 (0)