Skip to content
2 changes: 2 additions & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,5 @@
from . import hc4
from . import neuclir # must be after hc4
from . import sara
from . import trec_tot_2025

132 changes: 132 additions & 0 deletions ir_datasets/datasets/trec_tot_2025.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from ir_datasets import registry
from ir_datasets.datasets.base import Dataset, YamlDocumentation
from ir_datasets.util.download import RequestsDownload
from ir_datasets.formats.base import BaseDocs
from ir_datasets.indices import Docstore
from ir_datasets.util import ZipExtractCache, home_path, Cache, DownloadConfig
from ir_datasets.formats import BaseDocs, TrecQrels, JsonlQueries
from ir_datasets.indices import PickleLz4FullStore
import os
import gzip
import json
from tqdm import tqdm
from typing import NamedTuple

NAME = "trec-tot"


class JsonlDocumentOffset(NamedTuple):
doc_id: str
offset_start: int
offset_end: int


class TrecToT2025Doc(NamedTuple):
doc_id: str
title: str
url: str
text: str

@staticmethod
def _from_json(json_doc):
return TrecToT2025Doc(json_doc["id"], json_doc["title"], json_doc["url"], json_doc["text"])

def default_text(self):
return self.title + " " + self.text


class JsonlWithOffsetsDocsStore(Docstore):
def __init__(self, docs, offsets):
self.__docs = docs
self.__offsets = offsets
self._docs_dict = None
self._id_field = "doc_id"

def offsets_iter(self):
with gzip.open(self.__offsets.path(), "rt") as f:
for i in f:
i = json.loads(i)
yield JsonlDocumentOffset(doc_id=i["id"], offset_start=i["offset_start"], offset_end=i["offset_end"])

def docs_dict(self):
return PickleLz4FullStore(
path=str(self.__offsets.path()) + '.pklz4',
init_iter_fn=self.offsets_iter,
data_cls=JsonlDocumentOffset,
lookup_field="doc_id",
index_fields=("doc_id",)
)

def get_many_iter(self, doc_ids):
offsets = self.docs_dict()

with open(self.__docs.path(), "rb") as f:
for doc in doc_ids:
doc = offsets.get(doc)
f.seek(doc.offset_start)
raw_content_bytes = f.read(doc.offset_end - doc.offset_start)
yield gzip.decompress(raw_content_bytes)


class TrecToT2025DocsStore(JsonlWithOffsetsDocsStore):
def get_many_iter(self, doc_ids):
for i in super().get_many_iter(doc_ids):
yield TrecToT2025Doc._from_json(json.loads(i))


class JsonlDocumentsWithOffsets(BaseDocs):
def __init__(self, docs, offsets):
self.__docs = docs
self.__offsets = offsets

def docs_iter(self):
with gzip.open(self.__docs.path()) as f:
for l in f:
yield TrecToT2025Doc._from_json(json.loads(l))

def docs_cls(self):
return TrecToT2025Doc

def docs_store(self, field='doc_id'):
return TrecToT2025DocsStore(self.__docs, self.__offsets)

def docs_namespace(self):
raise ValueError("ToDo: Implement this")

def docs_count(self):
return 6407814

def docs_lang(self):
return "en"


class TrecToT2025Dataset(Dataset):
def __init__(self, docs_jsonl_file, offset_jsonl_file, queries=None, qrels=None, documentation=None):
docs = JsonlDocumentsWithOffsets(docs_jsonl_file, offset_jsonl_file)

if queries:
queries = JsonlQueries(queries, lang='en', mapping={"text": "query", "query_id": "query_id"})
if qrels:
qrels = TrecQrels(qrels, {0: 'Not Relevant', 1: 'Relevant'})

super().__init__(docs, queries, qrels, documentation)


def register_dataset():
if f"{NAME}/2025" in registry:
return

dlc = DownloadConfig.context("trec-tot-2025", home_path() / NAME / "2025")

documentation = YamlDocumentation(f'docs/{NAME}.yaml')
doc_offsets = dlc['trec-tot-2025-offsets.jsonl.gz']
doc_corpus = dlc['trec-tot-2025-corpus.jsonl.gz']
registry.register(f"{NAME}/2025", TrecToT2025Dataset(doc_corpus, doc_offsets, documentation=documentation("2025")))
for i in ["train", "dev1", "dev2", "dev3"]:
qrels = dlc[i + "-2025-qrel.txt"]
queries = dlc[i + "-2025-queries.jsonl"]
registry.register(f"{NAME}/2025/{i}", TrecToT2025Dataset(doc_corpus, doc_offsets, queries, qrels, documentation(f"2025/{i}")))


register_dataset()

42 changes: 42 additions & 0 deletions ir_datasets/docs/trec-tot-2025.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
_:
pretty_name: 'TREC Tip-of-the-Tongue'
desc: '
<p>
Tip of the tongue: The phenomenon of failing to retrieve something from memory, combined with partial recall and the feeling that retrieval is imminent. More details <a href="https://trec-tot.github.io/guidelines">are available on the official page for the TREC Tip-of-the-Tongue (ToT) Track</a>.
</p>
'

2025:
desc: '
<p>
Corpus for the TREC 2025 tip-of-the-tongue search track.
</p>
'

2025/train:
desc: '
<p>
Train query set for TREC 2025 tip-of-the-tongue search track.
</p>
'

2025/dev1:
desc: '
<p>
Dev-1 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 dev set).
</p>
'

2025/dev2:
desc: '
<p>
Dev-2 query set for TREC 2025 tip-of-the-tongue search track (the original 2023 test set).
</p>
'

2025/dev3:
desc: '
<p>
Dev-3 query set for TREC 2025 tip-of-the-tongue search track (the original 2024 test set).
</p>
'
53 changes: 53 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -6186,6 +6186,59 @@
"cache_path": "trec-tot.zip"
}
},

"trec-tot-2025": {
"trec-tot-2025-offsets.jsonl.gz": {
"url": "https://zenodo.org/records/15356599/files/trec-tot-2025-offsets.jsonl.gz",
"expected_md5": "00678e3155d962bb244e034e6401b79b",
"cache_path": "trec-tot-2025-offsets.jsonl.gz"
},
"trec-tot-2025-corpus.jsonl.gz": {
"url": "https://zenodo.org/records/15356599/files/trec-tot-2025-corpus.jsonl.gz",
"expected_md5": "a2c82398aa86df6a68c8706b9b462bf2",
"cache_path": "trec-tot-2025-corpus.jsonl.gz"
},
"train-2025-qrel.txt": {
"url": "https://zenodo.org/records/15356599/files/train-2025-qrel.txt",
"expected_md5": "10a3c727fc5806ec4510f7a071b57cd7",
"cache_path": "train-2025-qrel.txt"
},
"train-2025-queries.jsonl": {
"url": "https://zenodo.org/records/15356599/files/train-2025-queries.jsonl",
"expected_md5": "288b7707b4e897f7447aac2cc2f613be",
"cache_path": "train-2025-queries.jsonl"
},
"dev1-2025-qrel.txt": {
"url": "https://zenodo.org/records/15356599/files/dev1-2025-qrel.txt",
"expected_md5": "0c913ce8b5b287c73a6dfac662971e82",
"cache_path": "dev1-2025-qrel.txt"
},
"dev1-2025-queries.jsonl": {
"url": "https://zenodo.org/records/15356599/files/dev1-2025-queries.jsonl",
"expected_md5": "b87c2f51d058de844e258a69b02e70fc",
"cache_path": "dev1-2025-queries.jsonl"
},
"dev2-2025-qrel.txt": {
"url": "https://zenodo.org/records/15356599/files/dev2-2025-qrel.txt",
"expected_md5": "4548eb41e639905384aa017c69129bfc",
"cache_path": "dev2-2025-qrel.txt"
},
"dev2-2025-queries.jsonl": {
"url": "https://zenodo.org/records/15356599/files/dev2-2025-queries.jsonl",
"expected_md5": "b174a128a255e92d0d54b76465d596b5",
"cache_path": "dev2-2025-queries.jsonl"
},
"dev3-2025-qrel.txt": {
"url": "https://zenodo.org/records/15356599/files/dev3-2025-qrel.txt",
"expected_md5": "48ab0d24a5946861546e54064238477f",
"cache_path": "dev3-2025-qrel.txt"
},
"dev3-2025-queries.jsonl": {
"url": "https://zenodo.org/records/15356599/files/dev3-2025-queries.jsonl",
"expected_md5": "259c11645694a3c5230b66c7852d4d80",
"cache_path": "dev3-2025-queries.jsonl"
}
},

"tripclick": {
"benchmark": {
Expand Down
9 changes: 8 additions & 1 deletion ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@
"nano-beir/quora": {"docs": {"count": 5046, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 70, "fields": {"relevance": {"counts_by_value": {"1": 70}}}}},
"nano-beir/scidocs": {"docs": {"count": 2210, "fields": {"doc_id": {"max_len": 40, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 244, "fields": {"relevance": {"counts_by_value": {"1": 244}}}}},
"nano-beir/scifact": {"docs": {"count": 2919, "fields": {"doc_id": {"max_len": 9, "common_prefix": ""}}}, "queries": {"count": 50}, "qrels": {"count": 56, "fields": {"relevance": {"counts_by_value": {"1": 56}}}}},
"nano-beir/webis-touche2020": {"docs": {"count": 5745 , "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}},
"nano-beir/webis-touche2020": {"docs": {"count": 5745, "fields": {"doc_id": {"max_len": 39, "common_prefix": ""}}}, "queries": {"count": 49}, "qrels": {"count": 932, "fields": {"relevance": {"counts_by_value": {"1": 932}}}}},
"natural-questions": {"docs": {"count": 28390850, "fields": {"doc_id": {"max_len": 11, "common_prefix": ""}}}},
"natural-questions/dev": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 7830}, "qrels": {"count": 7695, "fields": {"relevance": {"counts_by_value": {"1": 7695}}}}, "scoreddocs": {"count": 973480}},
"natural-questions/train": {"docs": {"_ref": "natural-questions"}, "queries": {"count": 307373}, "qrels": {"count": 152148, "fields": {"relevance": {"counts_by_value": {"1": 152148}}}}, "scoreddocs": {"count": 40374730}},
Expand Down Expand Up @@ -702,6 +702,13 @@
"trec-tot/2023": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2023/dev": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"trec-tot/2023/train": {"docs": {"count": 231852, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 150}, "qrels": {"count": 150, "fields": {"relevance": {"counts_by_value": {"1": 150}}}}},
"trec-tot/2024": {"docs": {"count": 3185450, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2024/test": {"docs": {"_ref": "trec-tot/2024"}, "queries": {"count": 600}},
"trec-tot/2025": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"trec-tot/2025/dev1": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 142}, "qrels": {"count": 142, "fields": {"relevance": {"counts_by_value": {"1": 142}}}}},
"trec-tot/2025/dev2": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}},
"trec-tot/2025/dev3": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 536}, "qrels": {"count": 536, "fields": {"relevance": {"counts_by_value": {"1": 536}}}}},
"trec-tot/2025/train": {"docs": {"count": 6407814, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "queries": {"count": 143}, "qrels": {"count": 143, "fields": {"relevance": {"counts_by_value": {"1": 143}}}}},
"tripclick": {"docs": {"count": 1523878, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"tripclick/logs": {"docs": {"count": 5196956, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}, "qlogs": {"count": 5317350}},
"tripclick/test": {"docs": {"_ref": "tripclick"}, "queries": {"count": 3525}, "scoreddocs": {"count": 3486402}},
Expand Down
39 changes: 39 additions & 0 deletions test/integration/trec_tot_2025/test_docs_iter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import unittest

def load_dataset():
import ir_datasets
return ir_datasets.load("trec-tot/2025")

def load_doc_number(num):
index = 0
for i in load_dataset().docs_iter():
if num == index:
return i
index += 1

class TestDocsIter(unittest.TestCase):
def test_dataset_can_be_loaded(self):
actual = load_dataset()
self.assertIsNotNone(actual)

def test_first_doc(self):
actual = load_doc_number(0)

self.assertIsNotNone(actual)
self.assertEqual("12", actual.doc_id)
self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url)
self.assertEqual("Anarchism", actual.title)
self.assertIn("a political philosophy and movement that is skeptical", actual.text)
self.assertIn("a political philosophy and movement that is skeptical", actual.default_text())
self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text())

def test_third_doc(self):
actual = load_doc_number(3)

self.assertIsNotNone(actual)
self.assertEqual("303", actual.doc_id)
self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url)
self.assertEqual("Alabama", actual.title)
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text)
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text())

42 changes: 42 additions & 0 deletions test/integration/trec_tot_2025/test_docs_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import unittest

def load_docs_store():
import ir_datasets
return ir_datasets.load("trec-tot/2025").docs_store()

class TestDocsStore(unittest.TestCase):
def test_docs_store_can_be_loaded(self):
actual = load_docs_store()
self.assertIsNotNone(actual)

def test_first_doc(self):
actual = load_docs_store().get("12")

self.assertIsNotNone(actual)
self.assertEqual("12", actual.doc_id)
self.assertEqual("https://en.wikipedia.org/wiki/Anarchism", actual.url)
self.assertEqual("Anarchism", actual.title)
self.assertIn("a political philosophy and movement that is skeptical", actual.text)
self.assertIn("a political philosophy and movement that is skeptical", actual.default_text())
self.assertIn("Anarchism Anarchism is a political philosophy and movement that is skeptical", actual.default_text())

def test_third_doc(self):
actual = load_docs_store().get("303")

self.assertIsNotNone(actual)
self.assertEqual("303", actual.doc_id)
self.assertEqual("https://en.wikipedia.org/wiki/Alabama", actual.url)
self.assertEqual("Alabama", actual.title)
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.text)
self.assertIn("Alabama's economy in the 21st century is based on automotive", actual.default_text())

def test_some_random_doc(self):
actual = load_docs_store().get("6596604")

self.assertIsNotNone(actual)
self.assertEqual("6596604", actual.doc_id)
self.assertEqual("https://en.wikipedia.org/wiki/Radio%20Reloj", actual.url)
self.assertEqual("Radio Reloj", actual.title)
self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.text)
self.assertIn("Radio Reloj (Spanish for Radio Clock) is an internationally broadcast Spanish-language radio station", actual.default_text())

Loading