allenai · seanmacavaney · Feb 25, 2022 · Feb 25, 2022
diff --git a/ir_datasets/datasets/__init__.py b/ir_datasets/datasets/__init__.py
@@ -17,6 +17,7 @@
 from . import gov
 from . import gov2
 from . import highwire
+from . import kilt
 from . import lotte
 from . import medline
 from . import mmarco

diff --git a/ir_datasets/datasets/kilt.py b/ir_datasets/datasets/kilt.py
@@ -0,0 +1,120 @@
+import json
+import codecs
+from typing import NamedTuple, Tuple
+import ir_datasets
+from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator
+from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
+from ir_datasets.formats import BaseDocs
+from ir_datasets.indices import PickleLz4FullStore
+
+_logger = ir_datasets.log.easy()
+
+
+NAME = 'kilt'
+
+
+class KiltDocAnchor(NamedTuple):
+ text: str
+ href: str
+ paragraph_id: int
+ start: int
+ end: int
+
+
+class KiltDoc(NamedTuple):
+ doc_id: str
+ title: str
+ text: str
+ text_pieces: Tuple[str, ...]
+ anchors: Tuple[KiltDocAnchor, ...]
+ categories: Tuple[str, ...]
+ wikidata_id: str
+ history_revid: str
+ history_timestamp: str
+ history_parentid: str
+ history_pageid: str
+ history_url: str
+
+
+def strip_markup(text):
+ if text.startswith('Section::::'):
+ return text.replace('Section::::', '').replace(':', ' ')
+ if text.startswith('BULLET::::-'):
+ return text.replace('BULLET::::-', '-')
+ return text
+
+
+class KiltDocs(BaseDocs):
+ def __init__(self, streamer, count_hint=None):
+ super().__init__()
+ self._streamer = streamer
+ self._count_hint = count_hint
+
+ @ir_datasets.util.use_docstore
+ def docs_iter(self):
+ with self._streamer.stream() as stream:
+ for doc in stream:
+ doc = json.loads(doc)
+ yield KiltDoc(
+ doc['wikipedia_id'],
+ doc['wikipedia_title'],
+ ''.join(strip_markup(t) for t in doc['text']),
+ tuple(doc['text']),
+ tuple(KiltDocAnchor(
+ a['text'],
+ a['href'],
+ a['paragraph_id'],
+ a['start'],
+ a['end']) for a in doc['anchors']),
+ tuple(doc['categories'].split(',')),
+ doc.get('wikidata_info', {}).get('wikidata_id', ''),
+ str(doc['history']['revid']),
+ doc['history']['timestamp'],
+ str(doc['history']['parentid']),
+ str(doc['history']['pageid']),
+ doc['history']['url'],
+ )
+
+ def docs_cls(self):
+ return KiltDoc
+
+ def docs_store(self, field='doc_id'):
+ return PickleLz4FullStore(
+ path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
+ init_iter_fn=self.docs_iter,
+ data_cls=self.docs_cls(),
+ lookup_field=field,
+ index_fields=['doc_id'],
+ count_hint=self._count_hint,
+ )
+
+ def docs_count(self):
+ if self.docs_store().built():
+ return self.docs_store().count()
+
+ def docs_namespace(self):
+ return NAME
+
+ def docs_lang(self):
+ return 'en'
+
+
+def _init():
+ base_path = ir_datasets.util.home_path()/NAME
+ dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
+ documentation = YamlDocumentation(f'docs/{NAME}.yaml')
+
+ base = Dataset(
+ KiltDocs(dlc['knowledgesource'], count_hint=5903530),
+ documentation('_'))
+
+ subsets = {}
+
+ ir_datasets.registry.register(NAME, base)
+ for s in sorted(subsets):
+ ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])
+
+ return base, subsets
+
+
+base, subsets = _init()
diff --git a/ir_datasets/docs/bibliography.bib b/ir_datasets/docs/bibliography.bib
@@ -847,6 +847,26 @@ @article{Santhanam2021ColBERTv2
  url = "https://arxiv.org/abs/2112.01488"
 }
 
+@inproceedings{petroni-etal-2021-kilt,
+ title = "{KILT}: a Benchmark for Knowledge Intensive Language Tasks",
+ author = {Petroni, Fabio and Piktus, Aleksandra and
+ Fan, Angela and Lewis, Patrick and
+ Yazdani, Majid and De Cao, Nicola and
+ Thorne, James and Jernite, Yacine and
+ Karpukhin, Vladimir and Maillard, Jean and
+ Plachouras, Vassilis and Rockt{\"a}schel, Tim and
+ Riedel, Sebastian},
+ booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association 
+ for Computational Linguistics: Human Language Technologies",
+ month = "jun",
+ year = "2021",
+ address = "Online",
+ publisher = "Association for Computational Linguistics",
+ url = "https://aclanthology.org/2021.naacl-main.200",
+ doi = "10.18653/v1/2021.naacl-main.200",
+ pages = "2523--2544",
+}
+
 @article{Lawrie2022HC4,
  author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang},
  title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR},

diff --git a/ir_datasets/docs/kilt.yaml b/ir_datasets/docs/kilt.yaml
@@ -0,0 +1,14 @@
+_:
+ pretty_name: 'KILT'
+ desc: '
+<p>
+KILT is a corpus used for various "knowledge intensive language tasks".
+</p>
+<ul>
+<li>Documents: Wikipedia articles</li>
+<li><a href="https://github.com/facebookresearch/KILT">Repository</a></li>
+<li><a href="https://arxiv.org/abs/2009.02252">Paper</a></li>
+<li><a href="https://ai.facebook.com/tools/kilt/">Leaderboard</a></li>
+</ul>
+'
+ bibtex_ids: ['petroni-etal-2021-kilt']
diff --git a/ir_datasets/etc/downloads.json b/ir_datasets/etc/downloads.json
@@ -1452,6 +1452,15 @@
  }
 },
 
+"kilt": {
+ "knowledgesource": {
+ "url": "http://dl.fbaipublicfiles.com/KILT/kilt_knowledgesource.json",
+ "size_hint": 37318876722,
+ "expected_md5": "d1dca62aa6ba889d2e842182e3114af5",
+ "cache_path": "kilt_knowledgesource.json"
+ }
+},
+
 "lotte": {
  "source": {
  "url": "https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/lotte.tar.gz",

diff --git a/ir_datasets/etc/metadata.json b/ir_datasets/etc/metadata.json
@@ -189,6 +189,7 @@
  "highwire": {"docs": {"count": 162259, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
  "highwire/trec-genomics-2006": {"docs": {"_ref": "highwire"}, "queries": {"count": 28}, "qrels": {"count": 27999, "fields": {"relevance": {"counts_by_value": {"0": 24934, "1": 1237, "2": 1828}}}}},
  "highwire/trec-genomics-2007": {"docs": {"_ref": "highwire"}, "queries": {"count": 36}, "qrels": {"count": 35996, "fields": {"relevance": {"counts_by_value": {"0": 31501, "1": 4495}}}}},
+ "kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
  "lotte": {},
  "lotte/lifestyle/dev": {"docs": {"count": 268893, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}},
  "lotte/lifestyle/dev/forum": {"docs": {"_ref": "lotte/lifestyle/dev"}, "queries": {"count": 2076}, "qrels": {"count": 12823, "fields": {"relevance": {"counts_by_value": {"1": 12823}}}}},

diff --git a/test/integration/kilt.py b/test/integration/kilt.py