Skip to content
Merged

kilt #161

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ir_datasets/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from . import gov
from . import gov2
from . import highwire
from . import kilt
from . import lotte
from . import medline
from . import mmarco
Expand Down
120 changes: 120 additions & 0 deletions ir_datasets/datasets/kilt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import json
import codecs
from typing import NamedTuple, Tuple
import ir_datasets
from ir_datasets.util import TarExtractAll, Cache, RelativePath, Lazy, Migrator
from ir_datasets.datasets.base import Dataset, YamlDocumentation, FilteredQueries
from ir_datasets.formats import BaseDocs
from ir_datasets.indices import PickleLz4FullStore

_logger = ir_datasets.log.easy()


NAME = 'kilt'


class KiltDocAnchor(NamedTuple):
text: str
href: str
paragraph_id: int
start: int
end: int


class KiltDoc(NamedTuple):
doc_id: str
title: str
text: str
text_pieces: Tuple[str, ...]
anchors: Tuple[KiltDocAnchor, ...]
categories: Tuple[str, ...]
wikidata_id: str
history_revid: str
history_timestamp: str
history_parentid: str
history_pageid: str
history_url: str


def strip_markup(text):
if text.startswith('Section::::'):
return text.replace('Section::::', '').replace(':', ' ')
if text.startswith('BULLET::::-'):
return text.replace('BULLET::::-', '-')
return text


class KiltDocs(BaseDocs):
def __init__(self, streamer, count_hint=None):
super().__init__()
self._streamer = streamer
self._count_hint = count_hint

@ir_datasets.util.use_docstore
def docs_iter(self):
with self._streamer.stream() as stream:
for doc in stream:
doc = json.loads(doc)
yield KiltDoc(
doc['wikipedia_id'],
doc['wikipedia_title'],
''.join(strip_markup(t) for t in doc['text']),
tuple(doc['text']),
tuple(KiltDocAnchor(
a['text'],
a['href'],
a['paragraph_id'],
a['start'],
a['end']) for a in doc['anchors']),
tuple(doc['categories'].split(',')),
doc.get('wikidata_info', {}).get('wikidata_id', ''),
str(doc['history']['revid']),
doc['history']['timestamp'],
str(doc['history']['parentid']),
str(doc['history']['pageid']),
doc['history']['url'],
)

def docs_cls(self):
return KiltDoc

def docs_store(self, field='doc_id'):
return PickleLz4FullStore(
path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
init_iter_fn=self.docs_iter,
data_cls=self.docs_cls(),
lookup_field=field,
index_fields=['doc_id'],
count_hint=self._count_hint,
)

def docs_count(self):
if self.docs_store().built():
return self.docs_store().count()

def docs_namespace(self):
return NAME

def docs_lang(self):
return 'en'


def _init():
base_path = ir_datasets.util.home_path()/NAME
dlc = ir_datasets.util.DownloadConfig.context(NAME, base_path)
documentation = YamlDocumentation(f'docs/{NAME}.yaml')

base = Dataset(
KiltDocs(dlc['knowledgesource'], count_hint=5903530),
documentation('_'))

subsets = {}

ir_datasets.registry.register(NAME, base)
for s in sorted(subsets):
ir_datasets.registry.register(f'{NAME}/{s}', subsets[s])

return base, subsets


base, subsets = _init()
20 changes: 20 additions & 0 deletions ir_datasets/docs/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,26 @@ @article{Santhanam2021ColBERTv2
url = "https://arxiv.org/abs/2112.01488"
}

@inproceedings{petroni-etal-2021-kilt,
title = "{KILT}: a Benchmark for Knowledge Intensive Language Tasks",
author = {Petroni, Fabio and Piktus, Aleksandra and
Fan, Angela and Lewis, Patrick and
Yazdani, Majid and De Cao, Nicola and
Thorne, James and Jernite, Yacine and
Karpukhin, Vladimir and Maillard, Jean and
Plachouras, Vassilis and Rockt{\"a}schel, Tim and
Riedel, Sebastian},
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association
for Computational Linguistics: Human Language Technologies",
month = "jun",
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.200",
doi = "10.18653/v1/2021.naacl-main.200",
pages = "2523--2544",
}

@article{Lawrie2022HC4,
author = {Dawn Lawrie and James Mayfield and Douglas W. Oard and Eugene Yang},
title = {HC4: A New Suite of Test Collections for Ad Hoc CLIR},
Expand Down
14 changes: 14 additions & 0 deletions ir_datasets/docs/kilt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
_:
pretty_name: 'KILT'
desc: '
<p>
KILT is a corpus used for various "knowledge intensive language tasks".
</p>
<ul>
<li>Documents: Wikipedia articles</li>
<li><a href="https://github.com/facebookresearch/KILT">Repository</a></li>
<li><a href="https://arxiv.org/abs/2009.02252">Paper</a></li>
<li><a href="https://ai.facebook.com/tools/kilt/">Leaderboard</a></li>
</ul>
'
bibtex_ids: ['petroni-etal-2021-kilt']
9 changes: 9 additions & 0 deletions ir_datasets/etc/downloads.json
Original file line number Diff line number Diff line change
Expand Up @@ -1452,6 +1452,15 @@
}
},

"kilt": {
"knowledgesource": {
"url": "http://dl.fbaipublicfiles.com/KILT/kilt_knowledgesource.json",
"size_hint": 37318876722,
"expected_md5": "d1dca62aa6ba889d2e842182e3114af5",
"cache_path": "kilt_knowledgesource.json"
}
},

"lotte": {
"source": {
"url": "https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/lotte.tar.gz",
Expand Down
1 change: 1 addition & 0 deletions ir_datasets/etc/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@
"highwire": {"docs": {"count": 162259, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"highwire/trec-genomics-2006": {"docs": {"_ref": "highwire"}, "queries": {"count": 28}, "qrels": {"count": 27999, "fields": {"relevance": {"counts_by_value": {"0": 24934, "1": 1237, "2": 1828}}}}},
"highwire/trec-genomics-2007": {"docs": {"_ref": "highwire"}, "queries": {"count": 36}, "qrels": {"count": 35996, "fields": {"relevance": {"counts_by_value": {"0": 31501, "1": 4495}}}}},
"kilt": {"docs": {"count": 5903530, "fields": {"doc_id": {"max_len": 8, "common_prefix": ""}}}},
"lotte": {},
"lotte/lifestyle/dev": {"docs": {"count": 268893, "fields": {"doc_id": {"max_len": 6, "common_prefix": ""}}}},
"lotte/lifestyle/dev/forum": {"docs": {"_ref": "lotte/lifestyle/dev"}, "queries": {"count": 2076}, "qrels": {"count": 12823, "fields": {"relevance": {"counts_by_value": {"1": 12823}}}}},
Expand Down
16 changes: 16 additions & 0 deletions test/integration/kilt.py

Large diffs are not rendered by default.