|
| 1 | +import ir_datasets |
| 2 | +from ir_datasets.util import GzipExtract, TarExtract, Lazy, DownloadConfig |
| 3 | +from ir_datasets.formats import TrecQrels, TrecDocs, TrecQueries |
| 4 | +from ir_datasets.datasets.base import Dataset, FilteredQueries, FilteredQrels, YamlDocumentation |
| 5 | + |
| 6 | + |
| 7 | +NAME = 'disks45' |
| 8 | + |
| 9 | + |
| 10 | +QREL_DEFS = { |
| 11 | + 2: 'highly relevant', |
| 12 | + 1: 'relevant', |
| 13 | + 0: 'not relevant', |
| 14 | +} |
| 15 | + |
| 16 | +QREL_DEFS_TREC78 = { |
| 17 | + 1: 'relevant', |
| 18 | + 0: 'not relevant', |
| 19 | +} |
| 20 | + |
| 21 | +DUA = ("Please confirm you agree to the TREC data usage agreement found at " |
| 22 | + "<https://trec.nist.gov/data/cd45/index.html>") |
| 23 | + |
| 24 | + |
| 25 | +# folds from Huston & Croft 2014 <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.646.7749> |
| 26 | +ROBUST04_FOLDS = { |
| 27 | + 'fold1': {'302', '303', '309', '316', '317', '319', '323', '331', '336', '341', '356', '357', '370', '373', '378', '381', '383', '392', '394', '406', '410', '411', '414', '426', '428', '433', '447', '448', '601', '607', '608', '612', '617', '619', '635', '641', '642', '646', '647', '654', '656', '662', '665', '669', '670', '679', '684', '690', '692', '700'}, |
| 28 | + 'fold2': {'301', '308', '312', '322', '327', '328', '338', '343', '348', '349', '352', '360', '364', '365', '369', '371', '374', '386', '390', '397', '403', '419', '422', '423', '424', '432', '434', '440', '446', '602', '604', '611', '623', '624', '627', '632', '638', '643', '651', '652', '663', '674', '675', '678', '680', '683', '688', '689', '695', '698'}, |
| 29 | + 'fold3': {'306', '307', '313', '321', '324', '326', '334', '347', '351', '354', '358', '361', '362', '363', '376', '380', '382', '396', '404', '413', '415', '417', '427', '436', '437', '439', '444', '445', '449', '450', '603', '605', '606', '614', '620', '622', '626', '628', '631', '637', '644', '648', '661', '664', '666', '671', '677', '685', '687', '693'}, |
| 30 | + 'fold4': {'320', '325', '330', '332', '335', '337', '342', '344', '350', '355', '368', '377', '379', '387', '393', '398', '402', '405', '407', '408', '412', '420', '421', '425', '430', '431', '435', '438', '616', '618', '625', '630', '633', '636', '639', '649', '650', '653', '655', '657', '659', '667', '668', '672', '673', '676', '682', '686', '691', '697'}, |
| 31 | + 'fold5': {'304', '305', '310', '311', '314', '315', '318', '329', '333', '339', '340', '345', '346', '353', '359', '366', '367', '372', '375', '384', '385', '388', '389', '391', '395', '399', '400', '401', '409', '416', '418', '429', '441', '442', '443', '609', '610', '613', '615', '621', '629', '634', '640', '645', '658', '660', '681', '694', '696', '699'} |
| 32 | +} |
| 33 | + |
| 34 | + |
| 35 | +def _init(): |
| 36 | + documentation = YamlDocumentation(f'docs/{NAME}.yaml') |
| 37 | + base_path = ir_datasets.util.home_path()/NAME |
| 38 | + dlc = DownloadConfig.context(NAME, base_path, dua=DUA) |
| 39 | + subsets = {} |
| 40 | + |
| 41 | + collection_nocr = TrecDocs(dlc['docs'], |
| 42 | + path_globs=['**/FBIS/FB*', '**/FR94/??/FR*', '**/FT/*/FT*', '**/LATIMES/LA*'], |
| 43 | + namespace=NAME, |
| 44 | + lang='en', |
| 45 | + expected_file_count=2295, |
| 46 | + count_hint=ir_datasets.util.count_hint(NAME), |
| 47 | + parser='sax', |
| 48 | + docstore_path=base_path/'corpus.nocr.pklz4') |
| 49 | + |
| 50 | + robust_queries = TrecQueries(GzipExtract(dlc['robust04-queries']), namespace=NAME, lang='en') |
| 51 | + robust_qrels = TrecQrels(dlc['robust04-qrels'], QREL_DEFS) |
| 52 | + |
| 53 | + base = Dataset(documentation('_')) |
| 54 | + |
| 55 | + subsets['nocr'] = Dataset( |
| 56 | + collection_nocr, |
| 57 | + documentation('nocr')) |
| 58 | + |
| 59 | + subsets['nocr/trec-robust-2004'] = Dataset( |
| 60 | + collection_nocr, |
| 61 | + robust_queries, |
| 62 | + robust_qrels, |
| 63 | + documentation('nocr/trec-robust-2004')) |
| 64 | + |
| 65 | + for fold in ROBUST04_FOLDS: |
| 66 | + qid_filter = make_filter(fold) |
| 67 | + subsets[f'nocr/trec-robust-2004/{fold}'] = Dataset( |
| 68 | + collection_nocr, |
| 69 | + FilteredQueries(robust_queries, qid_filter), |
| 70 | + FilteredQrels(robust_qrels, qid_filter), |
| 71 | + documentation(f'nocr/trec-robust-2004/{fold}')) |
| 72 | + |
| 73 | + subsets['nocr/trec8'] = Dataset( |
| 74 | + collection_nocr, |
| 75 | + TrecQrels(TarExtract(dlc['trec8-qrels'], 'qrels.trec8.adhoc.parts1-5'), QREL_DEFS_TREC78), |
| 76 | + TrecQueries(GzipExtract(dlc['trec8-queries']), namespace=NAME, lang='en'), |
| 77 | + documentation('nocr/trec8')) |
| 78 | + |
| 79 | + subsets['nocr/trec7'] = Dataset( |
| 80 | + collection_nocr, |
| 81 | + TrecQrels([ |
| 82 | + GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part1.gz')), |
| 83 | + GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part2.gz')), |
| 84 | + GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part3.gz')), |
| 85 | + GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part4.gz')), |
| 86 | + GzipExtract(TarExtract(dlc['trec7-qrels'], 'qrels.trec7.adhoc.part5.gz')), |
| 87 | + ], QREL_DEFS_TREC78), |
| 88 | + TrecQueries(GzipExtract(dlc['trec7-queries']), namespace=NAME, lang='en'), |
| 89 | + documentation('nocr/trec7')) |
| 90 | + |
| 91 | + ir_datasets.registry.register(NAME, base) |
| 92 | + for s in sorted(subsets): |
| 93 | + ir_datasets.registry.register(f'{NAME}/{s}', subsets[s]) |
| 94 | + |
| 95 | + return base, subsets |
| 96 | + |
| 97 | + |
| 98 | +def make_filter(fold): |
| 99 | + return Lazy(lambda: ROBUST04_FOLDS[fold]) |
| 100 | + |
| 101 | + |
| 102 | +base, subsets = _init() |
0 commit comments