@@ -21,19 +21,19 @@ class JsonlDocumentOffset(NamedTuple):
21
21
offset_end : int
22
22
23
23
24
- class TrecToT2025Doc ():
25
- def __init__ (self , json_doc ):
26
- parsed_doc = json .loads (json_doc )
27
- self .doc_id = parsed_doc ["id" ]
28
- self .title = parsed_doc ["title" ]
29
- self .url = parsed_doc ["url" ]
30
- self .text = parsed_doc ["text" ]
24
+ class TrecToT2025Doc (NamedTuple ):
25
+ doc_id : str
26
+ title : str
27
+ url : str
28
+ text : str
29
+
30
+ @staticmethod
31
+ def _from_json (self , json_doc ):
32
+ return TrecToT2025Doc (json_doc ["id" ], json_doc ["title" ], json_doc ["url" ], json_doc ["text" ])
31
33
32
34
def default_text (self ):
33
35
return self .title + " " + self .text
34
36
35
- def _asdict (self ):
36
- return {"doc_id" : self .doc_id , "text" : self .default_text ()}
37
37
38
38
class JsonlWithOffsetsDocsStore (Docstore ):
39
39
def __init__ (self , docs , offsets ):
@@ -71,7 +71,7 @@ def get_many_iter(self, doc_ids):
71
71
class TrecToT2025DocsStore (JsonlWithOffsetsDocsStore ):
72
72
def get_many_iter (self , doc_ids ):
73
73
for i in super ().get_many_iter (doc_ids ):
74
- yield TrecToT2025Doc (i )
74
+ yield TrecToT2025Doc . _from_json (i )
75
75
76
76
77
77
class JsonlDocumentsWithOffsets (BaseDocs ):
@@ -82,10 +82,10 @@ def __init__(self, docs, offsets):
82
82
def docs_iter (self ):
83
83
with gzip .open (self .__docs .path ()) as f :
84
84
for l in f :
85
- yield TrecToT2025Doc ( l )
85
+ yield TrecToT2025Doc . _from_json ( json . loads ( l ) )
86
86
87
87
def docs_cls (self ):
88
- return self . _cls
88
+ return TrecToT2025Doc
89
89
90
90
def docs_store (self , field = 'doc_id' ):
91
91
return TrecToT2025DocsStore (self .__docs , self .__offsets )
0 commit comments