Skip to content

Commit f1a6079

Browse files
authored
Merge pull request #97 from egpbos/custom_doi_field_csv
Custom DOI field in `load_csv`
2 parents fcb82e8 + b59a9ad commit f1a6079

File tree

3 files changed

+32
-2
lines changed

3 files changed

+32
-2
lines changed

litstudy/sources/csv.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def publication_date(self):
116116

117117
for fmt in formats:
118118
try:
119-
return datetime.strptime(text, fmt)
119+
return datetime.datetime.strptime(text, fmt)
120120
except Exception:
121121
pass
122122

@@ -167,6 +167,7 @@ def load_csv(
167167
citation_field: str = None,
168168
date_field: str = None,
169169
source_field: str = None,
170+
doi_field: str = None,
170171
filter=None,
171172
) -> DocumentSet:
172173
"""Load an abitrary CSV file and parse its contents as a ``DocumentSet``
@@ -190,6 +191,8 @@ def load_csv(
190191
:param abstract_field: Field name for ``abstract``.
191192
:param citation_field: Field name for ``citation_count``.
192193
:param date_field: Field name for ``publication_date`` or
194+
:param source_field: Field name for ``source``.
195+
:param doi_field: Field name for ``doi``.
193196
:param filter: Optional function applied to each loaded record. This
194197
function can be used to, for example, add or delete fields.
195198
@@ -309,7 +312,8 @@ def load_csv(
309312
"pubmed id",
310313
],
311314
),
312-
doi=find_field(
315+
doi=doi_field
316+
or find_field(
313317
columns,
314318
[
315319
"doi",
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,ArticleType,RetractionDate,RetractionDOI,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes
2+
4242,Reflections on Research Software,(B/T) Computer Science;(B/T) Data Science;(B/T) Technology;,"Netherlands fScience Center, Nieuw-Amsterdam, Netherlands",Journal of Prominent Things,Prominence Inc,Netherlands,Patrick Bos,,Fake Research Article;,7/31/2024 14:00,10.4242/2024/01,0,7/31/2024 13:59,10.4242/2024/00,0,Retraction,+Concerns/Issues About Reality;+Randomly Generated Content;,No,This is a made-up dummy entry.

tests/test_sources_csv.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,27 @@ def test_load_scopus_csv():
4848

4949
assert len(doc.authors) == 10
5050
assert doc.authors[0].name == "Phillips J.C."
51+
52+
def test_load_retraction_watch_csv():
53+
path = os.path.dirname(__file__) + "/resources/retraction_watch.csv"
54+
55+
# let's also go out of our way to make the date field work:
56+
def date_filter(d: dict) -> dict:
57+
import datetime
58+
try:
59+
d["date"] = datetime.datetime.strptime(d["OriginalPaperDate"], "%m/%d/%Y %H:%M").date().isoformat()
60+
print(d["date"])
61+
except ValueError:
62+
pass
63+
return d
64+
65+
docs = load_csv(path, doi_field="OriginalPaperDOI", source_field="Journal", filter=date_filter)
66+
doc = docs[0]
67+
68+
assert doc.title == "Reflections on Research Software"
69+
assert doc.publication_source == "Journal of Prominent Things"
70+
assert doc.language is None
71+
assert doc.publication_year == 2024
72+
73+
assert len(doc.authors) == 1
74+
assert doc.authors[0].name == "Patrick Bos"

0 commit comments

Comments
 (0)