Skip to content

Commit 5f66a7a

Browse files
committed
extract formulas of biomolecular from pubchem and write them as json
1 parent d16270f commit 5f66a7a

File tree

1 file changed

+18
-4
lines changed

1 file changed

+18
-4
lines changed

python/pubchem.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from rdkit.Chem.rdmolops import GetFormalCharge
1111
import gzip
1212
from collections import Counter
13+
import json, glob, os
1314

1415
def filter_pubchem(ms):
1516
ms_filtered = []
@@ -37,8 +38,21 @@ def filter_pubchem(ms):
3738
ms_filtered.append(CalcMolFormula(m))
3839
return ms_filtered
3940

41+
def write_json(ms, fname):
42+
f = open(fname, 'w')
43+
json.dump(ms, f)
44+
f.close()
45+
46+
47+
sdf_path = 'F:/resources/isotope/pubchem/ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/2016-12-01/SDF/'
48+
sdfs = glob.glob(sdf_path + '*.sdf.gz')
49+
for i, sdf in enumerate(sdfs):
50+
print( '{:2.4f}% {}'.format((100 * float(i)/len(sdfs)), sdf))
51+
if not(os.path.exists(sdf[0:-6]+'json')):
52+
sdf_gz = gzip.open(sdf)
53+
gzsuppl = Chem.ForwardSDMolSupplier(sdf_gz)
54+
ms = [Chem.AddHs(x) for x in gzsuppl if x is not None]
55+
ms_filtered = filter_pubchem(ms)
56+
write_json(ms_filtered, sdf[0:-6]+'json')
57+
4058

41-
inf = gzip.open('F:/resources/isotope/pubchem/ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/2016-12-01/SDF/Compound_000000001_000025000.sdf.gz')
42-
gzsuppl = Chem.ForwardSDMolSupplier(inf)
43-
ms = [Chem.AddHs(x) for x in gzsuppl if x is not None]
44-
ms_filtered = filter_pubchem(ms)

0 commit comments

Comments
 (0)