-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathstandardize_compounds.py
More file actions
executable file
·140 lines (123 loc) · 4.9 KB
/
standardize_compounds.py
File metadata and controls
executable file
·140 lines (123 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from __future__ import print_function
import sys
import os
from rdkit import Chem
import glob
from molvs import tautomer
# from molvs import Standardizer
import json
import argparse
parser = argparse.ArgumentParser()
# To make the input integers
parser.add_argument('--mol_indices', nargs='+', type=int)
with open("/global/homes/b/bpb/repos/md2st/original_molecules.json", "r") as fid:
all_molecules = json.load(fid)
save_path = '/global/homes/b/bpb/repos/md2st/sanitized_molecules_2'
if not os.path.exists(save_path):
os.makedirs(save_path)
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
patts= (
# Imidazoles
('[n+;H]','n'),
# Amines
('[N+;!H0]','N'),
# Carboxylic acids and alcohols
('[$([O-]);!$([O-][#7])]','O'),
# Thiols
('[S-;X1]','S'),
# Sulfonamides
('[$([N-;X2]S(=O)=O)]','N'),
# Enamines
('[$([N-;X2][C,N]=C)]','N'),
# Tetrazoles
('[n-]','[nH]'),
# Sulfoxides
('[$([S-]=O)]','S'),
# Amides
('[$([N-]C=O)]','N'),
)
return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y)) for x,y in patts]
_reactions=None
def neutralise_charges(mol, reactions=None):
global _reactions
if reactions is None:
if _reactions is None:
_reactions=_InitialiseNeutralisationReactions()
reactions=_reactions
replaced = False
for i,(reactant, product) in enumerate(reactions):
while mol.HasSubstructMatch(reactant):
replaced = True
rms = Chem.AllChem.ReplaceSubstructs(mol, reactant, product, replaceAll=True)
mol = rms[0]
if replaced:
Chem.SanitizeMol(mol)
Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
return (mol, True)
else:
return (mol, False)
def desalt(mol):
#input is an rdkit mol
#returns an rdkit mol keeping the biggest component
#returns original mol if only one component
#returns a boolean indicated if cleaning was necessary
d = Chem.rdmolops.GetMolFrags(mol) #these are atom indices
if len(d) == 1: #If there are fragments or multiple molecules this will be greater than 1
return mol,False
my_smiles=Chem.MolToSmiles(mol,True)
parent_atom_count=0;
disconnected=my_smiles.split('.')
#With GetMolFrags, we've already established that there is more than one disconnected structure
status = False
for s in disconnected:
little_mol = Chem.MolFromSmiles(s,sanitize=True)
#Sanitize=True will fail for choline sulfate. Can't sanitize the radical.
if little_mol is not None:
count = little_mol.GetNumAtoms()
if count > parent_atom_count:
parent_atom_count = count
parent_mol = little_mol
status = True
return parent_mol,status
def force_to_unicode(text):
"If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding"
return text if isinstance(text, unicode) else text.decode('utf8')
def mol_to_neutral_desalted_canonical(mol_entry,filename):
"""
given an rdkit mol and filename
convert the mol into a standard form and
save it as an mdl mol file.
"""
canon = tautomer.TautomerCanonicalizer()
if mol_entry['original_smiles'] is not None:
mol = Chem.MolFromSmiles(mol_entry['original_smiles'])
# print('%s SMILES: %s'%(force_to_unicode(mol_entry['name']),mol_entry['original_smiles']))
print('SMILES: %s'%(mol_entry['original_smiles']))
if (not '*' in mol_entry['original_smiles']) and (mol is not None) and (mol.GetNumAtoms()>0):
Chem.SanitizeMol(mol)
Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
mol, status = neutralise_charges(mol)
mol, status = desalt(mol)
try:
mol = standardizer.standardize(mol)
Chem.SanitizeMol(mol)
Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
except:
Chem.SanitizeMol(mol)
Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
mol = canon.canonicalize(mol)
Chem.SanitizeMol(mol)
Chem.rdmolops.Kekulize(mol, clearAromaticFlags=True)
smiles = Chem.MolToSmiles(mol,isomericSmiles=True)
inchikey = Chem.InchiToInchiKey(Chem.MolToInchi(mol))
mol_entry['standardized_smiles'] = smiles
mol_entry['standardized_inchikey'] = inchikey
with open(filename, 'w') as fid:
json.dump(mol_entry, fid)
# Chem.MolToMolFile(mol,filename,includeStereo=True)
args = parser.parse_args()
for idx in args.mol_indices:
output_filename = os.path.join(save_path,'%d_%s.json'%(idx,all_molecules[idx]['unique_id']))
if not os.path.isfile(output_filename):
mol_to_neutral_desalted_canonical(all_molecules[idx],output_filename)