Skip to content

Sanitize SDF files #48

@proteneer

Description

@proteneer

The current SDF files have about ~40 molecules in SDF format that are non-neutral. Here's a script that regenerates correct ones.

import csv
import os
from rdkit import Chem
from rdkit.Chem import AllChem

def is_neutral(mol):
    net_charge = 0
    for a in mol.GetAtoms():
        net_charge += a.GetFormalCharge()
    return net_charge == 0

mols = []

mmff_fail_count = 0

with open('database.txt', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')
    for line, row in enumerate(spamreader):
        if line > 2:
            name = row[0]
            smiles = row[1]
            
            
            mol = Chem.MolFromSmiles(smiles)      
            mol = Chem.AddHs(mol)
            
            print(smiles)
            res = AllChem.EmbedMolecule(mol)
            assert res == 0 
            res = AllChem.MMFFOptimizeMolecule(mol)
            
            if res != 0:
                mmff_fail_count += 1

            exp_dG = float(row[3])
            exp_dG_err = float(row[4])
            

            mol.SetProp('_Name', name)
            mol.SetProp('dG', str(exp_dG))
            mol.SetProp('dG_err', str(exp_dG_err))
            
            assert is_neutral(mol)
            
            mols.append(mol)

print("mm_fail", mmff_fail_count)

w = Chem.SDWriter('freesolv.sdf')
for m in mols: w.write(m)
w.flush()

print("wrote", len(mols), "mols")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions