Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Scripts to create pretraining dataset #65

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions pretraining/createDESMonomersXtb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import openmm
import openmm.app as app
import openmm.unit as unit
import h5py
import os
from utils import compute_xtb_for_conformers, save_to_file
from rdkit import Chem
from openff.units import unit as ffunit
from openff.toolkit.typing.engines.smirnoff import ForceField
from openff.toolkit.topology import Molecule, Topology
from concurrent.futures import ThreadPoolExecutor

def createConformations(topology, system, mol, name):
"""Generate the conformations for a molecule and compute the energies and forces."""
print(f'Generating {name}')
try:
system.addForce(openmm.CMMotionRemover())
integrator = openmm.LangevinMiddleIntegrator(500*unit.kelvin, 1/unit.picosecond, 0.001*unit.picosecond)
simulation = app.Simulation(topology, system, integrator, openmm.Platform.getPlatformByName('Reference'))

# Generate 10 diverse starting points. Run MD from each one to generate a total
# of 100 conformations at each temperature.

mol.generate_conformers(n_conformers=10, rms_cutoff=0*ffunit.nanometers)
assert len(mol.conformers) == 10
states = []
for temperature in [100, 300, 500, 1000]:
for pos in mol.conformers:
simulation.context.setPositions(pos.m_as(ffunit.nanometers))
simulation.minimizeEnergy()
simulation.context.setVelocitiesToTemperature(temperature*unit.kelvin)
integrator.setTemperature(temperature*unit.kelvin)
for i in range(10):
simulation.step(10000)
state = simulation.context.getState(getPositions=True, getEnergy=True)
if state.getPotentialEnergy() < 1e4*unit.kilojoules_per_mole:
states.append(state)

# Add the conformations to the Molecule and put the atoms in canonical order.

mol._conformers = None
for state in states:
mol.add_conformer(state.getPositions(asNumpy=True))
mol = mol.canonical_order_atoms()

# Compute energies and forces with XTB.

positions, energies, formation_energies, grads = compute_xtb_for_conformers(mol)
return positions, energies, formation_energies, grads, mol, name
except Exception as ex:
print(name, ex)
return None, None, None, None, mol, name

# Create the conformations for the molecules.

forcefield = ForceField('openff_unconstrained-2.0.0.offxml')
outputfile = h5py.File('des-monomers.hdf5', 'w')
futures = []
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
for filename in os.listdir('../des370k/SDFS'):
if not filename.endswith('.sdf'):
continue
smiles = filename[:-4]
supp = Chem.SDMolSupplier(f'../des370k/SDFS/{filename}', sanitize=False, removeHs=False)
rdmol = list(supp)[0]
if rdmol.GetNumAtoms() > 1:
try:
mol = Molecule.from_rdkit(rdmol, allow_undefined_stereo=True, hydrogens_are_explicit=True)
topology = Topology.from_molecules([mol])
mmTopology = topology.to_openmm()
system = forcefield.create_openmm_system(topology)
futures.append(executor.submit(createConformations, mmTopology, system, mol, smiles))
except:
print(' failed to parametrize')

# Save the results to the output file.

for future in futures:
positions, energies, formation_energies, grads, mol, name = future.result()
if positions != None:
save_to_file(outputfile, mol, positions, energies, formation_energies, grads, name, 'DES Monomers')
121 changes: 121 additions & 0 deletions pretraining/createDipeptidesXtb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import openmm
import openmm.app as app
import openmm.unit as unit
import h5py
import os
from utils import compute_xtb_for_conformers, convert_to_openff, save_to_file
from openff.units import unit as ffunit
from collections import namedtuple
from concurrent.futures import ThreadPoolExecutor

def createDipeptide(forcefield, res1, res2, var1, var2):
"""Use PDBFixer to create a dipeptide with specified residues and variants."""
import pdbfixer
fixer = pdbfixer.PDBFixer(filename='../dipeptides/ala_ala.pdb')
fixer.missingResidues = {}
fixer.applyMutations([f'ALA-2-{res1}', f'ALA-3-{res2}'], 'A')
fixer.findMissingAtoms()
fixer.addMissingAtoms()
modeller = app.Modeller(fixer.topology, fixer.positions)
modeller.addHydrogens(forcefield=forcefield, variants=[None, var1, var2, None])
return modeller.topology, modeller.positions

def createConformations(forcefield, topology, positions, name, charges):
"""Generate the conformations for a molecule and compute the energies and forces."""
print(f'Generating {name}')
system = forcefield.createSystem(topology, constraints=None, rigidWater=False)
integrator = openmm.LangevinMiddleIntegrator(500*unit.kelvin, 1/unit.picosecond, 0.001*unit.picosecond)
simulation = app.Simulation(topology, system, integrator, openmm.Platform.getPlatformByName('Reference'))
simulation.context.setPositions(positions)
simulation.minimizeEnergy()

# Generate 10 diverse starting points. Run MD from each one to generate a total
# of 100 conformations at each temperature.

mol = convert_to_openff(simulation, charges)
mol.generate_conformers(n_conformers=10, rms_cutoff=0*ffunit.nanometers)
assert len(mol.conformers) == 10
states = []
for temperature in [100, 300, 500, 1000]:
for pos in mol.conformers:
simulation.context.setPositions(pos.m_as(ffunit.nanometers))
simulation.minimizeEnergy()
simulation.context.setVelocitiesToTemperature(temperature*unit.kelvin)
integrator.setTemperature(temperature*unit.kelvin)
for i in range(10):
simulation.step(10000)
state = simulation.context.getState(getPositions=True, getEnergy=True)
if state.getPotentialEnergy() < 1e4*unit.kilojoules_per_mole:
states.append(state)

# Add the conformations to the Molecule and put the atoms in canonical order.

mol._conformers = None
for state in states:
mol.add_conformer(state.getPositions(asNumpy=True))
mol = mol.canonical_order_atoms()

# Compute energies and forces with XTB.

positions, energies, formation_energies, grads = compute_xtb_for_conformers(mol)
return positions, energies, formation_energies, grads, mol, name

# Define the residue variants we will include.

Residue = namedtuple('Residue', ['name', 'variant', 'charges'])

residues = [
Residue('ALA', None, {}),
Residue('ASN', None, {}),
Residue('CYS', 'CYS', {}),
Residue('CYS', 'CYX', {'SG':[-1,True]}),
Residue('GLU', 'GLH', {}),
Residue('GLU', 'GLU', {'OE1':[-1,False], 'OE2':[-1,False]}),
Residue('HIS', 'HID', {}),
Residue('HIS', 'HIE', {}),
Residue('HIS', 'HIP', {'ND1':[1,True]}),
Residue('LEU', None, {}),
Residue('MET', None, {}),
Residue('PRO', None, {}),
Residue('THR', None, {}),
Residue('TYR', None, {}),
Residue('ARG', None, {'NH1':[1,False], 'NH2':[1,False]}),
Residue('ASP', 'ASH', {}),
Residue('ASP', 'ASP', {'OD1':[-1,False], 'OD2':[-1,False]}),
Residue('GLN', None, {}),
Residue('GLY', None, {}),
Residue('ILE', None, {}),
Residue('LYS', 'LYN', {}),
Residue('LYS', 'LYS', {'NZ':[1,True]}),
Residue('PHE', None, {}),
Residue('SER', None, {}),
Residue('TRP', None, {}),
Residue('VAL', None, {})
]

# Create the conformations for the molecules.

forcefield = app.ForceField('amber14-all.xml')
pdb = app.PDBFile('../dipeptides/disulfide.pdb')
futures = []
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
futures.append(executor.submit(createConformations, forcefield, pdb.topology, pdb.positions, 'Disulfide', {}))
for res1 in residues:
name1 = res1.name if res1.variant is None else res1.variant
for res2 in residues:
name2 = res2.name if res2.variant is None else res2.variant
topology, positions = createDipeptide(forcefield, res1.name, res2.name, res1.variant, res2.variant)
charges = {}
for atom in topology.atoms():
if atom.residue.index == 1 and atom.name in res1.charges:
charges[atom.index] = res1.charges[atom.name]
elif atom.residue.index == 2 and atom.name in res2.charges:
charges[atom.index] = res2.charges[atom.name]
futures.append(executor.submit(createConformations, forcefield, topology, positions, f'{name1}-{name2}', charges))

# Save the results to the output file.

outputfile = h5py.File('dipeptides.hdf5', 'w')
for future in futures:
positions, energies, formation_energies, grads, mol, name = future.result()
save_to_file(outputfile, mol, positions, energies, formation_energies, grads, name, 'Dipeptides')
138 changes: 138 additions & 0 deletions pretraining/createSolvatedAminoAcidsXtb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import openmm
import openmm.app as app
import openmm.unit as unit
import numpy as np
import h5py
from utils import compute_xtb_for_conformers, convert_to_openff, save_to_file
from collections import namedtuple

def createModel(forcefield, res, var):
"""Create a capped amino acid of the specified type and build a water box around it."""
import pdbfixer
fixer = pdbfixer.PDBFixer(filename='../solvated-amino-acids/ace_ala_nme.pdb')
fixer.missingResidues = {}
fixer.applyMutations([f'ALA-2-{res}'], 'A')
fixer.findMissingAtoms()
fixer.addMissingAtoms()
modeller = app.Modeller(fixer.topology, fixer.positions)
modeller.addHydrogens(forcefield=forcefield, variants=[None, var, None])
modeller.addSolvent(forcefield, boxSize=(2.2, 2.2, 2.2)*unit.nanometer)
return modeller.topology, modeller.positions

def createConformations(outputfile, forcefield, topology, positions, name, charges):
"""Generate the conformations for a molecule and save them to disk."""
print(f'Generating {name}')
system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffPeriodic, constraints=None, rigidWater=False)
system.addForce(openmm.MonteCarloBarostat(1*unit.bar, 300*unit.kelvin))
integrator = openmm.LangevinMiddleIntegrator(300*unit.kelvin, 1/unit.picosecond, 0.001*unit.picosecond)
simulation = app.Simulation(topology, system, integrator)
simulation.context.setPositions(positions)

# Equilibrate for 100 ps.

simulation.minimizeEnergy()
simulation.context.setVelocitiesToTemperature(300*unit.kelvin)
simulation.step(100000)

# Simulate for 5 ns, saving a state every 10 ps.

import mdtraj
atoms = list(topology.atoms())
soluteAtomIndex = [atom.index for atom in atoms if atom.residue.chain.index == 0]
waterAtomIndex = [atom.index for atom in atoms if atom.residue.name == 'HOH' and atom.name == 'O']
from openmm.app.internal import compiled
conformations = []
for i in range(509):
simulation.step(10000)
state = simulation.context.getState(getPositions=True, getEnergy=True)
assert state.getPotentialEnergy() < 0*unit.kilojoules_per_mole

# Identify the 20 water molecules closest to the solute.

periodicDistance = compiled.periodicDistance(state.getPeriodicBoxVectors().value_in_unit(unit.nanometer))
pos = state.getPositions().value_in_unit(unit.nanometer)
distances = [min(periodicDistance(pos[i], pos[j]) for j in soluteAtomIndex) for i in waterAtomIndex]
sortedIndices = np.argsort(distances)
waterToKeep = set(atoms[waterAtomIndex[i]].residue.index for i in sortedIndices[:20])

# Remove everything else from the Topology and coordinates.

modeller = app.Modeller(topology, state.getPositions())
toDelete = [res for res in topology.residues() if res.chain.index > 0 and res.index not in waterToKeep]
modeller.delete(toDelete)

# Center the solute with the water around it.

mdtop = mdtraj.Topology.from_openmm(modeller.topology)
xyz = np.array([modeller.positions.value_in_unit(unit.nanometer)])
traj = mdtraj.Trajectory(xyz, mdtop)
traj.unitcell_vectors = np.array([state.getPeriodicBoxVectors().value_in_unit(unit.nanometer)])
traj = traj.image_molecules()
conformations.append(traj.xyz[0]*unit.nanometer)

# Create an OpenFF molecule.

system = forcefield.createSystem(modeller.topology, constraints=None, rigidWater=False)
integrator = openmm.LangevinMiddleIntegrator(300*unit.kelvin, 1/unit.picosecond, 0.001*unit.picosecond)
simulation = app.Simulation(modeller.topology, system, integrator)
simulation.context.setPositions(conformations[-1])
mol = convert_to_openff(simulation, charges)

# Add the conformations to the Molecule and put the atoms in canonical order.

mol._conformers = None
for conf in conformations:
mol.add_conformer(conf)
mol = mol.canonical_order_atoms()

# Compute energies and forces with XTB.

positions, energies, formation_energies, grads = compute_xtb_for_conformers(mol)
return positions, energies, formation_energies, grads, mol

# Define the residue variants we will include.

Residue = namedtuple('Residue', ['name', 'variant', 'charges'])

residues = [
Residue('ALA', None, {}),
Residue('ASN', None, {}),
Residue('CYS', 'CYS', {}),
Residue('CYS', 'CYX', {'SG':[-1,True]}),
Residue('GLU', 'GLH', {}),
Residue('GLU', 'GLU', {'OE1':[-1,False], 'OE2':[-1,False]}),
Residue('HIS', 'HID', {}),
Residue('HIS', 'HIE', {}),
Residue('HIS', 'HIP', {'ND1':[1,True]}),
Residue('LEU', None, {}),
Residue('MET', None, {}),
Residue('PRO', None, {}),
Residue('THR', None, {}),
Residue('TYR', None, {}),
Residue('ARG', None, {'NH1':[1,False], 'NH2':[1,False]}),
Residue('ASP', 'ASH', {}),
Residue('ASP', 'ASP', {'OD1':[-1,False], 'OD2':[-1,False]}),
Residue('GLN', None, {}),
Residue('GLY', None, {}),
Residue('ILE', None, {}),
Residue('LYS', 'LYN', {}),
Residue('LYS', 'LYS', {'NZ':[1,True]}),
Residue('PHE', None, {}),
Residue('SER', None, {}),
Residue('TRP', None, {}),
Residue('VAL', None, {})
]

# Create the conformations for the molecules.

forcefield = app.ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
outputfile = h5py.File('solvated-amino-acids.hdf5', 'w')
for res in residues:
name = res.name if res.variant is None else res.variant
topology, positions = createModel(forcefield, res.name, res.variant)
charges = {}
for atom in topology.atoms():
if atom.residue.index == 1 and atom.name in res.charges:
charges[atom.index] = res.charges[atom.name]
positions, energies, formation_energies, grads, mol = createConformations(outputfile, forcefield, topology, positions, name, charges)
save_to_file(outputfile, mol, positions, energies, formation_energies, grads, name, 'Solvated Amino Acids')
Loading