diff --git a/tutorials/3_tutorial_read_from_smiles.ipynb b/tutorials/3_tutorial_read_from_smiles.ipynb new file mode 100644 index 00000000..e9dbc523 --- /dev/null +++ b/tutorials/3_tutorial_read_from_smiles.ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9c4c7edd", + "metadata": {}, + "source": [ + "# FEgrow: An Open-Source Molecular Builder and Free Energy Preparation Workflow\n", + "\n", + "**Authors: Mateusz K Bieniek, Ben Cree, Rachael Pirie, Joshua T. Horton, Natalie J. Tatum, Daniel J. Cole**" + ] + }, + { + "cell_type": "markdown", + "id": "ba86bf5e", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "Building and scoring molecules can be further streamlined by employing our established protocol. Here we show how to quickly build a library and score the entire library. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10a1f1b0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import prody\n", + "from rdkit import Chem\n", + "\n", + "import fegrow\n", + "from fegrow import ChemSpace\n", + "\n", + "from fegrow.testing import core_5R83_path, rec_5R83_path, data_5R83_path" + ] + }, + { + "cell_type": "markdown", + "id": "cd53ff25", + "metadata": {}, + "source": [ + "# Prepare the ligand template" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8c3547f", + "metadata": {}, + "outputs": [], + "source": [ + "scaffold = Chem.SDMolSupplier('sarscov2/mini.sdf')[0]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac6b40a8", + "metadata": {}, + "outputs": [], + "source": [ + "toview = fegrow.RMol(scaffold)\n", + "toview.rep2D(idx=True, size=(500, 500))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32fb23d9", + "metadata": {}, + "outputs": [], + "source": [ + "with open('sarscov2/SARS-smiles.txt') as f:\n", + " mols = f.read().splitlines()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9eadd91", + "metadata": {}, + "outputs": [], + "source": [ + "Chem.MolFromSmiles(mols[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6455b18e", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = scaffold\n", + "\n", + "for i in range(len(mols)):\n", + " mol = Chem.MolFromSmiles(mols[i])\n", + " if mol.HasSubstructMatch(pattern) == False:\n", + " print(i, mols[i])" + ] + }, + { + "cell_type": "markdown", + "id": "6f36f867", + "metadata": {}, + "source": [ + "As we are using already prepared Smiles that have the scaffold as a substructure, it is not needed to set any growing vector. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "063a03d1", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.distributed import LocalCluster\n", + "lc = LocalCluster(processes=True, n_workers=None, threads_per_worker=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "538b49cb", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# create the chemical space\n", + "cs = ChemSpace(dask_cluster=lc)\n", + "cs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6d66d1e", + "metadata": {}, + "outputs": [], + "source": [ + "#cs._dask_cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c04a9ef", + "metadata": {}, + "outputs": [], + "source": [ + "# we're not growing the scaffold, we're superimposing bigger molecules on it\n", + "cs.add_scaffold(scaffold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "689d4e59", + "metadata": {}, + "outputs": [], + "source": [ + "# get the protein-ligand complex structure\n", + "!wget -nc https://files.rcsb.org/download/7L10.pdb\n", + "\n", + "# load the complex with the ligand\n", + "sys = prody.parsePDB('sarscov2/7l10.pdb')\n", + "\n", + "# remove any unwanted molecules\n", + "rec = sys.select('not (nucleic or hetatm or water)')\n", + "\n", + "# save the processed protein\n", + "prody.writePDB('rec.pdb', rec)\n", + "\n", + "# fix the receptor file (missing residues, protonation, etc)\n", + "fegrow.fix_receptor(\"rec.pdb\", \"rec_final.pdb\")\n", + "\n", + "# load back into prody\n", + "#rec_final = prody.parsePDB(\"rec_final.pdb\")\n", + "#rec_final = prody.parsePDB(\"out.pdb\")\n", + "\n", + "# fix the receptor file (missing residues, protonation, etc)\n", + "##fegrow.fix_receptor(\"7t79-H-prep.pdb\", \"rec_final.pdb\")\n", + "\n", + "# load back into prody\n", + "##rec_final = prody.parsePDB(\"rec_final.pdb\")\n", + "\n", + "#!grep \"ATOM\" ../structures/7t79-H.pdb > rec_final.pdb\n", + "#cs.add_protein(rec_5R83_path)\n", + "\n", + "cs.add_protein('rec_final.pdb')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ace77bec", + "metadata": {}, + "outputs": [], + "source": [ + "smiles = mols[0:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a428e2b", + "metadata": {}, + "outputs": [], + "source": [ + "print(smiles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed6471a8", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# load 50k Smiles\n", + "#smiles = pd.read_csv('csv/arthor-hits-2024Mar26-0918.csv',\n", + "# names=[\"Smiles\", \"??\", \"db\"],\n", + "# index_col=0).Smiles\n", + "\n", + "#smiles = pd.read_csv('smiles.csv').Smiles.to_list()\n", + "\n", + "\n", + "# take all 20000\n", + "#smiles = smiles.apply(lambda r: r.split()[0])\n", + "smiles = mols[0:]\n", + "\n", + "# here we add Smiles which should already have been matched\n", + "# to the scaffold (rdkit Mol.HasSubstructureMatch)\n", + "#cs.add_smiles(smiles.to_list(), protonate=True)\n", + "cs.add_smiles(smiles, protonate=True)\n", + "cs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d323742f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "cs.evaluate(num_conf=500, gnina_gpu=False, penalty=0.0, al_ignore_penalty=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44faea11", + "metadata": {}, + "outputs": [], + "source": [ + "cs.df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa7ee3c8", + "metadata": {}, + "outputs": [], + "source": [ + "cs.to_sdf(\"cs_optimised_molecules.sdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bcefb75a", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range (len(cs)):\n", + " try:\n", + " cs[i].to_file(\"best_conformers_{0}.pdb\".format(i))\n", + " except AttributeError:\n", + " print(\"No conformer for molecule\", i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c08591f3", + "metadata": {}, + "outputs": [], + "source": [ + "cs.df.to_csv('SARS-out.csv', index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90c8bd41", + "metadata": {}, + "outputs": [], + "source": [ + "pattern = scaffold\n", + "mol = Chem.MolFromSmiles(smiles[0])\n", + "print(mol.HasSubstructMatch(pattern))\n", + "mol" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e642b2d", + "metadata": {}, + "outputs": [], + "source": [ + "cs.df.loc[cs.df['Success'] == True]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2a4748d", + "metadata": {}, + "outputs": [], + "source": [ + "# save the chemical space of built molecules:\n", + "\n", + "failed=False\n", + "unbuilt=False\n", + "\n", + "with Chem.SDWriter('notebook_chemspace.sdf') as SD:\n", + " columns = cs.df.columns.to_list()\n", + " columns.remove(\"Mol\")\n", + "\n", + " for i, row in cs.df.iterrows():\n", + "\n", + " # ignore this molecule because it failed during the build\n", + " if failed is False and row.Success is False:\n", + " continue\n", + "\n", + " # ignore this molecule because it was not built yet\n", + " if unbuilt is False and row.Success != True:\n", + " continue\n", + "\n", + " mol = row.Mol\n", + " mol.SetIntProp(\"index\", i)\n", + " for column in columns:\n", + " value = getattr(row, column)\n", + " mol.SetProp(column, str(value))\n", + "\n", + " mol.ClearProp(\"attachement_point\")\n", + " SD.write(mol)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b48faea", + "metadata": {}, + "outputs": [], + "source": [ + "# save the structures of the top 10 molecules in ranked order as a sdf file:\n", + "molecules = []\n", + "input_sdf = 'notebook_chemspace.sdf'\n", + "best_n = 100\n", + "\n", + "with Chem.SDMolSupplier(input_sdf) as SDF:\n", + " # for each mol\n", + " for mol in SDF:\n", + " if mol is None:\n", + " continue\n", + " if mol.GetPropsAsDict()['Success'] == 'True':\n", + " molecules.append(mol)\n", + "\n", + "# sort by the key\n", + "sorted_molecules = sorted(molecules, key=lambda m: m.GetPropsAsDict()['score'], reverse=True)\n", + "\n", + "with Chem.SDWriter(f\"top_{best_n:d}_{input_sdf}\") as SDF_OUT:\n", + " for i, mol in enumerate(sorted_molecules):\n", + " if i == best_n:\n", + " break\n", + "\n", + " SDF_OUT.write(mol)\n", + "\n", + "print('Done')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fegrow", + "language": "python", + "name": "fegrow" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/sarscov2/SARS-smiles.txt b/tutorials/sarscov2/SARS-smiles.txt new file mode 100644 index 00000000..807873fa --- /dev/null +++ b/tutorials/sarscov2/SARS-smiles.txt @@ -0,0 +1,6 @@ +CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H]2CNc2ccncn2)c2cc(Cl)ccc2C1=O +CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H]2CNc2cnn(C)c2)c2cc(Cl)ccc2C1=O +Cc1cnc(CN2C[C@@]3(C(=O)N(c4cncc5ccccc45)C[C@@H]3C)c3cc(F)ccc3C2=O)cn1 +CNC(=O)CN1C[C@@]2(C(=O)N(c3cncc4ccccc34)C[C@@H]2COC(C)C)c2cc(Cl)ccc2C1=O +C[C@H]1CN(c2cncc3ccccc23)C(=O)[C@@]12CN(Cc1nccn1C)C(=O)c1ccc(F)cc12 +C=C(CN(C(=O)[C@@H]1CN(CC(=O)NC)C(=O)c2ccc(Cl)cc21)c1cncc2ccccc12)C(=O)[O-]