Chemistry.AI | 基于循环神经网络（RNN）预测分子性质

Michelle ·

更新时间:2024-11-14

· 772 次阅读

Chemistry.AI | 基于卷积神经网络（CNN）预测分子特性

环境准备 Python版本：Python 3.6.8 PyTorch版本：PyTorch1.1.0 RDKit版本：RDKit 2020.03.1 基于循环神经网络（RNN）预测分子性质
导入库
from rdkit import Chem from rdkit.Chem.Crippen import MolLogP import numpy as np import torch import time 载入数据，计算分子指纹和描述符 maxlen = 64 with open('smiles.txt') as f: smiles = f.readlines()[:] smiles = [s.strip() for s in smiles] smiles = [s.split()[1] for s in smiles] smiles = [s for s in smiles if len(s)<maxlen] #Characters of smiles all_smiles='' for s in smiles: all_smiles+=s chars = sorted(list(set(list(all_smiles)))) c_to_i = {c:i for i,c in enumerate(chars)} print ('Max len:', maxlen) print ('Number of chars:', len(chars)) print (chars) print (c_to_i) Max len: 64 Number of chars: 45 ['#', '(', ')', '+', '-', '.', '/', '1', '2', '3', '4', '5', '6', '7', '=', '@', 'B', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'S', 'V', 'Z', '[', '\\', ']', 'a', 'c', 'e', 'g', 'i', 'l', 'n', 'o', 'r', 's', 'u'] {'#': 0, '(': 1, ')': 2, '+': 3, '-': 4, '.': 5, '/': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '=': 14, '@': 15, 'B': 16, 'C': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'K': 22, 'L': 23, 'M': 24, 'N': 25, 'O': 26, 'P': 27, 'S': 28, 'V': 29, 'Z': 30, '[': 31, '\\': 32, ']': 33, 'a': 34, 'c': 35, 'e': 36, 'g': 37, 'i': 38, 'l': 39, 'n': 40, 'o': 41, 'r': 42, 's': 43, 'u': 44} 计算每个分子的分子指纹和描述符 Y = [] num_data = 20000 st = time.time() for s in smiles[:num_data]: m = Chem.MolFromSmiles(s) logp = MolLogP(m) Y.append(logp) #Y.append(1) end = time.time() print (f'Time:{(end-st):.3f}') 数据批处理 #Dataset from torch.utils.data import Dataset, DataLoader class MolDataset(Dataset): def __init__(self, smiles, properties, c_to_i, maxlen): self.c_to_i = c_to_i self.maxlen = maxlen self.smiles = smiles self.properties = properties def __len__(self): return len(self.smiles) def __getitem__(self, idx): s = self.smiles[idx] i = torch.from_numpy(np.array([c_to_i[c] for c in s])) sample = dict() sample['X'] = i sample['L'] = len(s) sample['Y'] = self.properties[idx] return sample #Collate fn def my_collate(batch): sample = dict() X = torch.nn.utils.rnn.pad_sequence([b['X'] for b in batch], batch_first=True, padding_value = 45) Y = torch.Tensor([b['Y'] for b in batch]) L = torch.Tensor([b['L'] for b in batch]) sample['X'] = X sample['Y'] = Y sample['L'] = L return sample 定义RNN模型 #Model import torch import torch.nn as nn import torch.nn.functional as F class RNNRegressor(torch.nn.Module): def __init__(self, n_feature=128, n_rnn_layer = 1, n_char=46): super(RNNRegressor, self).__init__() self.rnn = nn.GRU(input_size=n_feature, hidden_size=n_feature, num_layers=n_rnn_layer) self.fc = nn.Linear(n_feature, 1) self.embedding = nn.Embedding(n_char, n_feature) def forward(self, x, l): x = self.embedding(x) x = x.permute((1,0,2)) output, h = self.rnn(x) selected = [] for i in range(len(l)): valid_h = h[:l[i]-1,i] valid_h = torch.mean(valid_h, 0) selected.append(valid_h) selected = torch.stack(selected) retval = self.fc(selected) return retval 训练模型 #Train model import time lr = 1e-4 model = RNNRegressor(64, 1, 46).cuda() #Dataset train_smiles = smiles[:19000] test_smiles = smiles[19000:20000] train_logp = Y[:19000] test_logp = Y[19000:20000] train_dataset = MolDataset(train_smiles, train_logp, c_to_i, maxlen) test_dataset = MolDataset(test_smiles, test_logp, c_to_i, maxlen) #Dataloader train_dataloader = DataLoader(train_dataset, batch_size=128, num_workers=1, collate_fn=my_collate) test_dataloader = DataLoader(test_dataset, batch_size=128, num_workers=1, collate_fn=my_collate) optimizer = torch.optim.Adam(model.parameters(), lr=lr) #optimizer = torch.optim.SGD(model.parameters(), lr=lr) loss_fn = nn.MSELoss() loss_list = [] st = time.time() for epoch in range(20): epoch_loss = [] for i_batch, batch in enumerate(train_dataloader): x, y, l = \ batch['X'].cuda().long(), batch['Y'].cuda().float(), batch['L'].cuda().long() pred = model(x, l).squeeze(-1) loss = loss_fn(pred, y) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() loss_list.append(loss.data.cpu().numpy()) epoch_loss.append(loss.data.cpu().numpy()) if True: print (epoch, np.mean(np.array(epoch_loss))) end = time.time() print ('Time:', end-st) 0 5.3075666 1 3.9476352 2 3.7014515 3 3.0184703 4 2.3332028 5 1.8761007 6 1.3051379 7 0.9804973 8 0.7861054 9 0.65022063 10 0.5559791 11 0.4898691 12 0.44078177 13 0.40346482 14 0.37828985 15 0.3515944 16 0.33561063 17 0.32390222 18 0.3072901 19 0.2942933 Time: 117.68075275421143 保存模型 #Save model fn = 'save.pt' torch.save(model.state_dict(), fn) 加载模型 #Load model model.load_state_dict(torch.load(fn)) 绘制损失曲线 import matplotlib.pyplot as plt plt.plot(loss_list) plt.xlabel('Num iteration') plt.ylabel('Loss') 测试模型 #Test model model.eval() with torch.no_grad(): y_pred_train, y_pred_test = [], [] loss_train, loss_test = [], [] pred_train, pred_test = [], [] true_train, true_test = [], [] for batch in train_dataloader: x, y, l = \ batch['X'].cuda().long(), batch['Y'].cuda().float(), batch['L'].cuda().long() pred = model(x, l).squeeze(-1) pred_train.append(pred.data.cpu().numpy()) true_train.append(y.data.cpu().numpy()) loss_train.append(loss_fn(y, pred).data.cpu().numpy()) for batch in test_dataloader: x, y, l = \ batch['X'].cuda().long(), batch['Y'].cuda().float(), batch['L'].cuda().long() pred = model(x, l).squeeze(-1) pred_test.append(pred.data.cpu().numpy()) true_test.append(y.data.cpu().numpy()) loss_test.append(loss_fn(y, pred).data.cpu().numpy()) pred_train = np.concatenate(pred_train, -1) pred_test = np.concatenate(pred_test, -1) true_train = np.concatenate(true_train, -1) true_test = np.concatenate(true_test, -1) print ('Train loss:', np.mean(np.array(loss_train))) print ('Test loss:', np.mean(np.array(loss_test))) Train loss: 0.290517 Test loss: 0.21602286 plt.scatter(true_train, pred_train, s=1) plt.scatter(true_test, pred_test, s=1) plt.plot([-8,12], [-8,12]) plt.xlabel('True') plt.ylabel('Pred') 作者：qq2648008726 循环神经网络循环 rnn 神经网络

1024 个赞编辑举报

需要登录后方可回复, 如果你还没有账号请注册新账号相关文章 HTML 5.1学习之新增的14项特性与应用示例 Ummi 2020-11-28 936 PowerShell批量安装msi后辍软件的方法 Sally 2020-12-13 907 HBASE 常用shell命令,增删改查方法 Heidi 2020-03-04 525 linux禁止普通用户切换至root用户的实例讲解 Connie 2020-01-15 762 Python利用keras接口实现深度神经网络回归 Tina 2023-02-18 418 Python基于TensorFlow接口实现深度学习神经网络回归 Lark 2023-02-18 364 C语言超详细讲解双向带头循环链表 Hoshi 2023-02-25 1503 Linux命令行循环执行shell命令 Heidi 2023-02-26 350 基于Matlab实现人工神经网络(ANN)回归的示例详解 Tallulah 2023-02-26 1622 Python嵌套循环的使用 Yelena 2023-02-26 563 python中的循环结构问题 Nora 2023-03-02 1770 PyTorch 之强大的 hub 模块和搭建神经网络进行气温预测 Ophelia 2023-03-21 1608 Bean的自动注入及循环依赖问题 Thalia 2023-03-23 323 C语言单双线性及循环链表与实例 Diane 2023-03-25 945 两个函数相互调用如何防止死循环 Antonia 2023-03-29 697 Python3中的循环语句示例详解 Pandora 2023-04-16 680 Android实现循环轮播跑马灯的效果 Crystal 2023-05-12 1676 循环神经网络TextRNN实现情感短文本分类任务 Kathy 2023-07-01 364 mysql双游标嵌套循环方式 Ida 2023-07-20 1450 mysql存储过程多层游标循环嵌套的写法分享 Irma 2023-07-20 1557

我要提问致谢帮助他人，成就自己。人生最大成功就是伸出热情而温暖的双手，尽自己所能去帮助身边的每一个人，只要无私的奉献，就会收获到美好的生活。 1024问感谢每一位朋友的帮助和支持。软件开发网提供编程的基础软件技术培训教程,软件开发编程实例讲解Go,Node,HTML,CSS,Javascript,Python,Java,Ruby,C,PHP,MySQL等软件开发编程语言以及数据开发的基础知识，也提供大量的软件开发在线实例、从入门到精通就在1024问。育儿网微养生全球行美食街育儿菜谱大全海南旅游女性养狗百科星座