200字范文,内容丰富有趣,生活中的好帮手!
200字范文 > 循环神经网络LSTM实现电影情感分类

循环神经网络LSTM实现电影情感分类

时间:2020-12-21 00:07:33

相关推荐

循环神经网络LSTM实现电影情感分类

循环神经网络LSTM实现电影情感分类

一.数据集:

为了对word embedding这种常用的文本向量化的方法进行巩固,这里我们会完成一个文本情感分类的案例

现在我们有一个经典的数据集IMDB数据集,地址:http://ai.stanford.edu/~amaas/data/sentiment/,这是一份包含了5万条流行电影的评论数据,其中训练集25000条,测试集25000条。数据格式如下:

下图左边为名称,其中名称包含两部分,分别是序号和情感评分,(1-4为neg,5-10为pos),右边为评论内容

但本次实验从简设计只实现二分类,即实现积极消极的预测

二.实现流程

准备数据集

构建模型

模型训练

模型评估

三.数据集准备

数据集DataSet构建

import pickleimport torchfrom torch.utils.data import DataLoader, Datasetimport osimport re'''data: 电影评论数据数据集准备使用W2S模型将文本序列化'''data_base_path = r"./aclImdb"# 加载-词典(Word2Sequence中保存的模型,事先生成使用w2s_save保存的模型)-用于文本序列化ws = pickle.load(open("./models/ws.pkl", "rb"))Max_Len = 40train_batch_size = 512test_batch_size = 1024# 定义tokenize的方法def tokenize(text):# fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>','\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”','“', ]text = re.sub("<.*?>", " ", text)text = re.sub("|".join(fileters), " ", text)return [i.strip().lower() for i in text.split()]# 准备datasetclass ImdbDataset(Dataset):def __init__(self, train=True):super(ImdbDataset, self).__init__()self.train_data_path = data_base_path + r'\train'self.test_data_path = data_base_path + r'\test'self.data_path = self.train_data_path if train else self.test_data_path# 把所有文件名放入列表self.temp_data_path = [os.path.join(self.data_path, 'pos'), os.path.join(self.data_path, 'neg')]self.total_file_path_list = [] # 所有评论文件的pathfor path in self.temp_data_path:self.total_file_path_list.extend([os.path.join(path, j) for j in os.listdir(path) if j.endswith('.txt')])def __getitem__(self, index):# 获取评论路径path = self.total_file_path_list[index]# 获取标签label_str = path.split('\\')[-2]label = 0 if label_str == 'neg' else 1content = tokenize(open(path,encoding='utf-8').read())return content, labeldef __len__(self):return len(self.total_file_path_list)def collate_fn(batch):# batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果# print(batch)content, labels = list(zip(*batch))content = torch.LongTensor([ws.transform(i,max_len=Max_Len) for i in content])labels = torch.LongTensor(labels)print(content, labels)return content, labelsdef get_dataloader(train=True,batch_size=train_batch_size):# 2. 实例化,准备dataloaderdataset = ImdbDataset(train)dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)return dataloaderif __name__ == '__main__':# 3. 观察数据输出结果for idx, (sentence, label) in enumerate(get_dataloader()):print("idx:", idx)print("sentence:", sentence)print("label:", label)break

文本序列化

import numpy as np'''文本序列化,序列化文本类'''class Word2Sequence():UNK_TAG = "UNK"PAD_TAG = "PAD"UNK = 0PAD = 1def __init__(self):self.dict = {self.UNK_TAG :self.UNK,self.PAD_TAG :self.PAD}# 是否进行fit 操作self.fited = False# 是否构建词典操作self.build_vocabd = False# 词频统计self.count = {}def to_index(self,word):"""word -> index"""assert self.fited == True,"必须先进行fit操作"return self.dict.get(word,self.UNK)def to_word(self,index):"""index -> word"""assert self.fited , "必须先进行fit操作"if index in self.inversed_dict:return self.inversed_dict[index]return self.UNK_TAGdef __len__(self):return len(self.dict)def fit(self,sentence):for word in sentence:self.count[word] = self.count.get(word, 0) + 1self.fited = Truedef build_vocab(self, min_count=5, max_count=None, max_features=None):'''生成词典:param sentence: [word1.word2,word3 .....]:param min_count: 最小出现的次数:param max_count: 最大出现的次数:param max_features: 一共保留多少个特征(word):return:'''# 删除词频中count 最小的wordif min_count is not None:self.count = {k: v for k, v in self.count.items() if v >= min_count}# 删除词频超过限制的wordif max_count is not None:self.count = {k: v for k, v in self.count.items() if v <= max_count}if max_features is not None:# 对词频字典排序取前max_features个词语temp = sorted(self.count.items(),key=lambda x: x[-1],reverse=True)[:max_features]self.count = dict(temp)# 给处理好的word编号for word in self.count:self.dict[word] = len(self.dict)# 得到一个翻转的字典(编号:词)self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))self.build_vocabd = Truedef transform(self, sentence, max_len=None):"""实现把句子转化为数组(向量):param sentence: [word1,word2,word3 ....]:param max_len: 向量的限制长度:return:"""assert self.fited, "必须先进行fit操作"assert self.build_vocabd,"必须先进行build_vocab操作"if max_len is not None:if max_len > len(sentence):sentence = sentence + [self.PAD_TAG] * (max_len-len(sentence)) # 填充else:sentence = sentence[:max_len] # 裁剪return [self.dict.get(word,self.UNK) for word in sentence]def inverse_transform(self,indices):"""实现从数组转化为文字:param indices: [1,2,3....]:return:[word1,word2.....]"""assert self.fited, "必须先进行fit操作"assert self.build_vocabd,"必须先进行build_vocab操作"return [self.inversed_dict.get(idx) for idx in indices]if __name__ == '__main__':w2s = Word2Sequence()w2s.fit(["你", "好", "么"])w2s.fit(["你", "好", "哦"])w2s.build_vocab(min_count=1)print(w2s.dict)print(w2s.fited)print(w2s.transform(["你","好","嘛"]))print(w2s.transform(["你好嘛"],max_len=10))print(w2s.inverse_transform([5,2,4]))print(len(w2s))

生成序列化模型

四.模型构建

import osimport torchimport numpy as npfrom torch import nn, optimfrom DataSet import get_dataloader, ws, Max_Len, test_batch_sizeimport torch.nn.functional as Ffrom tqdm import tqdm'''IMDB电影评论情感分析(pos,neg)积极和消极-改进版使用LSTM双向循环神经网络,抽取最后一个时间步的特征用作全连接层特征输入即:文本 -> num -> vector -> LSTM[last TimeStep] -> 2层全连接 -> softmax'''class IMDBLstmmodel(nn.Module):def __init__(self):super(IMDBLstmmodel,self).__init__()# 以下部分为超参数,可以自行修改self.hidden_size = 64 # 每一层的LSTM单元数self.embedding_dim = 200 # 每个词的向量长度self.num_layer = 2 # 隐藏层数self.bidriectional = True # 是否使用双向的LSTMself.bi_num = 2 if self.bidriectional else 1 # 是否是双向的LSTMself.dropout = 0.5self.embedding = nn.Embedding(len(ws),self.embedding_dim, padding_idx=ws.PAD)self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_size, num_layers=self.num_layer,bidirectional=self.bidriectional, dropout=self.dropout)# 使用两个全连接层,中间使用relu激活函数self.fc = nn.Linear(self.hidden_size*self.bi_num, 20)self.fc2 = nn.Linear(20, 2)def forward(self, x): # 输入的x的形状[batch_size, sequence_len]# x 经过embedding 之后的形状 [batch_size, sequence_len, embedding_dim]x = self.embedding(x)# 进行轴交换 x的形状变为[sequence_len, batch_size, embdedding_dim]x = x.permute(1, 0, 2)# x的形状[sequence_len, batch_size, hidden_size] , h_n的形状[num_layer*bi_num, batch, hidden_size], c_n同h_nx, (h_n, c_n) = self.lstm(x)# 只要最后一个lstm单元处理的结果,即正向的最后一个lstm 和 反向的最后一个处理结果合并,形状为[batch_szie, hidden_size*bi_num]out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1)out = self.fc(out)out = F.relu(out)out = self.fc2(out)return F.log_softmax(out,dim=-1)

五. 模型训练

# 实例化模型model = IMDBLstmmodel()# 实例化优化器optimizer = optim.Adam(model.parameters(), lr=0.001)if os.path.exists('./models/lstm_model.pkl'): # 是否有已训练的模型,方便快速训练model.load_state_dict(torch.load('./models/lstm_model.pkl'))optimizer.load_state_dict(torch.load('./models/lstm_optimizer.pkl'))# 定义训练函数def train(epoch):data_loader = get_dataloader()for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='第%d轮训练'%epoch):# 梯度清零optimizer.zero_grad()# 使用模型进行预测ouput = model(input)# 计算损失loss = F.nll_loss(ouput, label)# 误差反向传播loss.backward()# 梯度更新optimizer.step()if idx == len(data_loader)-1:print('result: 第%d轮次训练,损失%f'%(epoch,loss.item()))torch.save(model.state_dict(), "./models/lstm_model.pkl") # 模型保存torch.save(optimizer.state_dict(), './models/lstm_optimizer.pkl') # 优化器保存

训练效果(这里我已经提前训练了,所以损失已经很低了)

六.模型评估

# 模型评估def test():model.eval()loss_ = []acc_ = []with torch.no_grad():data_loader = get_dataloader(train=False,batch_size=test_batch_size)for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='模型评估'):ouput = model(input)loss = F.nll_loss(ouput,label,reduction="mean")loss_.append(loss.item())pred = ouput.max(dim=1)[1] # [batch_size, 1]acc_.append(pred.eq(pred).float().mean()) # 每个批次的平均准确率print('模型损失%f,平均准确率%f' % (np.mean(loss_), np.mean(acc_)))

准确率在99% 以上

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。