PyTorch是一种基于Python的高级深度学习库,广泛应用于各种机器学习和深度学习任务,包括自然语言处理(NLP)。以下是在Ubuntu上使用PyTorch进行自然语言处理的一些应用示例:
nn.Embedding模块实现词向量的训练与应用。以下是一个简单的情感分析示例,展示了如何使用PyTorch和torchtext进行文本分类任务:
import torch from torchtext.datasets import IMDB from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torch.utils.data import DataLoader, random_split # 分词器 tokenizer = get_tokenizer('basic_english') # 构建词汇表 def yield_tokens(data_iter): for _, text in data_iter: yield tokenizer(text) train_iter, test_iter = IMDB.splits(TEXT, LABEL) vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['unk']) vocab.set_default_index(vocab['unk']) # 创建数据迭代器 def text_pipeline(text): return vocab(tokenizer(text)) label_pipeline = lambda x: 1 if x == 'pos' else 0 def collate_batch(batch): label_list, text_list = [], [] for label, text in batch: label_list.append(label_pipeline(label)) processed_text = torch.tensor([text_pipeline(word) for word in text], dtype=torch.int64) text_list.append(processed_text) return torch.nn.utils.rnn.pad_sequence(text_list, padding_value=vocab['pad']), torch.tensor(label_list) # 划分训练集和验证集 train_iter, test_iter = random_split(IMDB(split='train'), [85000, 25000]) # 创建数据加载器 BATCH_SIZE = 64 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_dataloader = DataLoader(list(train_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) test_dataloader = DataLoader(list(test_iter), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) # 定义神经网络模型 class TextClassifier(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim): super(TextClassifier, self).__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) self.fc = nn.Linear(embedding_dim, output_dim) self.init_weights() def init_weights(self): initrange = 0.5 self.embedding.weight.data.uniform_(-initrange, initrange) self.fc.weight.data.uniform_(-initrange, initrange) self.fc.bias.data.zero_() def forward(self, text, offsets): embedded = self.embedding(text, offsets) return self.fc(embedded) # 实例化模型 model = TextClassifier(len(TEXT.vocab), 100, 256, len(label_pipeline)).to(device) # 定义损失函数和优化器 criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=4.0) # 训练模型 EMBED_DIM = 100 EPOCHS = 10 for epoch in range(EPOCHS): model.train() for batch in train_dataloader: optimizer.zero_grad() text, labels = batch.text, batch.label outputs = model(text, None) loss = criterion(outputs, labels) loss.backward() optimizer.step() # 评估模型 correct = 0 total = len(test_dataloader.dataset) with torch.no_grad(): for batch in test_dataloader: labels, text = batch.label, batch.text outputs = model(text, None) _, predicted = torch.max(outputs, 1) correct += (predicted == labels).sum().item() print("Accuracy: {:.2f}%".format(100 * correct / total)) 以上示例展示了如何在Ubuntu上使用PyTorch进行自然语言处理任务的基本流程,包括数据加载、模型定义、训练和评估。通过这些步骤,可以构建和训练各种自然语言处理模型。