我的博客

pytorch 官网教程 - 字符级别 RNN 实现姓名分类

目录
  1. 数据
  2. 加载数据
    1. 统一为 ascii 字母
    2. 读文件
    3. 字符串转向量
  3. 定义网络
  4. 训练模型
    1. 随机选择数据
  5. 测试结果
  6. 保存模型
  7. 加载模型
  8. 一些细节

教程原文

数据下载

数据

[medivh@medivh-pc data]$ wc -l names/*
  2000 names/Arabic.txt
   268 names/Chinese.txt
   519 names/Czech.txt
   297 names/Dutch.txt
  3668 names/English.txt
   277 names/French.txt
   724 names/German.txt
   203 names/Greek.txt
   232 names/Irish.txt
   709 names/Italian.txt
   991 names/Japanese.txt
    94 names/Korean.txt
   139 names/Polish.txt
    74 names/Portuguese.txt
  9408 names/Russian.txt
   100 names/Scottish.txt
   298 names/Spanish.txt
    73 names/Vietnamese.txt
 20074 总用量

一共有 18 个不同的语言的名字集合

[medivh@medivh-pc data]$ head names/Chinese.txt 
Ang
Au-Yong
Bai
Ban
Bao
Bei
Bian
Bui
Cai
Cao

每个文件里有一些名字,一个名字一行,基本上都是字母,但是有一些是带重音符号的字符,如:Ślusàrski

加载数据

统一为 ascii 字母

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import os
import glob
import string
import unicodedata


all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)

示例:

1
print(unicodeToAscii('Ślusàrski')) # Slusarski

读文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
lines = open(filename, encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]

for filename in glob.glob('names/*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
category_lines[category] = lines

n_categories = len(all_categories)

字符串转向量

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
tensor = torch.zeros(1, n_letters)
tensor[0][letterToIndex(letter)] = 1
return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li, letter in enumerate(line):
tensor[li][0][letterToIndex(letter)] = 1
return tensor

print(letterToTensor('J'))

print(lineToTensor('Jones').size())

定义网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import torch.nn as nn

class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()

self.hidden_size = hidden_size

self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden

def initHidden(self):
return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

训练模型

随机选择数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import random

def randomChoice(l):
return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
category = randomChoice(all_categories)
line = randomChoice(category_lines[category])
category_tensor = torch.tensor([all_categories.index(category)], dtype=torch.long)
line_tensor = lineToTensor(line)
return category, line, category_tensor, line_tensor

for i in range(10):
category, line, category_tensor, line_tensor = randomTrainingExample()
print('category =', category, '/ line =', line)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
criterion = nn.NLLLoss()
learning_rate = 0.005 # If you set this too high, it might explode. If too low, it might not learn

def categoryFromOutput(output):
top_n, top_i = output.topk(1)
category_i = top_i[0].item()
return all_categories[category_i], category_i


def train(category_tensor, line_tensor):
hidden = rnn.initHidden()

rnn.zero_grad()

for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)

loss = criterion(output, category_tensor)
loss.backward()

# Add parameters' gradients to their values, multiplied by learning rate
for p in rnn.parameters():
p.data.add_(-learning_rate, p.grad.data)

return output, loss.item()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
category, line, category_tensor, line_tensor = randomTrainingExample()
output, loss = train(category_tensor, line_tensor)
current_loss += loss

# Print iter number, loss, name and guess
if iter % print_every == 0:
guess, guess_i = categoryFromOutput(output)
correct = '✓' if guess == category else '✗ (%s)' % category
print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

# Add current loss avg to list of losses
if iter % plot_every == 0:
all_losses.append(current_loss / plot_every)
current_loss = 0

输出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
5000 5% (0m 17s) 1.8489 Lieu / Vietnamese ✓
10000 10% (0m 33s) 1.5836 Ri / Chinese ✗ (Korean)
15000 15% (0m 50s) 1.0168 Qiao / Chinese ✓
20000 20% (1m 7s) 0.8561 Aswad / Arabic ✓
25000 25% (1m 23s) 2.8020 Aonghus / Greek ✗ (Irish)
30000 30% (1m 39s) 1.6942 Hajkova / Japanese ✗ (Czech)
35000 35% (1m 56s) 0.7851 Lykoshin / Russian ✓
40000 40% (2m 12s) 1.7396 Lewerentz / Spanish ✗ (German)
45000 45% (2m 28s) 1.5940 Felix / French ✓
50000 50% (2m 44s) 0.7750 Cheng / Chinese ✓
55000 55% (3m 1s) 0.7863 An / Vietnamese ✓
60000 60% (3m 18s) 0.3988 Mochan / Irish ✓
65000 65% (3m 34s) 0.6608 Kassis / Arabic ✓
70000 70% (3m 50s) 0.3470 Palmeiro / Portuguese ✓
75000 75% (4m 6s) 0.1303 Koruba / Japanese ✓
80000 80% (4m 23s) 1.3898 Smets / Scottish ✗ (Dutch)
85000 85% (4m 41s) 0.2163 Connolly / Irish ✓
90000 90% (5m 0s) 1.1642 Nie / Korean ✗ (Chinese)
95000 95% (5m 19s) 1.0036 Telis / Greek ✓
100000 100% (5m 36s) 1.1293 Fritsch / English ✗ (Czech)
1
2
3
4
5
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

loss.png

测试结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def evaluate(line_tensor):
hidden = rnn.initHidden()
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
return output

def predict(input_line, n_predictions=3):
print('\n> %s' % input_line)
with torch.no_grad():
output = evaluate(lineToTensor(input_line))

# Get top N categories
topv, topi = output.topk(n_predictions, 1, True)
predictions = []

for i in range(n_predictions):
value = topv[0][i].item()
category_index = topi[0][i].item()
print('(%.2f) %s' % (value, all_categories[category_index]))
predictions.append([value, all_categories[category_index]])

predict('Dovesky')
predict('Jackson')
predict('Satoshi')

保存模型

1
torch.save(rnn, 'char-rnn-classification.pt')

加载模型

1
rnn = torch.load('char-rnn-classification.pt')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import sys
import string
import torch
import torch.nn as nn


all_categories = ['Scottish', 'Korean', 'Chinese', 'Irish', 'Arabic', 'English', 'Greek', 'Italian', 'Russian', 'Portuguese', 'Czech', 'Dutch', 'Spanish', 'French', 'German', 'Japanese', 'Vietnamese', 'Polish']
n_categories = len(all_categories)
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, input, hidden):
combined = torch.cat((input, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden

def initHidden(self):
return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

rnn = torch.load('char-rnn-classification.pt')

def evaluate(line_tensor):
hidden = rnn.initHidden()
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
return output

def letterToIndex(letter):
return all_letters.find(letter)

def lineToTensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li, letter in enumerate(line):
tensor[li][0][letterToIndex(letter)] = 1
return tensor

def predict(line, n_predictions=3):
output = evaluate(lineToTensor(line))

# Get top N categories
topv, topi = output.data.topk(n_predictions, 1, True)
predictions = []

for i in range(n_predictions):
value = topv[0][i]
category_index = topi[0][i]
print('(%.2f) %s' % (value, all_categories[category_index]))
predictions.append([value, all_categories[category_index]])

return predictions

if __name__ == '__main__':
predict(sys.argv[1])

一些细节

例如名字 “Satoshi”,长度是 7,我们的字母表的长度是 57,那么这个名字通过 lineToTensor 转换后得到的 tensor 的 shape 是 (7, 1, 57)

这里的 rnn 的 hidden 大小是 128,所以每次输入的是一个 1, 57 的 input 和一个 1, 128 的 hidden。

评论无需登录,可以匿名,欢迎评论!