CAPTCHA Recognition Based on Convolutional Neural Network
该文本描述了一个使用PyTorch构建的深度学习模型,用于识别复杂CAPTCHA图像。项目包括以下关键步骤:
数据生成与预处理:使用captcha库生成CAPTCHA图像数据,定义字符池和图像尺寸,通过数据增强和预处理生成训练、验证和测试数据集。
模型设计:设计了一个基于卷积神经网络(CNN)的模型,包含多个卷积层、激活函数和池化操作,最终输出多标签预测结果。
训练与验证:使用Adam优化器进行训练,定义了损失函数(MultiLabelSoftMarginLoss)并设置了训练参数(如批量大小、学习率和训练轮数)。通过训练集和验证集的损失曲线进行模型监控。
测试与评估:在测试集上评估模型性能,计算准确率并输出结果。
项目在Google Colab上运行,使用了PyTorch框架。
当一个爬虫遇到一个图像 CAPTCHA 系统时,一个 CAPTCHA 识别程序是必须的。本项目利用 PyTorch 构建了一个基于卷积神经网络(CNN)的深度学习模型,以识别由数字和字母组成的复杂 CAPTCHA 图像。我们利用 captcha 库自带的生成器生成了大量图像,并将其划分为训练集和测试集。利用 PyTorch 框架构建了 CNN 模型,并在其上进行了若干轮训练,最终期望实现相对较高的识别准确率。
文章目录
-
1. Pre-procession
-
- Import Packages
- Define Hyper-parameters
-
2. Generate Data
-
3. Load Data
-
- One-hot Code
- Define Data Loader
-
4. Design Model
-
5. Train Model
-
- Training Cycles
- Visualization
-
6. Test Model
1. Pre-procession
Import Packages
!pip install captcha # Install the CAPTCHA library and Pillow package

import concurrent.futures
import os
import random
import shutil
from pathlib import Path
import PIL
import captcha
import matplotlib
import matplotlib.pyplot as plt
import torch
import torchvision
from PIL import Image
from captcha.image import ImageCaptcha
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
%matplotlib inline
print("---Versions of Required Packages---")
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)
print("pillow:", PIL.__version__)
print("captcha:", captcha.__version__)
print("matplotlib:", matplotlib.__version__)

Define Hyper-parameters
CHAR_NUMBER = 4 # Number of characters in the image CAPTCHA
IMG_WIDTH = 160 # Image width
IMG_HEIGHT = 60 # Image height
SEED = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" # Character pool
TRAIN_SIZE = 30000 # Size of train set
VALIDATION_SIZE = 10000 # Size of validation set
TEST_SIZE = 10000 # Size of test set
BATCH_SIZE = 60 # Number of images in a mini-batch
TOTAL_EPOCH = 25 # Training rounds
LEARNING_RATE = 1e-3 # Learning rate while backward
device = "cuda" if torch.cuda.is_available() else "cpu" # Run this model on GPU if possible
torch.cuda.is_available()

# Path format: "./{folder}" (run locally) or "/content/{folder}" (run on Google Colab)
# train_set_path = "./data/train"
# validation_set_path = './data/validation'
# test_set_path = "./data/test"
# save_file_path = "./result/model.pth"
train_set_path = "/content/data/train"
validation_set_path = '/content/data/validation'
test_set_path = "/content/data/test"
save_file_path = "/content/result/model.pth"
# Lists for visualization usage
epoch_list = []
train_loss_list = []
valid_loss_list = []
2. Generate Data
def captcha_generator(num, output_dir, thread_name=0):
if Path(output_dir).exists():
shutil.rmtree(output_dir) # If the directory already exists, delete it before creating a new one.
Path(output_dir).mkdir()
for i in range(num):
image_captcha = ImageCaptcha(width=IMG_WIDTH, height=IMG_HEIGHT)
chars = "".join([random.choice(SEED) for _ in range(CHAR_NUMBER)]) # Randomly choose an element in the character pool to be attached to the CAPTCHA string.
save_path = f"{output_dir}/{i + 1}-{chars}.png" # The default output format is png.
image_captcha.write(chars, save_path)
print(f"Thread {thread_name}: {i + 1} CAPTCHA code{'s' if i > 0 else ''} ha{'ve' if i > 0 else 's'} been generated. ")
print(f"Thread {thread_name}: Congrats! All CAPTCHA codes have been generated! ")
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Multithread generation.
# executor.submit(captcha_generator, 3, "./data", 1)
executor.submit(captcha_generator, TRAIN_SIZE, train_set_path, 0)
executor.submit(captcha_generator, TEST_SIZE, test_set_path, 1)
executor.submit(captcha_generator, VALIDATION_SIZE, validation_set_path, 2)

3. Load Data
One-hot Code
def one_hot_encode(chars):
"""Convert Characters to One-hot Codes"""
cols = len(SEED)
rows = CHAR_NUMBER
res = torch.zeros(rows, cols, dtype=torch.float32) # Initialize the result tensor (can combine first two arguments to be a tuple for some reason)
for i, char in enumerate(chars):
j = SEED.index(char) # The column index is the position of a character in the character pool string
res[i, j] = 1.0 # Set the j-th element in the i-th row to be 1
return res.view(1, -1)[0] # Reshape and return the tensor as a row vector.
def one_hot_decode(code):
"""Revert One-hot Codes to Characters"""
code = code.view(-1, len(SEED)) # Reshape the row vector (one-hot code)
index_list = torch.argmax(code, dim=1) # dim=x:沿轴-x降维
chars = "".join([SEED[i] for i in index_list]) # Restore the characters respectively.
return chars
Define Data Loader
class ImageDataset(Dataset):
def __init__(self, dir_path):
super(ImageDataset, self).__init__()
self.img_path_list = [f"{dir_path}/{filename}" for filename in os.listdir(dir_path)] # Load all the paths of images in the data set.
self.transform = transforms.Compose([ # The transforms for processing images.
transforms.ToTensor(),
transforms.Grayscale(),
])
def __getitem__(self, index):
image = self.transform(Image.open(self.img_path_list[index]))
label = self.img_path_list[index].split("-")[-1].replace(".png", "") # Detach the characters from leading numbers and the file format(".png")
label = one_hot_encode(label)
return image, label
def __len__(self):
return len(self.img_path_list)
def get_dataloader(path):
dataset = ImageDataset(path)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) # Not really necessary to shuffle the data.
return dataloader
# View the shape of tensors.
train_dataloader = get_dataloader(train_set_path)
test_dataloader = get_dataloader(test_set_path)
for inputs, targets in train_dataloader:
print(inputs, inputs.shape)
print(targets, targets.shape)
break

4. Design Model
class NeuralNetWork(nn.Module):
"""Convolutional Neural Network (VGG-16)
Layout:
1. Conv_1x64 -> ReLU -> MaxPool_2x2
2. Conv_64x128 -> ReLU -> MaxPool_2x2
3. Conv_128x256 -> ReLU -> MaxPool_2x2
4. Conv_256x512 -> ReLU -> MaxPool_2x2
5. FC -(drop out)-> ReLU -> FC
"""
def __init__(self):
super(NeuralNetWork, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.layer4 = nn.Sequential(
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.layer5 = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=15360, out_features=4096),
nn.Dropout(0.5),
nn.ReLU(),
nn.Linear(in_features=4096, out_features=CHAR_NUMBER * len(SEED)) # The number of predictions must be the CAPTCHA character number times the length of the character pool
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.layer5(x)
return x
5. Train Model
Training Cycles
def train(dataloader, model, loss_func, optimizer):
model.train()
running_loss = 0.0
for batch, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = loss_func(outputs, targets)
running_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % (BATCH_SIZE / 10) == 0:
print(f"Batch {batch + 1}: Loss = {loss:>7f}")
print(f"Total loss on Train Set is {running_loss:>7f}")
return running_loss
def validate(dataloader, model, loss_func):
model.eval()
running_loss = 0.0
with torch.no_grad():
for batch, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(device), targets.to(device)
outputs = model(inputs)
loss = loss_func(outputs, targets)
running_loss += loss.item()
print(f"Total loss on Validation Set is {running_loss:>8f}")
return running_loss
损失函数:Multi-Label Soft Margin Loss(多标签交叉熵损失)loss(x,y)=-\frac{1}{C}\sum\limits_{i}(y^{(i)}\log(1+\exp(-x^{(i)}))^{-1}+(1-y^{(i)})\log\frac{\exp(-x^{(i)})}{1+\exp(-x^{(i)})}),其中x为输入张量,其形状为(N,C)(表示批量大小和分类数量),y为具有相同形状的标签。
model = NeuralNetWork().to(device)
loss_func = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
train_dataloader = get_dataloader(train_set_path)
validation_dataloader = get_dataloader(validation_set_path)
for epoch in range(TOTAL_EPOCH):
print(f"--------------- Training Epoch {epoch + 1} ---------------")
epoch_list.append(epoch + 1)
train_loss = train(train_dataloader, model, loss_func, optimizer)
train_loss_list.append(train_loss)
valid_loss = validate(validation_dataloader, model, loss_func)
valid_loss_list.append(valid_loss)
print()
torch.save(model.state_dict(), save_file_path)
print(f"The training is complete and the model is saved at \"{save_file_path}\"")

Visualization
plt.plot(epoch_list, train_loss_list)
plt.xlabel('Epoch')
plt.ylabel('Loss on Training Set')
plt.grid()
plt.show()

plt.plot(epoch_list, valid_loss_list)
plt.xlabel('Epoch')
plt.ylabel('Loss on Validation Set')
plt.grid()
plt.show()

6. Test Model
def predict(model, file_path):
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Grayscale(),
])
with torch.no_grad():
inputs = transform(Image.open(file_path)).reshape(1, 1, 60, 160).to(device) # All tensors(operators) should be on the same device.
outputs = model(inputs)
# print(outputs)
chars = one_hot_decode(outputs)
return chars
def recognize(model, file_path):
"""Can also be used to recognize other CAPTCHA images with custom paths"""
model.eval()
real_captcha = file_path.split("-")[-1].replace(".png", "") # File name formation: {index}-{characters}.{file format}
pred_captcha = predict(model, file_path)
correct = 0
if pred_captcha == real_captcha:
print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is CORRECT!")
correct = 1
else:
print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is WRONG!")
return correct
def model_test(model):
correct = 0
total = len(os.listdir(test_set_path))
for filename in os.listdir(test_set_path):
file_path = f"{test_set_path}/{filename}"
correct += recognize(model, file_path)
accuracy = f"{correct / total * 100:.8f}%"
print("\nThe accuracy of the model is", accuracy)
model = NeuralNetWork().to(device)
model.load_state_dict(torch.load(save_file_path, map_location=torch.device(device)))
model_test(model)

