Created
September 28, 2020 02:44
-
-
Save Sg4Dylan/d1e715b4140b170087f12ae49b441aa6 to your computer and use it in GitHub Desktop.
对抗字体反爬的深度学习实践
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "deobfs.ipynb", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "MVkiRhBahw5s" | |
}, | |
"source": [ | |
"解混淆流程:\n", | |
"\n", | |
"\n", | |
"1. 训练模型,使模型能够从图片识别单字\n", | |
"2. 使用混淆过的字体渲染单字图像,使用模型识别真实汉字,建立映射表\n", | |
"3. 使用映射表批量替换\n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_K4i6cuainWc" | |
}, | |
"source": [ | |
"切换当前目录到 Google Drive:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "yImYpO5NMYWm" | |
}, | |
"source": [ | |
"from google.colab import drive\n", | |
"drive.mount('/content/drive')\n", | |
"\n", | |
"import os\n", | |
"path = '/content/drive/My Drive/Colab Notebooks/deobfs_font'\n", | |
"if not os.path.exists(path):\n", | |
" os.makedirs(path)\n", | |
"os.chdir(path)\n", | |
"print(\"Current Working Directory \" , os.getcwd())" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_KnO_jQPi14N" | |
}, | |
"source": [ | |
"安装依赖库,加载骨干模型:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "B8VW6dXMSda_" | |
}, | |
"source": [ | |
"!pip install efficientnet_pytorch livelossplot" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lDbZNobFMFF2" | |
}, | |
"source": [ | |
"from efficientnet_pytorch import EfficientNet\n", | |
"\n", | |
"# 加载预训练模型\n", | |
"net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "OSu__zx3i8Yf" | |
}, | |
"source": [ | |
"构造数据集加载器,动态渲染产生训练集: \n", | |
"\\#16L 需修改为你使用的字体名称,多多益善" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fgmGp0_1PG1k" | |
}, | |
"source": [ | |
"import random\n", | |
"import numpy as np\n", | |
"from PIL import Image, ImageDraw, ImageFont\n", | |
"from torch.utils.data import DataLoader, Dataset\n", | |
"import torchvision.transforms as transforms\n", | |
"\n", | |
"class FontImgLoader(Dataset):\n", | |
"\n", | |
" transform = transforms.Compose(\n", | |
" [transforms.ToTensor(),\n", | |
" transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n", | |
"\n", | |
" def __init__(self):\n", | |
" self.loop = 10\n", | |
" self.length = self.loop * 20902 # 0x4e00 -> 0x9fa6\n", | |
" self.fonts = ['true_font_lanting_0.ttf', 'true_font_lanting_1.ttf'] # 与目标字体相似的字体\n", | |
" \n", | |
" def __len__(self):\n", | |
" return self.length\n", | |
" \n", | |
" def __getitem__(self, index):\n", | |
" word = index%20902\n", | |
" # 随机化字体字号旋转角度\n", | |
" font = random.choice(self.fonts)\n", | |
" font_size = random.choice(range(185, 210))\n", | |
" angle = random.uniform(-5, 5)\n", | |
" # 渲染单字\n", | |
" img = Image.new('RGB', (224, 224), (255, 255, 255))\n", | |
" fnt = ImageFont.truetype(font, font_size)\n", | |
" draw = ImageDraw.Draw(img)\n", | |
" draw.text((12,-22), chr(word+0x4e00), font=fnt, fill=(0,0,0))\n", | |
" img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n", | |
" return self.transform(img), word\n" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "m5MN0FLKjPJ4" | |
}, | |
"source": [ | |
"训练模型: \n", | |
"注意修改 batch_size、epoch_num、lr 确保训练能够进行 \n", | |
"通常 loss 低于 0.2 时认为模型已经收敛,模型的训练参数保存在 b0.pth 文件中" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BqrLeRbHQEqt" | |
}, | |
"source": [ | |
"import torch\n", | |
"import torch.optim as optim\n", | |
"from livelossplot import PlotLosses\n", | |
"import numpy as np\n", | |
"import os\n", | |
"\n", | |
"batch_size = 128\n", | |
"epoch_num = 20\n", | |
"trainloader = DataLoader(\n", | |
" FontImgLoader(), batch_size=batch_size,\n", | |
" shuffle=True, num_workers=2)\n", | |
"\n", | |
"criterion = torch.nn.CrossEntropyLoss()\n", | |
"optimizer = optim.Adam(net.parameters(), lr=1e-4) # 初始化学习率,非初次训练时需降低数值\n", | |
"\n", | |
"if os.path.exists('b0.pth'):\n", | |
" print('load state dict')\n", | |
" net.load_state_dict(torch.load('b0.pth'))\n", | |
"\n", | |
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", | |
"net.to(device)\n", | |
"\n", | |
"liveloss = PlotLosses()\n", | |
"\n", | |
"for epoch in range(epoch_num): # loop over the dataset multiple times\n", | |
"\n", | |
" running_loss = 0.0\n", | |
" for i, data in enumerate(trainloader, 0):\n", | |
" # get the inputs; data is a list of [inputs, labels]\n", | |
" inputs, labels = data\n", | |
" # zero the parameter gradients\n", | |
" optimizer.zero_grad()\n", | |
"\n", | |
" # forward + backward + optimize\n", | |
" outputs = net(inputs.to(device))\n", | |
" loss = criterion(outputs, labels.to(device))\n", | |
" loss.backward()\n", | |
" optimizer.step()\n", | |
"\n", | |
" # print statistics\n", | |
" running_loss += loss.item()\n", | |
" if i % 10 == 9: # print every 2000 mini-batches\n", | |
" print('[%d, %5d] loss: %.3f' %\n", | |
" (epoch + 1, i + 1, running_loss / 10))\n", | |
" liveloss.update({'loss': running_loss / 10})\n", | |
" liveloss.send()\n", | |
" running_loss = 0.0\n", | |
" \n", | |
" if i%500 == 499:\n", | |
" torch.save(net.state_dict(), 'b0.pth')\n", | |
" \n", | |
" torch.save(net.state_dict(), 'b0.pth')\n", | |
"\n", | |
"torch.save(net.state_dict(), 'b0.pth')\n", | |
"print('Finished Training')\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "cbnEAeKOjsYH" | |
}, | |
"source": [ | |
"使用训练完成的模型:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tT1hrrZ-1WlM" | |
}, | |
"source": [ | |
"# 验证\n", | |
"import json\n", | |
"from PIL import Image, ImageDraw, ImageFont\n", | |
"from tqdm import tqdm\n", | |
"import numpy as np\n", | |
"import torch\n", | |
"import torchvision.transforms as transforms\n", | |
"from efficientnet_pytorch import EfficientNet\n", | |
"\n", | |
"\n", | |
"# 加载模型\n", | |
"net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)\n", | |
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", | |
"net.load_state_dict(torch.load('b0.pth', map_location=torch.device(device)))\n", | |
"net.to(device)\n", | |
"net.eval()\n", | |
"transform = transforms.Compose(\n", | |
" [transforms.ToTensor(),\n", | |
" transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n", | |
"\n", | |
"# 复刻渲染流程\n", | |
"def gen_img(k, angle=0):\n", | |
" img = Image.new('RGB', (224, 224), (255, 255, 255))\n", | |
" fnt = ImageFont.truetype('fake.ttf', 200)\n", | |
" draw = ImageDraw.Draw(img)\n", | |
" draw.text((12,-22), chr(k), font=fnt, fill=(0,0,0))\n", | |
" img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n", | |
" return transform(img)\n", | |
"\n", | |
"# 解混淆测试\n", | |
"import re\n", | |
"import os\n", | |
"\n", | |
"map_path = 'map.json'\n", | |
"map = {}\n", | |
"test_text = open('test.html', 'r').read()\n", | |
"f_text = re.findall(r'\\\\u([abcdef\\d]{4})', test_text)\n", | |
"\n", | |
"# 尝试加载已有映射表\n", | |
"if os.path.exists(map_path):\n", | |
" map = json.loads(open(map_path, 'rb').read())\n", | |
"\n", | |
"# 产生映射表\n", | |
"for i in tqdm(f_text):\n", | |
" idx = int(i, 16)\n", | |
" if idx in map:\n", | |
" continue\n", | |
" img = torch.unsqueeze(gen_img(idx), dim=0)\n", | |
" raw = net(img.to(device))\n", | |
" outputs = torch.nn.functional.softmax(raw, dim=1)\n", | |
" value, preds = torch.max(outputs, 1)\n", | |
" map[i] = chr(offset+preds[0]) if value > 0.5 else chr(idx)\n", | |
"\n", | |
"# 输出解码结果\n", | |
"for i, k in map.items():\n", | |
" test_text = test_text.replace(f'\\\\u{i}', k)\n", | |
"print(test_text)\n", | |
"\n", | |
"# 保存映射表\n", | |
"with open(map_path, 'wb') as wp:\n", | |
" wp.write(json.dumps(map).encode('UTF-8')) " | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6WUuyjU8kiqC" | |
}, | |
"source": [ | |
"测试预测性能:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "M6v2ZjJsj75k" | |
}, | |
"source": [ | |
"# ground truth 测试\n", | |
"gt = json.loads(open('gt.user.js').read())\n", | |
"offset = 0x4e00\n", | |
"total = 0\n", | |
"count = 0\n", | |
"\n", | |
"for k, v in tqdm(gt.items()):\n", | |
" # 使用不同角度预测结果增强准确度\n", | |
" img_0 = torch.unsqueeze(gen_img(ord(k)), dim=0)\n", | |
" raw_0 = net(img_0.to(device))\n", | |
" img_1 = torch.unsqueeze(gen_img(ord(k), -3), dim=0)\n", | |
" raw_1 = net(img_1.to(device))\n", | |
" img_2 = torch.unsqueeze(gen_img(ord(k), 3), dim=0)\n", | |
" raw_2 = net(img_2.to(device))\n", | |
" outputs = torch.nn.functional.softmax(raw_0+raw_1+raw_2, dim=1)\n", | |
" value, preds = torch.max(outputs, 1)\n", | |
" if ord(v) == (offset+preds[0]):\n", | |
" count += 1\n", | |
" else:\n", | |
" print(f'\\ntruth: {v} {ord(v)} pred: {chr(offset+preds[0])} {offset+preds[0]} {value[0]:.3f} %')\n", | |
" pass\n", | |
" total += 1\n", | |
" # if total > 500:\n", | |
" # break\n", | |
"\n", | |
"print(f'\\nacc: {count/total:.3f}')\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment