Sg4Dylan/font_deobfs.ipynb

## font_deobfs.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "deobfs.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MVkiRhBahw5s"
      },
      "source": [
        "解混淆流程：\n",
        "\n",
        "\n",
        "1.   训练模型，使模型能够从图片识别单字\n",
        "2.   使用混淆过的字体渲染单字图像，使用模型识别真实汉字，建立映射表\n",
        "3.   使用映射表批量替换\n",
        "\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_K4i6cuainWc"
      },
      "source": [
        "切换当前目录到 Google Drive："
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yImYpO5NMYWm"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')\n",
        "\n",
        "import os\n",
        "path = '/content/drive/My Drive/Colab Notebooks/deobfs_font'\n",
        "if not os.path.exists(path):\n",
        "    os.makedirs(path)\n",
        "os.chdir(path)\n",
        "print(\"Current Working Directory \" , os.getcwd())"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_KnO_jQPi14N"
      },
      "source": [
        "安装依赖库，加载骨干模型："
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "B8VW6dXMSda_"
      },
      "source": [
        "!pip install efficientnet_pytorch livelossplot"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lDbZNobFMFF2"
      },
      "source": [
        "from efficientnet_pytorch import EfficientNet\n",
        "\n",
        "# 加载预训练模型\n",
        "net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OSu__zx3i8Yf"
      },
      "source": [
        "构造数据集加载器，动态渲染产生训练集：  \n",
        "\\#16L 需修改为你使用的字体名称，多多益善"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "fgmGp0_1PG1k"
      },
      "source": [
        "import random\n",
        "import numpy as np\n",
        "from PIL import Image, ImageDraw, ImageFont\n",
        "from torch.utils.data import DataLoader, Dataset\n",
        "import torchvision.transforms as transforms\n",
        "\n",
        "class FontImgLoader(Dataset):\n",
        "\n",
        "    transform = transforms.Compose(\n",
        "        [transforms.ToTensor(),\n",
        "        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n",
        "\n",
        "    def __init__(self):\n",
        "        self.loop = 10\n",
        "        self.length = self.loop * 20902 # 0x4e00 -> 0x9fa6\n",
        "        self.fonts = ['true_font_lanting_0.ttf', 'true_font_lanting_1.ttf'] # 与目标字体相似的字体\n",
        "    \n",
        "    def __len__(self):\n",
        "        return self.length\n",
        "    \n",
        "    def __getitem__(self, index):\n",
        "        word = index%20902\n",
        "        # 随机化字体字号旋转角度\n",
        "        font = random.choice(self.fonts)\n",
        "        font_size = random.choice(range(185, 210))\n",
        "        angle = random.uniform(-5, 5)\n",
        "        # 渲染单字\n",
        "        img = Image.new('RGB', (224, 224), (255, 255, 255))\n",
        "        fnt = ImageFont.truetype(font, font_size)\n",
        "        draw = ImageDraw.Draw(img)\n",
        "        draw.text((12,-22), chr(word+0x4e00), font=fnt, fill=(0,0,0))\n",
        "        img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n",
        "        return self.transform(img), word\n"
      ],
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "m5MN0FLKjPJ4"
      },
      "source": [
        "训练模型：  \n",
        "注意修改 batch_size、epoch_num、lr 确保训练能够进行  \n",
        "通常 loss 低于 0.2 时认为模型已经收敛，模型的训练参数保存在 b0.pth 文件中"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "BqrLeRbHQEqt"
      },
      "source": [
        "import torch\n",
        "import torch.optim as optim\n",
        "from livelossplot import PlotLosses\n",
        "import numpy as np\n",
        "import os\n",
        "\n",
        "batch_size = 128\n",
        "epoch_num = 20\n",
        "trainloader = DataLoader(\n",
        "    FontImgLoader(), batch_size=batch_size,\n",
        "    shuffle=True, num_workers=2)\n",
        "\n",
        "criterion = torch.nn.CrossEntropyLoss()\n",
        "optimizer = optim.Adam(net.parameters(), lr=1e-4) # 初始化学习率，非初次训练时需降低数值\n",
        "\n",
        "if os.path.exists('b0.pth'):\n",
        "    print('load state dict')\n",
        "    net.load_state_dict(torch.load('b0.pth'))\n",
        "\n",
        "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
        "net.to(device)\n",
        "\n",
        "liveloss = PlotLosses()\n",
        "\n",
        "for epoch in range(epoch_num):  # loop over the dataset multiple times\n",
        "\n",
        "    running_loss = 0.0\n",
        "    for i, data in enumerate(trainloader, 0):\n",
        "        # get the inputs; data is a list of [inputs, labels]\n",
        "        inputs, labels = data\n",
        "        # zero the parameter gradients\n",
        "        optimizer.zero_grad()\n",
        "\n",
        "        # forward + backward + optimize\n",
        "        outputs = net(inputs.to(device))\n",
        "        loss = criterion(outputs, labels.to(device))\n",
        "        loss.backward()\n",
        "        optimizer.step()\n",
        "\n",
        "        # print statistics\n",
        "        running_loss += loss.item()\n",
        "        if i % 10 == 9:    # print every 2000 mini-batches\n",
        "            print('[%d, %5d] loss: %.3f' %\n",
        "                  (epoch + 1, i + 1, running_loss / 10))\n",
        "            liveloss.update({'loss': running_loss / 10})\n",
        "            liveloss.send()\n",
        "            running_loss = 0.0\n",
        "        \n",
        "        if i%500 == 499:\n",
        "            torch.save(net.state_dict(), 'b0.pth')\n",
        "    \n",
        "    torch.save(net.state_dict(), 'b0.pth')\n",
        "\n",
        "torch.save(net.state_dict(), 'b0.pth')\n",
        "print('Finished Training')\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cbnEAeKOjsYH"
      },
      "source": [
        "使用训练完成的模型："
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tT1hrrZ-1WlM"
      },
      "source": [
        "# 验证\n",
        "import json\n",
        "from PIL import Image, ImageDraw, ImageFont\n",
        "from tqdm import tqdm\n",
        "import numpy as np\n",
        "import torch\n",
        "import torchvision.transforms as transforms\n",
        "from efficientnet_pytorch import EfficientNet\n",
        "\n",
        "\n",
        "# 加载模型\n",
        "net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)\n",
        "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
        "net.load_state_dict(torch.load('b0.pth', map_location=torch.device(device)))\n",
        "net.to(device)\n",
        "net.eval()\n",
        "transform = transforms.Compose(\n",
        "        [transforms.ToTensor(),\n",
        "        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n",
        "\n",
        "# 复刻渲染流程\n",
        "def gen_img(k, angle=0):\n",
        "    img = Image.new('RGB', (224, 224), (255, 255, 255))\n",
        "    fnt = ImageFont.truetype('fake.ttf', 200)\n",
        "    draw = ImageDraw.Draw(img)\n",
        "    draw.text((12,-22), chr(k), font=fnt, fill=(0,0,0))\n",
        "    img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n",
        "    return transform(img)\n",
        "\n",
        "# 解混淆测试\n",
        "import re\n",
        "import os\n",
        "\n",
        "map_path = 'map.json'\n",
        "map = {}\n",
        "test_text = open('test.html', 'r').read()\n",
        "f_text = re.findall(r'\\\\u([abcdef\\d]{4})', test_text)\n",
        "\n",
        "# 尝试加载已有映射表\n",
        "if os.path.exists(map_path):\n",
        "    map = json.loads(open(map_path, 'rb').read())\n",
        "\n",
        "# 产生映射表\n",
        "for i in tqdm(f_text):\n",
        "    idx = int(i, 16)\n",
        "    if idx in map:\n",
        "        continue\n",
        "    img = torch.unsqueeze(gen_img(idx), dim=0)\n",
        "    raw = net(img.to(device))\n",
        "    outputs = torch.nn.functional.softmax(raw, dim=1)\n",
        "    value, preds = torch.max(outputs, 1)\n",
        "    map[i] = chr(offset+preds[0]) if value > 0.5 else chr(idx)\n",
        "\n",
        "# 输出解码结果\n",
        "for i, k in map.items():\n",
        "    test_text = test_text.replace(f'\\\\u{i}', k)\n",
        "print(test_text)\n",
        "\n",
        "# 保存映射表\n",
        "with open(map_path, 'wb') as wp:\n",
        "    wp.write(json.dumps(map).encode('UTF-8')) "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6WUuyjU8kiqC"
      },
      "source": [
        "测试预测性能："
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "M6v2ZjJsj75k"
      },
      "source": [
        "# ground truth 测试\n",
        "gt = json.loads(open('gt.user.js').read())\n",
        "offset = 0x4e00\n",
        "total = 0\n",
        "count = 0\n",
        "\n",
        "for k, v in tqdm(gt.items()):\n",
        "    # 使用不同角度预测结果增强准确度\n",
        "    img_0 = torch.unsqueeze(gen_img(ord(k)), dim=0)\n",
        "    raw_0 = net(img_0.to(device))\n",
        "    img_1 = torch.unsqueeze(gen_img(ord(k), -3), dim=0)\n",
        "    raw_1 = net(img_1.to(device))\n",
        "    img_2 = torch.unsqueeze(gen_img(ord(k), 3), dim=0)\n",
        "    raw_2 = net(img_2.to(device))\n",
        "    outputs = torch.nn.functional.softmax(raw_0+raw_1+raw_2, dim=1)\n",
        "    value, preds = torch.max(outputs, 1)\n",
        "    if ord(v) == (offset+preds[0]):\n",
        "        count += 1\n",
        "    else:\n",
        "        print(f'\\ntruth: {v} {ord(v)} pred: {chr(offset+preds[0])} {offset+preds[0]} {value[0]:.3f} %')\n",
        "        pass\n",
        "    total += 1\n",
        "    # if total > 500:\n",
        "    #     break\n",
        "\n",
        "print(f'\\nacc: {count/total:.3f}')\n"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "deobfs.ipynb",
	"provenance": [],
	"collapsed_sections": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "MVkiRhBahw5s"
	},
	"source": [
	"解混淆流程：\n",
	"\n",
	"\n",
	"1. 训练模型，使模型能够从图片识别单字\n",
	"2. 使用混淆过的字体渲染单字图像，使用模型识别真实汉字，建立映射表\n",
	"3. 使用映射表批量替换\n",
	"\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "_K4i6cuainWc"
	},
	"source": [
	"切换当前目录到 Google Drive："
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "yImYpO5NMYWm"
	},
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/drive')\n",
	"\n",
	"import os\n",
	"path = '/content/drive/My Drive/Colab Notebooks/deobfs_font'\n",
	"if not os.path.exists(path):\n",
	" os.makedirs(path)\n",
	"os.chdir(path)\n",
	"print(\"Current Working Directory \" , os.getcwd())"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "_KnO_jQPi14N"
	},
	"source": [
	"安装依赖库，加载骨干模型："
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "B8VW6dXMSda_"
	},
	"source": [
	"!pip install efficientnet_pytorch livelossplot"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "lDbZNobFMFF2"
	},
	"source": [
	"from efficientnet_pytorch import EfficientNet\n",
	"\n",
	"# 加载预训练模型\n",
	"net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "OSu__zx3i8Yf"
	},
	"source": [
	"构造数据集加载器，动态渲染产生训练集： \n",
	"\\#16L 需修改为你使用的字体名称，多多益善"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "fgmGp0_1PG1k"
	},
	"source": [
	"import random\n",
	"import numpy as np\n",
	"from PIL import Image, ImageDraw, ImageFont\n",
	"from torch.utils.data import DataLoader, Dataset\n",
	"import torchvision.transforms as transforms\n",
	"\n",
	"class FontImgLoader(Dataset):\n",
	"\n",
	" transform = transforms.Compose(\n",
	" [transforms.ToTensor(),\n",
	" transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n",
	"\n",
	" def __init__(self):\n",
	" self.loop = 10\n",
	" self.length = self.loop * 20902 # 0x4e00 -> 0x9fa6\n",
	" self.fonts = ['true_font_lanting_0.ttf', 'true_font_lanting_1.ttf'] # 与目标字体相似的字体\n",
	" \n",
	" def __len__(self):\n",
	" return self.length\n",
	" \n",
	" def __getitem__(self, index):\n",
	" word = index%20902\n",
	" # 随机化字体字号旋转角度\n",
	" font = random.choice(self.fonts)\n",
	" font_size = random.choice(range(185, 210))\n",
	" angle = random.uniform(-5, 5)\n",
	" # 渲染单字\n",
	" img = Image.new('RGB', (224, 224), (255, 255, 255))\n",
	" fnt = ImageFont.truetype(font, font_size)\n",
	" draw = ImageDraw.Draw(img)\n",
	" draw.text((12,-22), chr(word+0x4e00), font=fnt, fill=(0,0,0))\n",
	" img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n",
	" return self.transform(img), word\n"
	],
	"execution_count": 12,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "m5MN0FLKjPJ4"
	},
	"source": [
	"训练模型： \n",
	"注意修改 batch_size、epoch_num、lr 确保训练能够进行 \n",
	"通常 loss 低于 0.2 时认为模型已经收敛，模型的训练参数保存在 b0.pth 文件中"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "BqrLeRbHQEqt"
	},
	"source": [
	"import torch\n",
	"import torch.optim as optim\n",
	"from livelossplot import PlotLosses\n",
	"import numpy as np\n",
	"import os\n",
	"\n",
	"batch_size = 128\n",
	"epoch_num = 20\n",
	"trainloader = DataLoader(\n",
	" FontImgLoader(), batch_size=batch_size,\n",
	" shuffle=True, num_workers=2)\n",
	"\n",
	"criterion = torch.nn.CrossEntropyLoss()\n",
	"optimizer = optim.Adam(net.parameters(), lr=1e-4) # 初始化学习率，非初次训练时需降低数值\n",
	"\n",
	"if os.path.exists('b0.pth'):\n",
	" print('load state dict')\n",
	" net.load_state_dict(torch.load('b0.pth'))\n",
	"\n",
	"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
	"net.to(device)\n",
	"\n",
	"liveloss = PlotLosses()\n",
	"\n",
	"for epoch in range(epoch_num): # loop over the dataset multiple times\n",
	"\n",
	" running_loss = 0.0\n",
	" for i, data in enumerate(trainloader, 0):\n",
	" # get the inputs; data is a list of [inputs, labels]\n",
	" inputs, labels = data\n",
	" # zero the parameter gradients\n",
	" optimizer.zero_grad()\n",
	"\n",
	" # forward + backward + optimize\n",
	" outputs = net(inputs.to(device))\n",
	" loss = criterion(outputs, labels.to(device))\n",
	" loss.backward()\n",
	" optimizer.step()\n",
	"\n",
	" # print statistics\n",
	" running_loss += loss.item()\n",
	" if i % 10 == 9: # print every 2000 mini-batches\n",
	" print('[%d, %5d] loss: %.3f' %\n",
	" (epoch + 1, i + 1, running_loss / 10))\n",
	" liveloss.update({'loss': running_loss / 10})\n",
	" liveloss.send()\n",
	" running_loss = 0.0\n",
	" \n",
	" if i%500 == 499:\n",
	" torch.save(net.state_dict(), 'b0.pth')\n",
	" \n",
	" torch.save(net.state_dict(), 'b0.pth')\n",
	"\n",
	"torch.save(net.state_dict(), 'b0.pth')\n",
	"print('Finished Training')\n"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "cbnEAeKOjsYH"
	},
	"source": [
	"使用训练完成的模型："
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "tT1hrrZ-1WlM"
	},
	"source": [
	"# 验证\n",
	"import json\n",
	"from PIL import Image, ImageDraw, ImageFont\n",
	"from tqdm import tqdm\n",
	"import numpy as np\n",
	"import torch\n",
	"import torchvision.transforms as transforms\n",
	"from efficientnet_pytorch import EfficientNet\n",
	"\n",
	"\n",
	"# 加载模型\n",
	"net = EfficientNet.from_pretrained('efficientnet-b0', num_classes=20902)\n",
	"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
	"net.load_state_dict(torch.load('b0.pth', map_location=torch.device(device)))\n",
	"net.to(device)\n",
	"net.eval()\n",
	"transform = transforms.Compose(\n",
	" [transforms.ToTensor(),\n",
	" transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])\n",
	"\n",
	"# 复刻渲染流程\n",
	"def gen_img(k, angle=0):\n",
	" img = Image.new('RGB', (224, 224), (255, 255, 255))\n",
	" fnt = ImageFont.truetype('fake.ttf', 200)\n",
	" draw = ImageDraw.Draw(img)\n",
	" draw.text((12,-22), chr(k), font=fnt, fill=(0,0,0))\n",
	" img = img.rotate(angle, expand=0, fillcolor=(255, 255, 255))\n",
	" return transform(img)\n",
	"\n",
	"# 解混淆测试\n",
	"import re\n",
	"import os\n",
	"\n",
	"map_path = 'map.json'\n",
	"map = {}\n",
	"test_text = open('test.html', 'r').read()\n",
	"f_text = re.findall(r'\\\\u([abcdef\\d]{4})', test_text)\n",
	"\n",
	"# 尝试加载已有映射表\n",
	"if os.path.exists(map_path):\n",
	" map = json.loads(open(map_path, 'rb').read())\n",
	"\n",
	"# 产生映射表\n",
	"for i in tqdm(f_text):\n",
	" idx = int(i, 16)\n",
	" if idx in map:\n",
	" continue\n",
	" img = torch.unsqueeze(gen_img(idx), dim=0)\n",
	" raw = net(img.to(device))\n",
	" outputs = torch.nn.functional.softmax(raw, dim=1)\n",
	" value, preds = torch.max(outputs, 1)\n",
	" map[i] = chr(offset+preds[0]) if value > 0.5 else chr(idx)\n",
	"\n",
	"# 输出解码结果\n",
	"for i, k in map.items():\n",
	" test_text = test_text.replace(f'\\\\u{i}', k)\n",
	"print(test_text)\n",
	"\n",
	"# 保存映射表\n",
	"with open(map_path, 'wb') as wp:\n",
	" wp.write(json.dumps(map).encode('UTF-8')) "
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "6WUuyjU8kiqC"
	},
	"source": [
	"测试预测性能："
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "M6v2ZjJsj75k"
	},
	"source": [
	"# ground truth 测试\n",
	"gt = json.loads(open('gt.user.js').read())\n",
	"offset = 0x4e00\n",
	"total = 0\n",
	"count = 0\n",
	"\n",
	"for k, v in tqdm(gt.items()):\n",
	" # 使用不同角度预测结果增强准确度\n",
	" img_0 = torch.unsqueeze(gen_img(ord(k)), dim=0)\n",
	" raw_0 = net(img_0.to(device))\n",
	" img_1 = torch.unsqueeze(gen_img(ord(k), -3), dim=0)\n",
	" raw_1 = net(img_1.to(device))\n",
	" img_2 = torch.unsqueeze(gen_img(ord(k), 3), dim=0)\n",
	" raw_2 = net(img_2.to(device))\n",
	" outputs = torch.nn.functional.softmax(raw_0+raw_1+raw_2, dim=1)\n",
	" value, preds = torch.max(outputs, 1)\n",
	" if ord(v) == (offset+preds[0]):\n",
	" count += 1\n",
	" else:\n",
	" print(f'\\ntruth: {v} {ord(v)} pred: {chr(offset+preds[0])} {offset+preds[0]} {value[0]:.3f} %')\n",
	" pass\n",
	" total += 1\n",
	" # if total > 500:\n",
	" # break\n",
	"\n",
	"print(f'\\nacc: {count/total:.3f}')\n"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}