pr0nstar/all.your.files.belong.to.us.ipynb

## all.your.files.belong.to.us.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import os\n",
    "import re\n",
    "import sys\n",
    "import glob\n",
    "import tqdm\n",
    "\n",
    "ORIG_PATH = os.getcwd()\n",
    "BASE_PATH = '../gits/deep-text-recognition-benchmark/'\n",
    "sys.path.append(BASE_PATH)\n",
    "os.chdir(BASE_PATH)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import urllib.request\n",
    "\n",
    "from PIL import Image\n",
    "\n",
    "import torch\n",
    "import torch.backends.cudnn as cudnn\n",
    "import torch.utils.data\n",
    "import torch.nn.functional as F\n",
    "\n",
    "from utils import AttnLabelConverter\n",
    "from dataset import RawDataset, AlignCollate\n",
    "from model import Model\n",
    "\n",
    "os.chdir(ORIG_PATH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_image(file_name):\n",
    "    image = Image.open(file_name)\n",
    "    image_arr = np.array(image).reshape(image.size[1], image.size[0], 3)\n",
    "    image_arr[(np.sum(image_arr, axis=2) > 290)] = [255,255,255]\n",
    "    \n",
    "    return Image.fromarray(image_arr).convert('L')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/clovaai/deep-text-recognition-benchmark\n",
    "MODEL = 'models/TPS-ResNet-BiLSTM-Attn.pth'\n",
    "CHARS = '0123456789abcdefghijklmnopqrstuvwxyz'\n",
    "DEVICE = torch.device('cpu')\n",
    "\n",
    "class OPT(object):\n",
    "    Transformation = 'TPS'\n",
    "    FeatureExtraction = 'ResNet'\n",
    "    SequenceModeling = 'BiLSTM'\n",
    "    Prediction = 'Attn'\n",
    "    \n",
    "    num_fiducial = 20\n",
    "    imgH = 32\n",
    "    imgW = 100\n",
    "    input_channel = 1\n",
    "    output_channel = 512\n",
    "    hidden_size = 256\n",
    "    \n",
    "    num_class = 38\n",
    "    batch_max_length = 5\n",
    "    \n",
    "opt = OPT()\n",
    "\n",
    "def load_model():\n",
    "    model = Model(opt)\n",
    "    model = torch.nn.DataParallel(model).to(DEVICE)\n",
    "    model.load_state_dict(torch.load(BASE_PATH + MODEL, map_location=DEVICE))\n",
    "    model.eval()\n",
    "    \n",
    "    return model\n",
    "\n",
    "def break_captcha(model, image):\n",
    "    converter = AttnLabelConverter(CHARS)\n",
    "    AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=True)\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        image_tensor, _ = AlignCollate_demo([(image, '')])\n",
    "        image_tensor = image_tensor.to(DEVICE)\n",
    "        batch_size = image_tensor.size(0)\n",
    "        \n",
    "        length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(DEVICE)\n",
    "        text_for_pred = torch.LongTensor(\n",
    "            batch_size, opt.batch_max_length + 1\n",
    "        ).fill_(0).to(DEVICE)\n",
    "        \n",
    "        preds = model(image_tensor, text_for_pred, is_train=False)\n",
    "        _, preds_index = preds.max(2)\n",
    "        preds_str = converter.decode(preds_index, length_for_pred)\n",
    "        \n",
    "        preds_prob = F.softmax(preds, dim=2)\n",
    "        preds_max_prob, _ = preds_prob.max(dim=2)\n",
    "        \n",
    "        confidence_score = preds_max_prob[0].cumprod(dim=0)[-1]\n",
    "        \n",
    "        return preds_str, confidence_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "DOMAIN = 'www.sicoes.gob.bo'\n",
    "URI = '/portal/contrataciones/ficha/'\n",
    "MAX_TRY = 3\n",
    "\n",
    "model = load_model()\n",
    "\n",
    "class NoRedirect(urllib.request.HTTPRedirectHandler):\n",
    "    def redirect_request(self, req, fp, code, msg, headers, newurl):\n",
    "        return None\n",
    "\n",
    "class ErrorRequest(object):\n",
    "    def __init__(self, code=302):\n",
    "        self.code = code\n",
    "        \n",
    "    def getcode(self):\n",
    "        return self.code\n",
    "\n",
    "opener = urllib.request.build_opener(NoRedirect)\n",
    "urllib.request.install_opener(opener)\n",
    "\n",
    "def do_request(URL, data=None, cookie=None):\n",
    "    headers = {\n",
    "        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'\n",
    "    }\n",
    "    if cookie:\n",
    "        headers['Cookie'] = cookie\n",
    "        \n",
    "    if type(data) == dict:\n",
    "        data = urllib.parse.urlencode(data).encode()\n",
    "        \n",
    "    req = urllib.request.Request(URL, data=data, headers=headers)\n",
    "    \n",
    "    try:\n",
    "        return urllib.request.urlopen(req)\n",
    "    except urllib.error.HTTPError:\n",
    "        return ErrorRequest(302)\n",
    "    \n",
    "def do_download(cuce, _try = 0):\n",
    "    if _try > MAX_TRY:\n",
    "        return\n",
    "    \n",
    "    req = do_request('https://{}/{}/{}'.format(DOMAIN, URI, 'captchaFicha.php'))\n",
    "    cookies = [_.split(';')[0] for _ in req.getheader('Set-Cookie').split(',')]\n",
    "    \n",
    "    image = io.BytesIO(req.read())\n",
    "    image = load_image(image)\n",
    "    \n",
    "    resolved, score = break_captcha(model, image)\n",
    "    \n",
    "    if score < .4:\n",
    "        return do_download(cuce, _try + 1)\n",
    "    \n",
    "    req = do_request(\n",
    "        'https://{}/{}/{}'.format(DOMAIN, URI, 'descargaFicha.php'),\n",
    "        data={\n",
    "            'inpCaptcha': resolved[0][:resolved[0].find('[s]')],\n",
    "            'cp': cuce,\n",
    "            'B903A6B7': 'DEADBEEF'\n",
    "        }, \n",
    "        cookie=';'.join(cookies)\n",
    "    )\n",
    "    \n",
    "    if req.getcode() != 200:\n",
    "        return do_download(cuce, _try + 1)\n",
    "    \n",
    "    with open('../res/{}.pdf'.format(cuce), 'wb') as f:\n",
    "        f.write(req.read())\n",
    "        \n",
    "    return _try\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../../convocatorias/convocatorias.csv')\n",
    "cuces = df[df['tipo_de_contratacion'] == 'Bienes']['cuce'].sort_index(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 94/8761 [04:10<5:43:49,  2.38s/it] "
     ]
    }
   ],
   "source": [
    "stats = {'error': 0, 'fail': 0, 'success': 0}\n",
    "for cuce in tqdm.tqdm(cuces):\n",
    "    try:\n",
    "        res = do_download(cuce)\n",
    "    except Exception:\n",
    "        pass\n",
    "    \n",
    "    if res is None:\n",
    "        stats['fail'] = stats['fail'] + 1\n",
    "        stats['error'] = stats['error'] + MAX_TRY\n",
    "    else:\n",
    "        stats['success'] = stats['success'] + 1\n",
    "        stats['error'] = stats['error'] + res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "stats"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"import io\n",
	"import os\n",
	"import re\n",
	"import sys\n",
	"import glob\n",
	"import tqdm\n",
	"\n",
	"ORIG_PATH = os.getcwd()\n",
	"BASE_PATH = '../gits/deep-text-recognition-benchmark/'\n",
	"sys.path.append(BASE_PATH)\n",
	"os.chdir(BASE_PATH)\n",
	"\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import urllib.request\n",
	"\n",
	"from PIL import Image\n",
	"\n",
	"import torch\n",
	"import torch.backends.cudnn as cudnn\n",
	"import torch.utils.data\n",
	"import torch.nn.functional as F\n",
	"\n",
	"from utils import AttnLabelConverter\n",
	"from dataset import RawDataset, AlignCollate\n",
	"from model import Model\n",
	"\n",
	"os.chdir(ORIG_PATH)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def load_image(file_name):\n",
	" image = Image.open(file_name)\n",
	" image_arr = np.array(image).reshape(image.size[1], image.size[0], 3)\n",
	" image_arr[(np.sum(image_arr, axis=2) > 290)] = [255,255,255]\n",
	" \n",
	" return Image.fromarray(image_arr).convert('L')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"# https://github.com/clovaai/deep-text-recognition-benchmark\n",
	"MODEL = 'models/TPS-ResNet-BiLSTM-Attn.pth'\n",
	"CHARS = '0123456789abcdefghijklmnopqrstuvwxyz'\n",
	"DEVICE = torch.device('cpu')\n",
	"\n",
	"class OPT(object):\n",
	" Transformation = 'TPS'\n",
	" FeatureExtraction = 'ResNet'\n",
	" SequenceModeling = 'BiLSTM'\n",
	" Prediction = 'Attn'\n",
	" \n",
	" num_fiducial = 20\n",
	" imgH = 32\n",
	" imgW = 100\n",
	" input_channel = 1\n",
	" output_channel = 512\n",
	" hidden_size = 256\n",
	" \n",
	" num_class = 38\n",
	" batch_max_length = 5\n",
	" \n",
	"opt = OPT()\n",
	"\n",
	"def load_model():\n",
	" model = Model(opt)\n",
	" model = torch.nn.DataParallel(model).to(DEVICE)\n",
	" model.load_state_dict(torch.load(BASE_PATH + MODEL, map_location=DEVICE))\n",
	" model.eval()\n",
	" \n",
	" return model\n",
	"\n",
	"def break_captcha(model, image):\n",
	" converter = AttnLabelConverter(CHARS)\n",
	" AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=True)\n",
	" \n",
	" with torch.no_grad():\n",
	" image_tensor, _ = AlignCollate_demo([(image, '')])\n",
	" image_tensor = image_tensor.to(DEVICE)\n",
	" batch_size = image_tensor.size(0)\n",
	" \n",
	" length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(DEVICE)\n",
	" text_for_pred = torch.LongTensor(\n",
	" batch_size, opt.batch_max_length + 1\n",
	" ).fill_(0).to(DEVICE)\n",
	" \n",
	" preds = model(image_tensor, text_for_pred, is_train=False)\n",
	" _, preds_index = preds.max(2)\n",
	" preds_str = converter.decode(preds_index, length_for_pred)\n",
	" \n",
	" preds_prob = F.softmax(preds, dim=2)\n",
	" preds_max_prob, _ = preds_prob.max(dim=2)\n",
	" \n",
	" confidence_score = preds_max_prob[0].cumprod(dim=0)[-1]\n",
	" \n",
	" return preds_str, confidence_score"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"DOMAIN = 'www.sicoes.gob.bo'\n",
	"URI = '/portal/contrataciones/ficha/'\n",
	"MAX_TRY = 3\n",
	"\n",
	"model = load_model()\n",
	"\n",
	"class NoRedirect(urllib.request.HTTPRedirectHandler):\n",
	" def redirect_request(self, req, fp, code, msg, headers, newurl):\n",
	" return None\n",
	"\n",
	"class ErrorRequest(object):\n",
	" def __init__(self, code=302):\n",
	" self.code = code\n",
	" \n",
	" def getcode(self):\n",
	" return self.code\n",
	"\n",
	"opener = urllib.request.build_opener(NoRedirect)\n",
	"urllib.request.install_opener(opener)\n",
	"\n",
	"def do_request(URL, data=None, cookie=None):\n",
	" headers = {\n",
	" 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'\n",
	" }\n",
	" if cookie:\n",
	" headers['Cookie'] = cookie\n",
	" \n",
	" if type(data) == dict:\n",
	" data = urllib.parse.urlencode(data).encode()\n",
	" \n",
	" req = urllib.request.Request(URL, data=data, headers=headers)\n",
	" \n",
	" try:\n",
	" return urllib.request.urlopen(req)\n",
	" except urllib.error.HTTPError:\n",
	" return ErrorRequest(302)\n",
	" \n",
	"def do_download(cuce, _try = 0):\n",
	" if _try > MAX_TRY:\n",
	" return\n",
	" \n",
	" req = do_request('https://{}/{}/{}'.format(DOMAIN, URI, 'captchaFicha.php'))\n",
	" cookies = [_.split(';')[0] for _ in req.getheader('Set-Cookie').split(',')]\n",
	" \n",
	" image = io.BytesIO(req.read())\n",
	" image = load_image(image)\n",
	" \n",
	" resolved, score = break_captcha(model, image)\n",
	" \n",
	" if score < .4:\n",
	" return do_download(cuce, _try + 1)\n",
	" \n",
	" req = do_request(\n",
	" 'https://{}/{}/{}'.format(DOMAIN, URI, 'descargaFicha.php'),\n",
	" data={\n",
	" 'inpCaptcha': resolved[0][:resolved[0].find('[s]')],\n",
	" 'cp': cuce,\n",
	" 'B903A6B7': 'DEADBEEF'\n",
	" }, \n",
	" cookie=';'.join(cookies)\n",
	" )\n",
	" \n",
	" if req.getcode() != 200:\n",
	" return do_download(cuce, _try + 1)\n",
	" \n",
	" with open('../res/{}.pdf'.format(cuce), 'wb') as f:\n",
	" f.write(req.read())\n",
	" \n",
	" return _try\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = pd.read_csv('../../convocatorias/convocatorias.csv')\n",
	"cuces = df[df['tipo_de_contratacion'] == 'Bienes']['cuce'].sort_index(ascending=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	" 1%\| \| 94/8761 [04:10<5:43:49, 2.38s/it] "
	]
	}
	],
	"source": [
	"stats = {'error': 0, 'fail': 0, 'success': 0}\n",
	"for cuce in tqdm.tqdm(cuces):\n",
	" try:\n",
	" res = do_download(cuce)\n",
	" except Exception:\n",
	" pass\n",
	" \n",
	" if res is None:\n",
	" stats['fail'] = stats['fail'] + 1\n",
	" stats['error'] = stats['error'] + MAX_TRY\n",
	" else:\n",
	" stats['success'] = stats['success'] + 1\n",
	" stats['error'] = stats['error'] + res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"stats"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}