Skip to content

Instantly share code, notes, and snippets.

@pr0nstar
Last active August 21, 2020 01:52
Show Gist options
  • Save pr0nstar/297addcbf22caa7b47fa485b1e391fd4 to your computer and use it in GitHub Desktop.
Save pr0nstar/297addcbf22caa7b47fa485b1e391fd4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"import io\n",
"import os\n",
"import re\n",
"import sys\n",
"import glob\n",
"import tqdm\n",
"\n",
"ORIG_PATH = os.getcwd()\n",
"BASE_PATH = '../gits/deep-text-recognition-benchmark/'\n",
"sys.path.append(BASE_PATH)\n",
"os.chdir(BASE_PATH)\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import urllib.request\n",
"\n",
"from PIL import Image\n",
"\n",
"import torch\n",
"import torch.backends.cudnn as cudnn\n",
"import torch.utils.data\n",
"import torch.nn.functional as F\n",
"\n",
"from utils import AttnLabelConverter\n",
"from dataset import RawDataset, AlignCollate\n",
"from model import Model\n",
"\n",
"os.chdir(ORIG_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def load_image(file_name):\n",
" image = Image.open(file_name)\n",
" image_arr = np.array(image).reshape(image.size[1], image.size[0], 3)\n",
" image_arr[(np.sum(image_arr, axis=2) > 290)] = [255,255,255]\n",
" \n",
" return Image.fromarray(image_arr).convert('L')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# https://github.com/clovaai/deep-text-recognition-benchmark\n",
"MODEL = 'models/TPS-ResNet-BiLSTM-Attn.pth'\n",
"CHARS = '0123456789abcdefghijklmnopqrstuvwxyz'\n",
"DEVICE = torch.device('cpu')\n",
"\n",
"class OPT(object):\n",
" Transformation = 'TPS'\n",
" FeatureExtraction = 'ResNet'\n",
" SequenceModeling = 'BiLSTM'\n",
" Prediction = 'Attn'\n",
" \n",
" num_fiducial = 20\n",
" imgH = 32\n",
" imgW = 100\n",
" input_channel = 1\n",
" output_channel = 512\n",
" hidden_size = 256\n",
" \n",
" num_class = 38\n",
" batch_max_length = 5\n",
" \n",
"opt = OPT()\n",
"\n",
"def load_model():\n",
" model = Model(opt)\n",
" model = torch.nn.DataParallel(model).to(DEVICE)\n",
" model.load_state_dict(torch.load(BASE_PATH + MODEL, map_location=DEVICE))\n",
" model.eval()\n",
" \n",
" return model\n",
"\n",
"def break_captcha(model, image):\n",
" converter = AttnLabelConverter(CHARS)\n",
" AlignCollate_demo = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=True)\n",
" \n",
" with torch.no_grad():\n",
" image_tensor, _ = AlignCollate_demo([(image, '')])\n",
" image_tensor = image_tensor.to(DEVICE)\n",
" batch_size = image_tensor.size(0)\n",
" \n",
" length_for_pred = torch.IntTensor([opt.batch_max_length] * batch_size).to(DEVICE)\n",
" text_for_pred = torch.LongTensor(\n",
" batch_size, opt.batch_max_length + 1\n",
" ).fill_(0).to(DEVICE)\n",
" \n",
" preds = model(image_tensor, text_for_pred, is_train=False)\n",
" _, preds_index = preds.max(2)\n",
" preds_str = converter.decode(preds_index, length_for_pred)\n",
" \n",
" preds_prob = F.softmax(preds, dim=2)\n",
" preds_max_prob, _ = preds_prob.max(dim=2)\n",
" \n",
" confidence_score = preds_max_prob[0].cumprod(dim=0)[-1]\n",
" \n",
" return preds_str, confidence_score"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"DOMAIN = 'www.sicoes.gob.bo'\n",
"URI = '/portal/contrataciones/ficha/'\n",
"MAX_TRY = 3\n",
"\n",
"model = load_model()\n",
"\n",
"class NoRedirect(urllib.request.HTTPRedirectHandler):\n",
" def redirect_request(self, req, fp, code, msg, headers, newurl):\n",
" return None\n",
"\n",
"class ErrorRequest(object):\n",
" def __init__(self, code=302):\n",
" self.code = code\n",
" \n",
" def getcode(self):\n",
" return self.code\n",
"\n",
"opener = urllib.request.build_opener(NoRedirect)\n",
"urllib.request.install_opener(opener)\n",
"\n",
"def do_request(URL, data=None, cookie=None):\n",
" headers = {\n",
" 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'\n",
" }\n",
" if cookie:\n",
" headers['Cookie'] = cookie\n",
" \n",
" if type(data) == dict:\n",
" data = urllib.parse.urlencode(data).encode()\n",
" \n",
" req = urllib.request.Request(URL, data=data, headers=headers)\n",
" \n",
" try:\n",
" return urllib.request.urlopen(req)\n",
" except urllib.error.HTTPError:\n",
" return ErrorRequest(302)\n",
" \n",
"def do_download(cuce, _try = 0):\n",
" if _try > MAX_TRY:\n",
" return\n",
" \n",
" req = do_request('https://{}/{}/{}'.format(DOMAIN, URI, 'captchaFicha.php'))\n",
" cookies = [_.split(';')[0] for _ in req.getheader('Set-Cookie').split(',')]\n",
" \n",
" image = io.BytesIO(req.read())\n",
" image = load_image(image)\n",
" \n",
" resolved, score = break_captcha(model, image)\n",
" \n",
" if score < .4:\n",
" return do_download(cuce, _try + 1)\n",
" \n",
" req = do_request(\n",
" 'https://{}/{}/{}'.format(DOMAIN, URI, 'descargaFicha.php'),\n",
" data={\n",
" 'inpCaptcha': resolved[0][:resolved[0].find('[s]')],\n",
" 'cp': cuce,\n",
" 'B903A6B7': 'DEADBEEF'\n",
" }, \n",
" cookie=';'.join(cookies)\n",
" )\n",
" \n",
" if req.getcode() != 200:\n",
" return do_download(cuce, _try + 1)\n",
" \n",
" with open('../res/{}.pdf'.format(cuce), 'wb') as f:\n",
" f.write(req.read())\n",
" \n",
" return _try\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../../convocatorias/convocatorias.csv')\n",
"cuces = df[df['tipo_de_contratacion'] == 'Bienes']['cuce'].sort_index(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 1%| | 94/8761 [04:10<5:43:49, 2.38s/it] "
]
}
],
"source": [
"stats = {'error': 0, 'fail': 0, 'success': 0}\n",
"for cuce in tqdm.tqdm(cuces):\n",
" try:\n",
" res = do_download(cuce)\n",
" except Exception:\n",
" pass\n",
" \n",
" if res is None:\n",
" stats['fail'] = stats['fail'] + 1\n",
" stats['error'] = stats['error'] + MAX_TRY\n",
" else:\n",
" stats['success'] = stats['success'] + 1\n",
" stats['error'] = stats['error'] + res"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"stats"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment