Created
October 4, 2017 02:35
-
-
Save efaisal/06b39c814f87b260f61d33faf9edc115 to your computer and use it in GitHub Desktop.
POC automating Semakan Pemilih Isi Rumah
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Dependencies: requests, lxml, beautifulsoup4 [simply pip install those] | |
# Tested with Python 3.6.2 on Linux | |
import logging | |
from time import sleep | |
import requests | |
from bs4 import BeautifulSoup | |
URL = 'http://pengundi.spr.gov.my/isirumah2/' | |
FORM = '{}{}'.format(URL, 'semakandm_baru2.asp') | |
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'} | |
logging.getLogger(__name__).addHandler(logging.StreamHandler()) | |
log = logging.getLogger(__name__) | |
log.setLevel(logging.DEBUG) | |
class SemakanError(Exception): | |
pass | |
def get_captcha(html): | |
log.info('Finding CAPTCHA value') | |
a = html.find('document.write') | |
a = html.find('document.write', a+1) | |
b = html.find('document.write', a+1) | |
b = html.find('document.write', b+1) | |
b = html.find('document.write', b+1) | |
html = html[a:b].strip().replace("document.write('", '').replace("');", '') | |
soup = BeautifulSoup(html, 'lxml') | |
c = '' | |
for span in soup.find_all('span'): | |
c += span.string | |
log.info('Found CAPTCHA value: {captcha}'.format(captcha=c)) | |
return c | |
def get_data(html): | |
soup = BeautifulSoup(html, 'lxml') | |
tbl = soup.find('table', class_='jawapan') | |
data = [] | |
for row in tbl.findAll('tr')[1:]: | |
td = row.findAll('td') | |
kp, nama = td[1].getText(separator='|', strip=True).split('|') | |
if '(' in kp: | |
kp_baru, kp_lama = kp.split(' ') | |
kp_lama = kp_lama.replace('(', '').replace(')', '') | |
else: | |
kp_baru, kp_lama = kp, None | |
kod, par, dun, dm, lok = td[2].getText(separator='|', strip=True).split('|') | |
kod_par, kod_dun, kod_dm, kod_lok = kod.replace('(', '').replace(')', '').split('/') | |
par = par.split(':')[1].strip() | |
dun = dun.split(':')[1].strip() | |
dm = dm.split(':')[1].strip() | |
lok = lok.split(':')[1].split('(')[0].strip() | |
data.append({ | |
'nama': nama, 'no_kp_baru': kp_baru, 'no_kp_lama': kp_lama, | |
'kod_parlimen': kod_par, 'parlimen': par, 'kod_dun': kod_dun, | |
'dun': dun, 'kod_daerah': dm, 'daerah': dm, | |
'kod_lokaliti': kod_lok, 'lokaliti': lok | |
}) | |
return data | |
def semak(kp): | |
if type(kp) is not list and type(kp) is not tuple: | |
log.warning('KP value is not a list or a tuple') | |
raise ValueError('Expecting list or tuple argument') | |
if len(kp) > 8: | |
log.warning('There are more than 8 KP values') | |
raise SemakanError('No more that 8 kad pengenalan at a time') | |
session = requests.Session() | |
# SPR website can be slow and lead to no response | |
log.info('Load the isi rumah page') | |
r = session.get(URL, headers=UA, timeout=10.0) | |
r.raise_for_status() | |
if r.status_code != 200: | |
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code)) | |
raise SemakanError('HTTP Error: {}'.format(r.status_code)) | |
# Add KP field if more that 3, CAPTCHA value will also change | |
payload = {'Tambah': 'Tambah IC', 'CaptchaBox': ''} | |
kp_len = len(kp) | |
if kp_len > 3: | |
for i, j in enumerate(kp): | |
payload['dfnokp{}'.format(i+1)] = j | |
kp_flds = 3 | |
for _ in range(kp_len - 3): | |
sleep(2.0) | |
log.info('Adding additional KP field') | |
r = session.post(FORM, headers=UA, data=payload, timeout=10.0) | |
r.raise_for_status() | |
if r.status_code != 200: | |
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code)) | |
raise SemakanError('HTTP Error: {}'.format(r.status_code)) | |
kp_flds += 1 | |
if kp_flds <= kp_len: | |
payload['dfnokp{}'.format(kp_flds)] = kp[kp_flds-1] | |
else: | |
for i in range(3): | |
try: | |
payload['dfnokp{}'.format(i+1)] = kp[i] | |
except IndexError: | |
payload['dfnokp{}'.format(i+1)] = '' | |
del payload['Tambah'] | |
payload['SEMAK'] = 'SEMAK' | |
payload['CaptchaBox'] = get_captcha(r.text) | |
sleep(4.0) | |
log.info('Fetch isi rumah data') | |
d = session.post(FORM, headers=UA, data=payload, timeout=60.0) | |
d.raise_for_status() | |
if r.status_code != 200: | |
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code)) | |
raise SemakanError('HTTP Error: {}'.format(r.status_code)) | |
return get_data(d.text) | |
if __name__ == '__main__': | |
kp = [kp1, kp2] # replace kp1 & kp2 with actual kp | |
data = semak(kp) | |
for d in data: | |
print(d) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment