Skip to content

Instantly share code, notes, and snippets.

@efaisal
Created October 4, 2017 02:35
Show Gist options
  • Save efaisal/06b39c814f87b260f61d33faf9edc115 to your computer and use it in GitHub Desktop.
Save efaisal/06b39c814f87b260f61d33faf9edc115 to your computer and use it in GitHub Desktop.
POC automating Semakan Pemilih Isi Rumah
#!/usr/bin/env python
# Dependencies: requests, lxml, beautifulsoup4 [simply pip install those]
# Tested with Python 3.6.2 on Linux
import logging
from time import sleep
import requests
from bs4 import BeautifulSoup
URL = 'http://pengundi.spr.gov.my/isirumah2/'
FORM = '{}{}'.format(URL, 'semakandm_baru2.asp')
UA = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
logging.getLogger(__name__).addHandler(logging.StreamHandler())
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
class SemakanError(Exception):
pass
def get_captcha(html):
log.info('Finding CAPTCHA value')
a = html.find('document.write')
a = html.find('document.write', a+1)
b = html.find('document.write', a+1)
b = html.find('document.write', b+1)
b = html.find('document.write', b+1)
html = html[a:b].strip().replace("document.write('", '').replace("');", '')
soup = BeautifulSoup(html, 'lxml')
c = ''
for span in soup.find_all('span'):
c += span.string
log.info('Found CAPTCHA value: {captcha}'.format(captcha=c))
return c
def get_data(html):
soup = BeautifulSoup(html, 'lxml')
tbl = soup.find('table', class_='jawapan')
data = []
for row in tbl.findAll('tr')[1:]:
td = row.findAll('td')
kp, nama = td[1].getText(separator='|', strip=True).split('|')
if '(' in kp:
kp_baru, kp_lama = kp.split(' ')
kp_lama = kp_lama.replace('(', '').replace(')', '')
else:
kp_baru, kp_lama = kp, None
kod, par, dun, dm, lok = td[2].getText(separator='|', strip=True).split('|')
kod_par, kod_dun, kod_dm, kod_lok = kod.replace('(', '').replace(')', '').split('/')
par = par.split(':')[1].strip()
dun = dun.split(':')[1].strip()
dm = dm.split(':')[1].strip()
lok = lok.split(':')[1].split('(')[0].strip()
data.append({
'nama': nama, 'no_kp_baru': kp_baru, 'no_kp_lama': kp_lama,
'kod_parlimen': kod_par, 'parlimen': par, 'kod_dun': kod_dun,
'dun': dun, 'kod_daerah': dm, 'daerah': dm,
'kod_lokaliti': kod_lok, 'lokaliti': lok
})
return data
def semak(kp):
if type(kp) is not list and type(kp) is not tuple:
log.warning('KP value is not a list or a tuple')
raise ValueError('Expecting list or tuple argument')
if len(kp) > 8:
log.warning('There are more than 8 KP values')
raise SemakanError('No more that 8 kad pengenalan at a time')
session = requests.Session()
# SPR website can be slow and lead to no response
log.info('Load the isi rumah page')
r = session.get(URL, headers=UA, timeout=10.0)
r.raise_for_status()
if r.status_code != 200:
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code))
raise SemakanError('HTTP Error: {}'.format(r.status_code))
# Add KP field if more that 3, CAPTCHA value will also change
payload = {'Tambah': 'Tambah IC', 'CaptchaBox': ''}
kp_len = len(kp)
if kp_len > 3:
for i, j in enumerate(kp):
payload['dfnokp{}'.format(i+1)] = j
kp_flds = 3
for _ in range(kp_len - 3):
sleep(2.0)
log.info('Adding additional KP field')
r = session.post(FORM, headers=UA, data=payload, timeout=10.0)
r.raise_for_status()
if r.status_code != 200:
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code))
raise SemakanError('HTTP Error: {}'.format(r.status_code))
kp_flds += 1
if kp_flds <= kp_len:
payload['dfnokp{}'.format(kp_flds)] = kp[kp_flds-1]
else:
for i in range(3):
try:
payload['dfnokp{}'.format(i+1)] = kp[i]
except IndexError:
payload['dfnokp{}'.format(i+1)] = ''
del payload['Tambah']
payload['SEMAK'] = 'SEMAK'
payload['CaptchaBox'] = get_captcha(r.text)
sleep(4.0)
log.info('Fetch isi rumah data')
d = session.post(FORM, headers=UA, data=payload, timeout=60.0)
d.raise_for_status()
if r.status_code != 200:
log.warning('Receive HTTP Error: {errno}'.format(errno=r.status_code))
raise SemakanError('HTTP Error: {}'.format(r.status_code))
return get_data(d.text)
if __name__ == '__main__':
kp = [kp1, kp2] # replace kp1 & kp2 with actual kp
data = semak(kp)
for d in data:
print(d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment