Skip to content

Instantly share code, notes, and snippets.

@kkprakasa
Created February 18, 2019 16:34
Show Gist options
  • Save kkprakasa/c00dc73e54e9b207bdc040e91f185ad6 to your computer and use it in GitHub Desktop.
Save kkprakasa/c00dc73e54e9b207bdc040e91f185ad6 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import urllib2
from bs4 import BeautifulSoup
from time import sleep
import re
from pymongo import MongoClient
# http://blog.pengyifan.com/how-to-fix-python-ssl-certificate_verify_failed/
import os, ssl
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
getattr(ssl, '_create_unverified_context', None)):
ssl._create_default_https_context = ssl._create_unverified_context
client = MongoClient('localhost', 27017)
db = client.jdihn
jdih = db.jdih
headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36' }
# http://jdihn.bphn.go.id/penelusuran/www/index.php/web/result/92816/detail
# http://jdihn.bphn.go.id/penelusuran/www/index.php/web/result/92817/detail
urlist = []
data=[]
for i in range(0,91661,10):
print('memproses halaman ke %s') % i
url0 = 'http://jdihn.bphn.go.id/penelusuran/www/index.php/web/result?q=&jenis_peraturan=1&nomor_peraturan=&tahun_terbit=&skip='+str(i)
req = urllib2.Request(url0, None, headers)
dres = urllib2.urlopen(req)
dhtml = dres.read()
dsoup = BeautifulSoup(dhtml)
for url in dsoup.findAll('div',{'class':'result__content__item__title'}):
d={}
d['tautanPeraturan'] = url.find('a')['href']
req = urllib2.Request(d['tautanPeraturan'], None, headers)
dres = urllib2.urlopen(req)
dhtml = dres.read()
dsoup = BeautifulSoup(dhtml)
for x in dsoup.find('div',{'class':'detail'}).findAll('div',{'class':'detail__left__card__content'}):
d[x.find('div',{'class':'title'}).text] = x.find('div',{'class':'field'}).text
try:
d[dsoup.find('div', {'class':'detail__left__detail__title'}).text] = dsoup.find('a',{'class':'lampiran'})['href']
except:
d[dsoup.find('div', {'class':'detail__left__detail__title'}).text] = '-'
d['Judul'] = dsoup.find('div',{'class':'detail__center__title'}).text
for x in dsoup.find('div',{'class':'detail'}).findAll('div',{'class':'detail__center__desc'}):
if len(x['class']) > 1 :
pass
else :
d[x.find('div',{'class':'detail__center__desc__title'}).text] = x.find('div',{'class':'detail__center__desc__field'}).text
jdih.insert_one(d)
print('selesai memproses %s') % d['tautanPeraturan']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment