Skip to content

Instantly share code, notes, and snippets.

@kkprakasa
Last active October 2, 2020 08:17
Show Gist options
  • Save kkprakasa/23d527b9a5a9065a9b83faf67b3e8342 to your computer and use it in GitHub Desktop.
Save kkprakasa/23d527b9a5a9065a9b83faf67b3e8342 to your computer and use it in GitHub Desktop.
#IDM
#!/bin/python3
import urllib3
from bs4 import BeautifulSoup
from time import sleep
import csv
import sys
import json
import ssl
import hashlib
from tqdm import tqdm
import requests
from bson.objectid import ObjectId
from pymongo import MongoClient
rem = MongoClient('mongodb://...')
dbrem = rem.lokadata
# idm = dbrem.idm_2020_07082020_2
# idmmod = dbrem.idm_2020
idmy = dbrem.idm_y20_30092020
# idm11 = dbrem.idm_20
idm = dbrem.idm_jateng_19
def ambilJson(url):
req = requests.get(url, verify=False)
return json.loads(req.content)
def ambil(url):
req = requests.get(url, verify=False)
dsoup = BeautifulSoup(req.content)
return dsoup
root = 'http://idm.kemendesa.go.id'
url = root+'/idm_data?id_prov=33&id_kabupaten=3321&id_kecamatan=332110&id_desa=3321102014&tahun=2020'
# 2020
jurl = root+'/users/list_idm?draw=1&columns%5B0%5D%5Bdata%5D=tahun&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=id_prov&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=id_kabupaten&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=id_kecamatan&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=id_desa&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=iks_2020&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=ike_2020&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=ikl_2020&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=true&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B8%5D%5Bdata%5D=idm_2020&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=true&columns%5B8%5D%5Borderable%5D=true&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B9%5D%5Bdata%5D=idm_status_2020&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=true&columns%5B9%5D%5Borderable%5D=true&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B10%5D%5Bdata%5D=detail&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=true&columns%5B10%5D%5Borderable%5D=false&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=0&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1568821375769'
# 2019
# jurl = root+'/users/list_idm?draw=1&columns%5B0%5D%5Bdata%5D=tahun&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=2019&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=id_prov&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=id_kabupaten&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=id_kecamatan&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=id_desa&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=iks_2019&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=ike_2019&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=ikl_2019&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=true&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B8%5D%5Bdata%5D=idm_2019&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=true&columns%5B8%5D%5Borderable%5D=true&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B9%5D%5Bdata%5D=idm_status_2019&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=true&columns%5B9%5D%5Borderable%5D=true&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B10%5D%5Bdata%5D=detail&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=true&columns%5B10%5D%5Borderable%5D=false&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=0&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1568821375769'
# BIKIN TABEL BANTU UNTUK OPERASI SELANJUTNYA
y = ambilJson(jurl)['data']
for i in y:
idmy.insert_one(i)
for u in idmy.find({},{'_id':1,'id_kabupaten':1,'id_prov':1,'id_desa':1,'id_kecamatan':1,'detail':1}):
try:
id_prov = u['id_prov'].split('|')[0].strip()
id_kabupaten = u['id_kabupaten'].split('|')[0].strip()
id_kecamatan = u['id_kecamatan'].split('|')[0].strip()
id_desa = u['id_desa'].split('|')[0].strip()
prov = u['id_prov'].split('|')[1].strip()
kabupaten = u['id_kabupaten'].split('|')[1].strip()
kecamatan = u['id_kecamatan'].split('|')[1].strip()
desa = u['id_desa'].split('|')[1].strip()
idmy.update_one({'_id':u['_id']},{'$set': {'id_prov':id_prov,'id_kabupaten':id_kabupaten,'id_kecamatan':id_kecamatan,'id_desa':id_desa,'prov':prov,'kabupaten':kabupaten,'kecamatan':kecamatan,'desa':desa }})
except:
pass
#EKSEKUSI AMBIL DETAIL
key0 = ['Indeks_komposit','Dimensi','skor','Nilai','Perangkat_Indikator','skor_1','Nilai_1','Indikator_peritem','skor_2','kosong','eksisting_dari_indikator_umum','Kewenangan_pusat','Kewenangan_provinsi','Kewenangan_kabupaten','Kewenangan_desa','Kewenangan_sosial_alami']
for u in tqdm(idmy.find({'id_prov':"35"},{'_id':0})):
if idmy.find_one({'detail':u['detail']}) is not None:
url0 = root+u['detail'].split('"')[1].replace('=2019','=2020')
n=ambil(url0)
kkk=[]
try:
x={}
for i in url0.split('?')[1].split('&'):
x[i.split('=')[0]] = str(i.split('=')[1])
x['url'] = url0
for i in n.findAll('div',{'class':'box-footer'}):
for j in i.findAll('h6'):
x[j.string.split(' &nbsp')[0]] = j.string.split(' &nbsp')[1]
for i in n.findAll('div',{'class':'inner'}):
x[i.find('p').string] = i.find('h4').string
for i in json.loads(','.join(n.findAll('script')[6].string.split(',')[-9:-2]).replace('\n','')[:-2].replace('"data": ','')):
x[i['series']]=str(i['point'])
table = n.findAll('table')[0]
allRows = table.findAll('tr')[2:]
results = [[data.get_text() for data in row.find_all('td')][:-5]+[i['src'].split('/')[-1].replace('.png','') for i in row.find_all('img')] for row in allRows]
rowspan = []
for no, tr in enumerate(allRows):
tmp = []
for td_no, data in enumerate(tr.find_all('td')):
print(data.has_key("rowspan"))
if data.has_key("rowspan"):
rowspan.append((no, td_no, int(data["rowspan"]), data.get_text()))
if rowspan:
for i in rowspan:
# tr value of rowspan is present in 1st place in results
for j in range(1, i[2]):
#- Add value in next tr.
# results[i[0]+j].insert(i[1], i[3])
results[i[0]+j].append(i[3])
for i in results:
while re.search("\n",i[0]) is None:
i.insert(0,i.pop())
for result in results:
kkk.append(list(x.values())+result)
except:
kk = list(u.values())
# kk.insert()
kkk.append(kk)
key1 = list(x.keys())+key0 #gabungkan list buat bikin key
for l in kkk:
data = dict(zip(key1,l))
# dx.append(data)
idm.insert_one(data)
# pipline = [{"$unwind" : "$prov"},{"$group":{"_id":"$prov","jumlah":{"$sum":1}}}]
# perbaiki posisi variabel
# ll=[]
# for i in idm.find({},{'_id':0}):
# try:
# l = list(i.values())[-16:-9]
# while cint(l[-1]) == False:
# l.insert(0,l.pop())
# if re.search("(IKS|IKE|IKL)",l[:-3][0]):
# ll.append(list(i.values())[:15]+l+list(i.values())[-9:])
# elif str(l[:-3][1]).isupper():
# ll.append(list(i.values())[:15]+ll[-1][15:16]+l[1:]+list(i.values())[-9:])
# #ll.append(l[:-3][-2:])
# else:
# ll.append(list(i.values())[:15]+ll[-1][-16:-12]+l[-3:]+list(i.values())[-9:])
# except:
# pass
# lll20=[]
# # k = list(data20[1].keys())
# for i in ll:
# lll20.append(dict(zip(key1,i)))
# pipline = [{"$group":{"_id":"$desa", "desa":{"$sum":1}}},{"$match": { "desa":{"$gt":47}}}]
# [ i for i in list(idmy.distinct('id_desa')) if i not in list(idm.distinct('id_desa'))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment