Last active
October 2, 2020 08:17
-
-
Save kkprakasa/23d527b9a5a9065a9b83faf67b3e8342 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#IDM | |
#!/bin/python3 | |
import urllib3 | |
from bs4 import BeautifulSoup | |
from time import sleep | |
import csv | |
import sys | |
import json | |
import ssl | |
import hashlib | |
from tqdm import tqdm | |
import requests | |
from bson.objectid import ObjectId | |
from pymongo import MongoClient | |
rem = MongoClient('mongodb://...') | |
dbrem = rem.lokadata | |
# idm = dbrem.idm_2020_07082020_2 | |
# idmmod = dbrem.idm_2020 | |
idmy = dbrem.idm_y20_30092020 | |
# idm11 = dbrem.idm_20 | |
idm = dbrem.idm_jateng_19 | |
def ambilJson(url): | |
req = requests.get(url, verify=False) | |
return json.loads(req.content) | |
def ambil(url): | |
req = requests.get(url, verify=False) | |
dsoup = BeautifulSoup(req.content) | |
return dsoup | |
root = 'http://idm.kemendesa.go.id' | |
url = root+'/idm_data?id_prov=33&id_kabupaten=3321&id_kecamatan=332110&id_desa=3321102014&tahun=2020' | |
# 2020 | |
jurl = root+'/users/list_idm?draw=1&columns%5B0%5D%5Bdata%5D=tahun&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=id_prov&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=id_kabupaten&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=id_kecamatan&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=id_desa&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=iks_2020&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=ike_2020&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=ikl_2020&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=true&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B8%5D%5Bdata%5D=idm_2020&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=true&columns%5B8%5D%5Borderable%5D=true&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B9%5D%5Bdata%5D=idm_status_2020&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=true&columns%5B9%5D%5Borderable%5D=true&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B10%5D%5Bdata%5D=detail&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=true&columns%5B10%5D%5Borderable%5D=false&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=0&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1568821375769' | |
# 2019 | |
# jurl = root+'/users/list_idm?draw=1&columns%5B0%5D%5Bdata%5D=tahun&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=true&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=2019&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=id_prov&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=true&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=id_kabupaten&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=true&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=id_kecamatan&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=true&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=id_desa&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=true&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=iks_2019&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=true&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=ike_2019&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=true&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=ikl_2019&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=true&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B8%5D%5Bdata%5D=idm_2019&columns%5B8%5D%5Bname%5D=&columns%5B8%5D%5Bsearchable%5D=true&columns%5B8%5D%5Borderable%5D=true&columns%5B8%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B8%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B9%5D%5Bdata%5D=idm_status_2019&columns%5B9%5D%5Bname%5D=&columns%5B9%5D%5Bsearchable%5D=true&columns%5B9%5D%5Borderable%5D=true&columns%5B9%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B9%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B10%5D%5Bdata%5D=detail&columns%5B10%5D%5Bname%5D=&columns%5B10%5D%5Bsearchable%5D=true&columns%5B10%5D%5Borderable%5D=false&columns%5B10%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B10%5D%5Bsearch%5D%5Bregex%5D=false&order%5B0%5D%5Bcolumn%5D=0&order%5B0%5D%5Bdir%5D=asc&start=0&length=0&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1568821375769' | |
# BIKIN TABEL BANTU UNTUK OPERASI SELANJUTNYA | |
y = ambilJson(jurl)['data'] | |
for i in y: | |
idmy.insert_one(i) | |
for u in idmy.find({},{'_id':1,'id_kabupaten':1,'id_prov':1,'id_desa':1,'id_kecamatan':1,'detail':1}): | |
try: | |
id_prov = u['id_prov'].split('|')[0].strip() | |
id_kabupaten = u['id_kabupaten'].split('|')[0].strip() | |
id_kecamatan = u['id_kecamatan'].split('|')[0].strip() | |
id_desa = u['id_desa'].split('|')[0].strip() | |
prov = u['id_prov'].split('|')[1].strip() | |
kabupaten = u['id_kabupaten'].split('|')[1].strip() | |
kecamatan = u['id_kecamatan'].split('|')[1].strip() | |
desa = u['id_desa'].split('|')[1].strip() | |
idmy.update_one({'_id':u['_id']},{'$set': {'id_prov':id_prov,'id_kabupaten':id_kabupaten,'id_kecamatan':id_kecamatan,'id_desa':id_desa,'prov':prov,'kabupaten':kabupaten,'kecamatan':kecamatan,'desa':desa }}) | |
except: | |
pass | |
#EKSEKUSI AMBIL DETAIL | |
key0 = ['Indeks_komposit','Dimensi','skor','Nilai','Perangkat_Indikator','skor_1','Nilai_1','Indikator_peritem','skor_2','kosong','eksisting_dari_indikator_umum','Kewenangan_pusat','Kewenangan_provinsi','Kewenangan_kabupaten','Kewenangan_desa','Kewenangan_sosial_alami'] | |
for u in tqdm(idmy.find({'id_prov':"35"},{'_id':0})): | |
if idmy.find_one({'detail':u['detail']}) is not None: | |
url0 = root+u['detail'].split('"')[1].replace('=2019','=2020') | |
n=ambil(url0) | |
kkk=[] | |
try: | |
x={} | |
for i in url0.split('?')[1].split('&'): | |
x[i.split('=')[0]] = str(i.split('=')[1]) | |
x['url'] = url0 | |
for i in n.findAll('div',{'class':'box-footer'}): | |
for j in i.findAll('h6'): | |
x[j.string.split('  ')[0]] = j.string.split('  ')[1] | |
for i in n.findAll('div',{'class':'inner'}): | |
x[i.find('p').string] = i.find('h4').string | |
for i in json.loads(','.join(n.findAll('script')[6].string.split(',')[-9:-2]).replace('\n','')[:-2].replace('"data": ','')): | |
x[i['series']]=str(i['point']) | |
table = n.findAll('table')[0] | |
allRows = table.findAll('tr')[2:] | |
results = [[data.get_text() for data in row.find_all('td')][:-5]+[i['src'].split('/')[-1].replace('.png','') for i in row.find_all('img')] for row in allRows] | |
rowspan = [] | |
for no, tr in enumerate(allRows): | |
tmp = [] | |
for td_no, data in enumerate(tr.find_all('td')): | |
print(data.has_key("rowspan")) | |
if data.has_key("rowspan"): | |
rowspan.append((no, td_no, int(data["rowspan"]), data.get_text())) | |
if rowspan: | |
for i in rowspan: | |
# tr value of rowspan is present in 1st place in results | |
for j in range(1, i[2]): | |
#- Add value in next tr. | |
# results[i[0]+j].insert(i[1], i[3]) | |
results[i[0]+j].append(i[3]) | |
for i in results: | |
while re.search("\n",i[0]) is None: | |
i.insert(0,i.pop()) | |
for result in results: | |
kkk.append(list(x.values())+result) | |
except: | |
kk = list(u.values()) | |
# kk.insert() | |
kkk.append(kk) | |
key1 = list(x.keys())+key0 #gabungkan list buat bikin key | |
for l in kkk: | |
data = dict(zip(key1,l)) | |
# dx.append(data) | |
idm.insert_one(data) | |
# pipline = [{"$unwind" : "$prov"},{"$group":{"_id":"$prov","jumlah":{"$sum":1}}}] | |
# perbaiki posisi variabel | |
# ll=[] | |
# for i in idm.find({},{'_id':0}): | |
# try: | |
# l = list(i.values())[-16:-9] | |
# while cint(l[-1]) == False: | |
# l.insert(0,l.pop()) | |
# if re.search("(IKS|IKE|IKL)",l[:-3][0]): | |
# ll.append(list(i.values())[:15]+l+list(i.values())[-9:]) | |
# elif str(l[:-3][1]).isupper(): | |
# ll.append(list(i.values())[:15]+ll[-1][15:16]+l[1:]+list(i.values())[-9:]) | |
# #ll.append(l[:-3][-2:]) | |
# else: | |
# ll.append(list(i.values())[:15]+ll[-1][-16:-12]+l[-3:]+list(i.values())[-9:]) | |
# except: | |
# pass | |
# lll20=[] | |
# # k = list(data20[1].keys()) | |
# for i in ll: | |
# lll20.append(dict(zip(key1,i))) | |
# pipline = [{"$group":{"_id":"$desa", "desa":{"$sum":1}}},{"$match": { "desa":{"$gt":47}}}] | |
# [ i for i in list(idmy.distinct('id_desa')) if i not in list(idm.distinct('id_desa'))] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment