Skip to content

Instantly share code, notes, and snippets.

@ferdhika31
Last active January 28, 2021 07:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ferdhika31/09edd649e348fdc810d0ad718fe0ed92 to your computer and use it in GitHub Desktop.
Save ferdhika31/09edd649e348fdc810d0ad718fe0ed92 to your computer and use it in GitHub Desktop.
Scrap Data Keur s Omen
import requests
import csv
from bs4 import BeautifulSoup
def scrap(page=1):
url = "http://sekolah.data.kemdikbud.go.id/chome/pagingpencarian"
hearders = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://sekolah.data.kemdikbud.go.id/chome/pagingpencarian'
}
form_data = {
'page': page,
'bentuk': 'TK',
'status': 'semua',
'akreditasi': 'semua',
'__ncforminfo': 'rD0vS5N8uly3rXAMERg-Bx1HkTPw7CJFqJHnRcD0Dkm2s1pXlo4SkNpJf0X45LBXmUEPhmDo7lcWXF9SfJmkBIgw2xajZxqd6KrPT6ea029IYJqOlIoLlQ=='
}
response = requests.post(url, data=form_data, headers=hearders)
soup = BeautifulSoup(response.text, 'html.parser')
tksContainers = soup.find_all('div', {'class':"col-xs-12 col-md-6"})
return tksContainers
datas = []
#max page 23561 (total data/4)
for idx in range(1, 100):
tksContainers = scrap(idx)
for tksContainer in tksContainers:
tksInfo = tksContainer.ul
nama = tksInfo.a.get_text()
infos = []
for info in tksInfo.find_all('li', {'class': "list-group-item text-muted"}):
infos.append(info.get_text().strip())
jalan = ""
kecamatan = ""
kabupaten = ""
if len(infos)>0:
jalan = infos[0]
kecamatan = infos[1]
kabupaten = infos[2]
datas.append([nama,jalan,kecamatan,kabupaten])
with open('data.csv', 'w', newline='') as file:
writer = csv.writer(file)
headers = ['Nama','Alamat','Kec', 'Kab']
writer.writerow(headers)
for data in datas:
writer.writerow(data)
print(datas)
import requests
import csv
from bs4 import BeautifulSoup
def ambilDaerah(path):
response = requests.get(url = "https://referensi.data.kemdikbud.go.id/"+path)
soup = BeautifulSoup(response.text, 'html.parser')
tabel = soup.find(id="box-table-a")
rows = tabel.find_all('tr')
data = []
for row in rows:
cells = row.findChildren('td')
if len(cells) > 1:
linkTag = cells[1].a
if linkTag != None:
link = linkTag.get('href')
text = linkTag.get_text()
data.append((link, text))
return data
def scrap(path):
response = requests.get(url = "https://referensi.data.kemdikbud.go.id/"+path)
soup = BeautifulSoup(response.text, 'html.parser')
tabel = soup.find(id="example")
rows = tabel.find_all('tr')
data = []
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
if len(cols) > 1:
data.append([ele for ele in cols if ele])
return data
def saveCSV(datas, filename='data.csv'):
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
headers = ['No','NPSN','Nama Satuan Pendidikan', 'Alamat', 'Kelurahan', 'Status', 'Provinsi', 'Kabupaten', 'Kecamatan']
writer.writerow(headers)
for data in datas:
writer.writerow(data)
daftarProvinsi = ambilDaerah("index11.php")
data = []
for provinsi in daftarProvinsi:
print("===============================")
print("PROVINSI :", provinsi[1])
print("===============================")
daftarKabupaten = ambilDaerah(provinsi[0])
for kabupaten in daftarKabupaten:
print("KABUPATEN :", kabupaten[1])
print("===============================")
daftarKecamatan = ambilDaerah(kabupaten[0])
for kecamatan in daftarKecamatan:
print("KECAMATAN :", kecamatan[1])
print("===============================")
dt = [k + [provinsi[1]] + [kabupaten[1]] + [kecamatan[1]] for k in scrap(kecamatan[0])]
data = data + dt
saveCSV(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment