Skip to content

Instantly share code, notes, and snippets.

@debuti
Last active September 10, 2018 15:04
Show Gist options
  • Save debuti/4c0700ed9a8ec5ba689cc549f9208389 to your computer and use it in GitHub Desktop.
Save debuti/4c0700ed9a8ec5ba689cc549f9208389 to your computer and use it in GitHub Desktop.
Scrapping script for public positions in Spain cityhalls
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import re
import json
import datetime
urlbase = 'https://administracion.gob.es/pagFront/empleoBecas/empleo'
url = urlbase + '/buscadorEmpleoAvanzado.htm'
headers = {'Content-Type' : 'application/x-www-form-urlencoded',
'user-agent': 'wreck'}
data = {'busquedaSimple':'Buscar',
'idAmbProvincia':'28', # Madrid
'idConvocante':'3', # Administracion local
'idPlazo':'1', # Plazo abierto
#'idVia':'1', # Promocion interna
'idVia':'2', # Acceso libre
'tipoVista':'Avanzado'}
jsonreq = ""
response = requests.post(url, timeout=5, data = data, headers = headers)
with open("results."+datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".html", "wb") as file:
file.write(response.content)
# parse html
page_content = BeautifulSoup(response.content, "html.parser")
items = page_content.find_all('form', {'name':'detalleEmpleo', 'action':re.compile("buscadorEmpleo.htm.*")})
for item in items:
m = re.search("span>(\d+)<span", str(item.h3).replace('\n',' '), re.M)
if m:
ref = m.group(1)
print("Ref: "+ref)
m = re.search("\| </span>(.*?)</h3>", str(item.h3).replace('\n',' '), re.M)
if m:
titulo = m.group(1)
print("Titulo: "+titulo)
seconddiv = item.select('div')[1]
secondblock = seconddiv.div.div
titulacionblock = secondblock.select('div')[0]
titulacion = titulacionblock.p.text
print("Titulacion: "+str(titulacion))
organoblock = secondblock.select('div')[1]
organo = organoblock.p.text
localidad = organoblock.span.text
print("Organo: "+str(organo)+ " - "+ str(localidad))
plazasblock = secondblock.select('div')[2]
plazas = plazasblock.p.text
print("Plazas: "+str(plazas))
plazoblock = secondblock.select('div')[3]
plazo = plazoblock.p.text
print("Plazo: "+str(plazo))
print("URL: "+ urlbase + "/buscadorEmpleo.htm?idRegistro="+ref)
jsonblock = seconddiv.find_all('input', {'name':'jsonDetalle'})
#print('JSON: ', end='', flush=True)
#print(json.dumps(json.loads(jsonblock[0]["value"].replace('#','"')), indent=4, sort_keys=True))
print('###################################################################')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment