Skip to content

Instantly share code, notes, and snippets.

@sergiolucero
Created September 24, 2021 18:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiolucero/f6cafaa4d46d70e3488735d2ae5f5f52 to your computer and use it in GitHub Desktop.
Save sergiolucero/f6cafaa4d46d70e3488735d2ae5f5f52 to your computer and use it in GitHub Desktop.
scraping GEOCGR
import os, time, pandas as pd
carpeta_salida = 'geocgr'
comunas = pd.read_excel('s3://corporacionciudades/cut_2018_v03.xls') # esto debiera vivir en una API
print(f'TOTAL comunas: {len(comunas)}')
#fuente: http://www.subdere.gov.cl/sites/default/files/documentos/cut_2018_v03.xls
url_base='https://www.contraloria.cl/opencgrapp/geocgr/api/comunas/%05d/newobras'
t0 = time.time()
sdf = pd.DataFrame()
for ix, row in comunas.iterrows(): # podría esto ser un async for?
url = url_base %(row['Código Comuna 2017'])
udf = pd.read_json(url)
comuna = row['Nombre Comuna']
udf['comuna'] = comuna
udf['provincia'] = row['Nombre Provincia']
udf['código_comuna'] = row['Código Comuna 2017']
udf['región'] = row['Código Región']
udf['CLASIFICACION'] = udf['CLASIFICACION'].apply(lambda n:n.split(';')[0])
sdf = sdf.append(udf, sort=False)
if ix%20==10:
print(f'[{ix}/{len(comunas)}]','Region=',row['Código Región'],len(sdf),round(time.time()-t0,2))
del udf # liberando memoria después de cada comuna
filename = f'base_geocgr_{time.strftime("%Y%m%d")}.pq'
print(f'{len(sdf)} contratos grabados en {filename}')
sdf.to_parquet(filename)
cmd=f'aws s3 cp {filename} s3://quantcldata/CLIENTES/CIUDADES/'
os.system(cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment