Skip to content

Instantly share code, notes, and snippets.

@LCPallares
Created December 17, 2019 19:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LCPallares/afa8e918c082cc6cd9c21bb0a2212343 to your computer and use it in GitHub Desktop.
Save LCPallares/afa8e918c082cc6cd9c21bb0a2212343 to your computer and use it in GitHub Desktop.
Ejemplos de raspado web(web scraping) con pagina de ejemplo http://example.webscraping.com
import requests
from bs4 import BeautifulSoup
URL_BASE = 'http://example.webscraping.com'
peticion = requests.get(URL_BASE)
sopa = BeautifulSoup(peticion.content, 'html.parser')
paises = sopa.find_all(class_="span12")[1].find(id="results")
for pais in paises.find_all('a'):
nom = pais.text
elx = URL_BASE + pais["href"]
# print(elx)
peticion2 = requests.get(elx)
sopa2 = BeautifulSoup(peticion2.content, 'html.parser')
cont = sopa2.find_all(class_="span12")[1]
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text
print(f'{capital} - {paiz}')
import requests, json
from bs4 import BeautifulSoup
class ClaseRaspador:
def __init__(self):
self.URL_BASE = 'http://example.webscraping.com'
self.sopa = BeautifulSoup(requests.get(self.URL_BASE).content, 'html.parser')
self.enlaces = []
def raspar(self):
Datos = []
paises = self.sopa.find_all(class_="span12")[1].find(id="results")
for pais in paises.find_all('a'):
nom = pais.text
elx = self.URL_BASE + pais["href"]
Datos.append({"NOMBRE": nom, "ENLACE": elx})
self.enlaces.append(elx)
return Datos
r = ClaseRaspador()
print(r.raspar())
import requests
from bs4 import BeautifulSoup
URL_BASE = 'http://example.webscraping.com'
peticion = requests.get(URL_BASE)
sopa = BeautifulSoup(peticion.content, 'html.parser')
paises = sopa.find_all(class_="span12")[1].find(id="results")
for pais in paises.find_all('a'):
nom = pais.text
elx = URL_BASE + pais["href"]
# print(elx)
peticion2 = requests.get(elx)
sopa2 = BeautifulSoup(peticion2.content, 'html.parser')
cont = sopa2.find_all(class_="span12")[1]
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text
print(f'{capital} - {paiz}')
#coding: utf-8
import requests
from bs4 import BeautifulSoup
from time import sleep
import json
import csv
import pandas as pd
BASE_URL = 'http://example.webscraping.com'
URL = 'http://example.webscraping.com/places/default/index/'
def make_soup(url):
peticion = requests.get(url)
return BeautifulSoup(peticion.content, 'html.parser')
def obtener_enlaces(sopa):
enlaces = []
paises = sopa.find_all(class_="span12")[1].find(id="results")
for pais in paises.find_all('a'):
nombrepais = pais.text
enlacepais = BASE_URL + pais["href"]
enlaces.append(enlacepais)
return enlaces
def obtener_info(enlaces):
cont = enlaces.find_all(class_="span12")[1]
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text
# print(f'{capital} - {paiz}')
# return {"Capital": capital, "Pais": paiz}
return {"Pais": paiz, "Capital": capital}
def guardar(resultado, archivo):
for elemento in resultado:
archivo.write(f'{elemento}\n')
def guardarjson(resultado, archivo):
archivo.write(resultado)
def main():
Datos = []
n = 0
while True:
print('---------------------------------------------------------')
print(f'pagina: {n}')
print('---------------------------------------------------------')
enlaces = obtener_enlaces(make_soup(f'{URL}{n}'))
# for enlace in obtener_enlaces(make_soup(f'{URL}{n}')):
for enlace in enlaces:
capitales = obtener_info(make_soup(enlace))
#print(capitales)
Datos.append(capitales)
sleep(1)
n += 1
guardarjson(capitales, open("capitales.json", "w"))
# guardar(capitales, open("capitales.txt", "w"))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment