Skip to content

Instantly share code, notes, and snippets.

@mathigatti
Last active June 19, 2020 12:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mathigatti/884cf8aea8316dc92e4bc5d5131f5f1d to your computer and use it in GitHub Desktop.
Save mathigatti/884cf8aea8316dc92e4bc5d5131f5f1d to your computer and use it in GitHub Desktop.
Scraper para bajar noticias de pagina12
from bs4 import BeautifulSoup
import requests
import os
def descargar_noticia(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
noticia = ""
for div in soup.findAll("div", {"class": "article-text"}):
for p in div.findAll("p"):
text = str(p)
for space in ["</br>","<br>","<br/>","<p>","</p>"]:
text = text.replace(space,"\n")
noticia += text
# La siguiente linea extrae de la url completa "https://www.pagina12.com.ar/271789-el-corazon-perdido-de-las-cosas"
# solo la ultima parte, "271789-el-corazon-perdido-de-las-cosas", asi lo usamos para nombrar el archivo txt
name = url.split("/")[-1]
print(f"Descargué la noticia '{name}'")
with open(f"noticias/{name}.txt",'w') as f:
f.write(noticia)
def noticias(author_url):
page = requests.get(author_url)
if not os.path.exists("noticias"):
os.mkdir("noticias")
soup = BeautifulSoup(page.content, 'html.parser')
articles_list = soup.find("ul",{"class": "article-list"})
for a in articles_list.findAll("a"):
#print(a["href"])
descargar_noticia(a["href"])
import sys
author_url = sys.argv[1]
noticias(author_url)
# Usage example:
# python scrape_pagina12.py https://www.pagina12.com.ar/autores/861-juan-forn
# It should download all Juan Forn articles into a folder called noticias
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment