Last active
June 19, 2020 12:42
-
-
Save mathigatti/884cf8aea8316dc92e4bc5d5131f5f1d to your computer and use it in GitHub Desktop.
Scraper para bajar noticias de pagina12
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import os | |
def descargar_noticia(url): | |
page = requests.get(url) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
noticia = "" | |
for div in soup.findAll("div", {"class": "article-text"}): | |
for p in div.findAll("p"): | |
text = str(p) | |
for space in ["</br>","<br>","<br/>","<p>","</p>"]: | |
text = text.replace(space,"\n") | |
noticia += text | |
# La siguiente linea extrae de la url completa "https://www.pagina12.com.ar/271789-el-corazon-perdido-de-las-cosas" | |
# solo la ultima parte, "271789-el-corazon-perdido-de-las-cosas", asi lo usamos para nombrar el archivo txt | |
name = url.split("/")[-1] | |
print(f"Descargué la noticia '{name}'") | |
with open(f"noticias/{name}.txt",'w') as f: | |
f.write(noticia) | |
def noticias(author_url): | |
page = requests.get(author_url) | |
if not os.path.exists("noticias"): | |
os.mkdir("noticias") | |
soup = BeautifulSoup(page.content, 'html.parser') | |
articles_list = soup.find("ul",{"class": "article-list"}) | |
for a in articles_list.findAll("a"): | |
#print(a["href"]) | |
descargar_noticia(a["href"]) | |
import sys | |
author_url = sys.argv[1] | |
noticias(author_url) | |
# Usage example: | |
# python scrape_pagina12.py https://www.pagina12.com.ar/autores/861-juan-forn | |
# It should download all Juan Forn articles into a folder called noticias |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment