mathigatti/scrape_pagina12.py

## scrape_pagina12.py
from bs4 import BeautifulSoup
import requests
import os

def descargar_noticia(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    noticia = ""

    for div in soup.findAll("div", {"class": "article-text"}):
        for p in div.findAll("p"):
            text = str(p)
            for space in ["</br>","<br>","<br/>","<p>","</p>"]:
                text = text.replace(space,"\n")
            noticia += text

	# La siguiente linea extrae de la url completa "https://www.pagina12.com.ar/271789-el-corazon-perdido-de-las-cosas"
	# solo la ultima parte, "271789-el-corazon-perdido-de-las-cosas", asi lo usamos para nombrar el archivo txt
    name = url.split("/")[-1]

    print(f"Descargué la noticia '{name}'")
    with open(f"noticias/{name}.txt",'w') as f:
        f.write(noticia)

def noticias(author_url):

    page = requests.get(author_url)

    if not os.path.exists("noticias"):
      os.mkdir("noticias")

    soup = BeautifulSoup(page.content, 'html.parser')

    articles_list = soup.find("ul",{"class": "article-list"})

    for a in articles_list.findAll("a"):
    	#print(a["href"])
        descargar_noticia(a["href"])

import sys
author_url = sys.argv[1]
noticias(author_url)

# Usage example:
# python scrape_pagina12.py https://www.pagina12.com.ar/autores/861-juan-forn
# It should download all Juan Forn articles into a folder called noticias
	from bs4 import BeautifulSoup
	import requests
	import os

	def descargar_noticia(url):
	page = requests.get(url)
	soup = BeautifulSoup(page.content, 'html.parser')

	noticia = ""

	for div in soup.findAll("div", {"class": "article-text"}):
	for p in div.findAll("p"):
	text = str(p)
	for space in ["</br>","<br>","<br/>","<p>","</p>"]:
	text = text.replace(space,"\n")
	noticia += text

	# La siguiente linea extrae de la url completa "https://www.pagina12.com.ar/271789-el-corazon-perdido-de-las-cosas"
	# solo la ultima parte, "271789-el-corazon-perdido-de-las-cosas", asi lo usamos para nombrar el archivo txt
	name = url.split("/")[-1]

	print(f"Descargué la noticia '{name}'")
	with open(f"noticias/{name}.txt",'w') as f:
	f.write(noticia)

	def noticias(author_url):

	page = requests.get(author_url)

	if not os.path.exists("noticias"):
	os.mkdir("noticias")

	soup = BeautifulSoup(page.content, 'html.parser')

	articles_list = soup.find("ul",{"class": "article-list"})

	for a in articles_list.findAll("a"):
	#print(a["href"])
	descargar_noticia(a["href"])

	import sys
	author_url = sys.argv[1]
	noticias(author_url)

	# Usage example:
	# python scrape_pagina12.py https://www.pagina12.com.ar/autores/861-juan-forn
	# It should download all Juan Forn articles into a folder called noticias