bici-sancta/lemondeScraper.py

## lemondeScraper.py
# Author: Xiaoou Wang, Master’s student (currently in Paris) in NLP looking for a phd position/contrat cifre. [linkedin](https://www.linkedin.com/in/xiaoou-wang)/[email](mailto:xiaoouwangfrance@gmail.com)
# https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
# Have a look at https://soshace.com/responsible-web-scraping-gathering-data-ethically-and-legally/ before using the code.
import os  # helper functions like check file exists
import datetime  # automatic file name
import requests  # the following imports are common web scraping bundle
from urllib.request import urlopen  # standard python module
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from collections import defaultdict
import re
from urllib.error import URLError
from tqdm import tqdm
import pickle
import bz2
import _pickle as cPickle
import pandas as pd

def extract_theme(link):
    try:
        theme_text = re.findall(r'.fr/.*?/', link)[0]
    except:
        pass
    else:
        return theme_text[4:-1]


def list_themes(links):
    themes = []
    for link in links:
        theme = extract_theme(link)
        if theme is not None:
            themes.append(theme)
    return themes


def write_links(path, links, year_fn):
    with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w') as f:
        for link in links:
            f.write(link + "\n")


def write_to_file(filename, content):
    if os.path.exists(filename):
        with open(filename, 'a+') as f:
            f.write(str(content))
    else:
        with open(filename, 'w') as f:
            f.write(str(content))

def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
    archive_links = {}
    for y in range(year_start, year_end + 1):
        dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
                 str(y) for m in range(month_start, month_end + 1) for d in
                 range(day_start, day_end + 1)]
        archive_links[y] = [
            "https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
    return archive_links

def get_articles_links(archive_links):
    links_non_abonne = []
    for link in archive_links:
        try:
            html = urlopen(link)
        except HTTPError as e:
            print("url not valid", link)
        else:
            soup = BeautifulSoup(html, "html.parser")
            news = soup.find_all(class_="teaser")
            # condition here : if no span icon__premium (abonnes)
            for item in news:
                if not item.find('span', {'class': 'icon__premium'}):
                    l_article = item.find('a')['href']
                    # en-direct = video
                    if 'en-direct' not in l_article:
                        links_non_abonne.append(l_article)
    return links_non_abonne


def classify_links(theme_list, link_list):
    dict_links = defaultdict(list)
    for theme in theme_list:
        theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
        for link in link_list:
            if theme_link in link:
                dict_links[theme].append(link)
    return dict_links


def get_single_page(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print("url not valid", url)
    else:
        soup = BeautifulSoup(html, "html.parser")
        text_title = soup.find('h1')
        text_body = soup.article.find_all(["p", "h2"], recursive=False)
        return (text_title, text_body)

def scrape_articles(dict_links):
    themes = dict_links.keys()
    for theme in themes:
        create_folder(os.path.join('corpus', theme))
        print("processing:", theme)
        for i in tqdm(range(len(dict_links[theme]))):
            link = dict_links[theme][i]
            fn = extract_fn(link)
            single_page = get_single_page(link)
            if single_page is not None:
                with open((os.path.join('corpus', theme, fn + '.txt')), 'w') as f:
                    # f.write(dict_links[theme][i] + "\n" * 2)
                    f.write(single_page[0].get_text() + "\n" * 2)
                    for line in single_page[1]:
                        f.write(line.get_text() + "\n" * 2)

def cr_corpus_dict(path_corpus, n_files=1000):
    dict_corpus = defaultdict(list)
    themes = os.listdir(path_corpus)
    for theme in themes:
        counter = 0
        if not theme.startswith('.'):
            theme_directory = os.path.join(path_corpus, theme)
            for file in os.listdir(theme_directory):
                if counter < n_files:
                    path_file = os.path.join(theme_directory, file)
                    text = read_file(path_file)
                    dict_corpus["label"].append(theme)
                    dict_corpus["text"].append(text)
                counter += 1
    return dict_corpus

def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
    else:
        print("folder exists already")

archive_links = scraper.create_archive_links(2006,2020,1, 12, 1, 31)
corpus_path = os.path.join(os.getcwd(), "corpus_links")
create_folder(corpus_path)
article_links = {}
for year,links in archive_links.items():
    print("processing: ",year)
    article_links_list = get_articles_links(links)
    article_links[year] = article_links_list
    write_links(corpus_path,article_links_list,year)
themes = []
for link_list in article_links.values():
    themes.extend(list_themes(link_list))
from collections import Counter
theme_stat = Counter(themes)
theme_top = []
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
    if v > 700:
        theme_top.append((k, v))
print(theme_top)
all_links = []
for link_list in article_links.values():
    all_links.extend(link_list)
themes_top_five = [x[0] for x in theme_top[:5]]
themes_top_five_links = classify_links(themes_top_five,all_links)
scrape_articles(themes_top_five_links)
path = 'corpus'
dico_corpus = cr_corpus_dict(path,1000)
	# Author: Xiaoou Wang, Master’s student (currently in Paris) in NLP looking for a phd position/contrat cifre. [linkedin](https://www.linkedin.com/in/xiaoou-wang)/[email](mailto:xiaoouwangfrance@gmail.com)
	# https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
	# Have a look at https://soshace.com/responsible-web-scraping-gathering-data-ethically-and-legally/ before using the code.
	import os # helper functions like check file exists
	import datetime # automatic file name
	import requests # the following imports are common web scraping bundle
	from urllib.request import urlopen # standard python module
	from bs4 import BeautifulSoup
	from urllib.error import HTTPError
	from collections import defaultdict
	import re
	from urllib.error import URLError
	from tqdm import tqdm
	import pickle
	import bz2
	import _pickle as cPickle
	import pandas as pd

	def extract_theme(link):
	try:
	theme_text = re.findall(r'.fr/.*?/', link)[0]
	except:
	pass
	else:
	return theme_text[4:-1]


	def list_themes(links):
	themes = []
	for link in links:
	theme = extract_theme(link)
	if theme is not None:
	themes.append(theme)
	return themes


	def write_links(path, links, year_fn):
	with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w') as f:
	for link in links:
	f.write(link + "\n")


	def write_to_file(filename, content):
	if os.path.exists(filename):
	with open(filename, 'a+') as f:
	f.write(str(content))
	else:
	with open(filename, 'w') as f:
	f.write(str(content))

	def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
	archive_links = {}
	for y in range(year_start, year_end + 1):
	dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
	str(y) for m in range(month_start, month_end + 1) for d in
	range(day_start, day_end + 1)]
	archive_links[y] = [
	"https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
	return archive_links

	def get_articles_links(archive_links):
	links_non_abonne = []
	for link in archive_links:
	try:
	html = urlopen(link)
	except HTTPError as e:
	print("url not valid", link)
	else:
	soup = BeautifulSoup(html, "html.parser")
	news = soup.find_all(class_="teaser")
	# condition here : if no span icon__premium (abonnes)
	for item in news:
	if not item.find('span', {'class': 'icon__premium'}):
	l_article = item.find('a')['href']
	# en-direct = video
	if 'en-direct' not in l_article:
	links_non_abonne.append(l_article)
	return links_non_abonne


	def classify_links(theme_list, link_list):
	dict_links = defaultdict(list)
	for theme in theme_list:
	theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
	for link in link_list:
	if theme_link in link:
	dict_links[theme].append(link)
	return dict_links


	def get_single_page(url):
	try:
	html = urlopen(url)
	except HTTPError as e:
	print("url not valid", url)
	else:
	soup = BeautifulSoup(html, "html.parser")
	text_title = soup.find('h1')
	text_body = soup.article.find_all(["p", "h2"], recursive=False)
	return (text_title, text_body)

	def scrape_articles(dict_links):
	themes = dict_links.keys()
	for theme in themes:
	create_folder(os.path.join('corpus', theme))
	print("processing:", theme)
	for i in tqdm(range(len(dict_links[theme]))):
	link = dict_links[theme][i]
	fn = extract_fn(link)
	single_page = get_single_page(link)
	if single_page is not None:
	with open((os.path.join('corpus', theme, fn + '.txt')), 'w') as f:
	# f.write(dict_links[theme][i] + "\n" * 2)
	f.write(single_page[0].get_text() + "\n" * 2)
	for line in single_page[1]:
	f.write(line.get_text() + "\n" * 2)

	def cr_corpus_dict(path_corpus, n_files=1000):
	dict_corpus = defaultdict(list)
	themes = os.listdir(path_corpus)
	for theme in themes:
	counter = 0
	if not theme.startswith('.'):
	theme_directory = os.path.join(path_corpus, theme)
	for file in os.listdir(theme_directory):
	if counter < n_files:
	path_file = os.path.join(theme_directory, file)
	text = read_file(path_file)
	dict_corpus["label"].append(theme)
	dict_corpus["text"].append(text)
	counter += 1
	return dict_corpus

	def create_folder(path):
	if not os.path.exists(path):
	os.mkdir(path)
	else:
	print("folder exists already")

	archive_links = scraper.create_archive_links(2006,2020,1, 12, 1, 31)
	corpus_path = os.path.join(os.getcwd(), "corpus_links")
	create_folder(corpus_path)
	article_links = {}
	for year,links in archive_links.items():
	print("processing: ",year)
	article_links_list = get_articles_links(links)
	article_links[year] = article_links_list
	write_links(corpus_path,article_links_list,year)
	themes = []
	for link_list in article_links.values():
	themes.extend(list_themes(link_list))
	from collections import Counter
	theme_stat = Counter(themes)
	theme_top = []
	for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
	if v > 700:
	theme_top.append((k, v))
	print(theme_top)
	all_links = []
	for link_list in article_links.values():
	all_links.extend(link_list)
	themes_top_five = [x[0] for x in theme_top[:5]]
	themes_top_five_links = classify_links(themes_top_five,all_links)
	scrape_articles(themes_top_five_links)
	path = 'corpus'
	dico_corpus = cr_corpus_dict(path,1000)