Skip to content

Instantly share code, notes, and snippets.

@bici-sancta
Forked from xiaoouwang/lemondeScraper.py
Created December 20, 2022 18:30
Show Gist options
  • Save bici-sancta/fdfac3fe5a2970928e738ee0a1bb62ef to your computer and use it in GitHub Desktop.
Save bici-sancta/fdfac3fe5a2970928e738ee0a1bb62ef to your computer and use it in GitHub Desktop.
Complete tutorial on scraping French news from le monde ❤️
# Author: Xiaoou Wang, Master’s student (currently in Paris) in NLP looking for a phd position/contrat cifre. [linkedin](https://www.linkedin.com/in/xiaoou-wang)/[email](mailto:xiaoouwangfrance@gmail.com)
# https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
# Have a look at https://soshace.com/responsible-web-scraping-gathering-data-ethically-and-legally/ before using the code.
import os # helper functions like check file exists
import datetime # automatic file name
import requests # the following imports are common web scraping bundle
from urllib.request import urlopen # standard python module
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from collections import defaultdict
import re
from urllib.error import URLError
from tqdm import tqdm
import pickle
import bz2
import _pickle as cPickle
import pandas as pd
def extract_theme(link):
try:
theme_text = re.findall(r'.fr/.*?/', link)[0]
except:
pass
else:
return theme_text[4:-1]
def list_themes(links):
themes = []
for link in links:
theme = extract_theme(link)
if theme is not None:
themes.append(theme)
return themes
def write_links(path, links, year_fn):
with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w') as f:
for link in links:
f.write(link + "\n")
def write_to_file(filename, content):
if os.path.exists(filename):
with open(filename, 'a+') as f:
f.write(str(content))
else:
with open(filename, 'w') as f:
f.write(str(content))
def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
archive_links = {}
for y in range(year_start, year_end + 1):
dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
str(y) for m in range(month_start, month_end + 1) for d in
range(day_start, day_end + 1)]
archive_links[y] = [
"https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
return archive_links
def get_articles_links(archive_links):
links_non_abonne = []
for link in archive_links:
try:
html = urlopen(link)
except HTTPError as e:
print("url not valid", link)
else:
soup = BeautifulSoup(html, "html.parser")
news = soup.find_all(class_="teaser")
# condition here : if no span icon__premium (abonnes)
for item in news:
if not item.find('span', {'class': 'icon__premium'}):
l_article = item.find('a')['href']
# en-direct = video
if 'en-direct' not in l_article:
links_non_abonne.append(l_article)
return links_non_abonne
def classify_links(theme_list, link_list):
dict_links = defaultdict(list)
for theme in theme_list:
theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
for link in link_list:
if theme_link in link:
dict_links[theme].append(link)
return dict_links
def get_single_page(url):
try:
html = urlopen(url)
except HTTPError as e:
print("url not valid", url)
else:
soup = BeautifulSoup(html, "html.parser")
text_title = soup.find('h1')
text_body = soup.article.find_all(["p", "h2"], recursive=False)
return (text_title, text_body)
def scrape_articles(dict_links):
themes = dict_links.keys()
for theme in themes:
create_folder(os.path.join('corpus', theme))
print("processing:", theme)
for i in tqdm(range(len(dict_links[theme]))):
link = dict_links[theme][i]
fn = extract_fn(link)
single_page = get_single_page(link)
if single_page is not None:
with open((os.path.join('corpus', theme, fn + '.txt')), 'w') as f:
# f.write(dict_links[theme][i] + "\n" * 2)
f.write(single_page[0].get_text() + "\n" * 2)
for line in single_page[1]:
f.write(line.get_text() + "\n" * 2)
def cr_corpus_dict(path_corpus, n_files=1000):
dict_corpus = defaultdict(list)
themes = os.listdir(path_corpus)
for theme in themes:
counter = 0
if not theme.startswith('.'):
theme_directory = os.path.join(path_corpus, theme)
for file in os.listdir(theme_directory):
if counter < n_files:
path_file = os.path.join(theme_directory, file)
text = read_file(path_file)
dict_corpus["label"].append(theme)
dict_corpus["text"].append(text)
counter += 1
return dict_corpus
def create_folder(path):
if not os.path.exists(path):
os.mkdir(path)
else:
print("folder exists already")
archive_links = scraper.create_archive_links(2006,2020,1, 12, 1, 31)
corpus_path = os.path.join(os.getcwd(), "corpus_links")
create_folder(corpus_path)
article_links = {}
for year,links in archive_links.items():
print("processing: ",year)
article_links_list = get_articles_links(links)
article_links[year] = article_links_list
write_links(corpus_path,article_links_list,year)
themes = []
for link_list in article_links.values():
themes.extend(list_themes(link_list))
from collections import Counter
theme_stat = Counter(themes)
theme_top = []
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
if v > 700:
theme_top.append((k, v))
print(theme_top)
all_links = []
for link_list in article_links.values():
all_links.extend(link_list)
themes_top_five = [x[0] for x in theme_top[:5]]
themes_top_five_links = classify_links(themes_top_five,all_links)
scrape_articles(themes_top_five_links)
path = 'corpus'
dico_corpus = cr_corpus_dict(path,1000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment