Skip to content

Instantly share code, notes, and snippets.

@xiaoouwang
Last active January 18, 2024 13:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save xiaoouwang/0f054840af560488f2162c110b6045b5 to your computer and use it in GitHub Desktop.
Save xiaoouwang/0f054840af560488f2162c110b6045b5 to your computer and use it in GitHub Desktop.
Complete tutorial on scraping French news from le monde ❤️
# Author: Xiaoou Wang [linkedin](https://www.linkedin.com/in/xiaoou-wang)/[email](mailto:xiaoouwangfrance@gmail.com)
# https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
# Have a look at https://soshace.com/responsible-web-scraping-gathering-data-ethically-and-legally/ before using the code.
import os # helper functions like check file exists
import datetime # automatic file name
import requests # the following imports are common web scraping bundle
from urllib.request import urlopen # standard python module
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from collections import defaultdict
import re
from urllib.error import URLError
from tqdm import tqdm
import pickle
import bz2
import _pickle as cPickle
import pandas as pd
def extract_theme(link):
try:
theme_text = re.findall(r'.fr/.*?/', link)[0]
except:
pass
else:
return theme_text[4:-1]
def list_themes(links):
themes = []
for link in links:
theme = extract_theme(link)
if theme is not None:
themes.append(theme)
return themes
def write_links(path, links, year_fn):
with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w') as f:
for link in links:
f.write(link + "\n")
def write_to_file(filename, content):
if os.path.exists(filename):
with open(filename, 'a+') as f:
f.write(str(content))
else:
with open(filename, 'w') as f:
f.write(str(content))
def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
archive_links = {}
for y in range(year_start, year_end + 1):
dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
str(y) for m in range(month_start, month_end + 1) for d in
range(day_start, day_end + 1)]
archive_links[y] = [
"https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
return archive_links
def get_articles_links(archive_links):
links_non_abonne = []
for link in archive_links:
try:
html = urlopen(link)
except HTTPError as e:
print("url not valid", link)
else:
soup = BeautifulSoup(html, "html.parser")
news = soup.find_all(class_="teaser")
# condition here : if no span icon__premium (abonnes)
for item in news:
if not item.find('span', {'class': 'icon__premium'}):
l_article = item.find('a')['href']
# en-direct = video
if 'en-direct' not in l_article:
links_non_abonne.append(l_article)
return links_non_abonne
def classify_links(theme_list, link_list):
dict_links = defaultdict(list)
for theme in theme_list:
theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
for link in link_list:
if theme_link in link:
dict_links[theme].append(link)
return dict_links
def get_single_page(url):
try:
html = urlopen(url)
except HTTPError as e:
print("url not valid", url)
else:
soup = BeautifulSoup(html, "html.parser")
text_title = soup.find('h1')
text_body = soup.article.find_all(["p", "h2"], recursive=False)
return (text_title, text_body)
def scrape_articles(dict_links):
themes = dict_links.keys()
for theme in themes:
create_folder(os.path.join('corpus', theme))
print("processing:", theme)
for i in tqdm(range(len(dict_links[theme]))):
link = dict_links[theme][i]
fn = extract_fn(link)
single_page = get_single_page(link)
if single_page is not None:
with open((os.path.join('corpus', theme, fn + '.txt')), 'w') as f:
# f.write(dict_links[theme][i] + "\n" * 2)
f.write(single_page[0].get_text() + "\n" * 2)
for line in single_page[1]:
f.write(line.get_text() + "\n" * 2)
def cr_corpus_dict(path_corpus, n_files=1000):
dict_corpus = defaultdict(list)
themes = os.listdir(path_corpus)
for theme in themes:
counter = 0
if not theme.startswith('.'):
theme_directory = os.path.join(path_corpus, theme)
for file in os.listdir(theme_directory):
if counter < n_files:
path_file = os.path.join(theme_directory, file)
text = read_file(path_file)
dict_corpus["label"].append(theme)
dict_corpus["text"].append(text)
counter += 1
return dict_corpus
def create_folder(path):
if not os.path.exists(path):
os.mkdir(path)
else:
print("folder exists already")
archive_links = scraper.create_archive_links(2006,2020,1, 12, 1, 31)
corpus_path = os.path.join(os.getcwd(), "corpus_links")
create_folder(corpus_path)
article_links = {}
for year,links in archive_links.items():
print("processing: ",year)
article_links_list = get_articles_links(links)
article_links[year] = article_links_list
write_links(corpus_path,article_links_list,year)
themes = []
for link_list in article_links.values():
themes.extend(list_themes(link_list))
from collections import Counter
theme_stat = Counter(themes)
theme_top = []
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
if v > 700:
theme_top.append((k, v))
print(theme_top)
all_links = []
for link_list in article_links.values():
all_links.extend(link_list)
themes_top_five = [x[0] for x in theme_top[:5]]
themes_top_five_links = classify_links(themes_top_five,all_links)
scrape_articles(themes_top_five_links)
path = 'corpus'
dico_corpus = cr_corpus_dict(path,1000)
@bici-sancta
Copy link

bici-sancta commented Dec 20, 2022

Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde.
I forked the gist, and started to look at the code.
There are references to 3 functions which are not included in the gist :

  • line 154 ... archive_links = scraper.create_archive_links(2006, 2020, 1, 12, 1, 31) - scraper does not exist
  • line 140 ... text = read_file(path_file) - read_file does not exist
  • line 120 ... fn = extract_fn(link) - extract_fn does not exist

Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do?
Thanks much if you can help !!
If you prefer to respond in French ... OK for me, I can read French just fine.
Rgds,
Patrick
also : e-mail : patrick@caffeinatedsolution.com

@xiaoouwang
Copy link
Author

Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde. I forked the gist, and started to look at the code. There are references to 3 functions which are not included in the gist :

  • line 154 ... archive_links = scraper.create_archive_links(2006, 2020, 1, 12, 1, 31) - scraper does not exist
  • line 140 ... text = read_file(path_file) - read_file does not exist
  • line 120 ... fn = extract_fn(link) - extract_fn does not exist

Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do? Thanks much if you can help !! If you prefer to respond in French ... OK for me, I can read French just fine. Rgds, Patrick also : e-mail : patrick@caffeinatedsolution.com

Hello Patrick, la fonction create_arch... peut être retrouvé ici
https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b

read_file lit le fichier et extract_fn extrait le nom du fichier à partir du lien

Bonne fête

@patrick-mcdevitt
Copy link

Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde. I forked the gist, and started to look at the code. There are references to 3 functions which are not included in the gist :

  • line 154 ... archive_links = scraper.create_archive_links(2006, 2020, 1, 12, 1, 31) - scraper does not exist
  • line 140 ... text = read_file(path_file) - read_file does not exist
  • line 120 ... fn = extract_fn(link) - extract_fn does not exist

Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do? Thanks much if you can help !! If you prefer to respond in French ... OK for me, I can read French just fine. Rgds, Patrick also : e-mail : patrick@caffeinatedsolution.com

Hello Patrick, la fonction create_arch... peut être retrouvé ici https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b

read_file lit le fichier et extract_fn extrait le nom du fichier à partir du lien

Bonne fête

Re-bonjour Xiaoou
Ça marche !! Avec qq modifications, j'ai fait une version qui marche pour moi. Merci bien!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment