-
-
Save xiaoouwang/0f054840af560488f2162c110b6045b5 to your computer and use it in GitHub Desktop.
# Author: Xiaoou Wang [linkedin](https://www.linkedin.com/in/xiaoou-wang)/[email](mailto:xiaoouwangfrance@gmail.com) | |
# https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b | |
# Have a look at https://soshace.com/responsible-web-scraping-gathering-data-ethically-and-legally/ before using the code. | |
import os # helper functions like check file exists | |
import datetime # automatic file name | |
import requests # the following imports are common web scraping bundle | |
from urllib.request import urlopen # standard python module | |
from bs4 import BeautifulSoup | |
from urllib.error import HTTPError | |
from collections import defaultdict | |
import re | |
from urllib.error import URLError | |
from tqdm import tqdm | |
import pickle | |
import bz2 | |
import _pickle as cPickle | |
import pandas as pd | |
def extract_theme(link): | |
try: | |
theme_text = re.findall(r'.fr/.*?/', link)[0] | |
except: | |
pass | |
else: | |
return theme_text[4:-1] | |
def list_themes(links): | |
themes = [] | |
for link in links: | |
theme = extract_theme(link) | |
if theme is not None: | |
themes.append(theme) | |
return themes | |
def write_links(path, links, year_fn): | |
with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w') as f: | |
for link in links: | |
f.write(link + "\n") | |
def write_to_file(filename, content): | |
if os.path.exists(filename): | |
with open(filename, 'a+') as f: | |
f.write(str(content)) | |
else: | |
with open(filename, 'w') as f: | |
f.write(str(content)) | |
def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end): | |
archive_links = {} | |
for y in range(year_start, year_end + 1): | |
dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" + | |
str(y) for m in range(month_start, month_end + 1) for d in | |
range(day_start, day_end + 1)] | |
archive_links[y] = [ | |
"https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates] | |
return archive_links | |
def get_articles_links(archive_links): | |
links_non_abonne = [] | |
for link in archive_links: | |
try: | |
html = urlopen(link) | |
except HTTPError as e: | |
print("url not valid", link) | |
else: | |
soup = BeautifulSoup(html, "html.parser") | |
news = soup.find_all(class_="teaser") | |
# condition here : if no span icon__premium (abonnes) | |
for item in news: | |
if not item.find('span', {'class': 'icon__premium'}): | |
l_article = item.find('a')['href'] | |
# en-direct = video | |
if 'en-direct' not in l_article: | |
links_non_abonne.append(l_article) | |
return links_non_abonne | |
def classify_links(theme_list, link_list): | |
dict_links = defaultdict(list) | |
for theme in theme_list: | |
theme_link = 'https://www.lemonde.fr/' + theme + '/article/' | |
for link in link_list: | |
if theme_link in link: | |
dict_links[theme].append(link) | |
return dict_links | |
def get_single_page(url): | |
try: | |
html = urlopen(url) | |
except HTTPError as e: | |
print("url not valid", url) | |
else: | |
soup = BeautifulSoup(html, "html.parser") | |
text_title = soup.find('h1') | |
text_body = soup.article.find_all(["p", "h2"], recursive=False) | |
return (text_title, text_body) | |
def scrape_articles(dict_links): | |
themes = dict_links.keys() | |
for theme in themes: | |
create_folder(os.path.join('corpus', theme)) | |
print("processing:", theme) | |
for i in tqdm(range(len(dict_links[theme]))): | |
link = dict_links[theme][i] | |
fn = extract_fn(link) | |
single_page = get_single_page(link) | |
if single_page is not None: | |
with open((os.path.join('corpus', theme, fn + '.txt')), 'w') as f: | |
# f.write(dict_links[theme][i] + "\n" * 2) | |
f.write(single_page[0].get_text() + "\n" * 2) | |
for line in single_page[1]: | |
f.write(line.get_text() + "\n" * 2) | |
def cr_corpus_dict(path_corpus, n_files=1000): | |
dict_corpus = defaultdict(list) | |
themes = os.listdir(path_corpus) | |
for theme in themes: | |
counter = 0 | |
if not theme.startswith('.'): | |
theme_directory = os.path.join(path_corpus, theme) | |
for file in os.listdir(theme_directory): | |
if counter < n_files: | |
path_file = os.path.join(theme_directory, file) | |
text = read_file(path_file) | |
dict_corpus["label"].append(theme) | |
dict_corpus["text"].append(text) | |
counter += 1 | |
return dict_corpus | |
def create_folder(path): | |
if not os.path.exists(path): | |
os.mkdir(path) | |
else: | |
print("folder exists already") | |
archive_links = scraper.create_archive_links(2006,2020,1, 12, 1, 31) | |
corpus_path = os.path.join(os.getcwd(), "corpus_links") | |
create_folder(corpus_path) | |
article_links = {} | |
for year,links in archive_links.items(): | |
print("processing: ",year) | |
article_links_list = get_articles_links(links) | |
article_links[year] = article_links_list | |
write_links(corpus_path,article_links_list,year) | |
themes = [] | |
for link_list in article_links.values(): | |
themes.extend(list_themes(link_list)) | |
from collections import Counter | |
theme_stat = Counter(themes) | |
theme_top = [] | |
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True): | |
if v > 700: | |
theme_top.append((k, v)) | |
print(theme_top) | |
all_links = [] | |
for link_list in article_links.values(): | |
all_links.extend(link_list) | |
themes_top_five = [x[0] for x in theme_top[:5]] | |
themes_top_five_links = classify_links(themes_top_five,all_links) | |
scrape_articles(themes_top_five_links) | |
path = 'corpus' | |
dico_corpus = cr_corpus_dict(path,1000) |
Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde. I forked the gist, and started to look at the code. There are references to 3 functions which are not included in the gist :
- line 154 ... archive_links = scraper.create_archive_links(2006, 2020, 1, 12, 1, 31) - scraper does not exist
- line 140 ... text = read_file(path_file) - read_file does not exist
- line 120 ... fn = extract_fn(link) - extract_fn does not exist
Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do? Thanks much if you can help !! If you prefer to respond in French ... OK for me, I can read French just fine. Rgds, Patrick also : e-mail : patrick@caffeinatedsolution.com
Hello Patrick, la fonction create_arch... peut être retrouvé ici
https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
read_file lit le fichier et extract_fn extrait le nom du fichier à partir du lien
Bonne fête
Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde. I forked the gist, and started to look at the code. There are references to 3 functions which are not included in the gist :
- line 154 ... archive_links = scraper.create_archive_links(2006, 2020, 1, 12, 1, 31) - scraper does not exist
- line 140 ... text = read_file(path_file) - read_file does not exist
- line 120 ... fn = extract_fn(link) - extract_fn does not exist
Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do? Thanks much if you can help !! If you prefer to respond in French ... OK for me, I can read French just fine. Rgds, Patrick also : e-mail : patrick@caffeinatedsolution.com
Hello Patrick, la fonction create_arch... peut être retrouvé ici https://xiaoouwang.medium.com/complete-tutorial-on-scraping-french-news-from-le-monde-%EF%B8%8F-4fa92bc0a07b
read_file lit le fichier et extract_fn extrait le nom du fichier à partir du lien
Bonne fête
Re-bonjour Xiaoou
Ça marche !! Avec qq modifications, j'ai fait une version qui marche pour moi. Merci bien!
Hi Xiaoou, This is interesting gist. I would like to try it out for a personal project that I am working to scrape some content from LeMonde.
I forked the gist, and started to look at the code.
There are references to 3 functions which are not included in the gist :
Are there some locations where those functions are already defined, or can you give me some info on what they are intended to do?
Thanks much if you can help !!
If you prefer to respond in French ... OK for me, I can read French just fine.
Rgds,
Patrick
also : e-mail : patrick@caffeinatedsolution.com