Created
December 14, 2018 14:36
-
-
Save lobstrio/21dc537ff165071f126bbf030df4f030 to your computer and use it in GitHub Desktop.
Extract headlines from French Media website lemonde.fr with Python3, Requests, and lxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from lxml import html | |
import re | |
import csv | |
from collections import Counter | |
class LeMondeScraper: | |
""" | |
Export all headlines from lemonde.fr, from the "actualites en continu" category | |
""" | |
def __init__(self): | |
# création de la session, comme lancer un navigateur | |
self.s = requests.session() | |
def parse(self): | |
final_list_words = [] | |
# requête vers lemonde.fr | |
for i in range(1, 26, 1): | |
url = 'https://www.lemonde.fr/actualite-en-continu/{}.html'.format(i) | |
response = self.s.get(url) | |
assert response.status_code == 200 | |
print('{} {}'.format('PAGE', i)) | |
# on parse la page avec lxml | |
page = html.fromstring(response.text) | |
articles = page.xpath("//article") | |
for article in articles: | |
# on parse les headlines | |
headline = article.xpath(".//h3/a/text()")[0].lower()\ | |
.replace('\r', '').replace('\n', '').replace('\xa0', '').replace('«', '').replace('»', '') | |
print(headline) | |
# on récupere les mots | |
list_words = re.findall(r'\w[\w]+', headline) | |
for word in list_words: | |
final_list_words.append(word) | |
return final_list_words | |
def main(): | |
my = LeMondeScraper() | |
list_words = my.parse() | |
assert list_words | |
assert isinstance(list_words, list) | |
# on compte le nombre d'apparitions des mots | |
dict_count = Counter(list_words) | |
# on imprime le dictionnaire dans la console | |
print(dict_count) | |
# on enregistre le résultat dans un .csv | |
with open('words.csv', 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=['mot', 'apparition'], delimiter='\t') | |
writer.writeheader() | |
for k, v in sorted(dict_count.items(), key=lambda x: x[1], reverse=True): | |
dict_row = dict() | |
dict_row['mot'] = k | |
dict_row['apparition'] = v | |
writer.writerow(dict_row) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment