Skip to content

Instantly share code, notes, and snippets.

@lobstrio
Created December 14, 2018 14:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lobstrio/21dc537ff165071f126bbf030df4f030 to your computer and use it in GitHub Desktop.
Save lobstrio/21dc537ff165071f126bbf030df4f030 to your computer and use it in GitHub Desktop.
Extract headlines from French Media website lemonde.fr with Python3, Requests, and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
import re
import csv
from collections import Counter
class LeMondeScraper:
"""
Export all headlines from lemonde.fr, from the "actualites en continu" category
"""
def __init__(self):
# création de la session, comme lancer un navigateur
self.s = requests.session()
def parse(self):
final_list_words = []
# requête vers lemonde.fr
for i in range(1, 26, 1):
url = 'https://www.lemonde.fr/actualite-en-continu/{}.html'.format(i)
response = self.s.get(url)
assert response.status_code == 200
print('{} {}'.format('PAGE', i))
# on parse la page avec lxml
page = html.fromstring(response.text)
articles = page.xpath("//article")
for article in articles:
# on parse les headlines
headline = article.xpath(".//h3/a/text()")[0].lower()\
.replace('\r', '').replace('\n', '').replace('\xa0', '').replace('«', '').replace('»', '')
print(headline)
# on récupere les mots
list_words = re.findall(r'\w[\w]+', headline)
for word in list_words:
final_list_words.append(word)
return final_list_words
def main():
my = LeMondeScraper()
list_words = my.parse()
assert list_words
assert isinstance(list_words, list)
# on compte le nombre d'apparitions des mots
dict_count = Counter(list_words)
# on imprime le dictionnaire dans la console
print(dict_count)
# on enregistre le résultat dans un .csv
with open('words.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=['mot', 'apparition'], delimiter='\t')
writer.writeheader()
for k, v in sorted(dict_count.items(), key=lambda x: x[1], reverse=True):
dict_row = dict()
dict_row['mot'] = k
dict_row['apparition'] = v
writer.writerow(dict_row)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment