Skip to content

Instantly share code, notes, and snippets.

@ebergam
Created November 19, 2018 14:59
Show Gist options
  • Save ebergam/806bb345a2f17c2207540bb3aab4ec40 to your computer and use it in GitHub Desktop.
Save ebergam/806bb345a2f17c2207540bb3aab4ec40 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import sys, re, time, logging, requests
import random
import unicodecsv as csv
from threading import Thread, Lock, currentThread
from queue import Queue, Empty
from lxml import html
lock = Lock()
# ql as input and qr as response writer (qr still not used)
ql = Queue()
qr = Queue()
num_threads = 10
k = " "
crawl_list = []
isCrawling = True
def fetch_links(i, ql, qr):
print("Start fetcher %d" % i)
# Stay alive until input queue is empty
#print(q.empty())
while not ql.empty() or isCrawling:
try:
qvalue = ql.get(timeout=1)
except Empty:
continue
#lock.acquire()
try:
url = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num='+str(qvalue)+'&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
#q.get(url, timeout=3)
r = requests.get(url)
print(r.status_code)
print(i, " ",qvalue)
### Fetch information
if r.status_code == 200:
page = html.fromstring(r.content)
### Get data
titles = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/text()')
links = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/@href')
abstracts = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/p/text()')
footers = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/span/text()')
dates = []
pagenums = []
for f in footers:
pagenums.append(qvalue)
match = re.search(r'\| .+$', f)
if match:
date = match.group()
dates.append(date)
pageindex = zip(titles,links,abstracts,footers,dates,pagenums) #what if there is a missing value?
qr.put(pageindex)
lock.acquire()
results.append(pageindex)
#qr.put(pageindex)
lock.release()
else:
pageindex = [[str(r.status_code),"","","","",str(qvalue)]]
lock.acquire()
results.append(pageindex)
#qr.put(pageindex)
lock.release()
except Exception as e:
print(e)
#lock.release()
ql.task_done()
if __name__ == '__main__':
try:
### Generate list URLS
startpage = 1
## Get endpage
url0 = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num=1&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
r0 = requests.get(url0)
print("First request: "+str(r0.status_code))
endpage_f = html.fromstring(r0.content)
endpage = endpage_f.xpath("//*[@id='habillagepub']/div[5]/div/div[1]/section/div/ul/li[@class='adroite']/a/text()")
print(str(endpage[0]) + " pages found")
crawl_list = random.sample(range(1,int(endpage[0])+1), int(endpage[0])) #generates randomized crawling order
crawl_list_try = crawl_list[:10] #for testing
except Exception as e:
### Catches openup error and return an empty crawl list, then breaks
print(e)
crawl_list = []
print(crawl_list_try)
# Set up some threads
# Queue defined is still empty
threads_list = []
results = []
for i in range(num_threads):
th = Thread(name = "thread_%d" % i,
target = fetch_links,
args = (i, ql, qr))
th.setDaemon(True)
threads_list.append(th)
th.start()
# Populate sources queue with numbers that compose the URL
for url_id in crawl_list_try:
ql.put(url_id)
# Now wait for the queue to be empty, indicating that we have
# processed all of the URLs in the list
isCrawling = False
print('*** Main thread waiting')
main_thread = currentThread()
for th in threads_list:
if th is not main_thread:
th.join()
#for th in threads_list:
# th.join()
print('*** Done')
with open(k+'_results.csv', 'wb') as outcsv:
wr = csv.DictWriter(outcsv, fieldnames=["title","link","abstract","footer","date","pagenum"])
wr.writeheader()
for result in results:
output = list(result)
for x in output:
wr.writerow({
#"keyword": str(keyword),
"title": x[0],
"link": x[1],
"abstract": x[2],
"footer": x[3],
"date": x[4],
"pagenum": x[5],
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment