ebergam/lm.py

## lm.py
# -*- coding: utf-8 -*-

import sys, re, time, logging, requests
import random
import unicodecsv as csv
from threading import Thread, Lock, currentThread
from queue import Queue, Empty
from lxml import html

lock = Lock()

# ql as input and qr as response writer (qr still not used)
ql = Queue()
qr = Queue()
num_threads = 10
k = " "
crawl_list = []
isCrawling = True

def fetch_links(i, ql, qr):
	print("Start fetcher %d" % i)
	# Stay alive until input queue is empty
	#print(q.empty())
	while not ql.empty() or isCrawling:
		try:
			qvalue = ql.get(timeout=1)
		except Empty:
			continue
		#lock.acquire()
		try:
			url = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num='+str(qvalue)+'&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
			#q.get(url, timeout=3)
			r = requests.get(url)
			print(r.status_code)
			print(i, " ",qvalue)
			### Fetch information
			if r.status_code == 200:
				page = html.fromstring(r.content)
				### Get data
				titles = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/text()')
				links = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/@href')
				abstracts = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/p/text()')
				footers = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/span/text()')
				dates = []
				pagenums = []
				for f in footers:
					pagenums.append(qvalue)
					match = re.search(r'\| .+$', f)
					if match:
						date = match.group()
						dates.append(date)
				pageindex = zip(titles,links,abstracts,footers,dates,pagenums) #what if there is a missing value?
				qr.put(pageindex)
				lock.acquire()
				results.append(pageindex)
				#qr.put(pageindex)
				lock.release()
			else:
				pageindex = [[str(r.status_code),"","","","",str(qvalue)]]
				lock.acquire()
				results.append(pageindex)
				#qr.put(pageindex)
				lock.release()

		except Exception as e:
			print(e)
		#lock.release()
		ql.task_done()


if __name__ == '__main__':
	try:
		### Generate list URLS
		startpage = 1
		## Get endpage
		url0 = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num=1&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
		r0 = requests.get(url0)
		print("First request: "+str(r0.status_code))
		endpage_f = html.fromstring(r0.content)
		endpage = endpage_f.xpath("//*[@id='habillagepub']/div[5]/div/div[1]/section/div/ul/li[@class='adroite']/a/text()")
		print(str(endpage[0]) + " pages found")
		crawl_list = random.sample(range(1,int(endpage[0])+1), int(endpage[0])) #generates randomized crawling order
		crawl_list_try = crawl_list[:10] #for testing
	except Exception as e:
		### Catches openup error and return an empty crawl list, then breaks
		print(e)
		crawl_list = []
	print(crawl_list_try)
	# Set up some threads
	# Queue defined is still empty
	threads_list = []
	results = []
	for i in range(num_threads):
		th = Thread(name = "thread_%d" % i,
					target = fetch_links,
					args = (i, ql, qr))
		th.setDaemon(True)
		threads_list.append(th)
		th.start()
	# Populate sources queue with numbers that compose the URL
	for url_id in crawl_list_try:
		ql.put(url_id)
	# Now wait for the queue to be empty, indicating that we have
	# processed all of the URLs in the list
	isCrawling = False
	print('*** Main thread waiting')
	main_thread = currentThread()
	for th in threads_list:
		if th is not main_thread:
			th.join()
	#for th in threads_list:
	#	th.join()
	print('*** Done')
	with open(k+'_results.csv', 'wb') as outcsv:
		wr = csv.DictWriter(outcsv, fieldnames=["title","link","abstract","footer","date","pagenum"])
		wr.writeheader()
		for result in results:
			output = list(result)
			for x in output:
					wr.writerow({
						#"keyword": str(keyword),
						"title": x[0],
						"link": x[1],
						"abstract": x[2],
						"footer": x[3],
						"date": x[4],
						"pagenum": x[5],
						})
	# -- coding: utf-8 --

	import sys, re, time, logging, requests
	import random
	import unicodecsv as csv
	from threading import Thread, Lock, currentThread
	from queue import Queue, Empty
	from lxml import html

	lock = Lock()

	# ql as input and qr as response writer (qr still not used)
	ql = Queue()
	qr = Queue()
	num_threads = 10
	k = " "
	crawl_list = []
	isCrawling = True

	def fetch_links(i, ql, qr):
	print("Start fetcher %d" % i)
	# Stay alive until input queue is empty
	#print(q.empty())
	while not ql.empty() or isCrawling:
	try:
	qvalue = ql.get(timeout=1)
	except Empty:
	continue
	#lock.acquire()
	try:
	url = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num='+str(qvalue)+'&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
	#q.get(url, timeout=3)
	r = requests.get(url)
	print(r.status_code)
	print(i, " ",qvalue)
	### Fetch information
	if r.status_code == 200:
	page = html.fromstring(r.content)
	### Get data
	titles = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/text()')
	links = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/h3/a/@href')
	abstracts = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/p/text()')
	footers = page.xpath('//*[@id="habillagepub"]/div[5]/div/div[1]/section/article/div/div/span/text()')
	dates = []
	pagenums = []
	for f in footers:
	pagenums.append(qvalue)
	match = re.search(r'\\| .+$', f)
	if match:
	date = match.group()
	dates.append(date)
	pageindex = zip(titles,links,abstracts,footers,dates,pagenums) #what if there is a missing value?
	qr.put(pageindex)
	lock.acquire()
	results.append(pageindex)
	#qr.put(pageindex)
	lock.release()
	else:
	pageindex = [[str(r.status_code),"","","","",str(qvalue)]]
	lock.acquire()
	results.append(pageindex)
	#qr.put(pageindex)
	lock.release()

	except Exception as e:
	print(e)
	#lock.release()
	ql.task_done()


	if __name__ == '__main__':
	try:
	### Generate list URLS
	startpage = 1
	## Get endpage
	url0 = 'https://www.lemonde.fr/recherche/?keywords='+str(k)+'&page_num=1&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=30&end_month=10&end_year=2018&sort=asc'
	r0 = requests.get(url0)
	print("First request: "+str(r0.status_code))
	endpage_f = html.fromstring(r0.content)
	endpage = endpage_f.xpath("//*[@id='habillagepub']/div[5]/div/div[1]/section/div/ul/li[@class='adroite']/a/text()")
	print(str(endpage[0]) + " pages found")
	crawl_list = random.sample(range(1,int(endpage[0])+1), int(endpage[0])) #generates randomized crawling order
	crawl_list_try = crawl_list[:10] #for testing
	except Exception as e:
	### Catches openup error and return an empty crawl list, then breaks
	print(e)
	crawl_list = []
	print(crawl_list_try)
	# Set up some threads
	# Queue defined is still empty
	threads_list = []
	results = []
	for i in range(num_threads):
	th = Thread(name = "thread_%d" % i,
	target = fetch_links,
	args = (i, ql, qr))
	th.setDaemon(True)
	threads_list.append(th)
	th.start()
	# Populate sources queue with numbers that compose the URL
	for url_id in crawl_list_try:
	ql.put(url_id)
	# Now wait for the queue to be empty, indicating that we have
	# processed all of the URLs in the list
	isCrawling = False
	print('*** Main thread waiting')
	main_thread = currentThread()
	for th in threads_list:
	if th is not main_thread:
	th.join()
	#for th in threads_list:
	# th.join()
	print('*** Done')
	with open(k+'_results.csv', 'wb') as outcsv:
	wr = csv.DictWriter(outcsv, fieldnames=["title","link","abstract","footer","date","pagenum"])
	wr.writeheader()
	for result in results:
	output = list(result)
	for x in output:
	wr.writerow({
	#"keyword": str(keyword),
	"title": x[0],
	"link": x[1],
	"abstract": x[2],
	"footer": x[3],
	"date": x[4],
	"pagenum": x[5],
	})