vlandham/chronicling_america-scraper.py

## chronicling_america-scraper.py
#Script for scraping Chronicling America
import requests
import re
import csv
from bs4 import BeautifulSoup, SoupStrainer
import os
from time import sleep
from datetime import date, datetime, timedelta

#search_terms is a string of words separated by spaces.
#This function will only perform a search for articles that mention any word in the string

#start_date and end_date are date objects created by the datetime module

#filepath is a string containing the path where the resulting csv should be written.

def chronicling_america_scraper(search_terms, start_date, end_date, filepath):
    #set starting values
    #set URLs
    url = "http://chroniclingamerica.loc.gov/search/pages/results/"
    stub = "http://chroniclingamerica.loc.gov"

    #Scraper Functions
    #Define date generator
    def perdelta(start, end, delta):
        curr = start
        while curr < end:
            yield curr
            curr += delta

    def make_search_query(search_terms, day, count):
        query_vals = {'dateFilterType':'range', 'date1':'', 'date2':'', 'language':'', 'ortext':'', 'andtext':'', 'phrasetext':'', 'proxtext':'', 'proxdistance':5, 'rows':count, 'searchType':'advanced'}
        date_1 = str(day.month) + "/" + str(day.day) + "/" + str(day.year)
        query_vals.update({'ortext':search_terms, 'date1':date_1, 'date2':date_1})
        return query_vals

    def get_article_data(article):
        matches = article.find('input', {'name':'words'})
        matches = matches['value']
        matches = matches.lower()
        check = [val for val in search_terms.split() if val in matches.split()]
        if len(check) > 0:
            line = {}
            line['text'] = matches.encode('utf8')
            info = article.a.next_sibling.next_sibling.next_sibling.get_text()
            m = re.search('^[^.]+(?=.)', info)
            line['newspaper'] = m.group(0).encode('utf8')
            d = re.search('(?<=\),\s)[a-zA-Z]+\s\d\d,\s\d\d\d\d(?=,)', info)
            line['date'] = d.group(0).encode('utf8')
            p = re.search('(?<=Page\s)[0-9]+?', info)
            i = re.search('(?<=Image\s)[0-9]+?', info)
            if p:
                line['page'] = p.group(0).encode('utf8')
            else:
                line['page'] = i.group(0).encode('utf8')
            line['href'] = stub + article.a['href']
            line['lc_id'] = article.a['href'].split('/')[2]
            line['db'] = 'chroniclingamerica'
            line['date.search'] = day
            return line
        else:
            return None

    def scrape(search_terms, day):
        print day

        #Search
        wait = 0
        while True:
            try:
                start_page = requests.get(url, params=make_search_query(search_terms, day, 500), timeout=(1,60)).text
                break
            except:
                print "... trying again ..."
                sleep(1.5**wait)
                wait += 1

        test = BeautifulSoup(start_page, 'html.parser')

        if test.find('span', style = 'color:#900')==None:
            page = test
            nextLink = []
            lines = []

            while nextLink != None:
                #Select articles
                results = page.find('table')
                while True:
                    try:
                        articles = results.find_all('div', class_="highlite")
                        break
                    except AttributeError:
                        pass

                #Extract information on articles
                for article in articles:
                    data = get_article_data(article)
                    if data != None:
                        lines.append(data)
                    else:
                        pass
                #Find url for next page
                try:
                    nextLink = url + page.find('a', class_='next')['href']
                    wait = 0
                    while True:
                        try:
                            next_page = requests.get(nextLink, timeout=(1,60)).text
                            break
                        except:
                            print "... trying again ..."
                            sleep(1.5**wait)
                            wait += 1
                    page = BeautifulSoup(next_page, 'html.parser')
                except:
                    nextLink = None
            return lines
        else:
            return None


    #Complete scraper
    last_date = end_date - timedelta(days=1)
    if last_date.year < 1923:

        #Create CSV
        #Create file name
        timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
        filename = "chronicling_america-" + timeperiod + ".csv"
        fields = ["newspaper", "date", "lc_id", "text", "page", "href", "db", "date.search"]
        with open("/".join((filepath,filename)), "w") as w:
            writer = csv.DictWriter(w, fieldnames=fields)
            writer.writeheader()

            #Loop over days
            for day in perdelta(start_date, end_date, timedelta(days=1)):
                results = scrape(search_terms, day)
                if results != None:
                    writer.writerows(results)
    else:
        print "Cannot search dates after 1922."
	#Script for scraping Chronicling America
	import requests
	import re
	import csv
	from bs4 import BeautifulSoup, SoupStrainer
	import os
	from time import sleep
	from datetime import date, datetime, timedelta

	#search_terms is a string of words separated by spaces.
	#This function will only perform a search for articles that mention any word in the string

	#start_date and end_date are date objects created by the datetime module

	#filepath is a string containing the path where the resulting csv should be written.

	def chronicling_america_scraper(search_terms, start_date, end_date, filepath):
	#set starting values
	#set URLs
	url = "http://chroniclingamerica.loc.gov/search/pages/results/"
	stub = "http://chroniclingamerica.loc.gov"

	#Scraper Functions
	#Define date generator
	def perdelta(start, end, delta):
	curr = start
	while curr < end:
	yield curr
	curr += delta

	def make_search_query(search_terms, day, count):
	query_vals = {'dateFilterType':'range', 'date1':'', 'date2':'', 'language':'', 'ortext':'', 'andtext':'', 'phrasetext':'', 'proxtext':'', 'proxdistance':5, 'rows':count, 'searchType':'advanced'}
	date_1 = str(day.month) + "/" + str(day.day) + "/" + str(day.year)
	query_vals.update({'ortext':search_terms, 'date1':date_1, 'date2':date_1})
	return query_vals

	def get_article_data(article):
	matches = article.find('input', {'name':'words'})
	matches = matches['value']
	matches = matches.lower()
	check = [val for val in search_terms.split() if val in matches.split()]
	if len(check) > 0:
	line = {}
	line['text'] = matches.encode('utf8')
	info = article.a.next_sibling.next_sibling.next_sibling.get_text()
	m = re.search('^[^.]+(?=.)', info)
	line['newspaper'] = m.group(0).encode('utf8')
	d = re.search('(?<=\),\s)[a-zA-Z]+\s\d\d,\s\d\d\d\d(?=,)', info)
	line['date'] = d.group(0).encode('utf8')
	p = re.search('(?<=Page\s)[0-9]+?', info)
	i = re.search('(?<=Image\s)[0-9]+?', info)
	if p:
	line['page'] = p.group(0).encode('utf8')
	else:
	line['page'] = i.group(0).encode('utf8')
	line['href'] = stub + article.a['href']
	line['lc_id'] = article.a['href'].split('/')[2]
	line['db'] = 'chroniclingamerica'
	line['date.search'] = day
	return line
	else:
	return None

	def scrape(search_terms, day):
	print day

	#Search
	wait = 0
	while True:
	try:
	start_page = requests.get(url, params=make_search_query(search_terms, day, 500), timeout=(1,60)).text
	break
	except:
	print "... trying again ..."
	sleep(1.5**wait)
	wait += 1

	test = BeautifulSoup(start_page, 'html.parser')

	if test.find('span', style = 'color:#900')==None:
	page = test
	nextLink = []
	lines = []

	while nextLink != None:
	#Select articles
	results = page.find('table')
	while True:
	try:
	articles = results.find_all('div', class_="highlite")
	break
	except AttributeError:
	pass

	#Extract information on articles
	for article in articles:
	data = get_article_data(article)
	if data != None:
	lines.append(data)
	else:
	pass
	#Find url for next page
	try:
	nextLink = url + page.find('a', class_='next')['href']
	wait = 0
	while True:
	try:
	next_page = requests.get(nextLink, timeout=(1,60)).text
	break
	except:
	print "... trying again ..."
	sleep(1.5**wait)
	wait += 1
	page = BeautifulSoup(next_page, 'html.parser')
	except:
	nextLink = None
	return lines
	else:
	return None


	#Complete scraper
	last_date = end_date - timedelta(days=1)
	if last_date.year < 1923:

	#Create CSV
	#Create file name
	timeperiod = str(start_date) + "to" + str(end_date - timedelta(days=1))
	filename = "chronicling_america-" + timeperiod + ".csv"
	fields = ["newspaper", "date", "lc_id", "text", "page", "href", "db", "date.search"]
	with open("/".join((filepath,filename)), "w") as w:
	writer = csv.DictWriter(w, fieldnames=fields)
	writer.writeheader()

	#Loop over days
	for day in perdelta(start_date, end_date, timedelta(days=1)):
	results = scrape(search_terms, day)
	if results != None:
	writer.writerows(results)
	else:
	print "Cannot search dates after 1922."