alexwoolford/scrapyToHive.py

## scrapyToHive.py
# The scrapy framework has a helper function to create a skeleton scraper project, e.g.

scrapy genspider somesiteSpider somesite.com


# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created:

outfile = open('urls.txt', 'w')
for id in range(1, 17723):
    url = "https://somesite.com/getDetail.do?someId=" + str(id)
    outfile.write(url + '\n')

outfile.close()


# The data being captured is defined in the items.py
# items.py

import scrapy

class SomesiteItem(scrapy.Item):
    url = scrapy.Field(default="")
    html = scrapy.Field(default="")


# The scrapy spider gets each of the items.
# spider.py

import scrapy
from somesite.items import SomesiteItem

class SomesiteSpider(scrapy.Spider):
    name = "somesiteSpider"
    allowed_domains = ["somesite.com"]
    inputfile = open('urls.txt', 'r')
    start_urls = [line.strip() for line in inputfile.readlines()]
    inputfile.close()

    def parse(self, response):
        item = SomesiteItem()
        try:
            item['url'] = response.url
            item['html'] = response.body.encode('utf-8')
        except:
            pass
        yield item


# The user-agent and delay between page requests are configured in settings.py
# settings.py

USER_AGENT = '[your user agent]'
DOWNLOAD_DELAY = 5
ITEM_PIPELINES = {
    'somesite.pipelines.JsonFilePipeline': 300,
}


# The HTML gets written to a JSON file using a pipeline:
# pipelines.py
import json
class JsonFilePipeline(object):
    def __init__(self):
        self.file = open('output.json', 'wb')
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item


# The crawl is kicked off:
nohup scrapy crawl somesiteSpider &


# .... and, eventually, the raw data (in JSON format), can be retrieved from output.json.

# This can be parsed using a map-only Mr.Job job:

from mrjob.job import MRJob
import json
from bs4 import BeautifulSoup
import re

class ParseHTML(MRJob):

    def mapper(self, _, line):

        idPattern = re.compile('https://somesite.com/getDetail.do?someId=([0-9]+)')

        record = json.loads(line)
        if record['url'] != 'https://somesite.com/notAnInteresingPage.do':
            try:
                url = record['url']
                html = record['html']
                soup = BeautifulSoup(html)
                id = idPattern.findall(url)[0]
                name = soup.find(text=' Name ').parent.nextSibling.nextSibling.getText()
                subject = soup.find(text='Subject').parent.nextSibling.nextSibling.getText().strip()
                currentAge = soup.find(text='Current Age ').parent.nextSibling.nextSibling.getText().strip()
                inCustody = soup.find(text='In Custody').parent.nextSibling.nextSibling.getText().strip()
                race, sex, height, weight, hair, eyes = [elem.getText().split()[0] for elem in soup.findAll('td', {'class':'tableOne'})]
                record = chr(1).join([id, name, subject, currentAge, inCustody, race, sex, height, weight, hair, eyes])
                print record
            except:
                pass

if __name__ == '__main__':
    ParseHTML.run()


# The records can then be parsed into a Hive-friendly format:
python parseHTML.py output.json > somesite_person


# Then loaded into HDFS:
hdfs dfs -copyFromLocal somesite_person /user/hive/warehouse/somesite_person

# and then schema-ified by creating a Hive table:
CREATE TABLE somesite_person
(
	offenderId INT,
	name STRING,
	subject STRING,
	currentAge STRING,
	inCustody STRING,
	race STRING,
	sex STRING,
	height STRING,
	weight int,
	hair STRING,
	eyes STRING
)
ROW FORMAT DELIMITED
   FIELDS TERMINATED BY '\001'
   LINES TERMINATED BY '\n'
   LOCATION '/user/hive/warehouse/somesite_person'
	# The scrapy framework has a helper function to create a skeleton scraper project, e.g.

	scrapy genspider somesiteSpider somesite.com


	# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created:

	outfile = open('urls.txt', 'w')
	for id in range(1, 17723):
	url = "https://somesite.com/getDetail.do?someId=" + str(id)
	outfile.write(url + '\n')

	outfile.close()


	# The data being captured is defined in the items.py
	# items.py

	import scrapy

	class SomesiteItem(scrapy.Item):
	url = scrapy.Field(default="")
	html = scrapy.Field(default="")


	# The scrapy spider gets each of the items.
	# spider.py

	import scrapy
	from somesite.items import SomesiteItem

	class SomesiteSpider(scrapy.Spider):
	name = "somesiteSpider"
	allowed_domains = ["somesite.com"]
	inputfile = open('urls.txt', 'r')
	start_urls = [line.strip() for line in inputfile.readlines()]
	inputfile.close()

	def parse(self, response):
	item = SomesiteItem()
	try:
	item['url'] = response.url
	item['html'] = response.body.encode('utf-8')
	except:
	pass
	yield item


	# The user-agent and delay between page requests are configured in settings.py
	# settings.py

	USER_AGENT = '[your user agent]'
	DOWNLOAD_DELAY = 5
	ITEM_PIPELINES = {
	'somesite.pipelines.JsonFilePipeline': 300,
	}


	# The HTML gets written to a JSON file using a pipeline:
	# pipelines.py
	import json
	class JsonFilePipeline(object):
	def __init__(self):
	self.file = open('output.json', 'wb')
	def process_item(self, item, spider):
	line = json.dumps(dict(item)) + "\n"
	self.file.write(line)
	return item


	# The crawl is kicked off:
	nohup scrapy crawl somesiteSpider &


	# .... and, eventually, the raw data (in JSON format), can be retrieved from output.json.

	# This can be parsed using a map-only Mr.Job job:

	from mrjob.job import MRJob
	import json
	from bs4 import BeautifulSoup
	import re

	class ParseHTML(MRJob):

	def mapper(self, _, line):

	idPattern = re.compile('https://somesite.com/getDetail.do?someId=([0-9]+)')

	record = json.loads(line)
	if record['url'] != 'https://somesite.com/notAnInteresingPage.do':
	try:
	url = record['url']
	html = record['html']
	soup = BeautifulSoup(html)
	id = idPattern.findall(url)[0]
	name = soup.find(text=' Name ').parent.nextSibling.nextSibling.getText()
	subject = soup.find(text='Subject').parent.nextSibling.nextSibling.getText().strip()
	currentAge = soup.find(text='Current Age ').parent.nextSibling.nextSibling.getText().strip()
	inCustody = soup.find(text='In Custody').parent.nextSibling.nextSibling.getText().strip()
	race, sex, height, weight, hair, eyes = [elem.getText().split()[0] for elem in soup.findAll('td', {'class':'tableOne'})]
	record = chr(1).join([id, name, subject, currentAge, inCustody, race, sex, height, weight, hair, eyes])
	print record
	except:
	pass

	if __name__ == '__main__':
	ParseHTML.run()


	# The records can then be parsed into a Hive-friendly format:
	python parseHTML.py output.json > somesite_person


	# Then loaded into HDFS:
	hdfs dfs -copyFromLocal somesite_person /user/hive/warehouse/somesite_person

	# and then schema-ified by creating a Hive table:
	CREATE TABLE somesite_person
	(
	offenderId INT,
	name STRING,
	subject STRING,
	currentAge STRING,
	inCustody STRING,
	race STRING,
	sex STRING,
	height STRING,
	weight int,
	hair STRING,
	eyes STRING
	)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY '\001'
	LINES TERMINATED BY '\n'
	LOCATION '/user/hive/warehouse/somesite_person'