Last active
August 29, 2015 14:06
-
-
Save alexwoolford/996f186c539f05ce1589 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The scrapy framework has a helper function to create a skeleton scraper project, e.g. | |
scrapy genspider somesiteSpider somesite.com | |
# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created: | |
outfile = open('urls.txt', 'w') | |
for id in range(1, 17723): | |
url = "https://somesite.com/getDetail.do?someId=" + str(id) | |
outfile.write(url + '\n') | |
outfile.close() | |
# The data being captured is defined in the items.py | |
# items.py | |
import scrapy | |
class SomesiteItem(scrapy.Item): | |
url = scrapy.Field(default="") | |
html = scrapy.Field(default="") | |
# The scrapy spider gets each of the items. | |
# spider.py | |
import scrapy | |
from somesite.items import SomesiteItem | |
class SomesiteSpider(scrapy.Spider): | |
name = "somesiteSpider" | |
allowed_domains = ["somesite.com"] | |
inputfile = open('urls.txt', 'r') | |
start_urls = [line.strip() for line in inputfile.readlines()] | |
inputfile.close() | |
def parse(self, response): | |
item = SomesiteItem() | |
try: | |
item['url'] = response.url | |
item['html'] = response.body.encode('utf-8') | |
except: | |
pass | |
yield item | |
# The user-agent and delay between page requests are configured in settings.py | |
# settings.py | |
USER_AGENT = '[your user agent]' | |
DOWNLOAD_DELAY = 5 | |
ITEM_PIPELINES = { | |
'somesite.pipelines.JsonFilePipeline': 300, | |
} | |
# The HTML gets written to a JSON file using a pipeline: | |
# pipelines.py | |
import json | |
class JsonFilePipeline(object): | |
def __init__(self): | |
self.file = open('output.json', 'wb') | |
def process_item(self, item, spider): | |
line = json.dumps(dict(item)) + "\n" | |
self.file.write(line) | |
return item | |
# The crawl is kicked off: | |
nohup scrapy crawl somesiteSpider & | |
# .... and, eventually, the raw data (in JSON format), can be retrieved from output.json. | |
# This can be parsed using a map-only Mr.Job job: | |
from mrjob.job import MRJob | |
import json | |
from bs4 import BeautifulSoup | |
import re | |
class ParseHTML(MRJob): | |
def mapper(self, _, line): | |
idPattern = re.compile('https://somesite.com/getDetail.do?someId=([0-9]+)') | |
record = json.loads(line) | |
if record['url'] != 'https://somesite.com/notAnInteresingPage.do': | |
try: | |
url = record['url'] | |
html = record['html'] | |
soup = BeautifulSoup(html) | |
id = idPattern.findall(url)[0] | |
name = soup.find(text=' Name ').parent.nextSibling.nextSibling.getText() | |
subject = soup.find(text='Subject').parent.nextSibling.nextSibling.getText().strip() | |
currentAge = soup.find(text='Current Age ').parent.nextSibling.nextSibling.getText().strip() | |
inCustody = soup.find(text='In Custody').parent.nextSibling.nextSibling.getText().strip() | |
race, sex, height, weight, hair, eyes = [elem.getText().split()[0] for elem in soup.findAll('td', {'class':'tableOne'})] | |
record = chr(1).join([id, name, subject, currentAge, inCustody, race, sex, height, weight, hair, eyes]) | |
print record | |
except: | |
pass | |
if __name__ == '__main__': | |
ParseHTML.run() | |
# The records can then be parsed into a Hive-friendly format: | |
python parseHTML.py output.json > somesite_person | |
# Then loaded into HDFS: | |
hdfs dfs -copyFromLocal somesite_person /user/hive/warehouse/somesite_person | |
# and then schema-ified by creating a Hive table: | |
CREATE TABLE somesite_person | |
( | |
offenderId INT, | |
name STRING, | |
subject STRING, | |
currentAge STRING, | |
inCustody STRING, | |
race STRING, | |
sex STRING, | |
height STRING, | |
weight int, | |
hair STRING, | |
eyes STRING | |
) | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY '\001' | |
LINES TERMINATED BY '\n' | |
LOCATION '/user/hive/warehouse/somesite_person' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment