Skip to content

Instantly share code, notes, and snippets.

@alexwoolford
Last active August 29, 2015 14:06
Show Gist options
  • Save alexwoolford/996f186c539f05ce1589 to your computer and use it in GitHub Desktop.
Save alexwoolford/996f186c539f05ce1589 to your computer and use it in GitHub Desktop.
# The scrapy framework has a helper function to create a skeleton scraper project, e.g.
scrapy genspider somesiteSpider somesite.com
# Sometimes patterns in the URL's make data acquisition a breeze. A textfile containing a list of URL's is created:
outfile = open('urls.txt', 'w')
for id in range(1, 17723):
url = "https://somesite.com/getDetail.do?someId=" + str(id)
outfile.write(url + '\n')
outfile.close()
# The data being captured is defined in the items.py
# items.py
import scrapy
class SomesiteItem(scrapy.Item):
url = scrapy.Field(default="")
html = scrapy.Field(default="")
# The scrapy spider gets each of the items.
# spider.py
import scrapy
from somesite.items import SomesiteItem
class SomesiteSpider(scrapy.Spider):
name = "somesiteSpider"
allowed_domains = ["somesite.com"]
inputfile = open('urls.txt', 'r')
start_urls = [line.strip() for line in inputfile.readlines()]
inputfile.close()
def parse(self, response):
item = SomesiteItem()
try:
item['url'] = response.url
item['html'] = response.body.encode('utf-8')
except:
pass
yield item
# The user-agent and delay between page requests are configured in settings.py
# settings.py
USER_AGENT = '[your user agent]'
DOWNLOAD_DELAY = 5
ITEM_PIPELINES = {
'somesite.pipelines.JsonFilePipeline': 300,
}
# The HTML gets written to a JSON file using a pipeline:
# pipelines.py
import json
class JsonFilePipeline(object):
def __init__(self):
self.file = open('output.json', 'wb')
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
# The crawl is kicked off:
nohup scrapy crawl somesiteSpider &
# .... and, eventually, the raw data (in JSON format), can be retrieved from output.json.
# This can be parsed using a map-only Mr.Job job:
from mrjob.job import MRJob
import json
from bs4 import BeautifulSoup
import re
class ParseHTML(MRJob):
def mapper(self, _, line):
idPattern = re.compile('https://somesite.com/getDetail.do?someId=([0-9]+)')
record = json.loads(line)
if record['url'] != 'https://somesite.com/notAnInteresingPage.do':
try:
url = record['url']
html = record['html']
soup = BeautifulSoup(html)
id = idPattern.findall(url)[0]
name = soup.find(text=' Name ').parent.nextSibling.nextSibling.getText()
subject = soup.find(text='Subject').parent.nextSibling.nextSibling.getText().strip()
currentAge = soup.find(text='Current Age ').parent.nextSibling.nextSibling.getText().strip()
inCustody = soup.find(text='In Custody').parent.nextSibling.nextSibling.getText().strip()
race, sex, height, weight, hair, eyes = [elem.getText().split()[0] for elem in soup.findAll('td', {'class':'tableOne'})]
record = chr(1).join([id, name, subject, currentAge, inCustody, race, sex, height, weight, hair, eyes])
print record
except:
pass
if __name__ == '__main__':
ParseHTML.run()
# The records can then be parsed into a Hive-friendly format:
python parseHTML.py output.json > somesite_person
# Then loaded into HDFS:
hdfs dfs -copyFromLocal somesite_person /user/hive/warehouse/somesite_person
# and then schema-ified by creating a Hive table:
CREATE TABLE somesite_person
(
offenderId INT,
name STRING,
subject STRING,
currentAge STRING,
inCustody STRING,
race STRING,
sex STRING,
height STRING,
weight int,
hair STRING,
eyes STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
LOCATION '/user/hive/warehouse/somesite_person'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment