Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Last active January 3, 2016 21:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abelsonlive/8522564 to your computer and use it in GitHub Desktop.
Save abelsonlive/8522564 to your computer and use it in GitHub Desktop.
sample img crawler
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol, JSONProtocol, RawProtocol
import json
from urlparse import urlparse
img_formats = [
'jpg',
'gif',
'png'
]
def test_link(l):
if l is None:
return None, False
elif l =='':
return None, False
else:
for format in img_formats:
if l.endswith("." + format):
return format, True
else:
continue
# return none if everything fails
return None, False
def parse_tld(l):
link = urlparse(l).netloc
if link.startswith('www.'):
return link[4:]
else:
return link
class ImgCrawler(MRJob):
HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.SequenceFileAsTextInputFormat'
OUTPUT_PROTOCOL = JSONValueProtocol
def mapper(self, key, line):
"""
Step through links, detect data sources, store metadata
"""
# print line
# check if line exists
if line is not None:
# split into key/value pairs
source, value = line.strip().split("\t")
# get tld of source
tld_source = parse_tld(source)
# parse meta_data
meta_data = json.loads(value.decode('utf-8', "ignore"))
# check for existence of necessary keys
if 'content' in meta_data:
if 'links' in meta_data['content']:
# loop through links
for link in meta_data['content']['links']:
# detect data
l = link.get('href', '')
format, is_data = test_link(l)
# continue if we found a data file
if is_data:
# parse tld target
tld_target = parse_tld(l)
# format output
value = {
'source': source,
'file': l,
'tld_source': tld_source,
'tld_target': tld_target,
'format': format
}
# emit row
yield l, value
def reducer(self, data_link, links_to_data):
"""
reduce output and count incoming links
"""
links = [l for l in links_to_data]
value = {
'file': data_link,
'format': links[0]['format'],
'in_degree':len(links),
'links':links
}
yield data_link, value
if __name__ == '__main__':
ImgCrawler.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment