abelsonlive/img_crawler.py

## img_crawler.py
from mrjob.job import MRJob
from mrjob.protocol import JSONValueProtocol, JSONProtocol, RawProtocol
import json
from urlparse import urlparse

img_formats = [
  'jpg',
  'gif',
  'png'
]

def test_link(l):
  if l is None:
    return None, False
  elif l =='':
    return None, False
  else:
    for format in img_formats:
      if l.endswith("." + format):
        return format, True
      else:
        continue
    # return none if everything fails
    return None, False

def parse_tld(l):
  link = urlparse(l).netloc
  if link.startswith('www.'):
    return link[4:]
  else:
    return link

class ImgCrawler(MRJob):
  HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.SequenceFileAsTextInputFormat'
  OUTPUT_PROTOCOL = JSONValueProtocol
  def mapper(self, key, line):
    """
    Step through links, detect data sources, store metadata
    """
    # print line
    # check if line exists
    if line is not None:

      # split into key/value pairs
      source, value = line.strip().split("\t")

      # get tld of source
      tld_source = parse_tld(source)

      # parse meta_data
      meta_data = json.loads(value.decode('utf-8', "ignore"))

      # check for existence of necessary keys
      if 'content' in meta_data:
        if 'links' in meta_data['content']:

          # loop through links
          for link in meta_data['content']['links']:

            # detect data
            l = link.get('href', '')
            format, is_data = test_link(l)

            # continue if we found a data file
            if is_data:

              # parse tld target
              tld_target = parse_tld(l)

              # format output
              value = {
                'source': source,
                'file': l,
                'tld_source': tld_source,
                'tld_target': tld_target,
                'format': format
              }

              # emit row
              yield l, value

  def reducer(self, data_link, links_to_data):
    """
    reduce output and count incoming links
    """
    links = [l for l in links_to_data]
    value = {
      'file': data_link,
      'format': links[0]['format'],
      'in_degree':len(links),
      'links':links
    }
    yield data_link, value

if __name__ == '__main__':
  ImgCrawler.run()
	from mrjob.job import MRJob
	from mrjob.protocol import JSONValueProtocol, JSONProtocol, RawProtocol
	import json
	from urlparse import urlparse

	img_formats = [
	'jpg',
	'gif',
	'png'
	]

	def test_link(l):
	if l is None:
	return None, False
	elif l =='':
	return None, False
	else:
	for format in img_formats:
	if l.endswith("." + format):
	return format, True
	else:
	continue
	# return none if everything fails
	return None, False

	def parse_tld(l):
	link = urlparse(l).netloc
	if link.startswith('www.'):
	return link[4:]
	else:
	return link

	class ImgCrawler(MRJob):
	HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.SequenceFileAsTextInputFormat'
	OUTPUT_PROTOCOL = JSONValueProtocol
	def mapper(self, key, line):
	"""
	Step through links, detect data sources, store metadata
	"""
	# print line
	# check if line exists
	if line is not None:

	# split into key/value pairs
	source, value = line.strip().split("\t")

	# get tld of source
	tld_source = parse_tld(source)

	# parse meta_data
	meta_data = json.loads(value.decode('utf-8', "ignore"))

	# check for existence of necessary keys
	if 'content' in meta_data:
	if 'links' in meta_data['content']:

	# loop through links
	for link in meta_data['content']['links']:

	# detect data
	l = link.get('href', '')
	format, is_data = test_link(l)

	# continue if we found a data file
	if is_data:

	# parse tld target
	tld_target = parse_tld(l)

	# format output
	value = {
	'source': source,
	'file': l,
	'tld_source': tld_source,
	'tld_target': tld_target,
	'format': format
	}

	# emit row
	yield l, value

	def reducer(self, data_link, links_to_data):
	"""
	reduce output and count incoming links
	"""
	links = [l for l in links_to_data]
	value = {
	'file': data_link,
	'format': links[0]['format'],
	'in_degree':len(links),
	'links':links
	}
	yield data_link, value

	if __name__ == '__main__':
	ImgCrawler.run()