Skip to content

Instantly share code, notes, and snippets.

@mpenkov
Last active June 9, 2016 16:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mpenkov/401f62731709cd05ca3de860d8ee7d17 to your computer and use it in GitHub Desktop.
Save mpenkov/401f62731709cd05ca3de860d8ee7d17 to your computer and use it in GitHub Desktop.
"""Check whether each line contains tab-separated decodable JSON."""
import sys
import json
import logging
logging.basicConfig(level=logging.ERROR)
for i, line in enumerate(sys.stdin, 1):
try:
key, value = line.split("\t", 1)
json.loads(key)
json.loads(value)
except ValueError as e:
logging.error("badness on line %d", i)
logging.error("<line>")
logging.error("%r", line)
logging.error("</line>")
logging.exception(e)
import urlparse
import urllib
import lxml.html
import lxml.html.clean
import lxml.etree
import itertools
import re
import logging
import gzip
import boto
import warc
import ssl
import time
import socket
import sys
import traceback
from boto.s3.key import Key
from gzipstream import GzipStreamFile
from mrjob.job import MRJob
logger = logging.getLogger(__name__)
#
# This will cause logging messages to appear twice during local runs, but on
# EMR runs, they will appear only once.
#
logger.addHandler(logging.StreamHandler(sys.stdout))
logger.setLevel(logging.INFO)
"""The max number of contact links to keep per domain."""
MAX_NUM_KEEP = 5
SLEEP_TIME = 5
NUM_RETRIES = 5
WORST_SCORE = 100
def read_from_s3(line):
# If we're on EC2 or running on a Hadoop cluster, pull files via S3
logger.info('Reading from Amazon S3')
# Connect to Amazon S3 using anonymous credentials
conn = boto.connect_s3(anon=True)
pds = conn.get_bucket('aws-publicdatasets')
# Start a connection to one of the WARC files
k = Key(pds, line)
f = warc.WARCFile(fileobj=GzipStreamFile(k))
return f
def read_from_local(line):
# If we're local, use files on the local file system
logger.info('Loading local file {}'.format(line))
f = warc.WARCFile(fileobj=gzip.open(line))
return f
class CCJob(MRJob):
def process_record(self, record):
"""
Override process_record with your mapper
"""
raise NotImplementedError('Process record needs to be customized')
def mapper(self, _, fpath, num_retries=NUM_RETRIES):
#
# We're loading a gzipped WARC file from S3, and several things could
# go wrong.
#
# We could trip over a network error. In that case, we sleep for a
# while and retry. The sleep interval increases after each failure to
# reduce the stress on the network. Since we retry the entire file,
# we will need to handle duplicate problems later, in the reducer.
#
# We could also trip over a malformed WARC file. We can't do anything
# here, so give up on the remainder of the file completely.
#
if fpath.startswith("local "):
_, fpath = fpath.split(" ", 1)
read = read_from_local
else:
read = read_from_s3
try:
for attempt in xrange(1, num_retries + 1):
try:
f = read(fpath)
for i, record in enumerate(f):
for key, value in self.process_record(record):
yield key, value
self.increment_counter(
'commoncrawl', 'processed_records', 1
)
break
except (ssl.SSLError, socket.error):
#
# ssl.SSLError: The read operation timed out
# socket.error: [Errno 104] Connection reset by peer
#
logger.error("encountered network error, retrying")
time.sleep(attempt * SLEEP_TIME)
if attempt == num_retries:
self.increment_counter('commoncrawl', 'failed_downloads', 1)
logger.error(
"failed to download %s after %d attempts",
fpath, num_retries
)
except Exception:
#
# The WARC parser raises IOError if it encounters a problem.
# If our IOError comes from the WARC parser, then give up on the
# file. If it comes from somewhere else, we have a different
# problem to deal with.
#
# http://stackoverflow.com/questions/1095601/find-module-name-of-the-originating-exception-in-python
#
exc_type, exc_value, exc_tb = sys.exc_info()
filename, _, _, _ = traceback.extract_tb(exc_tb)[-1]
if "/warc/" in filename:
logger.error("Malformed WARC file: %s, giving up", fpath)
self.increment_counter('commoncrawl', 'bad_warc_files', 1)
else:
#
# This isn't the exception you're looking for.
#
raise
def squash(text):
"""mrjob splits data on newlines, and then decodes each line according to
the specified protocol. If the data contains newlines, it will be
over-segmented and cause problems for the protocol parsers. Therefore,
we must make sure our data doesn't contain newlines."""
return re.sub(r"\s\s+", " ", text)
class MrStageA(CCJob):
def process_record(self, record):
if record['Content-Type'] == 'application/http; msgtype=response':
uri = record["WARC-Target-URI"]
parsed = urlparse.urlparse(uri)
domain = parsed.netloc
score = 0
is_homepage = parsed.path in ["", "/"]
if is_homepage or score < WORST_SCORE:
payload = record.payload.read()
#
# The HTTP response is defined by a specification: first
# part is headers (metadata) and then following two CRLFs
# (newlines) has the data for the response
#
headers, body = payload.split('\r\n\r\n', 1)
if 'Content-Type: text/html' in headers:
try:
uri = to_utf8(uri)
output_dict = split_html(uri, body)
key, value = domain, (score, output_dict)
yield key, value
except (UnicodeEncodeError, UnicodeDecodeError):
logger.error("Unicode encode/decode error: %s", uri)
self.increment_counter('commoncrawl', 'processed_records', 1)
def clean_html(html):
"""Removes parts of HTML unnecessary for processing."""
kill_tags = ["map", "base", "iframe", "select", "noscript"]
kwargs = {"scripts": True, "javascript": True, "comments": True,
"style": True, "links": True, "meta": True,
"page_structure": False, "processing_instructions": True,
"embedded": True, "frames": False, "forms": False,
"annoying_tags": True, "kill_tags": kill_tags}
cleaner = lxml.html.clean.Cleaner(**kwargs)
#
# If clean_html is given a unicode string, it will always return a unicode
# string. If it is given a byte string, then the output will be a
# utf8-encoded byte string or an ascii string, depending on what was in the
# input. Make sure the input is unicode to keep things simple.
#
try:
html = unicode(html)
except UnicodeDecodeError:
html = html.decode("utf-8", "replace")
return cleaner.clean_html(html)
def to_utf8(s):
#
# Make sure all dictionary values are safely encoded to UTF-8 to prevent
# crashes later on down the line.
# TODO: This shouldn't be necessary since we originally decode from UTF-8.
#
try:
return s.encode("utf-8", "replace")
except UnicodeDecodeError:
#
# The encoding above can fail if s is a byte string (as opposed to a
# unicode string. In that case, we can't know it's encoding for sure,
# so we just assume it's utf-8. In the worst case, the non-ASCII
# characters will end up being replaced.
#
return s.decode("utf-8", "replace").encode("utf-8", "replace")
def split_html(url, html):
"""Split the HTML into the title, headings and text for feeding into
ElasticSearch."""
try:
root = lxml.html.document_fromstring(clean_html(html))
html_title = root.xpath("//title/text()")
if html_title:
html_title = squash(to_utf8(urllib.unquote(html_title[0].strip())))
html_headings = [
to_utf8(elt.text.strip()) for elt in itertools.chain(
root.iterfind(".//h1"), root.iterfind(".//h2"),
root.iterfind(".//h3"), root.iterfind(".//h4")
) if elt.text
]
html_headings = [squash(h) for h in html_headings if h]
parts = root.xpath(".//text()")
if root.tail:
parts.append(root.tail)
html_text = to_utf8(
urllib.unquote(squash(" ".join(parts)).strip())
)
return {
"url": url, "title": html_title,
"headings": html_headings, "text": html_text
}
except UnicodeDecodeError:
logger.error("Bad html string from url: %s" % url)
return {"url": to_utf8(url), "error": "unable to parse html"}
except Exception:
exc_type, exc_value, exc_tb = sys.exc_info()
filename, _, _, _ = traceback.extract_tb(exc_tb)[-1]
if "/lxml/" in filename:
logger.error("Bad html string from url: %s" % url)
return {"url": to_utf8(url), "error": "unable to parse html"}
else:
#
# This isn't the exception you're looking for.
#
raise
if __name__ == '__main__':
MrStageA.run()
runners:
emr:
aws_region: us-west-2
# Either set the environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
# or set the two variables below
# aws_access_key_id:
# aws_secret_access_key:
# For more control, it's highly recommended to add your key pair
ec2_key_pair: your-key-pair
ec2_key_pair_file: your-key-file
#ssh_tunnel_to_job_tracker: true
ec2_instance_type: m1.large
ec2_master_instance_type: m1.large
# ec2_master_instance_bid_price: '0.1'
# ec2_core_instance_bid_price: '0.1'
# EMR allows a max of 20 EC2 instances per AWS account, including master
num_ec2_instances: 19
# EMR comes with Python 2.6 by default -- installing Python 2.7 takes a while but might be necessary
# We also install packages needed for streaming compressed files from S3 or reading WARC files
# There's a newer AMI version but it has issues with the released stable mrjob
ami_version: 3.0.4
interpreter: python2.7
bootstrap:
- sudo yum -y --releasever=2014.09 install -y python27 python27-devel gcc-c++
- sudo yum -y --releasever=2014.09 install libxml2 libxml2-devel libxslt libxslt-devel
- wget --no-check-certificate http://bootstrap.pypa.io/get-pip.py
- sudo python2.7 get-pip.py
- sudo pip2.7 install boto mrjob simplejson warc lxml certifi nose
- sudo pip2.7 install https://github.com/commoncrawl/gzipstream/archive/master.zip
s3_tmp_dir: s3://your-bucket/tmp
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00000-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00001-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00002-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00003-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00004-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00005-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00006-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00007-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00008-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00009-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00010-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00011-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00012-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00013-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00014-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00015-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00016-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00017-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00018-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00019-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00020-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00021-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00022-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00023-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00024-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00025-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00026-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00027-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00028-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00029-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00030-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00031-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00032-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00033-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00034-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00035-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00036-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00037-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00038-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00039-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00040-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00041-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00042-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00043-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00044-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00045-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00046-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00047-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00048-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00049-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00050-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00051-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00052-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00053-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00054-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00055-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00056-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00057-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00058-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00059-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00060-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00061-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00062-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00063-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00064-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00065-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00066-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00067-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00068-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00069-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00070-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00071-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00072-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00073-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00074-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00075-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00076-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00077-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00078-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00079-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00080-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00081-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00082-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00083-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00084-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00085-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00086-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00087-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00088-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00089-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00090-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00091-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00092-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00093-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00094-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00095-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00096-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00097-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00098-ip-10-180-136-8.ec2.internal.warc.gz
common-crawl/crawl-data/CC-MAIN-2014-35/segments/1408500800168.29/warc/CC-MAIN-20140820021320-00099-ip-10-180-136-8.ec2.internal.warc.gz
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment