Stephen Merity Smerity

## babi_rnn.py
from __future__ import absolute_import
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
np.random.seed(1337)  # for reproducibility

bAs such, I agree strongly with you that this won't make a good test dataset for testing various RNN architectures.from keras.callbacks import EarlyStopping

## fetch_page.py
import gzip
import json
import requests
try:
    from cStringIO import StringIO
except:
    from StringIO import StringIO

# Let's fetch the Common Crawl FAQ using the CC index
resp = requests.get('http://index.commoncrawl.org/CC-MAIN-2015-27-index?url=http%3A%2F%2Fcommoncrawl.org%2Ffaqs%2F&output=json')

## uniq_tasks_10k.txt
Unique samples in tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt
Train length: 9989
Test length: 1000
Intersection: 0
Unique samples in tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt
Train length: 9827
Test length: 997
Intersection: 25
Unique samples in tasks_1-20_v1-2/en-10k/qa12_conjunction_{}.txt
Train length: 9991

## index.html
<!DOCTYPE html>
<!-- saved from url=(0072)https://np.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/ -->
<html xmlns="http://www.w3.org/1999/xhtml" lang="np" xml:lang="np" class=" js cssanimations csstransforms res-parents-down res-commentBoxes res-commentBoxes-rounded res res-v430 res-navTop"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>/r/pics is no longer private : pics</title><meta name="keywords" content=" reddit, reddit.com, vote, comment, submit "><meta name="description" content="Hey guys --- #Woah, you aren;t private anymore Yep, not an excuse to shitpost though #Why are you open The admins have opened a line of..."><meta name="referrer" content="always"><link rel="alternate" media="only screen and (max-width: 640px)" href="https://m.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/"><meta name="viewport" content="width=1024"><link rel="shorturl" href="http://redd.it/3byu3f"><meta property="og:image" content="https://www.redditstatic.com/ic

## get_all_urls.py
import requests

show_pages = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&showNumPages=true'
get_page = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&page={page}'

query = 'nytimes.com/*'
show = requests.get(show_pages.format(query=query))
pages = show.json()['pages']

results = set()

## gist:3ad8f76bc78b65ef3090
{
   "Container" : {
      "Offset" : "1001602245",
      "Filename" : "CC-MAIN-20140728011800-00009-ip-10-146-231-18.ec2.internal.warc.gz",
      "Compressed" : true,
      "Gzip-Metadata" : {
         "Header-Length" : "10",
         "Footer-Length" : "8",
         "Inflated-CRC" : "-645434788",
         "Inflated-Length" : "26275",

## gist:e3ad94df81f43e07b538
{
   "Envelope" : {
      "Format" : "WARC",
      "WARC-Header-Metadata" : {
         "WARC-Record-ID" : "<urn:uuid:9b05aa37-7e35-40b8-9c4f-20095499bf4a>",
         "WARC-Payload-Digest" : "sha1:3WQOSKRIGPS6SKU3T6JVERTORDS3JRJP",
         "Content-Type" : "application/http; msgtype=response",
         "WARC-Concurrent-To" : "<urn:uuid:ef72d54d-3b9e-4db2-87d4-ded219612ce4>",
         "Content-Length" : "21248",
         "WARC-Type" : "response",

## save-my-sd.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                Smerity
                / save-my-sd.md
            
            
              Last active
              August 29, 2015 14:10
            
              
                Saving the photos on my SD card
              
          
    The SD card with all my Barcelona photos is dying, horrifically. This is my hacky attempt to save them.
I/O errors appear transient. Retried attempts to read the files works. When copying via cp IMG_62*.CR2 /dest/ 2> /tmp/bad I end up with:
cp: failed to extend ‘/home/smerity/Pictures/Photos/FailingSDBarcelona/raw/IMG_6275.CR2’: Input/output error
cp: error reading ‘IMG_6279.CR2’: Input/output error
cp: failed to extend ‘/home/smerity/Pictures/Photos/FailingSDBarcelona/raw/IMG_6279.CR2’: Input/output error
cp: error reading ‘IMG_6280.CR2’: Input/output error
cp: failed to extend ‘/home/smerity/Pictures/Photos/FailingSDBarcelona/raw/IMG_6280.CR2’: Input/output error

cp: error reading ‘IMG_6281.CR2’: Input/output error

  
## bbc.pretty.wat
{
   "Envelope" : {
      "WARC-Header-Length" : "578",
      "Block-Digest" : "sha1:YHKQUSBOS4CLYFEKQDVGJ457OAPD6IJO",
      "Format" : "WARC",
      "Actual-Content-Length" : "43428",
      "WARC-Header-Metadata" : {
         "WARC-Record-ID" : "<urn:uuid:ffbfb0c0-6456-42b0-af03-3867be6fc09f>",
         "WARC-Warcinfo-ID" : "<urn:uuid:3169ca8e-39a6-42e9-a4e3-9f001f067bdf>",
         "Content-Length" : "43428",

## gist:00d32fe628a352412704
{
   "Envelope" : {
      "WARC-Header-Metadata" : {
         "WARC-Target-URI" : "http://11870.com/lalangosta/contacts",
         "WARC-Block-Digest" : "sha1:IVGBI6YRWEWT5XAYLRYUC6Z6JOVC32LI",
         "WARC-Type" : "response",
         "WARC-Concurrent-To" : "<urn:uuid:7282548d-aceb-4966-aef7-2f2e7f8a15e8>",
         "WARC-Warcinfo-ID" : "<urn:uuid:3169ca8e-39a6-42e9-a4e3-9f001f067bdf>",
         "WARC-Payload-Digest" : "sha1:TXA7ZK4KTJ3THARCNUBT4HLU2XHBF7VF",
         "WARC-Date" : "2014-08-02T08:36:05Z",
	from __future__ import absolute_import
	from __future__ import print_function
	from functools import reduce
	import re
	import tarfile

	import numpy as np
	np.random.seed(1337) # for reproducibility

	bAs such, I agree strongly with you that this won't make a good test dataset for testing various RNN architectures.from keras.callbacks import EarlyStopping
	import gzip
	import json
	import requests
	try:
	from cStringIO import StringIO
	except:
	from StringIO import StringIO

	# Let's fetch the Common Crawl FAQ using the CC index
	resp = requests.get('http://index.commoncrawl.org/CC-MAIN-2015-27-index?url=http%3A%2F%2Fcommoncrawl.org%2Ffaqs%2F&output=json')
	Unique samples in tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt
	Train length: 9989
	Test length: 1000
	Intersection: 0
	Unique samples in tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt
	Train length: 9827
	Test length: 997
	Intersection: 25
	Unique samples in tasks_1-20_v1-2/en-10k/qa12_conjunction_{}.txt
	Train length: 9991
	<!DOCTYPE html>
	<!-- saved from url=(0072)https://np.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/ -->
	<html xmlns="http://www.w3.org/1999/xhtml" lang="np" xml:lang="np" class=" js cssanimations csstransforms res-parents-down res-commentBoxes res-commentBoxes-rounded res res-v430 res-navTop"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>/r/pics is no longer private : pics</title><meta name="keywords" content=" reddit, reddit.com, vote, comment, submit "><meta name="description" content="Hey guys --- #Woah, you aren;t private anymore Yep, not an excuse to shitpost though #Why are you open The admins have opened a line of..."><meta name="referrer" content="always"><link rel="alternate" media="only screen and (max-width: 640px)" href="https://m.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/"><meta name="viewport" content="width=1024"><link rel="shorturl" href="http://redd.it/3byu3f"><meta property="og:image" content="https://www.redditstatic.com/ic
	import requests

	show_pages = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&showNumPages=true'
	get_page = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&page={page}'

	query = 'nytimes.com/*'
	show = requests.get(show_pages.format(query=query))
	pages = show.json()['pages']

	results = set()
	{
	"Container" : {
	"Offset" : "1001602245",
	"Filename" : "CC-MAIN-20140728011800-00009-ip-10-146-231-18.ec2.internal.warc.gz",
	"Compressed" : true,
	"Gzip-Metadata" : {
	"Header-Length" : "10",
	"Footer-Length" : "8",
	"Inflated-CRC" : "-645434788",
	"Inflated-Length" : "26275",
	{
	"Envelope" : {
	"Format" : "WARC",
	"WARC-Header-Metadata" : {
	"WARC-Record-ID" : "<urn:uuid:9b05aa37-7e35-40b8-9c4f-20095499bf4a>",
	"WARC-Payload-Digest" : "sha1:3WQOSKRIGPS6SKU3T6JVERTORDS3JRJP",
	"Content-Type" : "application/http; msgtype=response",
	"WARC-Concurrent-To" : "<urn:uuid:ef72d54d-3b9e-4db2-87d4-ded219612ce4>",
	"Content-Length" : "21248",
	"WARC-Type" : "response",
	{
	"Envelope" : {
	"WARC-Header-Length" : "578",
	"Block-Digest" : "sha1:YHKQUSBOS4CLYFEKQDVGJ457OAPD6IJO",
	"Format" : "WARC",
	"Actual-Content-Length" : "43428",
	"WARC-Header-Metadata" : {
	"WARC-Record-ID" : "<urn:uuid:ffbfb0c0-6456-42b0-af03-3867be6fc09f>",
	"WARC-Warcinfo-ID" : "<urn:uuid:3169ca8e-39a6-42e9-a4e3-9f001f067bdf>",
	"Content-Length" : "43428",
	{
	"Envelope" : {
	"WARC-Header-Metadata" : {
	"WARC-Target-URI" : "http://11870.com/lalangosta/contacts",
	"WARC-Block-Digest" : "sha1:IVGBI6YRWEWT5XAYLRYUC6Z6JOVC32LI",
	"WARC-Type" : "response",
	"WARC-Concurrent-To" : "<urn:uuid:7282548d-aceb-4966-aef7-2f2e7f8a15e8>",
	"WARC-Warcinfo-ID" : "<urn:uuid:3169ca8e-39a6-42e9-a4e3-9f001f067bdf>",
	"WARC-Payload-Digest" : "sha1:TXA7ZK4KTJ3THARCNUBT4HLU2XHBF7VF",
	"WARC-Date" : "2014-08-02T08:36:05Z",