Stephen Merity Smerity

## babi_rnn.py
from __future__ import absolute_import
from __future__ import print_function
from functools import reduce
import re
import tarfile

import numpy as np
np.random.seed(1337)  # for reproducibility

bAs such, I agree strongly with you that this won't make a good test dataset for testing various RNN architectures.from keras.callbacks import EarlyStopping

## knn.cpp
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <map>
#include <set>
#include <sstream>
#include <unordered_map>
#include <vector>

## failed_logins
      1 .+?
      1 [^
      2 0000
      2 010101
      2 1111
      2 1234
      2 12345
      2 666666
      2 adm
      2 anna

## cartpole.py
''' Script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and rejection sampling of historical memories '''

import gym

import numpy as np

import chainer
from chainer import optimizers
from chainer import ChainList, Variable
import chainer.functions as F

## get_all_urls.py
import requests

show_pages = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&showNumPages=true'
get_page = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&page={page}'

query = 'nytimes.com/*'
show = requests.get(show_pages.format(query=query))
pages = show.json()['pages']

results = set()

## gist:2704d3d65aa191ff5f27

      
              1 file
            
          
              1 fork
            
          
              1 comment
            
          
              3 stars
            
          
                Smerity
                / gist:2704d3d65aa191ff5f27
            
            
              Last active
              May 1, 2017 19:45
            
              
                About the data
              
          
    Data Location

The Common Crawl dataset lives on Amazon S3 as part of the Amazon Public Datasets program. Downloading them is free from any instance on Amazon EC2, both via S3 and HTTP.
As the Common Crawl Foundation has evolved over the years, so has the format and metadata that accompany the crawls themselves.

[ARC] Archived Crawl #1 - s3://aws-publicdatasets/common-crawl/crawl-001/ - crawl data from 2008/2010
[ARC] Archived Crawl #2 - s3://aws-publicdatasets/common-crawl/crawl-002/ - crawl data from 2009/2010
[ARC] Archived Crawl #3 - s3://aws-publicdatasets/common-crawl/parse-output/ - crawl data from 2012
[WARC] s3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/


## count_wikitext.py
vocab = set()

for i, line in enumerate(open('wiki.train.tokens')):
  words = [x for x in line.split(' ') if x]
  [vocab.add(word) for word in words]

  if i < 10: print(words)

print('Vocab size:', len(vocab))

## part-r-00000
0	48
0000	6
0l	1
0xdc00	13
1	69
10	11
100	3
1001	1
100154	1
1004	1

## buggy_cartpole.py
""" Quick script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories """

import gym

import numpy as np

import chainer
from chainer import optimizers
from chainer import ChainList, Variable
import chainer.functions as F

## README
#!/bin/bash

# If you'd like, you can actually run this file
# It likely makes more sense to read it, understand it, and run the instructions yourself

# Create the virtual environment
virtualenv env

# Enter into the virtual environment
source ./env/bin/activate
	from __future__ import absolute_import
	from __future__ import print_function
	from functools import reduce
	import re
	import tarfile

	import numpy as np
	np.random.seed(1337) # for reproducibility

	bAs such, I agree strongly with you that this won't make a good test dataset for testing various RNN architectures.from keras.callbacks import EarlyStopping
	#include <algorithm>
	#include <fstream>
	#include <iostream>
	#include <iterator>
	#include <map>
	#include <set>
	#include <sstream>
	#include <unordered_map>
	#include <vector>
	1 .+?
	1 [^
	2 0000
	2 010101
	2 1111
	2 1234
	2 12345
	2 666666
	2 adm
	2 anna
	''' Script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and rejection sampling of historical memories '''

	import gym

	import numpy as np

	import chainer
	from chainer import optimizers
	from chainer import ChainList, Variable
	import chainer.functions as F
	import requests

	show_pages = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&showNumPages=true'
	get_page = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&page={page}'

	query = 'nytimes.com/*'
	show = requests.get(show_pages.format(query=query))
	pages = show.json()['pages']

	results = set()
	vocab = set()

	for i, line in enumerate(open('wiki.train.tokens')):
	words = [x for x in line.split(' ') if x]
	[vocab.add(word) for word in words]

	if i < 10: print(words)

	print('Vocab size:', len(vocab))
	""" Quick script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories """

	import gym

	import numpy as np

	import chainer
	from chainer import optimizers
	from chainer import ChainList, Variable
	import chainer.functions as F
	#!/bin/bash

	# If you'd like, you can actually run this file
	# It likely makes more sense to read it, understand it, and run the instructions yourself

	# Create the virtual environment
	virtualenv env

	# Enter into the virtual environment
	source ./env/bin/activate