View count_wikitext.py
vocab = set()
for i, line in enumerate(open('wiki.train.tokens')):
words = [x for x in line.split(' ') if x]
[vocab.add(word) for word in words]
if i < 10: print(words)
print('Vocab size:', len(vocab))
View cartpole.py
''' Script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and rejection sampling of historical memories '''
import gym
import numpy as np
import chainer
from chainer import optimizers
from chainer import ChainList, Variable
import chainer.functions as F
View buggy_cartpole.py
""" Quick script for Cartpole using policy gradient via Chainer, two layer MLP, dropout, and vaguely rejection sampling of historical memories """
import gym
import numpy as np
import chainer
from chainer import optimizers
from chainer import ChainList, Variable
import chainer.functions as F
View time_dist.py
from __future__ import print_function
import numpy as np
np.random.seed(1337)
import sys
from keras.utils.test_utils import get_test_data
from keras.models import Sequential
from keras.layers.core import Dense, TimeDistributedDense
from keras.layers.recurrent import GRU
View babi_rnn.py
from __future__ import absolute_import
from __future__ import print_function
from functools import reduce
import re
import tarfile
import numpy as np
np.random.seed(1337) # for reproducibility
bAs such, I agree strongly with you that this won't make a good test dataset for testing various RNN architectures.from keras.callbacks import EarlyStopping
View fetch_page.py
import gzip
import json
import requests
try:
from cStringIO import StringIO
except:
from StringIO import StringIO
# Let's fetch the Common Crawl FAQ using the CC index
resp = requests.get('http://index.commoncrawl.org/CC-MAIN-2015-27-index?url=http%3A%2F%2Fcommoncrawl.org%2Ffaqs%2F&output=json')
View uniq_tasks_10k.txt
Unique samples in tasks_1-20_v1-2/en-10k/qa10_indefinite-knowledge_{}.txt
Train length: 9989
Test length: 1000
Intersection: 0
Unique samples in tasks_1-20_v1-2/en-10k/qa11_basic-coreference_{}.txt
Train length: 9827
Test length: 997
Intersection: 25
Unique samples in tasks_1-20_v1-2/en-10k/qa12_conjunction_{}.txt
Train length: 9991
View index.html
<!DOCTYPE html>
<!-- saved from url=(0072)https://np.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/ -->
<html xmlns="http://www.w3.org/1999/xhtml" lang="np" xml:lang="np" class=" js cssanimations csstransforms res-parents-down res-commentBoxes res-commentBoxes-rounded res res-v430 res-navTop"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><title>/r/pics is no longer private : pics</title><meta name="keywords" content=" reddit, reddit.com, vote, comment, submit "><meta name="description" content="Hey guys --- #Woah, you aren;t private anymore Yep, not an excuse to shitpost though #Why are you open The admins have opened a line of..."><meta name="referrer" content="always"><link rel="alternate" media="only screen and (max-width: 640px)" href="https://m.reddit.com/r/pics/comments/3byu3f/rpics_is_no_longer_private/"><meta name="viewport" content="width=1024"><link rel="shorturl" href="http://redd.it/3byu3f"><meta property="og:image" content="https://www.redditstatic.com/ic
View get_all_urls.py
import requests
show_pages = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&showNumPages=true'
get_page = 'http://index.commoncrawl.org/CC-MAIN-2015-18-index?url={query}&output=json&page={page}'
query = 'nytimes.com/*'
show = requests.get(show_pages.format(query=query))
pages = show.json()['pages']
results = set()
View gist:3ad8f76bc78b65ef3090
{
"Container" : {
"Offset" : "1001602245",
"Filename" : "CC-MAIN-20140728011800-00009-ip-10-146-231-18.ec2.internal.warc.gz",
"Compressed" : true,
"Gzip-Metadata" : {
"Header-Length" : "10",
"Footer-Length" : "8",
"Inflated-CRC" : "-645434788",
"Inflated-Length" : "26275",