I hereby claim:
- I am smerity on github.
- I am smerity (https://keybase.io/smerity) on keybase.
- I have a public key whose fingerprint is 56A2 5996 3078 B205 1053 883A 6615 0186 B74F 858B
To claim this, I am signing this object:
0 48 | |
0000 6 | |
0l 1 | |
0xdc00 13 | |
1 69 | |
10 11 | |
100 3 | |
1001 1 | |
100154 1 | |
1004 1 |
#include <algorithm> | |
#include <fstream> | |
#include <iostream> | |
#include <iterator> | |
#include <map> | |
#include <set> | |
#include <sstream> | |
#include <unordered_map> | |
#include <vector> |
package main | |
import ( | |
"encoding/gob" | |
"fmt" | |
"log" | |
"net" | |
"net/rpc" | |
) |
smerity@pegasus:~/Coding/montelight/python$ time ~/Coding/Reference/pypy-2.2.1-linux64/bin/pypy -m cProfile minilight.py roomfront-n-1000.ml.txt | |
MiniLight 1.6 Python - http://www.hxa.name/minilight | |
iteration: 3^C | |
interrupted | |
1155613811 function calls (1062023566 primitive calls) in 89.591 seconds | |
Ordered by: standard name |
# To run: python just_text.py > text | |
### | |
from glob import glob | |
# | |
import warc | |
# List any of the WARC files found in the data folder | |
warc_files = glob('data/*.wet.gz') | |
# Process each of the WARC files we found |
I hereby claim:
To claim this, I am signing this object:
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00000-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00001-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00002-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00003-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00004-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00005-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/CC-MAIN-20140416005201-00006-ip-10-147-4-33.ec2.internal.warc.gz | |
common-crawl/crawl-data/CC-MAIN-2014-15/segments/1397609521512.15/warc/C |
import re | |
# | |
from collections import Counter | |
from glob import glob | |
from urlparse import urlparse | |
# | |
import warc | |
# Extract the names and total usage count of all the opening HTML tags in the document |
import boto | |
from boto.s3.key import Key | |
import zlib | |
def stream_decompress_multi(stream): | |
dec = zlib.decompressobj(16 + zlib.MAX_WBITS) | |
while True: | |
chunk = stream.read(1024 * 8) | |
if not chunk: |