adam mittenchops

## count_hist.sh
cat file.json | awk '{print $2}' | sort | uniq -c | sort -rn | head

## rsplit.R
rsplit <- function(mydf, chr){
    sapply(sapply(mydf, strsplit,chr, USE.NAMES=F),function(x){x[length(x)]})
    }

# USAGE
# val <- rsplit(df$long_url,"/")

## htmlclean.py
import requests
from lxml.html.clean import Cleaner

url = "http://en.wikipedia.org/wiki/Zipf%27s_law"
html = requests.get(url).text
cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False, remove_tags=['<div>','</div>'])
cleaner.scripts = True
cleaner.page_structure = True
cleaner.javascript = True
cleaner.style = True

## alldigits.py
# It took 15 tries.
digits = re.findall(r'[0-9]+', cleaned_text)
digits2 = re.findall(r'[0-9]\d*(\.\d+)?', cleaned_text)
digits3 = re.findall(r'[0-9]+((\.([0-9]+))?', cleaned_text)
digits4 = re.findall(r'[0-9]+(\.)?([0-9]+)?', cleaned_text)
digits5 = re.findall(r'\d+\.?\d*?', cleaned_text)
digits6 = re.findall(r'\d+(\.\d*)?', cleaned_text)
digits7 = re.findall(r'\d+(\.?\d*)?', cleaned_text)
digits8 = re.findall(r'\d+(\.?\d*)', cleaned_text)
digits9 = re.findall(r'\d+(\.{1}\d*)?', cleaned_text)

## histmaker.py
mylist = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

## pylist2rlist.py
lambda x: 'c({})'.format(x).replace("[","").replace("]","")

## TrueOCR.sh
# https://launchpad.net/~gezakovacs/+archive/pdfocr
pdfocr -i "$file" -o /tmp/tmp.pdf
pdftotext /tmp/tmp.pdf "`basename "$file" .pdf`.txt"

## learn.py
import pandas
import numpy as np
import string
import random
import matplotlib.pyplot as plt
from pandas import DataFrame
#import statsmodels.formula.api as sm

df = DataFrame(np.random.randn(10,3))
df['3'] = random.sample(string.letters,10)

## mongo agg
> db.coll.aggregate( [ { $group : {_id:0, minS : {$min: "$variabletomin"}, maxS : {$max : "$variabletomax"} } } ] )

## groupby.py
from itertools import groupby, islice
from operator import itemgetter
from pprint import pprint

>>> gb = groupby(sorted(xrange(0,11),key=iseven),iseven)
>>> [','.join(map(str,k)) for g,k in gb]
['1,3,5,7,9', '0,2,4,6,8,10']

>>> sent = "This is a long sentence where I want to group words of similar length using the python groupby function"
>>> gb = groupby(sorted(sent.split(),key=len),len)
	rsplit <- function(mydf, chr){
	sapply(sapply(mydf, strsplit,chr, USE.NAMES=F),function(x){x[length(x)]})
	}

	# USAGE
	# val <- rsplit(df$long_url,"/")
	import requests
	from lxml.html.clean import Cleaner

	url = "http://en.wikipedia.org/wiki/Zipf%27s_law"
	html = requests.get(url).text
	cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False, remove_tags=['<div>','</div>'])
	cleaner.scripts = True
	cleaner.page_structure = True
	cleaner.javascript = True
	cleaner.style = True
	# It took 15 tries.
	digits = re.findall(r'[0-9]+', cleaned_text)
	digits2 = re.findall(r'[0-9]\d*(\.\d+)?', cleaned_text)
	digits3 = re.findall(r'[0-9]+((\.([0-9]+))?', cleaned_text)
	digits4 = re.findall(r'[0-9]+(\.)?([0-9]+)?', cleaned_text)
	digits5 = re.findall(r'\d+\.?\d*?', cleaned_text)
	digits6 = re.findall(r'\d+(\.\d*)?', cleaned_text)
	digits7 = re.findall(r'\d+(\.?\d*)?', cleaned_text)
	digits8 = re.findall(r'\d+(\.?\d*)', cleaned_text)
	digits9 = re.findall(r'\d+(\.{1}\d*)?', cleaned_text)
	# https://launchpad.net/~gezakovacs/+archive/pdfocr
	pdfocr -i "$file" -o /tmp/tmp.pdf
	pdftotext /tmp/tmp.pdf "`basename "$file" .pdf`.txt"
	import pandas
	import numpy as np
	import string
	import random
	import matplotlib.pyplot as plt
	from pandas import DataFrame
	#import statsmodels.formula.api as sm

	df = DataFrame(np.random.randn(10,3))
	df['3'] = random.sample(string.letters,10)
	from itertools import groupby, islice
	from operator import itemgetter
	from pprint import pprint

	>>> gb = groupby(sorted(xrange(0,11),key=iseven),iseven)
	>>> [','.join(map(str,k)) for g,k in gb]
	['1,3,5,7,9', '0,2,4,6,8,10']

	>>> sent = "This is a long sentence where I want to group words of similar length using the python groupby function"
	>>> gb = groupby(sorted(sent.split(),key=len),len)