This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cat file.json | awk '{print $2}' | sort | uniq -c | sort -rn | head |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rsplit <- function(mydf, chr){ | |
sapply(sapply(mydf, strsplit,chr, USE.NAMES=F),function(x){x[length(x)]}) | |
} | |
# USAGE | |
# val <- rsplit(df$long_url,"/") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml.html.clean import Cleaner | |
url = "http://en.wikipedia.org/wiki/Zipf%27s_law" | |
html = requests.get(url).text | |
cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False, remove_tags=['<div>','</div>']) | |
cleaner.scripts = True | |
cleaner.page_structure = True | |
cleaner.javascript = True | |
cleaner.style = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# It took 15 tries. | |
digits = re.findall(r'[0-9]+', cleaned_text) | |
digits2 = re.findall(r'[0-9]\d*(\.\d+)?', cleaned_text) | |
digits3 = re.findall(r'[0-9]+((\.([0-9]+))?', cleaned_text) | |
digits4 = re.findall(r'[0-9]+(\.)?([0-9]+)?', cleaned_text) | |
digits5 = re.findall(r'\d+\.?\d*?', cleaned_text) | |
digits6 = re.findall(r'\d+(\.\d*)?', cleaned_text) | |
digits7 = re.findall(r'\d+(\.?\d*)?', cleaned_text) | |
digits8 = re.findall(r'\d+(\.?\d*)', cleaned_text) | |
digits9 = re.findall(r'\d+(\.{1}\d*)?', cleaned_text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mylist = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lambda x: 'c({})'.format(x).replace("[","").replace("]","") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://launchpad.net/~gezakovacs/+archive/pdfocr | |
pdfocr -i "$file" -o /tmp/tmp.pdf | |
pdftotext /tmp/tmp.pdf "`basename "$file" .pdf`.txt" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import numpy as np | |
import string | |
import random | |
import matplotlib.pyplot as plt | |
from pandas import DataFrame | |
#import statsmodels.formula.api as sm | |
df = DataFrame(np.random.randn(10,3)) | |
df['3'] = random.sample(string.letters,10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
> db.coll.aggregate( [ { $group : {_id:0, minS : {$min: "$variabletomin"}, maxS : {$max : "$variabletomax"} } } ] ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import groupby, islice | |
from operator import itemgetter | |
from pprint import pprint | |
>>> gb = groupby(sorted(xrange(0,11),key=iseven),iseven) | |
>>> [','.join(map(str,k)) for g,k in gb] | |
['1,3,5,7,9', '0,2,4,6,8,10'] | |
>>> sent = "This is a long sentence where I want to group words of similar length using the python groupby function" | |
>>> gb = groupby(sorted(sent.split(),key=len),len) |
OlderNewer