Skip to content

Instantly share code, notes, and snippets.

@hendra-herviawan
Last active June 24, 2018 03:27
Show Gist options
  • Save hendra-herviawan/a863c8859351efa7f3ce99d3e062e6f6 to your computer and use it in GitHub Desktop.
Save hendra-herviawan/a863c8859351efa7f3ce99d3e062e6f6 to your computer and use it in GitHub Desktop.
snipped_201712
#https://stackoverflow.com/questions/10622179/how-to-find-identify-large-files-commits-in-git-history
git rev-list --objects --all \
| git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \
| awk '/^blob/ {print substr($0,6)}' \
| sort --numeric-sort --key=2 \
| cut --complement --characters=13-40 \
| numfmt --field=2 --to=iec-i --suffix=B --padding=7 --round=nearest
#fabfile_listdir.py
from fabric.api import env, run, cd
env.hosts = ["localhost"]
def list_dir1(dir=None):
"""docstring for list_dir"""
dir = dir or env.cwd
string = run("for i in %s*; do echo $i; done" % dir)
files = string.replace("\r","").split("\n")
return files
def list_dir2(dir_=None):
"""returns a list of files in a directory (dir_) as absolute paths"""
with hide('output'):
if dir_ is not None and not dir_.endswith("/"):
dir_ += "/"
dir_ = dir_ or env.cwd
string_ = run("for i in %s*; do echo $i; done" % dir_)
files = string_.replace("\r","").split("\n")
return files
class NBatchLogger(Callback):
def __init__(self,display=100):
'''
display: Number of batches to wait before outputting loss
'''
self.seen = 0
self.display = display
def on_batch_end(self,batch,logs={}):
self.seen += logs.get('size', 0)
if self.seen % self.display == 0:
print '\n{0}/{1} - Batch Loss: {2}'.format(self.seen,self.params['nb_sample'],
self.params['metrics'][0])
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedCountVectorizer(CountVectorizer):
def build_analyzer(self):
analyzer = super(StemmedCountVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
#
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vect = CountVectorizer(tokenizer=LemmaTokenizer())
#
import re
def to_british(tokens):
for t in tokens:
t = re.sub(r"(...)our$", r"\1or", t)
t = re.sub(r"([bt])re$", r"\1er", t)
t = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", t)
t = re.sub(r"ogue$", "og", t)
yield t
class CustomVectorizer(CountVectorizer):
def build_tokenizer(self):
tokenize = super(CustomVectorizer, self).build_tokenizer()
return lambda doc: list(to_british(tokenize(doc)))
print(CustomVectorizer().build_analyzer()(u"color colour"))
#
def number_normalizer(tokens):
""" Map all numeric tokens to a placeholder.
For many applications, tokens that begin with a number are not directly
useful, but the fact that such a token exists can be relevant. By applying
this form of dimensionality reduction, some methods may perform better.
"""
return ("#NUMBER" if token[0].isdigit() else token for token in tokens)
class NumberNormalizingVectorizer(TfidfVectorizer):
def build_tokenizer(self):
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
return lambda doc: list(number_normalizer(tokenize(doc)))
# https://stackoverflow.com/questions/45196312/spacy-and-scikit-learn-vectorizer
def number_normalizer(tokens):
""" Map all numeric tokens to a placeholder.
For many applications, tokens that begin with a number are not directly
useful, but the fact that such a token exists can be relevant. By applying
this form of dimensionality reduction, some methods may perform better.
"""
return ("#NUMBER" if token[0].isdigit() else token for token in tokens)
class NumberNormalizingVectorizer(TfidfVectorizer):
def build_tokenizer(self):
tokenize = super(NumberNormalizingVectorizer, self).build_tokenizer()
return lambda doc: list(number_normalizer(tokenize(doc)))
import string
import spacy
nlp = spacy.load('en_core_web_sm')
# Clean text before feeding it to spaCy
punctuations = string.punctuation
# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
texts = []
counter = 1
for doc in docs:
if counter % 1000 == 0 and logging:
print("Processed %d out of %d documents." % (counter, len(docs)))
counter += 1
doc = nlp(doc, disable=['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
#Punctuation
#https://stackoverflow.com/questions/39782418/remove-punctuations-in-pandas
df["new_column"] = df['review'].str.replace('[^\w\s]','')
#Remove non latter
review_text = re.sub("[^a-zA-Z]"," ", review_text)
#Stopword
eng_stopwords = stopwords.words('english')
wordList = [word for word in wordList if word not in eng_stopwords]
normalized = [w for w in text6 if w.lower() not in stopwords.words('english')]
#Word Tokenizer
tokenization_pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \w+[\x90-\xff] # these are escaped emojis
| [][.,;"'?():-_`] # these are separate tokens
'''
word_tokenizer = nltk.tokenize.regexp.RegexpTokenizer(tokenization_pattern)
#Stemming
pstemmer = nltk.PorterStemmer()
lstemmer = nltk.LancasterStemmer()
wnlemmatizer = nltk.WordNetLemmatizer()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment