Pham Quang Nhat Minh minhpqn

## useful_pandas_snippets.py
#List unique values in a DataFrame column
pd.unique(df.column_name.ravel())

#Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

#Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(value_list)]

## .screenrc
# the following two lines give a two-line status, with the current window highlighted
hardstatus alwayslastline
hardstatus string '%{= kG}[%{G}%H%? %1`%?%{g}][%= %{= kw}%-w%{+b yk} %n*%t%?(%u)%? %{-}%+w %=%{g}][%{B}%m/%d %{W}%C%A%{g}]'

# huge scrollback buffer
defscrollback 5000

# no welcome message
startup_message off

## no_accent_vietnamese.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Chương trình chuyển đổi từ Tiếng Việt có dấu sang Tiếng Việt không dấu
Chỉnh sửa từ mã nguồn của anh NamNT
http://www.vithon.org/2009/06/14/x%E1%BB%AD-ly-ti%E1%BA%BFng-vi%E1%BB%87t-trong-python
"""

import re

INTAB = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"

## print_cm.py
from sklearn.metrics import confusion_matrix

def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels]+[5]) # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print "    " + empty_cell,
    for label in labels:
        print "%{0}s".format(columnwidth) % label,

## mini_sequence_labeler.py
"""
PyTorch implementation of a sequence labeler (POS taggger).

Basic architecture:
 - take words
 - run though bidirectional GRU
 - predict labels one word at a time (left to right), using a recurrent neural network "decoder"

The decoder updates hidden state based on:
 - most recent word

## log.py
import logging
import sys
from logging.handlers import TimedRotatingFileHandler

FORMATTER = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
LOG_FILE = "my_app.log"

def get_console_handler():
	console_handler = logging.StreamHandler(sys.stdout)
	console_handler.setFormatter(FORMATTER)

## rnn_minibatch.py
"""
How to do minibatches for RNNs in pytorch

Assume we feed characters to the model and predict the language of the words.
"""


def prepare_batch(x, y):
    # determine the maximum word length per batch and zero pad the tensors
    n_max = max([a.shape[0] for a in x])

## download_glue_data.py
''' Script for downloading all GLUE data.

Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).

mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC

## regex-japanese.txt
Regex for matching ALL Japanese common & uncommon Kanji (4e00 – 9fcf) ~ The Big Kahuna!
([一-龯])

Regex for matching Hirgana or Katakana
([ぁ-んァ-ン])

Regex for matching Non-Hirgana or Non-Katakana
([^ぁ-んァ-ン])

Regex for matching Hirgana or Katakana or basic punctuation (、。’)

## pep8_cheatsheet.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
"""This module's docstring summary line.

This is a multi-line docstring. Paragraphs are separated with blank lines.
Lines conform to 79-column limit.

Module and packages names should be short, lower_case_with_underscores.
Notice that this in not PEP8-cheatsheet.py
	#List unique values in a DataFrame column
	pd.unique(df.column_name.ravel())

	#Convert Series datatype to numeric, getting rid of any non-numeric values
	df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

	#Grab DataFrame rows where column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(value_list)]
	# the following two lines give a two-line status, with the current window highlighted
	hardstatus alwayslastline
	hardstatus string '%{= kG}[%{G}%H%? %1`%?%{g}][%= %{= kw}%-w%{+b yk} %n*%t%?(%u)%? %{-}%+w %=%{g}][%{B}%m/%d %{W}%C%A%{g}]'

	# huge scrollback buffer
	defscrollback 5000

	# no welcome message
	startup_message off
	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""Chương trình chuyển đổi từ Tiếng Việt có dấu sang Tiếng Việt không dấu
	Chỉnh sửa từ mã nguồn của anh NamNT
	http://www.vithon.org/2009/06/14/x%E1%BB%AD-ly-ti%E1%BA%BFng-vi%E1%BB%87t-trong-python
	"""

	import re

	INTAB = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
	from sklearn.metrics import confusion_matrix

	def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
	"""pretty print for confusion matrixes"""
	columnwidth = max([len(x) for x in labels]+[5]) # 5 is value length
	empty_cell = " " * columnwidth
	# Print header
	print " " + empty_cell,
	for label in labels:
	print "%{0}s".format(columnwidth) % label,
	"""
	PyTorch implementation of a sequence labeler (POS taggger).

	Basic architecture:
	- take words
	- run though bidirectional GRU
	- predict labels one word at a time (left to right), using a recurrent neural network "decoder"

	The decoder updates hidden state based on:
	- most recent word
	import logging
	import sys
	from logging.handlers import TimedRotatingFileHandler

	FORMATTER = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	LOG_FILE = "my_app.log"

	def get_console_handler():
	console_handler = logging.StreamHandler(sys.stdout)
	console_handler.setFormatter(FORMATTER)
	"""
	How to do minibatches for RNNs in pytorch

	Assume we feed characters to the model and predict the language of the words.
	"""


	def prepare_batch(x, y):
	# determine the maximum word length per batch and zero pad the tensors
	n_max = max([a.shape[0] for a in x])
	''' Script for downloading all GLUE data.

	Note: for legal reasons, we are unable to host MRPC.
	You can either use the version hosted by the SentEval team, which is already tokenized,
	or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
	For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
	You should then rename and place specific files in a folder (see below for an example).

	mkdir MRPC
	cabextract MSRParaphraseCorpus.msi -d MRPC
	Regex for matching ALL Japanese common & uncommon Kanji (4e00 – 9fcf) ~ The Big Kahuna!
	([一-龯])

	Regex for matching Hirgana or Katakana
	([ぁ-んァ-ン])

	Regex for matching Non-Hirgana or Non-Katakana
	([^ぁ-んァ-ン])

	Regex for matching Hirgana or Katakana or basic punctuation (、。’)
	#! /usr/bin/env python
	# -- coding: utf-8 --
	"""This module's docstring summary line.

	This is a multi-line docstring. Paragraphs are separated with blank lines.
	Lines conform to 79-column limit.

	Module and packages names should be short, lower_case_with_underscores.
	Notice that this in not PEP8-cheatsheet.py