Chu-Yu Hsu ChuyuHsu

## 词性标记.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                ChuyuHsu
                / 词性标记.md
            
            
              Created
              July 21, 2016 09:26
                — forked from luw2007/词性标记.md
            
              
                词性标记： 包含 ICTPOS3.0词性标记集、ICTCLAS 汉语词性标注集、jieba 字典中出现的词性、simhash 中可以忽略的部分词性
              
          
    词的分类


实词：名词、动词、形容词、状态词、区别词、数词、量词、代词
虚词：副词、介词、连词、助词、拟声词、叹词。

ICTPOS3.0词性标记集

n 名词

nr 人名

  
## marisa_count_vectorizer.py
import numpy as np
import marisa_trie
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import six

class MarisaCountVectorizer(CountVectorizer):

    # ``CountVectorizer.fit`` method calls ``fit_transform`` so
    # ``fit`` is not provided
    def fit_transform(self, raw_documents, y=None):

## init.coffee
# Your init script
#
# Atom will evaluate this file each time a new window is opened. It is run
# after packages are loaded/activated and after the previous editor state
# has been restored.
#
# An example hack to log to the console when each text editor is saved.
#
# atom.workspace.observeTextEditors (editor) ->
#   editor.onDidSave ->

## tensorflow rnn variable seq length
import tensorflow as tf
import numpy as np

if __name__ == '__main__':
  np.random.seed(1)
  # the size of the hidden state for the lstm (notice the lstm uses 2x of this amount so actually lstm will have state of size 2)
  size = 1
  # 2 different sequences total
  batch_size= 2
  # the maximum steps for both sequences is 10

## one-hot.py
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.

    Modified from: https://gist.github.com/kljensen/5452382


## spark_parallel_boost.py
from pyspark import SparkContext

import numpy as np

from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

def run(sc):

## pptpd.sh
# Automaticlly install pptpd on Amazon EC2 Amazon Linux
#
# Ripped from http://blog.diahosting.com/linux-tutorial/pptpd/
# pptpd source rpm packing by it's authors
#
# WARNING:
# first ms-dns setting to 172.16.0.23, 172.16.0.23 was showing on my
# /etc/resolv.conf, I'm not sure this is the same on all Amazon AWS zones.
#
# You need to adjust your "Security Groups" which you are using too.

## apply_df_by_multiprocessing.py
import multiprocessing
import pandas as pd
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')

## useful_pandas_snippets.py
#List unique values in a DataFrame column
pd.unique(df.column_name.ravel())

#Convert Series datatype to numeric, getting rid of any non-numeric values
df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

#Grab DataFrame rows where column has certain values
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(value_list)]
	import numpy as np
	import marisa_trie
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.externals import six

	class MarisaCountVectorizer(CountVectorizer):

	# ``CountVectorizer.fit`` method calls ``fit_transform`` so
	# ``fit`` is not provided
	def fit_transform(self, raw_documents, y=None):
	# Your init script
	#
	# Atom will evaluate this file each time a new window is opened. It is run
	# after packages are loaded/activated and after the previous editor state
	# has been restored.
	#
	# An example hack to log to the console when each text editor is saved.
	#
	# atom.workspace.observeTextEditors (editor) ->
	# editor.onDidSave ->
	import tensorflow as tf
	import numpy as np

	if __name__ == '__main__':
	np.random.seed(1)
	# the size of the hidden state for the lstm (notice the lstm uses 2x of this amount so actually lstm will have state of size 2)
	size = 1
	# 2 different sequences total
	batch_size= 2
	# the maximum steps for both sequences is 10
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction import DictVectorizer

	def encode_onehot(df, cols):
	"""
	One-hot encoding is applied to columns specified in a pandas DataFrame.

	Modified from: https://gist.github.com/kljensen/5452382
	from pyspark import SparkContext

	import numpy as np

	from sklearn.cross_validation import train_test_split, Bootstrap
	from sklearn.datasets import make_classification
	from sklearn.metrics import accuracy_score
	from sklearn.tree import DecisionTreeClassifier

	def run(sc):
	# Automaticlly install pptpd on Amazon EC2 Amazon Linux
	#
	# Ripped from http://blog.diahosting.com/linux-tutorial/pptpd/
	# pptpd source rpm packing by it's authors
	#
	# WARNING:
	# first ms-dns setting to 172.16.0.23, 172.16.0.23 was showing on my
	# /etc/resolv.conf, I'm not sure this is the same on all Amazon AWS zones.
	#
	# You need to adjust your "Security Groups" which you are using too.
	import multiprocessing
	import pandas as pd
	import numpy as np

	def _apply_df(args):
	df, func, kwargs = args
	return df.apply(func, **kwargs)

	def apply_by_multiprocessing(df, func, **kwargs):
	workers = kwargs.pop('workers')
	#List unique values in a DataFrame column
	pd.unique(df.column_name.ravel())

	#Convert Series datatype to numeric, getting rid of any non-numeric values
	df['col'] = df['col'].astype(str).convert_objects(convert_numeric=True)

	#Grab DataFrame rows where column has certain values
	valuelist = ['value1', 'value2', 'value3']
	df = df[df.column.isin(value_list)]