Created
November 3, 2014 17:49
-
-
Save cmgerber/af05c61b122d44c03466 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "import nltk\nimport codecs\nimport unicodedata\nimport re\nfrom copy import deepcopy\nfrom pyUtil import easyPickle as pickle\nfrom pyUtil import flattenList as flatten\nimport random\nimport pandas as pd\nfrom nltk.corpus import wordnet as wn", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#read in file\nf = codecs.open('../Data/yahoo_train.txt',\n encoding=\"utf-8\")\ntrain_text = f.readlines()\n\n#normalize unicode\ndef norm_unicode(text):\n '''this function takes in a list of strings, and \n normalizes each word in each string from unicode\n characters to equivalent (or closest) ascii \n characters'''\n text_ascii = []\n for doc in text:\n re_combine = []\n for word in doc.split():\n word = unicodedata.normalize('NFKD', word).encode('ascii','ignore')\n re_combine.append(word)\n text_ascii.append(' '.join(re_combine))\n return text_ascii\n\ntrain_text = norm_unicode(train_text)\n\n#tokenize sentences\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\n\ndef sent_token(text):\n sentence_groups = []\n for sent_group in text:\n sentence_groups.append(sent_tokenizer.tokenize(sent_group))\n return sentence_groups\n\ntrain_text_sents = sent_token(train_text)\n\n#tokenize words\n#patter for tokenizing\npattern = r'''(?x) # set flag to allow verbose regexps\n ([A-Z]\\.)+ # abbreviations, e.g. U.S.A\n | \\w+([-‘]\\w+)* # words with optional internal hyphens\n | \\$?\\d+(\\.\\d+)?%? # currency and percentages, e.g. $12.40, 82%\n | \\.\\.\\. # ellipsis... \n | [][.,;\"'?():\\-_`]+ # these are separate tokens\n '''\n\ndef doc_token(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.regexp_tokenize(sent, pattern))\n result.append(doc_text)\n return result\n\ntrain_text_docs_token = doc_token(train_text_sents)", | |
"prompt_number": 2, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Load tagged data", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "train_text_docs_tagged = pickle.open_object('yahoo_train_corpus_pos_tagged.pkl')", | |
"prompt_number": 3, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Lematize (stem) data - didn't work", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# from nltk.stem.wordnet import WordNetLemmatizer as lemma\n# lemm = lemma()\n\n# def lem(doc):\n# new_doc = []\n# for sent in doc:\n# new_sent = []\n# for word in sent:\n# new_sent.append(lemm.lemmatize(word))\n# new_doc.append(new_sent)\n# return new_doc\n\n# #for tagged data\n# def lem(doc):\n# new_doc = []\n# for sent in doc:\n# new_sent = []\n# for word in sent:\n# new_sent.append((lemm.lemmatize(word[0]), word[1]))\n# new_doc.append(new_sent)\n# return new_doc", | |
"prompt_number": 111, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# train_text_docs_tagged_lem = []\n\n# for doc in train_text_docs_tagged:\n# train_text_docs_tagged_lem.append(lem(doc))", | |
"prompt_number": 112, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "# train_text_docs_token_lem = []\n\n# for doc in train_text_docs_token:\n# train_text_docs_token_lem.append(lem(doc))", | |
"prompt_number": 90, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Create Training Set", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#training set for tagged data\ndef get_doc_tuple_tag(text, num):\n result = []\n for doc in text:\n if doc[0][0][0] == str(num):\n doc[0][0] = ('', '')\n result.append((doc, str(num)))\n return result\n\ndef createDataSet_tag(text):\n cat_1 = get_doc_tuple_tag(text, 1)\n cat_2 = get_doc_tuple_tag(text, 2)\n cat_3 = get_doc_tuple_tag(text, 3)\n cat_4 = get_doc_tuple_tag(text, 4)\n cat_5 = get_doc_tuple_tag(text, 5)\n cat_6 = get_doc_tuple_tag(text, 6)\n cat_7 = get_doc_tuple_tag(text, 7)\n all_cats = cat_1+cat_2+cat_3+cat_4+cat_5+cat_6+cat_7\n \n #shuffle to make sure random\n random.shuffle(all_cats)\n return all_cats\n\ntrain_text_docs_token_c = deepcopy(train_text_docs_tagged)\ncat_data_tag = createDataSet_tag(train_text_docs_token_c)", | |
"prompt_number": 4, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#training set for untagged data\ndef get_doc_tuple(text, num):\n result = []\n for doc in text:\n if doc[0][0] == str(num):\n doc[0][0] = ''\n result.append((doc, str(num)))\n return result\n\ndef createDataSet(text):\n cat_1 = get_doc_tuple(text, 1)\n cat_2 = get_doc_tuple(text, 2)\n cat_3 = get_doc_tuple(text, 3)\n cat_4 = get_doc_tuple(text, 4)\n cat_5 = get_doc_tuple(text, 5)\n cat_6 = get_doc_tuple(text, 6)\n cat_7 = get_doc_tuple(text, 7)\n all_cats = cat_1+cat_2+cat_3+cat_4+cat_5+cat_6+cat_7\n \n #shuffle to make sure random\n random.shuffle(all_cats)\n return all_cats\n\ntrain_text_docs_token_c = deepcopy(train_text_docs_token)\ncat_data = createDataSet(train_text_docs_token_c)", | |
"prompt_number": 5, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "TF-IDF", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#get tfidf data\ndef get_doc_tuple_tf(text, num):\n result = []\n for doc in text:\n if doc[0][0] == str(num):\n doc[0][0] = ''\n result.append(doc)\n return result\n\ndef createTFIDFDataSet(text):\n cat_1 = get_doc_tuple_tf(text, 1)\n cat_2 = get_doc_tuple_tf(text, 2)\n cat_3 = get_doc_tuple_tf(text, 3)\n cat_4 = get_doc_tuple_tf(text, 4)\n cat_5 = get_doc_tuple_tf(text, 5)\n cat_6 = get_doc_tuple_tf(text, 6)\n cat_7 = get_doc_tuple_tf(text, 7)\n all_tfidf_cats = [cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7]\n #flatten each document\n flat_docs = []\n for doc in all_tfidf_cats:\n flat_docs.append(' '.join(flatten.flatten(flatten.flatten(doc))))\n \n \n return flat_docs", | |
"prompt_number": 6, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "train_text_docs_token_copy = deepcopy(train_text_docs_token)\ntfidf_text = createTFIDFDataSet(train_text_docs_token_copy)\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nvectorizer = TfidfVectorizer(min_df=2, stop_words='english') #, ngram_range=(1,3)\nX = vectorizer.fit_transform(tfidf_text)\nidf = vectorizer._tfidf.idf_\ntfidf_dict = dict(zip(vectorizer.get_feature_names(), idf))\ntfidf_list = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)\ntfidf_word_list = [word[0] for word in tfidf_list]", | |
"prompt_number": 7, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Features and Classification", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#functions for getting features\ndef check_for_num(doc):\n if len([letter for letter in ' '.join(doc)\n if letter in ['1','2','3','4','5','6','7','8','9','0']]) > 0:\n return True\n else:\n return False\n \ndef check_for_long_words(doc):\n for word in doc:\n if len(word) > 4:\n return True\n return False\n\ndef pos_count(tag, tag_list):\n count = 0\n for t in tag_list:\n if tag in t:\n count+=1\n return count\n\ndef pos_repeat(tag, tag_list):\n for n in xrange(len(tag_list)-1):\n if tag in tag_list[n] and tag in tag_list[n+1]:\n return True\n return False\n\ndef pos_count_binary(tag, tag_list):\n for t in tag_list:\n if tag in t:\n return True\n return False\n\ndef avg_word_len(document_words):\n count = 0\n word_len = 0\n for word in document_words:\n word_len += len(word)\n count += 1\n return word_len/float(count)\n\ndef get_hypernym(term):\n if len(term[1]) == 0:\n return False\n if term[1][0] == 'V':\n POS = wn.VERB\n elif term[1][0] == 'J':\n POS = wn.ADJ\n elif term[1][0] == 'R':\n POS = wn.ADV\n else:\n POS = wn.NOUN\n # get its nominal synsets\n s = wn.synsets(term[0].lower(), POS)\n \n for syn in s:\n if len(syn.hypernyms()) == 0:\n continue\n elif len(syn.hypernyms()[0].hypernyms()) != 0:\n if len(syn.hypernyms()[0].hypernyms()[0].hypernyms()) != 0:\n return str(syn.hypernyms()[0].hypernyms()[0].hypernyms()[0].name())\n return False #str(syn.hypernyms()[0].hypernyms()[0].name())\n else: \n return False #str(syn.hypernyms()[0].name())\n return False\n #for hyp in syn.hypernyms():\n ", | |
"prompt_number": 8, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "tag_group_list = ['NN', 'JJ', 'PRP', 'CD', 'RB', 'VB']", | |
"prompt_number": 9, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#add other features to feature weight dict\nfor pos in tag_group_list:\n tfidf_dict[pos] = 1.5\ntfidf_dict['$'] = 1.5\ntfidf_dict['??'] = 1.5", | |
"prompt_number": 10, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#convert weights to a 0-1 scale\n#http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio\noldmax = max(tfidf_dict.values())\noldmin = min(tfidf_dict.values())\noldrange = (oldmax-oldmin)\nnewmin = 0.001\nnewmax = 1\nnewrange = (newmax - newmin)\n\nfor key, value in tfidf_dict.items():\n tfidf_dict[key] = (((value - oldmin) * newrange) / oldrange) + newmin", | |
"prompt_number": 11, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#add important words to tfidf\n# informative_words = pickle.open_object('informative_words_features.pkl')\n# all_imp_words = informative_words + tfidf_word_list", | |
"prompt_number": 12, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "from sklearn.linear_model import SGDClassifier\nfrom nltk.classify import SklearnClassifier\nfrom sklearn.svm import SVC\n\n#main classifier\ndef category_features3(doc):\n features = {}\n document_words = [word[0] for word in list(set(flatten.flatten(doc)))]\n document_tags = [word[1] for word in list(set(flatten.flatten(doc)))]\n #document_all = [word for word in list(set(flatten.flatten(doc)))]\n \n# document_bigrams = [' '.join(bi) for bi in list(nltk.bigrams(document_words))]\n# document_all= document_words + document_bigrams\n \n for word in tfidf_word_list:\n features['%s' % word] = (word in document_words) # contains(%s)\n for tag in tag_group_list:\n features['%s' % tag] = pos_count(tag, document_tags) #%s count\n features['$'] = '$' in ' '.join(document_words)\n features['??'] = '??' in ' '.join(document_words)\n features['contains_num'] = check_for_num(document_words)\n features['contains_long_word'] = check_for_long_words(document_words)\n features['avg_word_leng'] = avg_word_len(document_words)\n\n \n# for term in document_all:\n# features['hypernym'] = get_hypernym(term) \n# features['first_word'] = document_words[0]\n# features['last_word'] = document_words[-1]\n# for tag in tag_group_list:\n# features['%s count' % tag] = pos_repeat(tag, document_tags)\n\n\n return features\n\ndef create_training_sets (feature_function, items):\n # Create the features sets. Call the function that was passed in.\n featuresets = [(feature_function(key), value) for (key, value) in items]\n \n halfsize = int(float(len(featuresets)) / 10.0)\n train_features, test_features = featuresets[halfsize:], featuresets[:halfsize]\n train_items, test_items = items[halfsize:], items[:halfsize]\n return train_features, test_features, train_items, test_items\n\ntrain_features, test_features, train_items, test_items = create_training_sets(category_features3, cat_data_tag)", | |
"prompt_number": 13, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Testing different algorithms" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "from sklearn.linear_model import SGDClassifier\nfrom nltk.classify import SklearnClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.svm import LinearSVC\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.linear_model import RidgeClassifierCV\nfrom sklearn.linear_model import RidgeCV\nfrom sklearn.multiclass import OneVsRestClassifier", | |
"prompt_number": 14, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#got ideas from here: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html\n#svc = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5)\n#cl4 = svc.fit(train_features, sparse=False)\n#cl4 = SklearnClassifier(SGDClassifier(loss='hinge', penalty='l1', alpha=1e-3, n_iter=5), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(LinearSVC(loss='l2', penalty='l1', dual=False, tol=1e-3), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(Perceptron(n_iter=50), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(RidgeClassifier(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='auto', tol=0.001), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, scoring=None, normalize=False), sparse=False).train(train_features)\n#cl4 = nltk.NaiveBayesClassifier.train(train_features)\n#cl4 = SklearnClassifier(OneVsRestClassifier(SGDClassifier(class_weight='auto',loss='hinge', penalty='l1', alpha=1e-3, n_iter=5)), sparse=False).train(train_features)\n\n#BEST\n#cl4 = SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features)\ncl4 = SklearnClassifier(RidgeClassifierCV(), sparse=False).train(train_features) # normalize=True\nprint \"%.3f\" % nltk.classify.accuracy(cl4, test_features)", | |
"prompt_number": 15, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "0.528\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "0.520 0.506 .498", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "cl4.show_most_informative_features(20)", | |
"prompt_number": 129, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "Most Informative Features\n hypernym = \"Synset('feeling.n.01')\" 4 : 1 = 15.5 : 1.0\n hypernym = \"Synset('creation.n.02')\" 3 : 1 = 8.3 : 1.0\n hypernym = \"Synset('change.v.02')\" 6 : 3 = 6.4 : 1.0\n hypernym = \"Synset('event.n.01')\" 3 : 1 = 6.1 : 1.0\n hypernym = \"Synset('think.v.03')\" 5 : 1 = 5.8 : 1.0\n hypernym = \"Synset('condition.n.01')\" 6 : 1 = 5.7 : 1.0\n hypernym = 'change.v.01' 1 : 3 = 5.0 : 1.0\n hypernym = \"Synset('food.n.01')\" 2 : 1 = 4.7 : 1.0\n hypernym = \"Synset('causal_agent.n.01')\" 4 : 2 = 4.3 : 1.0\n hypernym = \"Synset('measure.n.02')\" 3 : 5 = 4.2 : 1.0\n hypernym = \"Synset('make.v.03')\" 5 : 1 = 4.1 : 1.0\n hypernym = \"Synset('organ.n.01')\" 5 : 1 = 4.1 : 1.0\n hypernym = \"Synset('discipline.n.01')\" 3 : 2 = 3.9 : 1.0\n hypernym = \"Synset('integer.n.01')\" 2 : 5 = 3.7 : 1.0\n hypernym = \"Synset('abstraction.n.06')\" 7 : 1 = 3.4 : 1.0\n hypernym = 'be.v.01' 4 : 1 = 3.4 : 1.0\n hypernym = \"Synset('join.v.01')\" 4 : 1 = 3.1 : 1.0\n hypernym = \"Synset('give.v.03')\" 4 : 1 = 3.1 : 1.0\n hypernym = \"Synset('mean.v.01')\" 1 : 2 = 2.9 : 1.0\n hypernym = \"Synset('compass_point.n.01')\" 3 : 1 = 2.8 : 1.0\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "markdown", | |
"source": "Test a group of classifiers" | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#got ideas from here: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html\nfor clf, name in (\n (SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features), \"Ridge Classifier\"),\n (SklearnClassifier(Perceptron(n_iter=50), sparse=False).train(train_features), \"Perceptron\"),\n (SklearnClassifier(PassiveAggressiveClassifier(n_iter=50), sparse=False).train(train_features), \"Passive-Aggressive\"),\n (SklearnClassifier(KNeighborsClassifier(n_neighbors=10), sparse=False).train(train_features), \"kNN\"),\n (SklearnClassifier(NearestCentroid(), sparse=False).train(train_features), \"Centriod\")):\n print name, \"%.3f\" % nltk.classify.accuracy(clf, test_features)", | |
"prompt_number": 31, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Ridge Classifier " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.502\nPerceptron " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.420\nPassive-Aggressive " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.435\nkNN " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.316\nCentriod " | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "0.175\n" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#cross validation for testing and making sure not overfitting\nfrom sklearn import cross_validation\ncv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None)\n\nfor traincv, evalcv in cv:\n classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[-1]])\n #classifier = SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features[traincv[0]:traincv[-1]])\n print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[-1]])", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Running real test for turn in", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#normalizing ascii chartacters\ndef norm_unicode(text):\n '''this function takes in a list of strings, and \n normalizes each word in each string from unicode\n characters to equivalent (or closest) ascii \n characters'''\n text_ascii = []\n for doc in text:\n re_combine = []\n for word in doc.split():\n word = unicodedata.normalize('NFKD', word).encode('ascii','ignore')\n re_combine.append(word)\n text_ascii.append(' '.join(re_combine))\n return text_ascii\n\n#sentence tokenizer\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\ndef sent_token(text):\n sentence_groups = []\n for sent_group in text:\n sentence_groups.append(sent_tokenizer.tokenize(sent_group))\n return sentence_groups\n\n#patter for tokenizing words\npattern = r'''(?x) # set flag to allow verbose regexps\n ([A-Z]\\.)+ # abbreviations, e.g. U.S.A\n | \\w+([-‘]\\w+)* # words with optional internal hyphens\n | \\$?\\d+(\\.\\d+)?%? # currency and percentages, e.g. $12.40, 82%\n | \\.\\.\\. # ellipsis... \n | [][.,;\"'?():\\-_`]+ # these are separate tokens\n '''\n#tokenize all the words in the documents\ndef doc_token(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.regexp_tokenize(sent, pattern))\n result.append(doc_text)\n return result\n\n#function for tagging text\ndef doc_tagger_pos(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.pos_tag(sent))\n result.append(doc_text)\n return result\n\n#import yahoo test data\nf = codecs.open('../Data/yahoo_test.csv',\n encoding=\"utf-8\")\ntest_text = f.readlines()\n\n#normalize ascii\ntest_text = norm_unicode(test_text)\n\n#tokenize sentences\ntest_text_sents = sent_token(test_text)\n\n#tokenize all the words in the documents\ntest_text_docs_token = doc_token(test_text_sents)\n\n#remove first line\ntest_text_docs_token = test_text_docs_token[1:]\n\n#split out numbering\nindex_list = []\nfor ix, doc in enumerate(test_text_docs_token):\n index_list.append(doc[0][0])\n test_text_docs_token[ix][0] = doc[0][2:]\n\n#tag test data\ntest_text_docs_tagged = doc_tagger_pos(test_text_docs_token)\n\n#Save tagged data\npickle.save_object(test_text_docs_tagged,\n 'yahoo_test_corpus_pos_tagged.pkl')", | |
"prompt_number": 16, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#load tagged data\ntest_text_docs_tagged = pickle.open_object('yahoo_test_corpus_pos_tagged.pkl')", | |
"prompt_number": 17, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#create the training and test sets\ndef create_training_sets_test (feature_function, train, test):\n # Create the features sets. Call the function that was passed in.\n featuresets_train = [(feature_function(key), value) for (key, value) in train]\n featuresets_test = [(feature_function(key)) for (key) in test]\n \n train_features, test_features = featuresets_train, featuresets_test\n train_items, test_items = train, test\n return train_features, test_features, train_items, test_items", | |
"prompt_number": 18, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "train_features, test_features, train_items, test_items = create_training_sets_test(category_features3,\n cat_data_tag, test_text_docs_tagged)\ncl4 = SklearnClassifier(RidgeClassifierCV(), sparse=False).train(train_features)", | |
"prompt_number": 19, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "def classify_test(test):\n result = []\n for doc in test:\n result.append(cl4.classify(category_features3(doc)))\n return result", | |
"prompt_number": 20, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "result = classify_test(test_text_docs_tagged)\n#create file for turn in\ndf_turnIn = pd.DataFrame([map(int,index_list),map(int,result)], index=['Id','Category']).T\ndf_turnIn.to_csv('../Data/turn_in4.csv', index=False)", | |
"prompt_number": 21, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Pipeline", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "from sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.feature_extraction.text import TfidfTransformer\ntext_pipeline = Pipeline([('tfidf', TfidfTransformer()),\n ('chi2', SelectKBest(chi2, k=2000)),\n ('RC', RidgeClassifierCV())])\npipecl = SklearnClassifier(text_pipeline)", | |
"prompt_number": 168, | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "pipecl.train(train_features)", | |
"prompt_number": 169, | |
"outputs": [ | |
{ | |
"text": "<SklearnClassifier(Pipeline(steps=[('tfidf', TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,\n use_idf=True)), ('chi2', SelectKBest(k=2000, score_func=<function chi2 at 0x1149d88c0>)), ('RC', RidgeClassifierCV(alphas=array([ 0.1, 1. , 10. ]), class_weight=None,\n cv=None, fit_intercept=True, loss_func=None, normalize=False,\n score_func=None, scoring=None))]))>", | |
"output_type": "pyout", | |
"metadata": {}, | |
"prompt_number": 169 | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "print \"%.3f\" % nltk.classify.accuracy(pipecl, test_features)", | |
"prompt_number": 170, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": "0.487\n", | |
"stream": "stdout" | |
} | |
], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "heading", | |
"source": "Averaging algorith", | |
"level": 3 | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "#use to combine classifiers\n#http://stackoverflow.com/questions/21506128/best-way-to-combine-probabilistic-classifiers-in-scikit-learn\nclass EnsembleClassifier(BaseEstimator, ClassifierMixin):\n def __init__(self, classifiers=None):\n self.classifiers = classifiers\n\n def fit(self, X, y):\n for classifier in self.classifiers:\n classifier.fit(X, y)\n\n def predict_proba(self, X):\n self.predictions_ = list()\n for classifier in self.classifiers:\n self.predictions_.append(classifier.predict_proba(X))\n return np.mean(self.predictions_, axis=0)", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
}, | |
{ | |
"metadata": {}, | |
"cell_type": "code", | |
"input": "", | |
"outputs": [], | |
"language": "python", | |
"trusted": true, | |
"collapsed": false | |
} | |
], | |
"metadata": {} | |
} | |
], | |
"metadata": { | |
"name": "", | |
"signature": "sha256:fab0e8662ab94d59f4c3bcc4f00450ace910c36206daadf66df28b00d74a8da3" | |
}, | |
"nbformat": 3 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment