Skip to content

Instantly share code, notes, and snippets.

@cmgerber
Created November 3, 2014 17:49
Show Gist options
  • Save cmgerber/af05c61b122d44c03466 to your computer and use it in GitHub Desktop.
Save cmgerber/af05c61b122d44c03466 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import nltk\nimport codecs\nimport unicodedata\nimport re\nfrom copy import deepcopy\nfrom pyUtil import easyPickle as pickle\nfrom pyUtil import flattenList as flatten\nimport random\nimport pandas as pd\nfrom nltk.corpus import wordnet as wn",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#read in file\nf = codecs.open('../Data/yahoo_train.txt',\n encoding=\"utf-8\")\ntrain_text = f.readlines()\n\n#normalize unicode\ndef norm_unicode(text):\n '''this function takes in a list of strings, and \n normalizes each word in each string from unicode\n characters to equivalent (or closest) ascii \n characters'''\n text_ascii = []\n for doc in text:\n re_combine = []\n for word in doc.split():\n word = unicodedata.normalize('NFKD', word).encode('ascii','ignore')\n re_combine.append(word)\n text_ascii.append(' '.join(re_combine))\n return text_ascii\n\ntrain_text = norm_unicode(train_text)\n\n#tokenize sentences\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\n\ndef sent_token(text):\n sentence_groups = []\n for sent_group in text:\n sentence_groups.append(sent_tokenizer.tokenize(sent_group))\n return sentence_groups\n\ntrain_text_sents = sent_token(train_text)\n\n#tokenize words\n#patter for tokenizing\npattern = r'''(?x) # set flag to allow verbose regexps\n ([A-Z]\\.)+ # abbreviations, e.g. U.S.A\n | \\w+([-‘]\\w+)* # words with optional internal hyphens\n | \\$?\\d+(\\.\\d+)?%? # currency and percentages, e.g. $12.40, 82%\n | \\.\\.\\. # ellipsis... \n | [][.,;\"'?():\\-_`]+ # these are separate tokens\n '''\n\ndef doc_token(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.regexp_tokenize(sent, pattern))\n result.append(doc_text)\n return result\n\ntrain_text_docs_token = doc_token(train_text_sents)",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Load tagged data",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "train_text_docs_tagged = pickle.open_object('yahoo_train_corpus_pos_tagged.pkl')",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Lematize (stem) data - didn't work",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "# from nltk.stem.wordnet import WordNetLemmatizer as lemma\n# lemm = lemma()\n\n# def lem(doc):\n# new_doc = []\n# for sent in doc:\n# new_sent = []\n# for word in sent:\n# new_sent.append(lemm.lemmatize(word))\n# new_doc.append(new_sent)\n# return new_doc\n\n# #for tagged data\n# def lem(doc):\n# new_doc = []\n# for sent in doc:\n# new_sent = []\n# for word in sent:\n# new_sent.append((lemm.lemmatize(word[0]), word[1]))\n# new_doc.append(new_sent)\n# return new_doc",
"prompt_number": 111,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# train_text_docs_tagged_lem = []\n\n# for doc in train_text_docs_tagged:\n# train_text_docs_tagged_lem.append(lem(doc))",
"prompt_number": 112,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# train_text_docs_token_lem = []\n\n# for doc in train_text_docs_token:\n# train_text_docs_token_lem.append(lem(doc))",
"prompt_number": 90,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Create Training Set",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#training set for tagged data\ndef get_doc_tuple_tag(text, num):\n result = []\n for doc in text:\n if doc[0][0][0] == str(num):\n doc[0][0] = ('', '')\n result.append((doc, str(num)))\n return result\n\ndef createDataSet_tag(text):\n cat_1 = get_doc_tuple_tag(text, 1)\n cat_2 = get_doc_tuple_tag(text, 2)\n cat_3 = get_doc_tuple_tag(text, 3)\n cat_4 = get_doc_tuple_tag(text, 4)\n cat_5 = get_doc_tuple_tag(text, 5)\n cat_6 = get_doc_tuple_tag(text, 6)\n cat_7 = get_doc_tuple_tag(text, 7)\n all_cats = cat_1+cat_2+cat_3+cat_4+cat_5+cat_6+cat_7\n \n #shuffle to make sure random\n random.shuffle(all_cats)\n return all_cats\n\ntrain_text_docs_token_c = deepcopy(train_text_docs_tagged)\ncat_data_tag = createDataSet_tag(train_text_docs_token_c)",
"prompt_number": 4,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#training set for untagged data\ndef get_doc_tuple(text, num):\n result = []\n for doc in text:\n if doc[0][0] == str(num):\n doc[0][0] = ''\n result.append((doc, str(num)))\n return result\n\ndef createDataSet(text):\n cat_1 = get_doc_tuple(text, 1)\n cat_2 = get_doc_tuple(text, 2)\n cat_3 = get_doc_tuple(text, 3)\n cat_4 = get_doc_tuple(text, 4)\n cat_5 = get_doc_tuple(text, 5)\n cat_6 = get_doc_tuple(text, 6)\n cat_7 = get_doc_tuple(text, 7)\n all_cats = cat_1+cat_2+cat_3+cat_4+cat_5+cat_6+cat_7\n \n #shuffle to make sure random\n random.shuffle(all_cats)\n return all_cats\n\ntrain_text_docs_token_c = deepcopy(train_text_docs_token)\ncat_data = createDataSet(train_text_docs_token_c)",
"prompt_number": 5,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "TF-IDF",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#get tfidf data\ndef get_doc_tuple_tf(text, num):\n result = []\n for doc in text:\n if doc[0][0] == str(num):\n doc[0][0] = ''\n result.append(doc)\n return result\n\ndef createTFIDFDataSet(text):\n cat_1 = get_doc_tuple_tf(text, 1)\n cat_2 = get_doc_tuple_tf(text, 2)\n cat_3 = get_doc_tuple_tf(text, 3)\n cat_4 = get_doc_tuple_tf(text, 4)\n cat_5 = get_doc_tuple_tf(text, 5)\n cat_6 = get_doc_tuple_tf(text, 6)\n cat_7 = get_doc_tuple_tf(text, 7)\n all_tfidf_cats = [cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7]\n #flatten each document\n flat_docs = []\n for doc in all_tfidf_cats:\n flat_docs.append(' '.join(flatten.flatten(flatten.flatten(doc))))\n \n \n return flat_docs",
"prompt_number": 6,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "train_text_docs_token_copy = deepcopy(train_text_docs_token)\ntfidf_text = createTFIDFDataSet(train_text_docs_token_copy)\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nvectorizer = TfidfVectorizer(min_df=2, stop_words='english') #, ngram_range=(1,3)\nX = vectorizer.fit_transform(tfidf_text)\nidf = vectorizer._tfidf.idf_\ntfidf_dict = dict(zip(vectorizer.get_feature_names(), idf))\ntfidf_list = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)\ntfidf_word_list = [word[0] for word in tfidf_list]",
"prompt_number": 7,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Features and Classification",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#functions for getting features\ndef check_for_num(doc):\n if len([letter for letter in ' '.join(doc)\n if letter in ['1','2','3','4','5','6','7','8','9','0']]) > 0:\n return True\n else:\n return False\n \ndef check_for_long_words(doc):\n for word in doc:\n if len(word) > 4:\n return True\n return False\n\ndef pos_count(tag, tag_list):\n count = 0\n for t in tag_list:\n if tag in t:\n count+=1\n return count\n\ndef pos_repeat(tag, tag_list):\n for n in xrange(len(tag_list)-1):\n if tag in tag_list[n] and tag in tag_list[n+1]:\n return True\n return False\n\ndef pos_count_binary(tag, tag_list):\n for t in tag_list:\n if tag in t:\n return True\n return False\n\ndef avg_word_len(document_words):\n count = 0\n word_len = 0\n for word in document_words:\n word_len += len(word)\n count += 1\n return word_len/float(count)\n\ndef get_hypernym(term):\n if len(term[1]) == 0:\n return False\n if term[1][0] == 'V':\n POS = wn.VERB\n elif term[1][0] == 'J':\n POS = wn.ADJ\n elif term[1][0] == 'R':\n POS = wn.ADV\n else:\n POS = wn.NOUN\n # get its nominal synsets\n s = wn.synsets(term[0].lower(), POS)\n \n for syn in s:\n if len(syn.hypernyms()) == 0:\n continue\n elif len(syn.hypernyms()[0].hypernyms()) != 0:\n if len(syn.hypernyms()[0].hypernyms()[0].hypernyms()) != 0:\n return str(syn.hypernyms()[0].hypernyms()[0].hypernyms()[0].name())\n return False #str(syn.hypernyms()[0].hypernyms()[0].name())\n else: \n return False #str(syn.hypernyms()[0].name())\n return False\n #for hyp in syn.hypernyms():\n ",
"prompt_number": 8,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "tag_group_list = ['NN', 'JJ', 'PRP', 'CD', 'RB', 'VB']",
"prompt_number": 9,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#add other features to feature weight dict\nfor pos in tag_group_list:\n tfidf_dict[pos] = 1.5\ntfidf_dict['$'] = 1.5\ntfidf_dict['??'] = 1.5",
"prompt_number": 10,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#convert weights to a 0-1 scale\n#http://stackoverflow.com/questions/929103/convert-a-number-range-to-another-range-maintaining-ratio\noldmax = max(tfidf_dict.values())\noldmin = min(tfidf_dict.values())\noldrange = (oldmax-oldmin)\nnewmin = 0.001\nnewmax = 1\nnewrange = (newmax - newmin)\n\nfor key, value in tfidf_dict.items():\n tfidf_dict[key] = (((value - oldmin) * newrange) / oldrange) + newmin",
"prompt_number": 11,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#add important words to tfidf\n# informative_words = pickle.open_object('informative_words_features.pkl')\n# all_imp_words = informative_words + tfidf_word_list",
"prompt_number": 12,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "from sklearn.linear_model import SGDClassifier\nfrom nltk.classify import SklearnClassifier\nfrom sklearn.svm import SVC\n\n#main classifier\ndef category_features3(doc):\n features = {}\n document_words = [word[0] for word in list(set(flatten.flatten(doc)))]\n document_tags = [word[1] for word in list(set(flatten.flatten(doc)))]\n #document_all = [word for word in list(set(flatten.flatten(doc)))]\n \n# document_bigrams = [' '.join(bi) for bi in list(nltk.bigrams(document_words))]\n# document_all= document_words + document_bigrams\n \n for word in tfidf_word_list:\n features['%s' % word] = (word in document_words) # contains(%s)\n for tag in tag_group_list:\n features['%s' % tag] = pos_count(tag, document_tags) #%s count\n features['$'] = '$' in ' '.join(document_words)\n features['??'] = '??' in ' '.join(document_words)\n features['contains_num'] = check_for_num(document_words)\n features['contains_long_word'] = check_for_long_words(document_words)\n features['avg_word_leng'] = avg_word_len(document_words)\n\n \n# for term in document_all:\n# features['hypernym'] = get_hypernym(term) \n# features['first_word'] = document_words[0]\n# features['last_word'] = document_words[-1]\n# for tag in tag_group_list:\n# features['%s count' % tag] = pos_repeat(tag, document_tags)\n\n\n return features\n\ndef create_training_sets (feature_function, items):\n # Create the features sets. Call the function that was passed in.\n featuresets = [(feature_function(key), value) for (key, value) in items]\n \n halfsize = int(float(len(featuresets)) / 10.0)\n train_features, test_features = featuresets[halfsize:], featuresets[:halfsize]\n train_items, test_items = items[halfsize:], items[:halfsize]\n return train_features, test_features, train_items, test_items\n\ntrain_features, test_features, train_items, test_items = create_training_sets(category_features3, cat_data_tag)",
"prompt_number": 13,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Testing different algorithms"
},
{
"metadata": {},
"cell_type": "code",
"input": "from sklearn.linear_model import SGDClassifier\nfrom nltk.classify import SklearnClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.svm import LinearSVC\nfrom sklearn.neighbors import KNeighborsClassifier\nfrom sklearn.neighbors import NearestCentroid\nfrom sklearn.linear_model import Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import RidgeClassifier\nfrom sklearn.linear_model import RidgeClassifierCV\nfrom sklearn.linear_model import RidgeCV\nfrom sklearn.multiclass import OneVsRestClassifier",
"prompt_number": 14,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#got ideas from here: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html\n#svc = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5)\n#cl4 = svc.fit(train_features, sparse=False)\n#cl4 = SklearnClassifier(SGDClassifier(loss='hinge', penalty='l1', alpha=1e-3, n_iter=5), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(LinearSVC(loss='l2', penalty='l1', dual=False, tol=1e-3), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(Perceptron(n_iter=50), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(RidgeClassifier(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, solver='auto', tol=0.001), sparse=False).train(train_features)\n#cl4 = SklearnClassifier(RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, scoring=None, normalize=False), sparse=False).train(train_features)\n#cl4 = nltk.NaiveBayesClassifier.train(train_features)\n#cl4 = SklearnClassifier(OneVsRestClassifier(SGDClassifier(class_weight='auto',loss='hinge', penalty='l1', alpha=1e-3, n_iter=5)), sparse=False).train(train_features)\n\n#BEST\n#cl4 = SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features)\ncl4 = SklearnClassifier(RidgeClassifierCV(), sparse=False).train(train_features) # normalize=True\nprint \"%.3f\" % nltk.classify.accuracy(cl4, test_features)",
"prompt_number": 15,
"outputs": [
{
"output_type": "stream",
"text": "0.528\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "0.520 0.506 .498",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "cl4.show_most_informative_features(20)",
"prompt_number": 129,
"outputs": [
{
"output_type": "stream",
"text": "Most Informative Features\n hypernym = \"Synset('feeling.n.01')\" 4 : 1 = 15.5 : 1.0\n hypernym = \"Synset('creation.n.02')\" 3 : 1 = 8.3 : 1.0\n hypernym = \"Synset('change.v.02')\" 6 : 3 = 6.4 : 1.0\n hypernym = \"Synset('event.n.01')\" 3 : 1 = 6.1 : 1.0\n hypernym = \"Synset('think.v.03')\" 5 : 1 = 5.8 : 1.0\n hypernym = \"Synset('condition.n.01')\" 6 : 1 = 5.7 : 1.0\n hypernym = 'change.v.01' 1 : 3 = 5.0 : 1.0\n hypernym = \"Synset('food.n.01')\" 2 : 1 = 4.7 : 1.0\n hypernym = \"Synset('causal_agent.n.01')\" 4 : 2 = 4.3 : 1.0\n hypernym = \"Synset('measure.n.02')\" 3 : 5 = 4.2 : 1.0\n hypernym = \"Synset('make.v.03')\" 5 : 1 = 4.1 : 1.0\n hypernym = \"Synset('organ.n.01')\" 5 : 1 = 4.1 : 1.0\n hypernym = \"Synset('discipline.n.01')\" 3 : 2 = 3.9 : 1.0\n hypernym = \"Synset('integer.n.01')\" 2 : 5 = 3.7 : 1.0\n hypernym = \"Synset('abstraction.n.06')\" 7 : 1 = 3.4 : 1.0\n hypernym = 'be.v.01' 4 : 1 = 3.4 : 1.0\n hypernym = \"Synset('join.v.01')\" 4 : 1 = 3.1 : 1.0\n hypernym = \"Synset('give.v.03')\" 4 : 1 = 3.1 : 1.0\n hypernym = \"Synset('mean.v.01')\" 1 : 2 = 2.9 : 1.0\n hypernym = \"Synset('compass_point.n.01')\" 3 : 1 = 2.8 : 1.0\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Test a group of classifiers"
},
{
"metadata": {},
"cell_type": "code",
"input": "#got ideas from here: http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html\nfor clf, name in (\n (SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features), \"Ridge Classifier\"),\n (SklearnClassifier(Perceptron(n_iter=50), sparse=False).train(train_features), \"Perceptron\"),\n (SklearnClassifier(PassiveAggressiveClassifier(n_iter=50), sparse=False).train(train_features), \"Passive-Aggressive\"),\n (SklearnClassifier(KNeighborsClassifier(n_neighbors=10), sparse=False).train(train_features), \"kNN\"),\n (SklearnClassifier(NearestCentroid(), sparse=False).train(train_features), \"Centriod\")):\n print name, \"%.3f\" % nltk.classify.accuracy(clf, test_features)",
"prompt_number": 31,
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Ridge Classifier "
},
{
"output_type": "stream",
"stream": "stdout",
"text": "0.502\nPerceptron "
},
{
"output_type": "stream",
"stream": "stdout",
"text": "0.420\nPassive-Aggressive "
},
{
"output_type": "stream",
"stream": "stdout",
"text": "0.435\nkNN "
},
{
"output_type": "stream",
"stream": "stdout",
"text": "0.316\nCentriod "
},
{
"output_type": "stream",
"stream": "stdout",
"text": "0.175\n"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#cross validation for testing and making sure not overfitting\nfrom sklearn import cross_validation\ncv = cross_validation.KFold(len(train_features), n_folds=10, indices=True, shuffle=False, random_state=None)\n\nfor traincv, evalcv in cv:\n classifier = nltk.NaiveBayesClassifier.train(train_features[traincv[0]:traincv[-1]])\n #classifier = SklearnClassifier(RidgeClassifier(tol=1e-2, solver=\"lsqr\"), sparse=False).train(train_features[traincv[0]:traincv[-1]])\n print 'accuracy: %.3f' % nltk.classify.util.accuracy(classifier, train_features[evalcv[0]:evalcv[-1]])",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Running real test for turn in",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#normalizing ascii chartacters\ndef norm_unicode(text):\n '''this function takes in a list of strings, and \n normalizes each word in each string from unicode\n characters to equivalent (or closest) ascii \n characters'''\n text_ascii = []\n for doc in text:\n re_combine = []\n for word in doc.split():\n word = unicodedata.normalize('NFKD', word).encode('ascii','ignore')\n re_combine.append(word)\n text_ascii.append(' '.join(re_combine))\n return text_ascii\n\n#sentence tokenizer\nsent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')\ndef sent_token(text):\n sentence_groups = []\n for sent_group in text:\n sentence_groups.append(sent_tokenizer.tokenize(sent_group))\n return sentence_groups\n\n#patter for tokenizing words\npattern = r'''(?x) # set flag to allow verbose regexps\n ([A-Z]\\.)+ # abbreviations, e.g. U.S.A\n | \\w+([-‘]\\w+)* # words with optional internal hyphens\n | \\$?\\d+(\\.\\d+)?%? # currency and percentages, e.g. $12.40, 82%\n | \\.\\.\\. # ellipsis... \n | [][.,;\"'?():\\-_`]+ # these are separate tokens\n '''\n#tokenize all the words in the documents\ndef doc_token(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.regexp_tokenize(sent, pattern))\n result.append(doc_text)\n return result\n\n#function for tagging text\ndef doc_tagger_pos(text):\n result = []\n for doc in text:\n doc_text = []\n for sent in doc:\n doc_text.append(nltk.pos_tag(sent))\n result.append(doc_text)\n return result\n\n#import yahoo test data\nf = codecs.open('../Data/yahoo_test.csv',\n encoding=\"utf-8\")\ntest_text = f.readlines()\n\n#normalize ascii\ntest_text = norm_unicode(test_text)\n\n#tokenize sentences\ntest_text_sents = sent_token(test_text)\n\n#tokenize all the words in the documents\ntest_text_docs_token = doc_token(test_text_sents)\n\n#remove first line\ntest_text_docs_token = test_text_docs_token[1:]\n\n#split out numbering\nindex_list = []\nfor ix, doc in enumerate(test_text_docs_token):\n index_list.append(doc[0][0])\n test_text_docs_token[ix][0] = doc[0][2:]\n\n#tag test data\ntest_text_docs_tagged = doc_tagger_pos(test_text_docs_token)\n\n#Save tagged data\npickle.save_object(test_text_docs_tagged,\n 'yahoo_test_corpus_pos_tagged.pkl')",
"prompt_number": 16,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#load tagged data\ntest_text_docs_tagged = pickle.open_object('yahoo_test_corpus_pos_tagged.pkl')",
"prompt_number": 17,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#create the training and test sets\ndef create_training_sets_test (feature_function, train, test):\n # Create the features sets. Call the function that was passed in.\n featuresets_train = [(feature_function(key), value) for (key, value) in train]\n featuresets_test = [(feature_function(key)) for (key) in test]\n \n train_features, test_features = featuresets_train, featuresets_test\n train_items, test_items = train, test\n return train_features, test_features, train_items, test_items",
"prompt_number": 18,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "train_features, test_features, train_items, test_items = create_training_sets_test(category_features3,\n cat_data_tag, test_text_docs_tagged)\ncl4 = SklearnClassifier(RidgeClassifierCV(), sparse=False).train(train_features)",
"prompt_number": 19,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def classify_test(test):\n result = []\n for doc in test:\n result.append(cl4.classify(category_features3(doc)))\n return result",
"prompt_number": 20,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "result = classify_test(test_text_docs_tagged)\n#create file for turn in\ndf_turnIn = pd.DataFrame([map(int,index_list),map(int,result)], index=['Id','Category']).T\ndf_turnIn.to_csv('../Data/turn_in4.csv', index=False)",
"prompt_number": 21,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Pipeline",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "from sklearn.pipeline import Pipeline\nfrom sklearn.linear_model import SGDClassifier\nfrom sklearn.feature_selection import SelectKBest, chi2\nfrom sklearn.feature_extraction.text import TfidfTransformer\ntext_pipeline = Pipeline([('tfidf', TfidfTransformer()),\n ('chi2', SelectKBest(chi2, k=2000)),\n ('RC', RidgeClassifierCV())])\npipecl = SklearnClassifier(text_pipeline)",
"prompt_number": 168,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "pipecl.train(train_features)",
"prompt_number": 169,
"outputs": [
{
"text": "<SklearnClassifier(Pipeline(steps=[('tfidf', TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,\n use_idf=True)), ('chi2', SelectKBest(k=2000, score_func=<function chi2 at 0x1149d88c0>)), ('RC', RidgeClassifierCV(alphas=array([ 0.1, 1. , 10. ]), class_weight=None,\n cv=None, fit_intercept=True, loss_func=None, normalize=False,\n score_func=None, scoring=None))]))>",
"output_type": "pyout",
"metadata": {},
"prompt_number": 169
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print \"%.3f\" % nltk.classify.accuracy(pipecl, test_features)",
"prompt_number": 170,
"outputs": [
{
"output_type": "stream",
"text": "0.487\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "heading",
"source": "Averaging algorith",
"level": 3
},
{
"metadata": {},
"cell_type": "code",
"input": "#use to combine classifiers\n#http://stackoverflow.com/questions/21506128/best-way-to-combine-probabilistic-classifiers-in-scikit-learn\nclass EnsembleClassifier(BaseEstimator, ClassifierMixin):\n def __init__(self, classifiers=None):\n self.classifiers = classifiers\n\n def fit(self, X, y):\n for classifier in self.classifiers:\n classifier.fit(X, y)\n\n def predict_proba(self, X):\n self.predictions_ = list()\n for classifier in self.classifiers:\n self.predictions_.append(classifier.predict_proba(X))\n return np.mean(self.predictions_, axis=0)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:fab0e8662ab94d59f4c3bcc4f00450ace910c36206daadf66df28b00d74a8da3"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment