rjurney/convert_to_scikit.py

## convert_to_scikit.py
import sys, os
import numpy as np
from collections import defaultdict
from operator import itemgetter
from sklearn.naive_bayes import GaussianNB

# live    1       classic pop and rock
# onli    2       classic pop and rock
# tri     1       classic pop and rock
# keep    3       classic pop and rock
# dream   2       classic pop and rock

f = open('/tmp/genre_lyrics.txt/part-r-00000')
genre_tokens = defaultdict(lambda : defaultdict(dict))
X = []
y = []
all_keys = {}
for line in f:
  token, count, genre = line[:-1].split('\t')
  all_keys[token] = 1
  genre_tokens[genre][token] = (float(count), genre)

for key in all_keys:
  for genre in sorted(genre_tokens):
    if key in genre_tokens[genre]:
      X.append([genre_tokens[genre][key][0]])
    else:
      X.append([0.0]) # Laplace here
    y.append(genre)

gnb = GaussianNB()
y_pred = gnb.fit(X, y).predict(X)
	import sys, os
	import numpy as np
	from collections import defaultdict
	from operator import itemgetter
	from sklearn.naive_bayes import GaussianNB

	# live 1 classic pop and rock
	# onli 2 classic pop and rock
	# tri 1 classic pop and rock
	# keep 3 classic pop and rock
	# dream 2 classic pop and rock

	f = open('/tmp/genre_lyrics.txt/part-r-00000')
	genre_tokens = defaultdict(lambda : defaultdict(dict))
	X = []
	y = []
	all_keys = {}
	for line in f:
	token, count, genre = line[:-1].split('\t')
	all_keys[token] = 1
	genre_tokens[genre][token] = (float(count), genre)

	for key in all_keys:
	for genre in sorted(genre_tokens):
	if key in genre_tokens[genre]:
	X.append([genre_tokens[genre][key][0]])
	else:
	X.append([0.0]) # Laplace here
	y.append(genre)

	gnb = GaussianNB()
	y_pred = gnb.fit(X, y).predict(X)