Skip to content

Instantly share code, notes, and snippets.

@vmesel
Created November 9, 2016 00:42
Show Gist options
  • Save vmesel/4ed20f62f550091e29c3cd1205eaf1e6 to your computer and use it in GitHub Desktop.
Save vmesel/4ed20f62f550091e29c3cd1205eaf1e6 to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.naive_bayes import GaussianNB
def monta_df(arquivo):
df = pd.DataFrame()
tweets = []
with open(arquivo, "r") as f:
for line in f:
tweets.append(line)
df['TWEETS'] = tweets
return df
def limpa_dataframe_de_spam(arquivo):
df_orig = monta_df(arquivo)
df_orig["SPAM"] = [1 if "http" in item else 0 for item in df_orig["TWEETS"]]
return df_orig[df_orig["SPAM"] == 0 ]
def faz_predicao(frases_pos, frases_negs):
resultadoS = []
for i in frases_pos:
resultados.append(1)
for i in frases_negs:
resultados.append(0)
classificador = GaussianNB()
classificador.fit(frases_pos + frases_negs, resultado_pos)
return classificador
def classifica_frases(arquivo, frases_pos, frases_negs):
classificador = faz_predicao(frases_pos, frases_negs)
df = limpa_dataframe_de_spam(arquivo)
classes = []
for item in df["TWEETS"]:
classes.append(classificador.predict(item))
df["CLASSIFICACOES"] = classes
return df
limpa_dataframe_de_spam("saida.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment