Created
November 9, 2016 00:42
-
-
Save vmesel/4ed20f62f550091e29c3cd1205eaf1e6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.naive_bayes import GaussianNB | |
def monta_df(arquivo): | |
df = pd.DataFrame() | |
tweets = [] | |
with open(arquivo, "r") as f: | |
for line in f: | |
tweets.append(line) | |
df['TWEETS'] = tweets | |
return df | |
def limpa_dataframe_de_spam(arquivo): | |
df_orig = monta_df(arquivo) | |
df_orig["SPAM"] = [1 if "http" in item else 0 for item in df_orig["TWEETS"]] | |
return df_orig[df_orig["SPAM"] == 0 ] | |
def faz_predicao(frases_pos, frases_negs): | |
resultadoS = [] | |
for i in frases_pos: | |
resultados.append(1) | |
for i in frases_negs: | |
resultados.append(0) | |
classificador = GaussianNB() | |
classificador.fit(frases_pos + frases_negs, resultado_pos) | |
return classificador | |
def classifica_frases(arquivo, frases_pos, frases_negs): | |
classificador = faz_predicao(frases_pos, frases_negs) | |
df = limpa_dataframe_de_spam(arquivo) | |
classes = [] | |
for item in df["TWEETS"]: | |
classes.append(classificador.predict(item)) | |
df["CLASSIFICACOES"] = classes | |
return df | |
limpa_dataframe_de_spam("saida.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment