Skip to content

Instantly share code, notes, and snippets.

@susanli2016
Created September 23, 2018 18:25
Show Gist options
  • Save susanli2016/4bb8adc4daf4fe313824fb0af1413eaa to your computer and use it in GitHub Desktop.
Save susanli2016/4bb8adc4daf4fe313824fb0af1413eaa to your computer and use it in GitHub Desktop.
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline
df = pd.read_csv('stack-overflow-data.csv')
df = df[pd.notnull(df['tags'])]
print(df.head(10))
print(df['post'].apply(lambda x: len(x.split(' '))).sum())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment