Created
September 23, 2018 18:25
-
-
Save susanli2016/4bb8adc4daf4fe313824fb0af1413eaa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import pandas as pd | |
import numpy as np | |
from numpy import random | |
import gensim | |
import nltk | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
import matplotlib.pyplot as plt | |
from nltk.corpus import stopwords | |
import re | |
from bs4 import BeautifulSoup | |
%matplotlib inline | |
df = pd.read_csv('stack-overflow-data.csv') | |
df = df[pd.notnull(df['tags'])] | |
print(df.head(10)) | |
print(df['post'].apply(lambda x: len(x.split(' '))).sum()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment