Skip to content

Instantly share code, notes, and snippets.

View s-j's full-sized avatar
😀
Better than ever

Simon Jonassen s-j

😀
Better than ever
View GitHub Profile
@bbengfort
bbengfort / sentiment.py
Last active December 27, 2022 05:17
An end-to-end demonstration of a Scikit-Learn SVM classifier trained on the positive and negative movie reviews corpus in NLTK.
import os
import time
import string
import pickle
from operator import itemgetter
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
@miguelmalvarez
miguelmalvarez / run.py
Created March 20, 2015 09:32
Represent Reuters21578
from nltk import word_tokenize
from nltk.corpus import reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")
def tokenize(text):
@herrfz
herrfz / Reuters.py
Last active October 18, 2021 19:27
Reuters-21578 keyword extraction
# Reuters-21578 dataset downloader and parser
#
# Author: Eustache Diemert <eustache@diemert.fr>
# http://scikit-learn.org/stable/auto_examples/applications/plot_out_of_core_classification.html
#
# Modified by @herrfz, get pandas DataFrame from the orig SGML
# License: BSD 3 clause
from __future__ import print_function