Skip to content

Instantly share code, notes, and snippets.

@qqueue
Created March 7, 2014 07:20
Show Gist options
  • Save qqueue/9406904 to your computer and use it in GitHub Desktop.
Save qqueue/9406904 to your computer and use it in GitHub Desktop.
k-means on 4chan
#!/usr/bin/env python3
# scikit-based thread clustering for 4chan.
import numpy as np
import sys
import json
import re
from time import time
import html.parser
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
h = html.parser.HTMLParser()
BR_RE = re.compile(r'<br>')
TAG_RE = re.compile(r'<[^>]+>')
def text_content(html):
return h.unescape(TAG_RE.sub('', BR_RE.sub('\n', html)))
with open("/tmp/org.hakase.fountain.a.json") as f:
state = json.load(f)
threads = []
texts = []
for tno, thread in state['threads'].items():
text = []
for post in thread['posts']:
if 'com' in post:
text.append(text_content(post['com']))
threads.append(thread)
texts.append('\n'.join(text))
hasher = HashingVectorizer(n_features=5,
stop_words='english', non_negative=True,
norm=None, binary=False)
vectorizer = Pipeline((
('hasher', hasher),
('tf_idf', TfidfTransformer())
))
X = vectorizer.fit_transform(texts)
km = KMeans(n_clusters=25, init='k-means++', max_iter=100, n_init=1)
km.fit(X)
clusters = {}
for (thread, label) in zip(threads, km.labels_):
if str(label) not in clusters:
clusters[str(label)] = []
clusters[str(label)].append(thread)
print(json.dumps(clusters))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment