qqueue/gist:9406904

## gistfile1.py
#!/usr/bin/env python3
# scikit-based thread clustering for 4chan.
import numpy as np
import sys
import json
import re
from time import time
import html.parser
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

h = html.parser.HTMLParser()

BR_RE = re.compile(r'<br>')
TAG_RE = re.compile(r'<[^>]+>')
def text_content(html):
    return h.unescape(TAG_RE.sub('', BR_RE.sub('\n', html)))

with open("/tmp/org.hakase.fountain.a.json") as f:
    state = json.load(f)

threads = []
texts = []
for tno, thread in state['threads'].items():
    text = []
    for post in thread['posts']:
        if 'com' in post:
            text.append(text_content(post['com']))
    threads.append(thread)
    texts.append('\n'.join(text))

hasher = HashingVectorizer(n_features=5,
                           stop_words='english', non_negative=True,
                           norm=None, binary=False)
vectorizer = Pipeline((
    ('hasher', hasher),
    ('tf_idf', TfidfTransformer())
))

X = vectorizer.fit_transform(texts)

km = KMeans(n_clusters=25, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

clusters = {}
for (thread, label) in zip(threads, km.labels_):
    if str(label) not in clusters:
        clusters[str(label)] = []
    clusters[str(label)].append(thread)

print(json.dumps(clusters))
	#!/usr/bin/env python3
	# scikit-based thread clustering for 4chan.
	import numpy as np
	import sys
	import json
	import re
	from time import time
	import html.parser
	from sklearn.datasets import fetch_20newsgroups
	from sklearn.decomposition import TruncatedSVD
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import HashingVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import Normalizer
	from sklearn import metrics

	from sklearn.cluster import KMeans, MiniBatchKMeans

	h = html.parser.HTMLParser()

	BR_RE = re.compile(r'<br>')
	TAG_RE = re.compile(r'<[^>]+>')
	def text_content(html):
	return h.unescape(TAG_RE.sub('', BR_RE.sub('\n', html)))

	with open("/tmp/org.hakase.fountain.a.json") as f:
	state = json.load(f)

	threads = []
	texts = []
	for tno, thread in state['threads'].items():
	text = []
	for post in thread['posts']:
	if 'com' in post:
	text.append(text_content(post['com']))
	threads.append(thread)
	texts.append('\n'.join(text))

	hasher = HashingVectorizer(n_features=5,
	stop_words='english', non_negative=True,
	norm=None, binary=False)
	vectorizer = Pipeline((
	('hasher', hasher),
	('tf_idf', TfidfTransformer())
	))

	X = vectorizer.fit_transform(texts)

	km = KMeans(n_clusters=25, init='k-means++', max_iter=100, n_init=1)
	km.fit(X)

	clusters = {}
	for (thread, label) in zip(threads, km.labels_):
	if str(label) not in clusters:
	clusters[str(label)] = []
	clusters[str(label)].append(thread)

	print(json.dumps(clusters))