Last active
September 14, 2015 18:51
-
-
Save wiseman/2b8781d81078ce8179e5 to your computer and use it in GitHub Desktop.
metafiter username classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Classifies usernames according to which corpus in dariusk's | |
collection of small corpora they seem to fit into best. | |
John Wiseman <jjwiseman@gmail.com> | |
First each username is segmented using Google's trillion-word corpus. | |
For example, "mathemagician" becomes ["ma", "the", "magician"]. | |
Then we look for corpora that match each segment. | |
ma -> Slovak stop words | |
the -> English stop words, Slovak stop words | |
magician -> Common character archetypes | |
We use a TF-IDF model to assign information values to each corpus over | |
the entire set of usernames. E.g. if almost every username gets a | |
classification as "English stop words", "English stop words" will get | |
a low score. | |
Finally for each username we find the highest scoring matching | |
corpora. | |
Usage: | |
$ git clone https://github.com/dariusk/corpora.git | |
$ cd corpora | |
$ curl -O 'http://mefi.us/infodump/usernames.txt.zip' | |
$ unzip usernames.txt.zip | |
$ pip install wordsegment python-gflags | |
$ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep cannabis | head | |
-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis | |
-5.25749537203 a shrill fucking shitstripe ~350 popular strains of cannabis | |
-5.25749537203 A-Train ~350 popular strains of cannabis | |
-5.25749537203 AdapterIce ~350 popular strains of cannabis | |
-5.25749537203 ambrosia ~350 popular strains of cannabis | |
-5.25749537203 Ambrosia Voyeur ~350 popular strains of cannabis | |
-5.25749537203 American Christmas Devil ~350 popular strains of cannabis | |
-5.25749537203 AMSBoethius ~350 popular strains of cannabis | |
-5.25749537203 androx ~350 popular strains of cannabis | |
-5.25749537203 anonymice ~350 popular strains of cannabis | |
$ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep "little headband" | |
-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis | |
-5.39816270152 a little headband I put around my throat A list of NSA project code names. | |
-7.43070708255 a little headband I put around my throat Danish stop words | |
-7.48155570191 a little headband I put around my throat Suffixes taken from a form on an airline website. | |
-7.63867982388 a little headband I put around my throat Norwegian stop words | |
-8.04494704962 a little headband I put around my throat Swedish stop words | |
-8.42901750051 a little headband I put around my throat Portuguese stop words | |
-8.62137301033 a little headband I put around my throat Spanish stop words | |
-9.20883924585 a little headband I put around my throat data/science/elements.json | |
-16.9385253153 a little headband I put around my throat Polish stop words | |
""" | |
import codecs | |
import collections | |
import json | |
import logging | |
import math | |
import os | |
from os import path | |
import sys | |
import types | |
import gflags | |
import wordsegment | |
sys.stdout = codecs.getwriter('utf8')(sys.stdout) | |
FLAGS = gflags.FLAGS | |
logger = logging.getLogger(__name__) | |
gflags.DEFINE_bool( | |
'dump_idf', False, | |
'Dump the category scores.') | |
def json_files(): | |
"Returns a list of all JSON files under the data directory." | |
jsons = [] | |
for root, dirs, files in os.walk('data'): | |
for f in files: | |
if f.endswith('.json'): | |
jsons.append(path.join(root, f)) | |
return jsons | |
def corpus_values(corpus): | |
"Returns all values in a corpus." | |
vs = [] | |
for key in corpus: | |
if key not in ('description', 'source'): | |
vs += values(corpus[key]) | |
return vs | |
def values(data): | |
"Flattens a JSON object into a single list of values." | |
if isinstance(data, dict): | |
# For dicts, we just want the values. | |
vs = [] | |
for key in data: | |
vs += values(data[key]) | |
return vs | |
if type(data) in (types.ListType, types.TupleType): | |
vs = [] | |
for datum in data: | |
vs += values(datum) | |
return vs | |
return [data] | |
def read_corpus(path): | |
"Reads a corpus file. Returns JSON." | |
with open(path, 'rb') as f: | |
return json.loads(f.read()) | |
def build_db(): | |
"Builds a corpus -> values database." | |
logger.info('Loading corpora...') | |
db = collections.defaultdict(list) | |
for f in json_files(): | |
logger.info('Reading %s', f) | |
corpus = read_corpus(f) | |
desc = corpus.get('description', None) | |
if not desc: | |
logger.warn('No description: %s', f) | |
desc = f | |
values = corpus_values(corpus) | |
for v in values: | |
if v: | |
# Index values as (lowercase) strings. | |
db[unicode(v).lower()].append(desc) | |
logger.info('Done loading corpora') | |
return db | |
def unique(seq): | |
"Removes duplicates from a sequence." | |
return list(set(seq)) | |
def main(argv): | |
argv = FLAGS(argv) | |
logging.basicConfig(level=logging.INFO) | |
db = build_db() | |
index = collections.defaultdict(list) | |
for line in sys.stdin: | |
line = unicode(line, 'utf8') | |
line = line.strip() | |
words = set(wordsegment.segment(line) + [line]) | |
for word in words: | |
index[line] = index[line] + unique(db.get(word.lower(), [])) | |
idf = collections.defaultdict(int) | |
for user in index: | |
for cat in index[user]: | |
idf[cat] += 1 | |
for cat in idf: | |
idf[cat] = math.log(1.0 / idf[cat]) | |
if FLAGS.dump_idf: | |
cat_score = [(cat, idf[cat]) for cat in idf] | |
cat_score = sorted(cat_score, key=lambda e: e[1]) | |
for cat, score in cat_score: | |
print '%f %s' % (score, cat) | |
else: | |
sorted_users = sorted(index.keys(), key=lambda x: x.lower()) | |
for user in sorted_users: | |
scored_cats = collections.defaultdict(float) | |
for cat in index[user]: | |
scored_cats[cat] += idf[cat] | |
cats = [(scored_cats[cat], cat) for cat in scored_cats] | |
cats = sorted(cats, key=lambda e: e[0], reverse=True) | |
# Output this in a unix CLI-friendly way. | |
for score, desc in cats: | |
sys.stdout.write('%s\t%s\t%s\n' % ( | |
score, user, desc)) | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment