Skip to content

Instantly share code, notes, and snippets.

@wiseman
Last active September 14, 2015 18:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wiseman/2b8781d81078ce8179e5 to your computer and use it in GitHub Desktop.
Save wiseman/2b8781d81078ce8179e5 to your computer and use it in GitHub Desktop.
metafiter username classification
"""Classifies usernames according to which corpus in dariusk's
collection of small corpora they seem to fit into best.
John Wiseman <jjwiseman@gmail.com>
First each username is segmented using Google's trillion-word corpus.
For example, "mathemagician" becomes ["ma", "the", "magician"].
Then we look for corpora that match each segment.
ma -> Slovak stop words
the -> English stop words, Slovak stop words
magician -> Common character archetypes
We use a TF-IDF model to assign information values to each corpus over
the entire set of usernames. E.g. if almost every username gets a
classification as "English stop words", "English stop words" will get
a low score.
Finally for each username we find the highest scoring matching
corpora.
Usage:
$ git clone https://github.com/dariusk/corpora.git
$ cd corpora
$ curl -O 'http://mefi.us/infodump/usernames.txt.zip'
$ unzip usernames.txt.zip
$ pip install wordsegment python-gflags
$ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep cannabis | head
-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis
-5.25749537203 a shrill fucking shitstripe ~350 popular strains of cannabis
-5.25749537203 A-Train ~350 popular strains of cannabis
-5.25749537203 AdapterIce ~350 popular strains of cannabis
-5.25749537203 ambrosia ~350 popular strains of cannabis
-5.25749537203 Ambrosia Voyeur ~350 popular strains of cannabis
-5.25749537203 American Christmas Devil ~350 popular strains of cannabis
-5.25749537203 AMSBoethius ~350 popular strains of cannabis
-5.25749537203 androx ~350 popular strains of cannabis
-5.25749537203 anonymice ~350 popular strains of cannabis
$ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep "little headband"
-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis
-5.39816270152 a little headband I put around my throat A list of NSA project code names.
-7.43070708255 a little headband I put around my throat Danish stop words
-7.48155570191 a little headband I put around my throat Suffixes taken from a form on an airline website.
-7.63867982388 a little headband I put around my throat Norwegian stop words
-8.04494704962 a little headband I put around my throat Swedish stop words
-8.42901750051 a little headband I put around my throat Portuguese stop words
-8.62137301033 a little headband I put around my throat Spanish stop words
-9.20883924585 a little headband I put around my throat data/science/elements.json
-16.9385253153 a little headband I put around my throat Polish stop words
"""
import codecs
import collections
import json
import logging
import math
import os
from os import path
import sys
import types
import gflags
import wordsegment
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
FLAGS = gflags.FLAGS
logger = logging.getLogger(__name__)
gflags.DEFINE_bool(
'dump_idf', False,
'Dump the category scores.')
def json_files():
"Returns a list of all JSON files under the data directory."
jsons = []
for root, dirs, files in os.walk('data'):
for f in files:
if f.endswith('.json'):
jsons.append(path.join(root, f))
return jsons
def corpus_values(corpus):
"Returns all values in a corpus."
vs = []
for key in corpus:
if key not in ('description', 'source'):
vs += values(corpus[key])
return vs
def values(data):
"Flattens a JSON object into a single list of values."
if isinstance(data, dict):
# For dicts, we just want the values.
vs = []
for key in data:
vs += values(data[key])
return vs
if type(data) in (types.ListType, types.TupleType):
vs = []
for datum in data:
vs += values(datum)
return vs
return [data]
def read_corpus(path):
"Reads a corpus file. Returns JSON."
with open(path, 'rb') as f:
return json.loads(f.read())
def build_db():
"Builds a corpus -> values database."
logger.info('Loading corpora...')
db = collections.defaultdict(list)
for f in json_files():
logger.info('Reading %s', f)
corpus = read_corpus(f)
desc = corpus.get('description', None)
if not desc:
logger.warn('No description: %s', f)
desc = f
values = corpus_values(corpus)
for v in values:
if v:
# Index values as (lowercase) strings.
db[unicode(v).lower()].append(desc)
logger.info('Done loading corpora')
return db
def unique(seq):
"Removes duplicates from a sequence."
return list(set(seq))
def main(argv):
argv = FLAGS(argv)
logging.basicConfig(level=logging.INFO)
db = build_db()
index = collections.defaultdict(list)
for line in sys.stdin:
line = unicode(line, 'utf8')
line = line.strip()
words = set(wordsegment.segment(line) + [line])
for word in words:
index[line] = index[line] + unique(db.get(word.lower(), []))
idf = collections.defaultdict(int)
for user in index:
for cat in index[user]:
idf[cat] += 1
for cat in idf:
idf[cat] = math.log(1.0 / idf[cat])
if FLAGS.dump_idf:
cat_score = [(cat, idf[cat]) for cat in idf]
cat_score = sorted(cat_score, key=lambda e: e[1])
for cat, score in cat_score:
print '%f %s' % (score, cat)
else:
sorted_users = sorted(index.keys(), key=lambda x: x.lower())
for user in sorted_users:
scored_cats = collections.defaultdict(float)
for cat in index[user]:
scored_cats[cat] += idf[cat]
cats = [(scored_cats[cat], cat) for cat in scored_cats]
cats = sorted(cats, key=lambda e: e[0], reverse=True)
# Output this in a unix CLI-friendly way.
for score, desc in cats:
sys.stdout.write('%s\t%s\t%s\n' % (
score, user, desc))
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment