wiseman/user.py

## user.py
"""Classifies usernames according to which corpus in dariusk's
collection of small corpora they seem to fit into best.

John Wiseman <jjwiseman@gmail.com>

First each username is segmented using Google's trillion-word corpus.
For example, "mathemagician" becomes ["ma", "the", "magician"].

Then we look for corpora that match each segment.

  ma -> Slovak stop words
  the -> English stop words, Slovak stop words
  magician -> Common character archetypes

We use a TF-IDF model to assign information values to each corpus over
the entire set of usernames. E.g. if almost every username gets a
classification as "English stop words", "English stop words" will get
a low score.

Finally for each username we find the highest scoring matching
corpora.

Usage:
  $ git clone https://github.com/dariusk/corpora.git
  $ cd corpora
  $ curl -O 'http://mefi.us/infodump/usernames.txt.zip'
  $ unzip usernames.txt.zip
  $ pip install wordsegment python-gflags
  $ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep cannabis | head
  -5.25749537203	a little headband I put around my throat	~350 popular strains of cannabis
  -5.25749537203	a shrill fucking shitstripe	~350 popular strains of cannabis
  -5.25749537203	A-Train	~350 popular strains of cannabis
  -5.25749537203	AdapterIce	~350 popular strains of cannabis
  -5.25749537203	ambrosia	~350 popular strains of cannabis
  -5.25749537203	Ambrosia Voyeur	~350 popular strains of cannabis
  -5.25749537203	American Christmas Devil	~350 popular strains of cannabis
  -5.25749537203	AMSBoethius	~350 popular strains of cannabis
  -5.25749537203	androx	~350 popular strains of cannabis
  -5.25749537203	anonymice	~350 popular strains of cannabis
  $ cat usernames.txt | cut -d $'\t' -f 3 | python user.py | grep "little headband"
  -5.25749537203	a little headband I put around my throat	~350 popular strains of cannabis
  -5.39816270152	a little headband I put around my throat	A list of NSA project code names.
  -7.43070708255	a little headband I put around my throat	Danish stop words
  -7.48155570191	a little headband I put around my throat	Suffixes taken from a form on an airline website.
  -7.63867982388	a little headband I put around my throat	Norwegian stop words
  -8.04494704962	a little headband I put around my throat	Swedish stop words
  -8.42901750051	a little headband I put around my throat	Portuguese stop words
  -8.62137301033	a little headband I put around my throat	Spanish stop words
  -9.20883924585	a little headband I put around my throat	data/science/elements.json
  -16.9385253153	a little headband I put around my throat	Polish stop words
"""

import codecs
import collections
import json
import logging
import math
import os
from os import path
import sys
import types

import gflags
import wordsegment

sys.stdout = codecs.getwriter('utf8')(sys.stdout)

FLAGS = gflags.FLAGS
logger = logging.getLogger(__name__)


gflags.DEFINE_bool(
    'dump_idf', False,
    'Dump the category scores.')


def json_files():
    "Returns a list of all JSON files under the data directory."
    jsons = []
    for root, dirs, files in os.walk('data'):
        for f in files:
            if f.endswith('.json'):
                jsons.append(path.join(root, f))
    return jsons


def corpus_values(corpus):
    "Returns all values in a corpus."
    vs = []
    for key in corpus:
        if key not in ('description', 'source'):
            vs += values(corpus[key])
    return vs


def values(data):
    "Flattens a JSON object into a single list of values."
    if isinstance(data, dict):
        # For dicts, we just want the values.
        vs = []
        for key in data:
            vs += values(data[key])
        return vs
    if type(data) in (types.ListType, types.TupleType):
        vs = []
        for datum in data:
            vs += values(datum)
        return vs
    return [data]


def read_corpus(path):
    "Reads a corpus file. Returns JSON."
    with open(path, 'rb') as f:
        return json.loads(f.read())


def build_db():
    "Builds a corpus -> values database."
    logger.info('Loading corpora...')
    db = collections.defaultdict(list)
    for f in json_files():
        logger.info('Reading %s', f)
        corpus = read_corpus(f)
        desc = corpus.get('description', None)
        if not desc:
            logger.warn('No description: %s', f)
            desc = f
        values = corpus_values(corpus)
        for v in values:
            if v:
                # Index values as (lowercase) strings.
                db[unicode(v).lower()].append(desc)
    logger.info('Done loading corpora')
    return db


def unique(seq):
    "Removes duplicates from a sequence."
    return list(set(seq))


def main(argv):
    argv = FLAGS(argv)
    logging.basicConfig(level=logging.INFO)
    db = build_db()
    index = collections.defaultdict(list)
    for line in sys.stdin:
        line = unicode(line, 'utf8')
        line = line.strip()
        words = set(wordsegment.segment(line) + [line])
        for word in words:
            index[line] = index[line] + unique(db.get(word.lower(), []))
    idf = collections.defaultdict(int)
    for user in index:
        for cat in index[user]:
            idf[cat] += 1
    for cat in idf:
        idf[cat] = math.log(1.0 / idf[cat])
    if FLAGS.dump_idf:
        cat_score = [(cat, idf[cat]) for cat in idf]
        cat_score = sorted(cat_score, key=lambda e: e[1])
        for cat, score in cat_score:
            print '%f %s' % (score, cat)
    else:
        sorted_users = sorted(index.keys(), key=lambda x: x.lower())
        for user in sorted_users:
            scored_cats = collections.defaultdict(float)
            for cat in index[user]:
                scored_cats[cat] += idf[cat]
            cats = [(scored_cats[cat], cat) for cat in scored_cats]
            cats = sorted(cats, key=lambda e: e[0], reverse=True)
            # Output this in a unix CLI-friendly way.
            for score, desc in cats:
                sys.stdout.write('%s\t%s\t%s\n' % (
                    score, user, desc))


if __name__ == '__main__':
    main(sys.argv)
	"""Classifies usernames according to which corpus in dariusk's
	collection of small corpora they seem to fit into best.

	John Wiseman <jjwiseman@gmail.com>

	First each username is segmented using Google's trillion-word corpus.
	For example, "mathemagician" becomes ["ma", "the", "magician"].

	Then we look for corpora that match each segment.

	ma -> Slovak stop words
	the -> English stop words, Slovak stop words
	magician -> Common character archetypes

	We use a TF-IDF model to assign information values to each corpus over
	the entire set of usernames. E.g. if almost every username gets a
	classification as "English stop words", "English stop words" will get
	a low score.

	Finally for each username we find the highest scoring matching
	corpora.

	Usage:
	$ git clone https://github.com/dariusk/corpora.git
	$ cd corpora
	$ curl -O 'http://mefi.us/infodump/usernames.txt.zip'
	$ unzip usernames.txt.zip
	$ pip install wordsegment python-gflags
	$ cat usernames.txt \| cut -d $'\t' -f 3 \| python user.py \| grep cannabis \| head
	-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis
	-5.25749537203 a shrill fucking shitstripe ~350 popular strains of cannabis
	-5.25749537203 A-Train ~350 popular strains of cannabis
	-5.25749537203 AdapterIce ~350 popular strains of cannabis
	-5.25749537203 ambrosia ~350 popular strains of cannabis
	-5.25749537203 Ambrosia Voyeur ~350 popular strains of cannabis
	-5.25749537203 American Christmas Devil ~350 popular strains of cannabis
	-5.25749537203 AMSBoethius ~350 popular strains of cannabis
	-5.25749537203 androx ~350 popular strains of cannabis
	-5.25749537203 anonymice ~350 popular strains of cannabis
	$ cat usernames.txt \| cut -d $'\t' -f 3 \| python user.py \| grep "little headband"
	-5.25749537203 a little headband I put around my throat ~350 popular strains of cannabis
	-5.39816270152 a little headband I put around my throat A list of NSA project code names.
	-7.43070708255 a little headband I put around my throat Danish stop words
	-7.48155570191 a little headband I put around my throat Suffixes taken from a form on an airline website.
	-7.63867982388 a little headband I put around my throat Norwegian stop words
	-8.04494704962 a little headband I put around my throat Swedish stop words
	-8.42901750051 a little headband I put around my throat Portuguese stop words
	-8.62137301033 a little headband I put around my throat Spanish stop words
	-9.20883924585 a little headband I put around my throat data/science/elements.json
	-16.9385253153 a little headband I put around my throat Polish stop words
	"""

	import codecs
	import collections
	import json
	import logging
	import math
	import os
	from os import path
	import sys
	import types

	import gflags
	import wordsegment

	sys.stdout = codecs.getwriter('utf8')(sys.stdout)

	FLAGS = gflags.FLAGS
	logger = logging.getLogger(__name__)


	gflags.DEFINE_bool(
	'dump_idf', False,
	'Dump the category scores.')


	def json_files():
	"Returns a list of all JSON files under the data directory."
	jsons = []
	for root, dirs, files in os.walk('data'):
	for f in files:
	if f.endswith('.json'):
	jsons.append(path.join(root, f))
	return jsons


	def corpus_values(corpus):
	"Returns all values in a corpus."
	vs = []
	for key in corpus:
	if key not in ('description', 'source'):
	vs += values(corpus[key])
	return vs


	def values(data):
	"Flattens a JSON object into a single list of values."
	if isinstance(data, dict):
	# For dicts, we just want the values.
	vs = []
	for key in data:
	vs += values(data[key])
	return vs
	if type(data) in (types.ListType, types.TupleType):
	vs = []
	for datum in data:
	vs += values(datum)
	return vs
	return [data]


	def read_corpus(path):
	"Reads a corpus file. Returns JSON."
	with open(path, 'rb') as f:
	return json.loads(f.read())


	def build_db():
	"Builds a corpus -> values database."
	logger.info('Loading corpora...')
	db = collections.defaultdict(list)
	for f in json_files():
	logger.info('Reading %s', f)
	corpus = read_corpus(f)
	desc = corpus.get('description', None)
	if not desc:
	logger.warn('No description: %s', f)
	desc = f
	values = corpus_values(corpus)
	for v in values:
	if v:
	# Index values as (lowercase) strings.
	db[unicode(v).lower()].append(desc)
	logger.info('Done loading corpora')
	return db


	def unique(seq):
	"Removes duplicates from a sequence."
	return list(set(seq))


	def main(argv):
	argv = FLAGS(argv)
	logging.basicConfig(level=logging.INFO)
	db = build_db()
	index = collections.defaultdict(list)
	for line in sys.stdin:
	line = unicode(line, 'utf8')
	line = line.strip()
	words = set(wordsegment.segment(line) + [line])
	for word in words:
	index[line] = index[line] + unique(db.get(word.lower(), []))
	idf = collections.defaultdict(int)
	for user in index:
	for cat in index[user]:
	idf[cat] += 1
	for cat in idf:
	idf[cat] = math.log(1.0 / idf[cat])
	if FLAGS.dump_idf:
	cat_score = [(cat, idf[cat]) for cat in idf]
	cat_score = sorted(cat_score, key=lambda e: e[1])
	for cat, score in cat_score:
	print '%f %s' % (score, cat)
	else:
	sorted_users = sorted(index.keys(), key=lambda x: x.lower())
	for user in sorted_users:
	scored_cats = collections.defaultdict(float)
	for cat in index[user]:
	scored_cats[cat] += idf[cat]
	cats = [(scored_cats[cat], cat) for cat in scored_cats]
	cats = sorted(cats, key=lambda e: e[0], reverse=True)
	# Output this in a unix CLI-friendly way.
	for score, desc in cats:
	sys.stdout.write('%s\t%s\t%s\n' % (
	score, user, desc))


	if __name__ == '__main__':
	main(sys.argv)