Skip to content

Instantly share code, notes, and snippets.

@ekingery
Last active January 5, 2017 17:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ekingery/1bb6865e24fccb40c1cd1368c04f4d31 to your computer and use it in GitHub Desktop.
Save ekingery/1bb6865e24fccb40c1cd1368c04f4d31 to your computer and use it in GitHub Desktop.
Parse RCV1 topics into a tree structure
# This script parses the RCV1 topics into a tree structure
# It can then be exported to json or dotfile format
# For more info on RCV1, see
# http://jmlr.csail.mit.edu/papers/volume5/lewis04a/lewis04a.pdf
import re
from treelib import Tree
from treelib.plugins import export_to_dot
# read topics from flat file into a list of lists
topics = []
# put them into a topic tree structure
ttree = Tree()
# curl http://jmlr.csail.mit.edu/papers/volume5/lewis04a/a03-expanded-topics-hierarchy/rcv1.topics.hier.expanded > rcv1.topics.hier.expanded.txt # noqa
with open('rcv1.topics.hier.expanded.txt', 'r') as f:
for line in f:
cols = re.split(r'\s{2,}', line.rstrip('\n'))
cols = [c.replace('parent: ', '').
replace('child: ', '').
replace('child-description: ', '') for c in cols]
topics.append(cols)
# loop once to set the root node and it's children
# (the source data is in level-order after the 2nd level)
for t in topics:
if 'None' == t[0]:
ttree.create_node(t[1] + " - " + t[2], t[1])
elif 'Root' == t[0]:
ttree.create_node(t[1] + " - " + t[2], t[1], parent=t[0])
# loop again to pull all other nodes
for t in topics:
if ('None' != t[0] and 'Root' != t[0]):
ttree.create_node(t[1] + " - " + t[2], t[1], parent=t[0])
# print(ttree)
# export_to_dot(ttree, 'topics-tree.dot')
# export dotfiles for each main category subtree
for cat in ['CCAT', 'ECAT', 'GCAT', 'MCAT']:
export_to_dot(ttree.subtree(cat), cat + '-topics-tree.dot')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment