inky/convert_rels.py

## convert_rels.py
#!/usr/bin/env python
"""
This script aims to make it a little easier to work with ConceptNet
relationships using the raw data available. It extracts all of the semantic
relationships for English terms, and saves the results to multiple JSON files
-- one for each letter in the alphabet.

As of Jan 2015, this will require about 9 GB of disk space:

    * conceptnet5_flat_json_5.3.tar.bz2 is 644 MB compressed,
      7.3 GB uncompressed.

    * Using the file above, the output of this script is 747 MB uncompressed.

To run:

   1. Download the 'flat json' file from
      http://conceptnet5.media.mit.edu/downloads/current/

   2. Extract the file, and then change to the data directory. For example:

          tar xvjf conceptnet5_flat_json_5.3.tar.bz2
          cd data

   3. Run this script to parse the data files. The results will be saved to
      a directory named 'parsed'.

          python convert_rels.py
          ls -l parsed/
          du -sh parsed/

Sample output (Jan 2015):

    $ python convert_rels.py
    Reading...
      assertions/part_00.jsons (1065963 items)
      assertions/part_01.jsons (1066531 items)
      assertions/part_02.jsons (1065944 items)
      assertions/part_03.jsons (1066361 items)
      assertions/part_04.jsons (1066461 items)
      assertions/part_05.jsons (1067644 items)
      assertions/part_06.jsons (1066130 items)
      assertions/part_07.jsons (1067429 items)
    Writing...
      relationship types
      0 (47110 items)
      a (169995 items)
      b (166081 items)
      c (194629 items)
      d (117090 items)
      e (80352 items)
      f (83671 items)
      g (100831 items)
      h (102779 items)
      i (61977 items)
      j (78533 items)
      k (78580 items)
      l (110009 items)
      m (177304 items)
      n (89463 items)
      o (55490 items)
      p (162473 items)
      q (9865 items)
      r (107025 items)
      s (242414 items)
      t (119849 items)
      u (52904 items)
      v (43502 items)
      w (80200 items)
      x (5250 items)
      y (16658 items)
      z (14671 items)
    Done

"""
import codecs
import json
import os
import string
import sys
from collections import defaultdict
from glob import glob


def main():
    try:
        os.mkdir('parsed')
    except OSError:
        pass

    # get statistics on the types of relationships used, in the format:
    # { '/r/SomeRelationship': number_of_entries, ... }
    rel_types = defaultdict(int)

    # get relationship triplets for each term, in the format:
    # {
    #   'a': {
    #     'a word': [
    #       (start_term, relationship, end_term),
    #       ...
    #     ],
    #     ...
    #   },
    #   ...
    # }
    rel_entries = {}
    for letter in (string.lowercase + '0'):
        rel_entries[letter] = defaultdict(list)

    # prefix used for English terms
    en_term_prefix = '/c/en/'
    en_term_prefix_len = len(en_term_prefix)


    print 'Reading...'

    for fn in glob('assertions/*.jsons'):
        sys.stdout.write('  %s ' % fn)
        added = 0
        with codecs.open(fn, encoding='utf-8') as fp:
            # parse each line as a json object;
            # only parse English terms
            for line in fp:
                if not en_term_prefix in line:
                    continue

                item = json.loads(line)
                start, rel, end = item['start'], item['rel'], item['end']

                if not (start.startswith(en_term_prefix) and
                        end.startswith(en_term_prefix)):
                    continue

                # remove the prefix
                start = start[en_term_prefix_len:]
                end = end[en_term_prefix_len:]

                letter = start[0].lower()
                if not letter in string.lowercase:
                    letter = '0'

                # add the relationship
                rel_types[rel] += 1
                rel_entries[letter][start].append((start, rel, end))
                added += 1

        sys.stdout.write('(%d items)\n' % added)


    print 'Writing...'

    with codecs.open('parsed/rel_types.txt', 'w', encoding='utf-8') as fp:
        print '  relationship types'
        for rel, count in sorted(rel_types.items()):
            fp.write('%7d %s\n' % (count, rel))

    for letter in sorted(rel_entries.keys()):
        entries = rel_entries[letter]
        print '  %s (%d items)' % (letter, len(entries))
        with codecs.open('parsed/terms_%s.json' % letter, 'w',
                         encoding='utf-8') as fp:
            json.dump(entries, fp, indent=2)

    print 'Done'


if __name__ == '__main__':
    try:
        retval = main()
    except KeyboardInterrupt:
        print('')
        retval = 1
    sys.exit(retval)
	#!/usr/bin/env python
	"""
	This script aims to make it a little easier to work with ConceptNet
	relationships using the raw data available. It extracts all of the semantic
	relationships for English terms, and saves the results to multiple JSON files
	-- one for each letter in the alphabet.

	As of Jan 2015, this will require about 9 GB of disk space:

	* conceptnet5_flat_json_5.3.tar.bz2 is 644 MB compressed,
	7.3 GB uncompressed.

	* Using the file above, the output of this script is 747 MB uncompressed.

	To run:

	1. Download the 'flat json' file from
	http://conceptnet5.media.mit.edu/downloads/current/

	2. Extract the file, and then change to the data directory. For example:

	tar xvjf conceptnet5_flat_json_5.3.tar.bz2
	cd data

	3. Run this script to parse the data files. The results will be saved to
	a directory named 'parsed'.

	python convert_rels.py
	ls -l parsed/
	du -sh parsed/

	Sample output (Jan 2015):

	$ python convert_rels.py
	Reading...
	assertions/part_00.jsons (1065963 items)
	assertions/part_01.jsons (1066531 items)
	assertions/part_02.jsons (1065944 items)
	assertions/part_03.jsons (1066361 items)
	assertions/part_04.jsons (1066461 items)
	assertions/part_05.jsons (1067644 items)
	assertions/part_06.jsons (1066130 items)
	assertions/part_07.jsons (1067429 items)
	Writing...
	relationship types
	0 (47110 items)
	a (169995 items)
	b (166081 items)
	c (194629 items)
	d (117090 items)
	e (80352 items)
	f (83671 items)
	g (100831 items)
	h (102779 items)
	i (61977 items)
	j (78533 items)
	k (78580 items)
	l (110009 items)
	m (177304 items)
	n (89463 items)
	o (55490 items)
	p (162473 items)
	q (9865 items)
	r (107025 items)
	s (242414 items)
	t (119849 items)
	u (52904 items)
	v (43502 items)
	w (80200 items)
	x (5250 items)
	y (16658 items)
	z (14671 items)
	Done

	"""
	import codecs
	import json
	import os
	import string
	import sys
	from collections import defaultdict
	from glob import glob


	def main():
	try:
	os.mkdir('parsed')
	except OSError:
	pass

	# get statistics on the types of relationships used, in the format:
	# { '/r/SomeRelationship': number_of_entries, ... }
	rel_types = defaultdict(int)

	# get relationship triplets for each term, in the format:
	# {
	# 'a': {
	# 'a word': [
	# (start_term, relationship, end_term),
	# ...
	# ],
	# ...
	# },
	# ...
	# }
	rel_entries = {}
	for letter in (string.lowercase + '0'):
	rel_entries[letter] = defaultdict(list)

	# prefix used for English terms
	en_term_prefix = '/c/en/'
	en_term_prefix_len = len(en_term_prefix)


	print 'Reading...'

	for fn in glob('assertions/*.jsons'):
	sys.stdout.write(' %s ' % fn)
	added = 0
	with codecs.open(fn, encoding='utf-8') as fp:
	# parse each line as a json object;
	# only parse English terms
	for line in fp:
	if not en_term_prefix in line:
	continue

	item = json.loads(line)
	start, rel, end = item['start'], item['rel'], item['end']

	if not (start.startswith(en_term_prefix) and
	end.startswith(en_term_prefix)):
	continue

	# remove the prefix
	start = start[en_term_prefix_len:]
	end = end[en_term_prefix_len:]

	letter = start[0].lower()
	if not letter in string.lowercase:
	letter = '0'

	# add the relationship
	rel_types[rel] += 1
	rel_entries[letter][start].append((start, rel, end))
	added += 1

	sys.stdout.write('(%d items)\n' % added)


	print 'Writing...'

	with codecs.open('parsed/rel_types.txt', 'w', encoding='utf-8') as fp:
	print ' relationship types'
	for rel, count in sorted(rel_types.items()):
	fp.write('%7d %s\n' % (count, rel))

	for letter in sorted(rel_entries.keys()):
	entries = rel_entries[letter]
	print ' %s (%d items)' % (letter, len(entries))
	with codecs.open('parsed/terms_%s.json' % letter, 'w',
	encoding='utf-8') as fp:
	json.dump(entries, fp, indent=2)

	print 'Done'


	if __name__ == '__main__':
	try:
	retval = main()
	except KeyboardInterrupt:
	print('')
	retval = 1
	sys.exit(retval)