roberto-arista/hapaxator.py

## hapaxator.py
# Copyright (c) 2013, Federico Antonini e Roberto Arista
# MIT License
# Hapax script

### Opening libraries
import string
import re
import sys
from operator import itemgetter

def occurDict(items):
    """
    Returns a dictionary with from an iterable with elements organized by occurrences.
    ['a','a','a','b', 'b', 'c'] --> {'a': 3, 'b': 2, 'c': 1}

    """
    dictionary = {}

    for each_item in items:

        if each_item in dictionary:
            dictionary[each_item] = dictionary[each_item]+1
        else:
            dictionary[each_item] = 1

    return dictionary


def main():
    """
    This script is able to reorder by occurrences a txt file.
    Usage: python hapaxator.py input.txt

    """

    ### Variables
    order = raw_input('What order do you prefer? Increasing or decreasing? [i/d] ')

    # Opening text file
    input = file(sys.argv[1], 'r')
    input_read = input.read().decode('utf8')

    # Stripping punctuation
    exclude_words = set(string.punctuation+'\n'+'\t'+'\r')
    exclude_punctation = set(string.punctuation)
    input_string_words = ''.join(each_ch for each_ch in input_read if each_ch not in exclude_words)
    input_string_punctuation = ''.join(each_ch for each_ch in input_read if each_ch in exclude_punctation)

    # Splitting into words
    words_list = re.split(' ', input_string_words)

    punctuation_list = []
    for each_ch in input_string_punctuation:
        punctuation_list.append(str(each_ch))

    # Occurrences dictonaries
    words_dict = occurDict(words_list)
    punctuation_dict = occurDict(punctuation_list)

    # Ordering the dictionary
    words_occurrences = sorted(words_dict.items(), key=itemgetter(0), reverse = True)
    words_occurrences = sorted(words_occurrences, key=itemgetter(1))
    punctuation_occurrences = sorted(punctuation_dict.items(), key=itemgetter(1))

    # Reversing the list
    if order == 'd':
        words_occurrences.reverse()
        punctuation_occurrences.reverse()

    # Creating a new unicode string
    all_ordered_words = []
    for each_item in words_occurrences:
        all_ordered_words.append(((each_item[0]+u' ') * each_item[1]))

    all_ordered_punctation = []
    for each_item in punctuation_occurrences:
        all_ordered_punctation.append(((each_item[0]+u' ') * each_item[1]))

    # Joining the list elements
    string_output_words = "".join(unicode(each_word) for each_word in all_ordered_words)
    string_output_punctuation = "".join(unicode(each_punctation) for each_punctation in all_ordered_punctation)

    # output!
    output_words = file(sys.argv[1][:-4]+'_HAPAXATOR_words.txt', 'w')
    output_words.write(string_output_words.encode('utf8'))

    output_punctuation = file(sys.argv[1][:-4]+'_HAPAXATOR_punctation.txt', 'w')
    output_punctuation.write(string_output_punctuation)

if __name__ == "__main__":
    main()
    print 'done!'
	# Copyright (c) 2013, Federico Antonini e Roberto Arista
	# MIT License
	# Hapax script

	### Opening libraries
	import string
	import re
	import sys
	from operator import itemgetter

	def occurDict(items):
	"""
	Returns a dictionary with from an iterable with elements organized by occurrences.
	['a','a','a','b', 'b', 'c'] --> {'a': 3, 'b': 2, 'c': 1}

	"""
	dictionary = {}

	for each_item in items:

	if each_item in dictionary:
	dictionary[each_item] = dictionary[each_item]+1
	else:
	dictionary[each_item] = 1

	return dictionary


	def main():
	"""
	This script is able to reorder by occurrences a txt file.
	Usage: python hapaxator.py input.txt

	"""

	### Variables
	order = raw_input('What order do you prefer? Increasing or decreasing? [i/d] ')

	# Opening text file
	input = file(sys.argv[1], 'r')
	input_read = input.read().decode('utf8')

	# Stripping punctuation
	exclude_words = set(string.punctuation+'\n'+'\t'+'\r')
	exclude_punctation = set(string.punctuation)
	input_string_words = ''.join(each_ch for each_ch in input_read if each_ch not in exclude_words)
	input_string_punctuation = ''.join(each_ch for each_ch in input_read if each_ch in exclude_punctation)

	# Splitting into words
	words_list = re.split(' ', input_string_words)

	punctuation_list = []
	for each_ch in input_string_punctuation:
	punctuation_list.append(str(each_ch))

	# Occurrences dictonaries
	words_dict = occurDict(words_list)
	punctuation_dict = occurDict(punctuation_list)

	# Ordering the dictionary
	words_occurrences = sorted(words_dict.items(), key=itemgetter(0), reverse = True)
	words_occurrences = sorted(words_occurrences, key=itemgetter(1))
	punctuation_occurrences = sorted(punctuation_dict.items(), key=itemgetter(1))

	# Reversing the list
	if order == 'd':
	words_occurrences.reverse()
	punctuation_occurrences.reverse()

	# Creating a new unicode string
	all_ordered_words = []
	for each_item in words_occurrences:
	all_ordered_words.append(((each_item[0]+u' ') * each_item[1]))

	all_ordered_punctation = []
	for each_item in punctuation_occurrences:
	all_ordered_punctation.append(((each_item[0]+u' ') * each_item[1]))

	# Joining the list elements
	string_output_words = "".join(unicode(each_word) for each_word in all_ordered_words)
	string_output_punctuation = "".join(unicode(each_punctation) for each_punctation in all_ordered_punctation)

	# output!
	output_words = file(sys.argv[1][:-4]+'_HAPAXATOR_words.txt', 'w')
	output_words.write(string_output_words.encode('utf8'))

	output_punctuation = file(sys.argv[1][:-4]+'_HAPAXATOR_punctation.txt', 'w')
	output_punctuation.write(string_output_punctuation)

	if __name__ == "__main__":
	main()
	print 'done!'