Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This script is able to reorder by occurrences a txt file. It returns two different txt files, words and punctuation. It is possible to choose the direction of the ordering action: increasing or decreasing.
# Copyright (c) 2013, Federico Antonini e Roberto Arista
# MIT License
# Hapax script
### Opening libraries
import string
import re
import sys
from operator import itemgetter
def occurDict(items):
"""
Returns a dictionary with from an iterable with elements organized by occurrences.
['a','a','a','b', 'b', 'c'] --> {'a': 3, 'b': 2, 'c': 1}
"""
dictionary = {}
for each_item in items:
if each_item in dictionary:
dictionary[each_item] = dictionary[each_item]+1
else:
dictionary[each_item] = 1
return dictionary
def main():
"""
This script is able to reorder by occurrences a txt file.
Usage: python hapaxator.py input.txt
"""
### Variables
order = raw_input('What order do you prefer? Increasing or decreasing? [i/d] ')
# Opening text file
input = file(sys.argv[1], 'r')
input_read = input.read().decode('utf8')
# Stripping punctuation
exclude_words = set(string.punctuation+'\n'+'\t'+'\r')
exclude_punctation = set(string.punctuation)
input_string_words = ''.join(each_ch for each_ch in input_read if each_ch not in exclude_words)
input_string_punctuation = ''.join(each_ch for each_ch in input_read if each_ch in exclude_punctation)
# Splitting into words
words_list = re.split(' ', input_string_words)
punctuation_list = []
for each_ch in input_string_punctuation:
punctuation_list.append(str(each_ch))
# Occurrences dictonaries
words_dict = occurDict(words_list)
punctuation_dict = occurDict(punctuation_list)
# Ordering the dictionary
words_occurrences = sorted(words_dict.items(), key=itemgetter(0), reverse = True)
words_occurrences = sorted(words_occurrences, key=itemgetter(1))
punctuation_occurrences = sorted(punctuation_dict.items(), key=itemgetter(1))
# Reversing the list
if order == 'd':
words_occurrences.reverse()
punctuation_occurrences.reverse()
# Creating a new unicode string
all_ordered_words = []
for each_item in words_occurrences:
all_ordered_words.append(((each_item[0]+u' ') * each_item[1]))
all_ordered_punctation = []
for each_item in punctuation_occurrences:
all_ordered_punctation.append(((each_item[0]+u' ') * each_item[1]))
# Joining the list elements
string_output_words = "".join(unicode(each_word) for each_word in all_ordered_words)
string_output_punctuation = "".join(unicode(each_punctation) for each_punctation in all_ordered_punctation)
# output!
output_words = file(sys.argv[1][:-4]+'_HAPAXATOR_words.txt', 'w')
output_words.write(string_output_words.encode('utf8'))
output_punctuation = file(sys.argv[1][:-4]+'_HAPAXATOR_punctation.txt', 'w')
output_punctuation.write(string_output_punctuation)
if __name__ == "__main__":
main()
print 'done!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment