Last active
August 29, 2015 14:09
-
-
Save roberto-arista/cecea49df3de2a3870e4 to your computer and use it in GitHub Desktop.
This script is able to reorder by occurrences a txt file. It returns two different txt files, words and punctuation. It is possible to choose the direction of the ordering action: increasing or decreasing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2013, Federico Antonini e Roberto Arista | |
# MIT License | |
# Hapax script | |
### Opening libraries | |
import string | |
import re | |
import sys | |
from operator import itemgetter | |
def occurDict(items): | |
""" | |
Returns a dictionary with from an iterable with elements organized by occurrences. | |
['a','a','a','b', 'b', 'c'] --> {'a': 3, 'b': 2, 'c': 1} | |
""" | |
dictionary = {} | |
for each_item in items: | |
if each_item in dictionary: | |
dictionary[each_item] = dictionary[each_item]+1 | |
else: | |
dictionary[each_item] = 1 | |
return dictionary | |
def main(): | |
""" | |
This script is able to reorder by occurrences a txt file. | |
Usage: python hapaxator.py input.txt | |
""" | |
### Variables | |
order = raw_input('What order do you prefer? Increasing or decreasing? [i/d] ') | |
# Opening text file | |
input = file(sys.argv[1], 'r') | |
input_read = input.read().decode('utf8') | |
# Stripping punctuation | |
exclude_words = set(string.punctuation+'\n'+'\t'+'\r') | |
exclude_punctation = set(string.punctuation) | |
input_string_words = ''.join(each_ch for each_ch in input_read if each_ch not in exclude_words) | |
input_string_punctuation = ''.join(each_ch for each_ch in input_read if each_ch in exclude_punctation) | |
# Splitting into words | |
words_list = re.split(' ', input_string_words) | |
punctuation_list = [] | |
for each_ch in input_string_punctuation: | |
punctuation_list.append(str(each_ch)) | |
# Occurrences dictonaries | |
words_dict = occurDict(words_list) | |
punctuation_dict = occurDict(punctuation_list) | |
# Ordering the dictionary | |
words_occurrences = sorted(words_dict.items(), key=itemgetter(0), reverse = True) | |
words_occurrences = sorted(words_occurrences, key=itemgetter(1)) | |
punctuation_occurrences = sorted(punctuation_dict.items(), key=itemgetter(1)) | |
# Reversing the list | |
if order == 'd': | |
words_occurrences.reverse() | |
punctuation_occurrences.reverse() | |
# Creating a new unicode string | |
all_ordered_words = [] | |
for each_item in words_occurrences: | |
all_ordered_words.append(((each_item[0]+u' ') * each_item[1])) | |
all_ordered_punctation = [] | |
for each_item in punctuation_occurrences: | |
all_ordered_punctation.append(((each_item[0]+u' ') * each_item[1])) | |
# Joining the list elements | |
string_output_words = "".join(unicode(each_word) for each_word in all_ordered_words) | |
string_output_punctuation = "".join(unicode(each_punctation) for each_punctation in all_ordered_punctation) | |
# output! | |
output_words = file(sys.argv[1][:-4]+'_HAPAXATOR_words.txt', 'w') | |
output_words.write(string_output_words.encode('utf8')) | |
output_punctuation = file(sys.argv[1][:-4]+'_HAPAXATOR_punctation.txt', 'w') | |
output_punctuation.write(string_output_punctuation) | |
if __name__ == "__main__": | |
main() | |
print 'done!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment