Created
November 1, 2017 09:16
-
-
Save anonymous/addaac6423d4cd954ac8ba01d4af7397 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
input_file = 'input.txt' | |
output_file = 'output.csv' | |
# Для консольного приложения надо считывать имена файлов из параметром | |
# In[11]: | |
# Чтение из файла в строку | |
def read_text(filename): | |
return " Мама мыла и раму, вот такая штука!" | |
# In[3]: | |
def clear_symbols(raw_string): | |
return "Мама мыла и раму вот такая штука" | |
# In[4]: | |
def tokenize(clear_string): | |
return ["Мама", "мыла", "и", "раму", "вот", "такая", "штука"] | |
# In[5]: | |
def normalize(tokens): | |
return ["мама", "мыла", "и", "раму", "вот", "такая", "штука"] | |
# In[6]: | |
def build_dictionary(tokens): | |
return { | |
"мама": 1, | |
"мыла": 1, | |
"раму": 1, | |
"вот": 1, | |
"такая": 1, | |
"штука": 1, | |
"и": 1 | |
} | |
# In[7]: | |
def remove_stop_words(frequency_dict): | |
return { | |
"мама": 1, | |
"мыла": 1, | |
"раму": 1, | |
"вот": 1, | |
"такая": 1, | |
"штука": 1 | |
} | |
# In[9]: | |
def write_dict_to_csv(frequency_dict, csv_filename): | |
# Записать в файл упорядоченный (!) по убыванию частостный словарь | |
# | |
# word,count | |
# мама,1 | |
# раму,1 | |
# вот,1 | |
# мыла,1 | |
# такая,1 | |
# штука,1 | |
pass | |
# In[12]: | |
input_text = read_text(input_file) | |
clear_text = clear_symbols(input_text) | |
clear_tokens = normalize(tokenize(clear_text)) | |
frequency_dict = remove_stop_words(build_dictionary(clear_tokens)) | |
print("Formated dictionary:", frequency_dict) | |
write_dict_to_csv(frequency_dict, output_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment