Last active
February 6, 2020 06:15
-
-
Save zilista/f5a891fff912bc45125c5238697c9427 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk.corpus import stopwords | |
from string import punctuation | |
# import ssl | |
# | |
# try: | |
# _create_unverified_https_context = ssl._create_unverified_context | |
# except AttributeError: | |
# pass | |
# else: | |
# ssl._create_default_https_context = _create_unverified_https_context | |
# | |
# nltk.download() | |
text = """ | |
Obviously, you have already come across lots of essays and now understand how | |
significant the writing skills are. Moreover, essay writing is a usual and crucial part of | |
the education process, so your spirit should not be daunted by the nearest assignment. | |
Normally, as a good exercise in creative writing, essays help the student grasp researching, | |
planning and drafting skills, every one of which belongs to a ‘must-have’ category | |
and is highly evaluated by everyone who wants to finish the program successfully. | |
Therefore, request any help from the teacher in case of emergency, because you can be sure he or she does not | |
long to see you suffer. The only thing you ought not to do is to moan and complain about | |
the volume of work; also, it would not be reasonable to discuss with them your impending | |
deadline, because, most likely, the real problem will appear to be your poor time management. | |
""" | |
text_ru = """ | |
Курсовые работы – большинство студентов боятся их, как огня. Ведь, не сдав вовремя курсовую работу | |
или получив неудовлетворительную оценку, вы рискуете не попасть на следующий курс обучения. | |
Никому не хочется проходить через всю эту волокиту с беготней за преподавателем, бесконечными правками, пересдачами | |
и т.д. Чтобы написать качественную курсовую, необходимо перелопатить уйму теоретического материала, | |
провести собственное исследование, да еще и оформить все это грамотно, с учетом действующих нормативов. | |
И хорошо, если подготовкой курсовой вы начали заниматься заблаговременно и с данной дисциплиной у вас все ладится. | |
А если нет? Если вы по болезни или по другим причинам пропустили много занятий? Если вам приходится совмещать учебу | |
и работу, так что свободного времени и сил катастрофически не хватает? | |
Отличное решение для таких случаев – заказать курсовую работу у профессионалов в данной сфере. | |
""" | |
sentences = nltk.sent_tokenize(text_ru) | |
sentences_0 = sentences[0] | |
# words = nltk.word_tokenize(sentences_0) | |
# | |
# word = 'Obviously' | |
# ps = PorterStemmer() | |
# base = ps.stem(word) | |
# print(base) | |
# rus_word = 'Курсовые' | |
# stemmer = SnowballStemmer('russian') | |
# rus_stem = stemmer.stem(rus_word) | |
# print(rus_stem) | |
# print(words) | |
rus_word = 'Курсовые' | |
lemma = WordNetLemmatizer() | |
lword = lemma.lemmatize(rus_word) | |
print(lword) | |
stemmer = SnowballStemmer('russian') | |
result_dict = {} | |
swords = stopwords.words('russian') #импорт стоп слов | |
for word in nltk.word_tokenize(text_ru): | |
if (word in swords) or (word in punctuation): | |
continue | |
word = stemmer.stem((word)) | |
if word in result_dict: | |
result_dict[word]+=1 | |
else: | |
result_dict[word]=1 | |
for word in result_dict: | |
result_dict[word] = round(result_dict[word] / len(result_dict) * 100, 2) | |
sorted_words = sorted(result_dict.items(), key=lambda x: x[1], reverse=True) | |
print(sorted_words) | |
bigrams = nltk.ngrams(nltk.word_tokenize(text_ru), 2) | |
print(list(bigrams)) | |
for word_tupl in bigrams: | |
word = ' '.join(word_tupl) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment