Skip to content

Instantly share code, notes, and snippets.

@MBoaretto25
Created January 31, 2019 17:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MBoaretto25/3f58a7dd2d171b15d12f5d94b6379428 to your computer and use it in GitHub Desktop.
Save MBoaretto25/3f58a7dd2d171b15d12f5d94b6379428 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jan 31 15:41:46 2019
@author: boaretto
"""
from string import digits
from nltk import ngrams, FreqDist
from nltk.corpus import stopwords
from collections import Counter
def words_frequency_pt(text):
# Clean punctuaction and lower text
for char in '-.,\n#':
text=text.replace(char, ' ')
text = text.lower()
# Remove digits
remove_digits = str.maketrans('', '', digits)
text = text.translate(remove_digits)
words_list = text.split()
#Remove stop words
stop_words = stopwords.words('portuguese')
stop_words.append('é')
stop_words.append('ter')
filtered_text = [w for w in words_list if not w in stop_words]
return Counter(filtered_text).most_common()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment