Created
June 2, 2020 20:56
-
-
Save mathigatti/8209e7d220ea918fa47b4fd00eebf310 to your computer and use it in GitHub Desktop.
Sort words on a text file by its relevance.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from es_lemmatizer import lemmatize | |
import spacy | |
from nltk.tokenize import word_tokenize | |
from nltk import ngrams, FreqDist | |
from tqdm import tqdm | |
from collections import defaultdict | |
import unidecode | |
import sys | |
import os | |
nlp = spacy.load("es_core_news_sm") | |
nlp.add_pipe(lemmatize, after="tagger") | |
def remove_accents(word): | |
return unidecode.unidecode(word) | |
def lemma(word): | |
#word = remove_accents(word) | |
return nlp(word)[0].lemma_ | |
def likelihood(rank): | |
return 1/(rank*math.log(1.78*R)) | |
# https://datascience.stackexchange.com/questions/25725/idf-values-of-english-words | |
def weight(word): | |
if word in word2rank: | |
rank = word2rank[word] | |
else: | |
rank = R+1 | |
return 1/likelihood(rank) | |
def tokenize(text): | |
text = text.lower() | |
words = word_tokenize(text) | |
words = filter(lambda word : word.isalpha(),words) | |
return words | |
def words_distribution(tokens): | |
print("Computing words distribution") | |
counts = FreqDist(ngrams(tokens, 1)) | |
counts_lemmatized = defaultdict(lambda : 0) | |
for word, freq in tqdm(counts.items()): | |
word_lemmatized = lemma(word[0]) | |
counts_lemmatized[word_lemmatized] += freq | |
return counts_lemmatized | |
def weight_by_importance(words_distribution): | |
print("Computing words importance") | |
words_by_importance = {} | |
for word, freq in tqdm(words_distribution.items()): | |
words_by_importance[word] = freq*weight(word) | |
return words_by_importance | |
text_file = sys.argv[1] | |
with open(text_file,'r') as f: | |
text = f.read() | |
# https://github.com/mazyvan/most-common-spanish-words/blob/master/most-common-spanish-words-v5.txt | |
with open("most-common-spanish-words.txt",'r') as f: | |
lines = f.readlines() | |
rank = [w.strip() for w in lines] | |
word2rank = {w.strip():(i+1) for i,w in enumerate(lines)} | |
R = len(rank) | |
tokens = tokenize(text) | |
words_by_importance = weight_by_importance(words_distribution(tokens)) | |
words_by_importance_sorted = sorted(words_by_importance.items(),key=lambda x : -x[1]) | |
if os.path.exists("output.txt"): | |
os.remove("output.txt") | |
with open("output.txt",'a') as f: | |
for word, freq in words_by_importance_sorted: | |
f.write(f"{word},{round(freq)}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Forma de uso:
python palabras_comunes.py diario_intimo.txt