Skip to content

Instantly share code, notes, and snippets.

@mathigatti
Created June 2, 2020 20:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mathigatti/8209e7d220ea918fa47b4fd00eebf310 to your computer and use it in GitHub Desktop.
Save mathigatti/8209e7d220ea918fa47b4fd00eebf310 to your computer and use it in GitHub Desktop.
Sort words on a text file by its relevance.
import math
from es_lemmatizer import lemmatize
import spacy
from nltk.tokenize import word_tokenize
from nltk import ngrams, FreqDist
from tqdm import tqdm
from collections import defaultdict
import unidecode
import sys
import os
nlp = spacy.load("es_core_news_sm")
nlp.add_pipe(lemmatize, after="tagger")
def remove_accents(word):
return unidecode.unidecode(word)
def lemma(word):
#word = remove_accents(word)
return nlp(word)[0].lemma_
def likelihood(rank):
return 1/(rank*math.log(1.78*R))
# https://datascience.stackexchange.com/questions/25725/idf-values-of-english-words
def weight(word):
if word in word2rank:
rank = word2rank[word]
else:
rank = R+1
return 1/likelihood(rank)
def tokenize(text):
text = text.lower()
words = word_tokenize(text)
words = filter(lambda word : word.isalpha(),words)
return words
def words_distribution(tokens):
print("Computing words distribution")
counts = FreqDist(ngrams(tokens, 1))
counts_lemmatized = defaultdict(lambda : 0)
for word, freq in tqdm(counts.items()):
word_lemmatized = lemma(word[0])
counts_lemmatized[word_lemmatized] += freq
return counts_lemmatized
def weight_by_importance(words_distribution):
print("Computing words importance")
words_by_importance = {}
for word, freq in tqdm(words_distribution.items()):
words_by_importance[word] = freq*weight(word)
return words_by_importance
text_file = sys.argv[1]
with open(text_file,'r') as f:
text = f.read()
# https://github.com/mazyvan/most-common-spanish-words/blob/master/most-common-spanish-words-v5.txt
with open("most-common-spanish-words.txt",'r') as f:
lines = f.readlines()
rank = [w.strip() for w in lines]
word2rank = {w.strip():(i+1) for i,w in enumerate(lines)}
R = len(rank)
tokens = tokenize(text)
words_by_importance = weight_by_importance(words_distribution(tokens))
words_by_importance_sorted = sorted(words_by_importance.items(),key=lambda x : -x[1])
if os.path.exists("output.txt"):
os.remove("output.txt")
with open("output.txt",'a') as f:
for word, freq in words_by_importance_sorted:
f.write(f"{word},{round(freq)}\n")
@mathigatti
Copy link
Author

mathigatti commented Jun 2, 2020

Forma de uso:

python palabras_comunes.py diario_intimo.txt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment