mathigatti/palabras_comunes.py

## palabras_comunes.py
import math
from es_lemmatizer import lemmatize
import spacy
from nltk.tokenize import word_tokenize
from nltk import ngrams, FreqDist
from tqdm import tqdm
from collections import defaultdict
import unidecode
import sys
import os

nlp = spacy.load("es_core_news_sm")
nlp.add_pipe(lemmatize, after="tagger")

def remove_accents(word):
    return unidecode.unidecode(word)

def lemma(word):
    #word = remove_accents(word)
    return nlp(word)[0].lemma_

def likelihood(rank):
    return 1/(rank*math.log(1.78*R))

# https://datascience.stackexchange.com/questions/25725/idf-values-of-english-words
def weight(word):
    if word in word2rank:
        rank = word2rank[word]
    else:
        rank = R+1
    return 1/likelihood(rank)

def tokenize(text):
    text = text.lower()
    words = word_tokenize(text)
    words = filter(lambda word : word.isalpha(),words)
    return words

def words_distribution(tokens):
    print("Computing words distribution")
    counts = FreqDist(ngrams(tokens, 1))
    counts_lemmatized = defaultdict(lambda : 0)
    for word, freq in tqdm(counts.items()):
        word_lemmatized = lemma(word[0])
        counts_lemmatized[word_lemmatized] += freq
    return counts_lemmatized

def weight_by_importance(words_distribution):
    print("Computing words importance")
    words_by_importance = {}
    for word, freq in tqdm(words_distribution.items()):
        words_by_importance[word] = freq*weight(word)
    return words_by_importance


text_file = sys.argv[1]

with open(text_file,'r') as f:
    text = f.read()

# https://github.com/mazyvan/most-common-spanish-words/blob/master/most-common-spanish-words-v5.txt
with open("most-common-spanish-words.txt",'r') as f:
    lines = f.readlines()

rank = [w.strip() for w in lines]
word2rank = {w.strip():(i+1) for i,w in enumerate(lines)}
R = len(rank)

tokens = tokenize(text)
words_by_importance = weight_by_importance(words_distribution(tokens))

words_by_importance_sorted = sorted(words_by_importance.items(),key=lambda x : -x[1])

if os.path.exists("output.txt"):
    os.remove("output.txt")
with open("output.txt",'a') as f:
    for word, freq in words_by_importance_sorted:
        f.write(f"{word},{round(freq)}\n")
	import math
	from es_lemmatizer import lemmatize
	import spacy
	from nltk.tokenize import word_tokenize
	from nltk import ngrams, FreqDist
	from tqdm import tqdm
	from collections import defaultdict
	import unidecode
	import sys
	import os

	nlp = spacy.load("es_core_news_sm")
	nlp.add_pipe(lemmatize, after="tagger")

	def remove_accents(word):
	return unidecode.unidecode(word)

	def lemma(word):
	#word = remove_accents(word)
	return nlp(word)[0].lemma_

	def likelihood(rank):
	return 1/(rankmath.log(1.78R))

	# https://datascience.stackexchange.com/questions/25725/idf-values-of-english-words
	def weight(word):
	if word in word2rank:
	rank = word2rank[word]
	else:
	rank = R+1
	return 1/likelihood(rank)

	def tokenize(text):
	text = text.lower()
	words = word_tokenize(text)
	words = filter(lambda word : word.isalpha(),words)
	return words

	def words_distribution(tokens):
	print("Computing words distribution")
	counts = FreqDist(ngrams(tokens, 1))
	counts_lemmatized = defaultdict(lambda : 0)
	for word, freq in tqdm(counts.items()):
	word_lemmatized = lemma(word[0])
	counts_lemmatized[word_lemmatized] += freq
	return counts_lemmatized

	def weight_by_importance(words_distribution):
	print("Computing words importance")
	words_by_importance = {}
	for word, freq in tqdm(words_distribution.items()):
	words_by_importance[word] = freq*weight(word)
	return words_by_importance


	text_file = sys.argv[1]

	with open(text_file,'r') as f:
	text = f.read()

	# https://github.com/mazyvan/most-common-spanish-words/blob/master/most-common-spanish-words-v5.txt
	with open("most-common-spanish-words.txt",'r') as f:
	lines = f.readlines()

	rank = [w.strip() for w in lines]
	word2rank = {w.strip():(i+1) for i,w in enumerate(lines)}
	R = len(rank)

	tokens = tokenize(text)
	words_by_importance = weight_by_importance(words_distribution(tokens))

	words_by_importance_sorted = sorted(words_by_importance.items(),key=lambda x : -x[1])

	if os.path.exists("output.txt"):
	os.remove("output.txt")
	with open("output.txt",'a') as f:
	for word, freq in words_by_importance_sorted:
	f.write(f"{word},{round(freq)}\n")