singerng/kindle.py

## kindle.py
from bs4 import BeautifulSoup
from nltk.stem import SnowballStemmer
from nltk.corpus import cess_esp as corpus
from nltk import FreqDist
import csv

words = []

with open("kindle.html") as f:
	soup = BeautifulSoup(f.read(), 'html.parser')
	hits = soup.find_all('div', {'class': 'noteText'})

	for hit in hits:
		# pnct = [",",".",";"]
		words.append(hit.contents[0].replace(",","").replace(".","").replace(";",""))

print("Total words: {}".format(len(words)))

stemmer = SnowballStemmer('spanish')

stems = set()
stems_to_words = {}

for word in words:
	stems.add(stemmer.stem(word))
	stems_to_words[stemmer.stem(word)] = word

print("Unique stems: {}".format(len(stems)))

corpus_stems = []

for word in corpus.words():
	corpus_stems.append(stemmer.stem(word))

print("Unique stems (reference): {}".format(len(set(corpus_stems))))

fd = FreqDist(corpus_stems)

with open("kindle_words.csv", 'w', newline='') as outfile:
	outwriter = csv.writer(outfile, delimiter=',')

	for stem in stems:
		outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""])
	from bs4 import BeautifulSoup
	from nltk.stem import SnowballStemmer
	from nltk.corpus import cess_esp as corpus
	from nltk import FreqDist
	import csv

	words = []

	with open("kindle.html") as f:
	soup = BeautifulSoup(f.read(), 'html.parser')
	hits = soup.find_all('div', {'class': 'noteText'})

	for hit in hits:
	# pnct = [",",".",";"]
	words.append(hit.contents[0].replace(",","").replace(".","").replace(";",""))

	print("Total words: {}".format(len(words)))

	stemmer = SnowballStemmer('spanish')

	stems = set()
	stems_to_words = {}

	for word in words:
	stems.add(stemmer.stem(word))
	stems_to_words[stemmer.stem(word)] = word

	print("Unique stems: {}".format(len(stems)))

	corpus_stems = []

	for word in corpus.words():
	corpus_stems.append(stemmer.stem(word))

	print("Unique stems (reference): {}".format(len(set(corpus_stems))))

	fd = FreqDist(corpus_stems)

	with open("kindle_words.csv", 'w', newline='') as outfile:
	outwriter = csv.writer(outfile, delimiter=',')

	for stem in stems:
	outwriter.writerow([stems_to_words[stem], stem, fd[stem] if stem in fd else ""])