mmmayo13/better_text_summarization.py

## better_text_summarization.py
from collections import Counter
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words
import spacy

def count_words(tokens):
    word_counts = {}
    for token in tokens:
        if token not in stop_words and token not in punctuation and token is not '\n':
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    return word_counts

def word_freq_distribution(word_counts):
    freq_dist = {}
    max_freq = max(word_counts.values())
    for word in word_counts.keys():
        freq_dist[word] = (word_counts[word]/max_freq)
    return freq_dist

def score_sentences(sents, freq_dist, max_len=40):
    sent_scores = {}
    for sent in sents:
        words = sent.text.split(' ')
        for word in words:
            if word.lower() in freq_dist.keys():
                if len(words) < max_len:
                    if sent not in sent_scores.keys():
                        sent_scores[sent] = freq_dist[word.lower()]
                    else:
                        sent_scores[sent] += freq_dist[word.lower()]
    return sent_scores

def summarize(sent_scores, k):
    top_sents = Counter(sent_scores)
    summary = ''
    scores = []
    top = top_sents.most_common(k)
    for t in top:
        summary += t[0].text
        scores.append((t[1], t[0]))
    return summary, scores

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Tokenize with spaCy
tokens = [token.text for token in doc]
sents = [sentence for sentence in doc.sents]

# Get word counts
word_counts = count_words(tokens)

# Get word frequency distirbution
freq_dist = word_freq_distribution(word_counts)

# Score sentences
sent_scores = score_sentences(sents, freq_dist)

# Summarize text
summary, summary_sent_scores = summarize(sent_scores, 3)
print(summary)

# Print summary sentence scores
print(summary_sent_scores)
	from collections import Counter
	from string import punctuation
	from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words
	import spacy

	def count_words(tokens):
	word_counts = {}
	for token in tokens:
	if token not in stop_words and token not in punctuation and token is not '\n':
	if token not in word_counts.keys():
	word_counts[token] = 1
	else:
	word_counts[token] += 1
	return word_counts

	def word_freq_distribution(word_counts):
	freq_dist = {}
	max_freq = max(word_counts.values())
	for word in word_counts.keys():
	freq_dist[word] = (word_counts[word]/max_freq)
	return freq_dist

	def score_sentences(sents, freq_dist, max_len=40):
	sent_scores = {}
	for sent in sents:
	words = sent.text.split(' ')
	for word in words:
	if word.lower() in freq_dist.keys():
	if len(words) < max_len:
	if sent not in sent_scores.keys():
	sent_scores[sent] = freq_dist[word.lower()]
	else:
	sent_scores[sent] += freq_dist[word.lower()]
	return sent_scores

	def summarize(sent_scores, k):
	top_sents = Counter(sent_scores)
	summary = ''
	scores = []
	top = top_sents.most_common(k)
	for t in top:
	summary += t[0].text
	scores.append((t[1], t[0]))
	return summary, scores

	nlp = spacy.load("en_core_web_sm")
	doc = nlp(text)

	# Tokenize with spaCy
	tokens = [token.text for token in doc]
	sents = [sentence for sentence in doc.sents]

	# Get word counts
	word_counts = count_words(tokens)

	# Get word frequency distirbution
	freq_dist = word_freq_distribution(word_counts)

	# Score sentences
	sent_scores = score_sentences(sents, freq_dist)

	# Summarize text
	summary, summary_sent_scores = summarize(sent_scores, 3)
	print(summary)

	# Print summary sentence scores
	print(summary_sent_scores)