Skip to content

Instantly share code, notes, and snippets.

@mmmayo13
Last active September 14, 2021 13:55
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mmmayo13/86b6ce75a3acc6f8ba2ddadc0f7fecb2 to your computer and use it in GitHub Desktop.
Save mmmayo13/86b6ce75a3acc6f8ba2ddadc0f7fecb2 to your computer and use it in GitHub Desktop.
from collections import Counter
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words
import spacy
def count_words(tokens):
word_counts = {}
for token in tokens:
if token not in stop_words and token not in punctuation and token is not '\n':
if token not in word_counts.keys():
word_counts[token] = 1
else:
word_counts[token] += 1
return word_counts
def word_freq_distribution(word_counts):
freq_dist = {}
max_freq = max(word_counts.values())
for word in word_counts.keys():
freq_dist[word] = (word_counts[word]/max_freq)
return freq_dist
def score_sentences(sents, freq_dist, max_len=40):
sent_scores = {}
for sent in sents:
words = sent.text.split(' ')
for word in words:
if word.lower() in freq_dist.keys():
if len(words) < max_len:
if sent not in sent_scores.keys():
sent_scores[sent] = freq_dist[word.lower()]
else:
sent_scores[sent] += freq_dist[word.lower()]
return sent_scores
def summarize(sent_scores, k):
top_sents = Counter(sent_scores)
summary = ''
scores = []
top = top_sents.most_common(k)
for t in top:
summary += t[0].text
scores.append((t[1], t[0]))
return summary, scores
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
# Tokenize with spaCy
tokens = [token.text for token in doc]
sents = [sentence for sentence in doc.sents]
# Get word counts
word_counts = count_words(tokens)
# Get word frequency distirbution
freq_dist = word_freq_distribution(word_counts)
# Score sentences
sent_scores = score_sentences(sents, freq_dist)
# Summarize text
summary, summary_sent_scores = summarize(sent_scores, 3)
print(summary)
# Print summary sentence scores
print(summary_sent_scores)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment