Last active
April 2, 2022 15:30
-
-
Save dualyticalchemy/3426a9b6a0ebbc0c0b51daa4c2d6e87a to your computer and use it in GitHub Desktop.
text summarization in python with nltk. be sure to download 'punkt' and 'stopwords' using `nltk.download('punkt')`, etc after running $ python in commandline, then `import nltk`. use $ pip install -r requirements.txt to install the requirements. be sure to use virtualenv if you don't want to clutter up your global python packages directory. from …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3.8.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
@description text summarization with python | |
''' | |
import bs4 as bs | |
import urllib.request | |
import re | |
import nltk | |
import heapq | |
import sys | |
def help_menu(): | |
print('''USAGE: | |
python __init__.py [FILE_PATH] [SUMMARY_LENGTH] | |
python __init__.py ./some_file.txt 1 | |
python __init__.py 'https://en.wikipedia.org/wiki/Brute_fact' 5''') | |
input_data = sys.argv[1] if len(sys.argv) > 1 else '' | |
summary_length = int(sys.argv[2]) if len(sys.argv) > 2 else 1 | |
article_text = "" | |
if (input_data and not ('http' in input_data)): | |
opened_file = open(input_data, 'r+') | |
article_text = [line for line in opened_file] | |
elif ('http' in input_data): | |
scraped_data = urllib.request.urlopen(input_data) | |
article = scraped_data.read() | |
parsed_article = bs.BeautifulSoup(article, 'lxml') | |
paragraphs = parsed_article.find_all('p') | |
article_text = [paragraph.text for paragraph in paragraphs] | |
else: | |
help_menu() | |
exit(0) | |
if __name__ == "__main__": | |
article_text = "".join(article_text) | |
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text) | |
article_text = re.sub(r'\s+', ' ', article_text) | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
maximum_freq = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_freq) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_frequencies.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_frequencies[word] | |
else: | |
sentence_scores[sent] += word_frequencies[word] | |
summary_sentences = heapq.nlargest(summary_length, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sentences) | |
print(summary) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
appdirs==1.4.4 | |
appnope==0.1.0 | |
attrs==19.3.0 | |
backcall==0.1.0 | |
beautifulsoup4==4.9.1 | |
bleach==3.1.5 | |
click==7.1.2 | |
decorator==4.4.2 | |
defusedxml==0.6.0 | |
distlib==0.3.0 | |
entrypoints==0.3 | |
filelock==3.0.12 | |
ipykernel==5.3.0 | |
ipython==7.14.0 | |
ipython-genutils==0.2.0 | |
ipywidgets==7.5.1 | |
jedi==0.17.0 | |
Jinja2==2.11.2 | |
joblib==0.15.1 | |
jsonschema==3.2.0 | |
jupyter==1.0.0 | |
jupyter-client==6.1.3 | |
jupyter-console==6.1.0 | |
jupyter-core==4.6.3 | |
lxml==4.5.1 | |
MarkupSafe==1.1.1 | |
mistune==0.8.4 | |
nbconvert==5.6.1 | |
nbformat==5.0.6 | |
nltk==3.5 | |
notebook==6.0.3 | |
numpy==1.18.4 | |
packaging==20.4 | |
pandocfilters==1.4.2 | |
parso==0.7.0 | |
pexpect==4.8.0 | |
pickleshare==0.7.5 | |
prometheus-client==0.8.0 | |
prompt-toolkit==3.0.5 | |
ptyprocess==0.6.0 | |
Pygments==2.6.1 | |
pyparsing==2.4.7 | |
pyrsistent==0.16.0 | |
python-dateutil==2.8.1 | |
pyzmq==19.0.1 | |
qtconsole==4.7.4 | |
QtPy==1.9.0 | |
regex==2020.6.8 | |
Send2Trash==1.5.0 | |
six==1.15.0 | |
soupsieve==2.0.1 | |
terminado==0.8.3 | |
testpath==0.4.4 | |
tornado==6.0.4 | |
tqdm==4.46.1 | |
traitlets==4.3.3 | |
virtualenv==20.0.23 | |
wcwidth==0.1.9 | |
webencodings==0.5.1 | |
widgetsnbextension==3.5.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment