Skip to content

Instantly share code, notes, and snippets.

@dualyticalchemy
Last active April 2, 2022 15:30
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dualyticalchemy/3426a9b6a0ebbc0c0b51daa4c2d6e87a to your computer and use it in GitHub Desktop.
Save dualyticalchemy/3426a9b6a0ebbc0c0b51daa4c2d6e87a to your computer and use it in GitHub Desktop.
text summarization in python with nltk. be sure to download 'punkt' and 'stopwords' using `nltk.download('punkt')`, etc after running $ python in commandline, then `import nltk`. use $ pip install -r requirements.txt to install the requirements. be sure to use virtualenv if you don't want to clutter up your global python packages directory. from
#!/usr/bin/env python
'''
@description text summarization with python
'''
import bs4 as bs
import urllib.request
import re
import nltk
import heapq
import sys
def help_menu():
print('''USAGE:
python __init__.py [FILE_PATH] [SUMMARY_LENGTH]
python __init__.py ./some_file.txt 1
python __init__.py 'https://en.wikipedia.org/wiki/Brute_fact' 5''')
input_data = sys.argv[1] if len(sys.argv) > 1 else ''
summary_length = int(sys.argv[2]) if len(sys.argv) > 2 else 1
article_text = ""
if (input_data and not ('http' in input_data)):
opened_file = open(input_data, 'r+')
article_text = [line for line in opened_file]
elif ('http' in input_data):
scraped_data = urllib.request.urlopen(input_data)
article = scraped_data.read()
parsed_article = bs.BeautifulSoup(article, 'lxml')
paragraphs = parsed_article.find_all('p')
article_text = [paragraph.text for paragraph in paragraphs]
else:
help_menu()
exit(0)
if __name__ == "__main__":
article_text = "".join(article_text)
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
article_text = re.sub(r'\s+', ' ', article_text)
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
sentence_list = nltk.sent_tokenize(article_text)
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
if word not in stopwords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
maximum_freq = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_freq)
sentence_scores = {}
for sent in sentence_list:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]
summary_sentences = heapq.nlargest(summary_length, sentence_scores, key=sentence_scores.get)
summary = ' '.join(summary_sentences)
print(summary)
appdirs==1.4.4
appnope==0.1.0
attrs==19.3.0
backcall==0.1.0
beautifulsoup4==4.9.1
bleach==3.1.5
click==7.1.2
decorator==4.4.2
defusedxml==0.6.0
distlib==0.3.0
entrypoints==0.3
filelock==3.0.12
ipykernel==5.3.0
ipython==7.14.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.0
Jinja2==2.11.2
joblib==0.15.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.3
jupyter-console==6.1.0
jupyter-core==4.6.3
lxml==4.5.1
MarkupSafe==1.1.1
mistune==0.8.4
nbconvert==5.6.1
nbformat==5.0.6
nltk==3.5
notebook==6.0.3
numpy==1.18.4
packaging==20.4
pandocfilters==1.4.2
parso==0.7.0
pexpect==4.8.0
pickleshare==0.7.5
prometheus-client==0.8.0
prompt-toolkit==3.0.5
ptyprocess==0.6.0
Pygments==2.6.1
pyparsing==2.4.7
pyrsistent==0.16.0
python-dateutil==2.8.1
pyzmq==19.0.1
qtconsole==4.7.4
QtPy==1.9.0
regex==2020.6.8
Send2Trash==1.5.0
six==1.15.0
soupsieve==2.0.1
terminado==0.8.3
testpath==0.4.4
tornado==6.0.4
tqdm==4.46.1
traitlets==4.3.3
virtualenv==20.0.23
wcwidth==0.1.9
webencodings==0.5.1
widgetsnbextension==3.5.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment