dualyticalchemy/.python_version

## .python_version
3.8.2

## __init__.py
#!/usr/bin/env python
'''
@description text summarization with python
'''

import bs4 as bs
import urllib.request
import re
import nltk
import heapq
import sys


def help_menu():
    print('''USAGE:
python __init__.py [FILE_PATH] [SUMMARY_LENGTH]
python __init__.py ./some_file.txt 1
python __init__.py 'https://en.wikipedia.org/wiki/Brute_fact' 5''')

input_data = sys.argv[1] if len(sys.argv) > 1 else ''
summary_length = int(sys.argv[2]) if len(sys.argv) > 2 else 1

article_text = ""

if (input_data and not ('http' in input_data)):

    opened_file = open(input_data, 'r+')

    article_text = [line for line in opened_file]

elif ('http' in input_data):

    scraped_data = urllib.request.urlopen(input_data)
    article = scraped_data.read()

    parsed_article = bs.BeautifulSoup(article, 'lxml')

    paragraphs = parsed_article.find_all('p')

    article_text = [paragraph.text for paragraph in paragraphs]

else:

    help_menu()
    exit(0)

if __name__ == "__main__":

    article_text = "".join(article_text)

    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
    article_text = re.sub(r'\s+', ' ', article_text)

    formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

    sentence_list = nltk.sent_tokenize(article_text)

    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}

    for word in nltk.word_tokenize(formatted_article_text):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_freq = max(word_frequencies.values())

    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/maximum_freq)

    sentence_scores = {}

    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(summary_length, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)

    print(summary)

## requirements.txt
appdirs==1.4.4
appnope==0.1.0
attrs==19.3.0
backcall==0.1.0
beautifulsoup4==4.9.1
bleach==3.1.5
click==7.1.2
decorator==4.4.2
defusedxml==0.6.0
distlib==0.3.0
entrypoints==0.3
filelock==3.0.12
ipykernel==5.3.0
ipython==7.14.0
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.0
Jinja2==2.11.2
joblib==0.15.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.3
jupyter-console==6.1.0
jupyter-core==4.6.3
lxml==4.5.1
MarkupSafe==1.1.1
mistune==0.8.4
nbconvert==5.6.1
nbformat==5.0.6
nltk==3.5
notebook==6.0.3
numpy==1.18.4
packaging==20.4
pandocfilters==1.4.2
parso==0.7.0
pexpect==4.8.0
pickleshare==0.7.5
prometheus-client==0.8.0
prompt-toolkit==3.0.5
ptyprocess==0.6.0
Pygments==2.6.1
pyparsing==2.4.7
pyrsistent==0.16.0
python-dateutil==2.8.1
pyzmq==19.0.1
qtconsole==4.7.4
QtPy==1.9.0
regex==2020.6.8
Send2Trash==1.5.0
six==1.15.0
soupsieve==2.0.1
terminado==0.8.3
testpath==0.4.4
tornado==6.0.4
tqdm==4.46.1
traitlets==4.3.3
virtualenv==20.0.23
wcwidth==0.1.9
webencodings==0.5.1
widgetsnbextension==3.5.1
	#!/usr/bin/env python
	'''
	@description text summarization with python
	'''

	import bs4 as bs
	import urllib.request
	import re
	import nltk
	import heapq
	import sys


	def help_menu():
	print('''USAGE:
	python __init__.py [FILE_PATH] [SUMMARY_LENGTH]
	python __init__.py ./some_file.txt 1
	python __init__.py 'https://en.wikipedia.org/wiki/Brute_fact' 5''')

	input_data = sys.argv[1] if len(sys.argv) > 1 else ''
	summary_length = int(sys.argv[2]) if len(sys.argv) > 2 else 1

	article_text = ""

	if (input_data and not ('http' in input_data)):

	opened_file = open(input_data, 'r+')

	article_text = [line for line in opened_file]

	elif ('http' in input_data):

	scraped_data = urllib.request.urlopen(input_data)
	article = scraped_data.read()

	parsed_article = bs.BeautifulSoup(article, 'lxml')

	paragraphs = parsed_article.find_all('p')

	article_text = [paragraph.text for paragraph in paragraphs]

	else:

	help_menu()
	exit(0)

	if __name__ == "__main__":

	article_text = "".join(article_text)

	article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
	article_text = re.sub(r'\s+', ' ', article_text)

	formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
	formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)

	sentence_list = nltk.sent_tokenize(article_text)

	stopwords = nltk.corpus.stopwords.words('english')

	word_frequencies = {}

	for word in nltk.word_tokenize(formatted_article_text):
	if word not in stopwords:
	if word not in word_frequencies.keys():
	word_frequencies[word] = 1
	else:
	word_frequencies[word] += 1

	maximum_freq = max(word_frequencies.values())

	for word in word_frequencies.keys():
	word_frequencies[word] = (word_frequencies[word]/maximum_freq)

	sentence_scores = {}

	for sent in sentence_list:
	for word in nltk.word_tokenize(sent.lower()):
	if word in word_frequencies.keys():
	if len(sent.split(' ')) < 30:
	if sent not in sentence_scores.keys():
	sentence_scores[sent] = word_frequencies[word]
	else:
	sentence_scores[sent] += word_frequencies[word]

	summary_sentences = heapq.nlargest(summary_length, sentence_scores, key=sentence_scores.get)

	summary = ' '.join(summary_sentences)

	print(summary)
	appdirs==1.4.4
	appnope==0.1.0
	attrs==19.3.0
	backcall==0.1.0
	beautifulsoup4==4.9.1
	bleach==3.1.5
	click==7.1.2
	decorator==4.4.2
	defusedxml==0.6.0
	distlib==0.3.0
	entrypoints==0.3
	filelock==3.0.12
	ipykernel==5.3.0
	ipython==7.14.0
	ipython-genutils==0.2.0
	ipywidgets==7.5.1
	jedi==0.17.0
	Jinja2==2.11.2
	joblib==0.15.1
	jsonschema==3.2.0
	jupyter==1.0.0
	jupyter-client==6.1.3
	jupyter-console==6.1.0
	jupyter-core==4.6.3
	lxml==4.5.1
	MarkupSafe==1.1.1
	mistune==0.8.4
	nbconvert==5.6.1
	nbformat==5.0.6
	nltk==3.5
	notebook==6.0.3
	numpy==1.18.4
	packaging==20.4
	pandocfilters==1.4.2
	parso==0.7.0
	pexpect==4.8.0
	pickleshare==0.7.5
	prometheus-client==0.8.0
	prompt-toolkit==3.0.5
	ptyprocess==0.6.0
	Pygments==2.6.1
	pyparsing==2.4.7
	pyrsistent==0.16.0
	python-dateutil==2.8.1
	pyzmq==19.0.1
	qtconsole==4.7.4
	QtPy==1.9.0
	regex==2020.6.8
	Send2Trash==1.5.0
	six==1.15.0
	soupsieve==2.0.1
	terminado==0.8.3
	testpath==0.4.4
	tornado==6.0.4
	tqdm==4.46.1
	traitlets==4.3.3
	virtualenv==20.0.23
	wcwidth==0.1.9
	webencodings==0.5.1
	widgetsnbextension==3.5.1