Skip to content

Instantly share code, notes, and snippets.

@pshapiro
Created June 6, 2018 22:19
  • Star 19 You must be signed in to star a gist
  • Fork 16 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save pshapiro/fe8b0c9cfd57481dfb8e247aacd06c18 to your computer and use it in GitHub Desktop.
Use Text Summarization Algorithms to Help Aid the Writing of Meta Descriptions
import csv
import os
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Lsa
from sumy.summarizers.luhn import LuhnSummarizer as Luhn
from sumy.summarizers.text_rank import TextRankSummarizer as TxtRank
from sumy.summarizers.lex_rank import LexRankSummarizer as LexRank
from sumy.summarizers.sum_basic import SumBasicSummarizer as SumBasic
from sumy.summarizers.kl import KLSummarizer as KL
from sumy.summarizers.edmundson import EdmundsonSummarizer as Edmundson
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
LANGUAGE = "english"
SENTENCES_COUNT = 1
urlinput = os.path.join(os.path.dirname(__file__), input('Enter input text file: '))
urls = open(urlinput, "r")
outputcsv = os.path.join(os.path.dirname(__file__), input('Enter a filename (minus file extension): ')+'.csv')
f = csv.writer(open(outputcsv, "w+", newline="\n", encoding="utf-8"))
f.writerow(["URL", "Copy", "Summarization Algorithm"])
for line in iter(urls):
stemmer = Stemmer(LANGUAGE)
lsaSummarizer = Lsa(stemmer)
lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
luhnSummarizer = Luhn(stemmer)
luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
# edmundsonSummarizer.bonus_words = get_bonus_words
lexrankSummarizer = LexRank(stemmer)
lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)
textrankSummarizer = TxtRank(stemmer)
textrankSummarizer.stop_words = get_stop_words(LANGUAGE)
sumbasicSummarizer = SumBasic(stemmer)
sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)
klSummarizer = KL(stemmer)
klSummarizer.stop_words = get_stop_words(LANGUAGE)
parser = HtmlParser.from_url(line, Tokenizer(LANGUAGE))
for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"LSA"])
print("Summarizing URL via LSA: " + line)
for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"Luhn"])
print("Summarizing URL via Luhn: " + line)
for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"LexRank"])
print("Summarizing URL via LexRank: " + line)
for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"TextRank"])
print("Summarizing URL via TextRank: " + line)
for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"SumBasic"])
print("Summarizing URL via SumBasic: " + line)
for sentence in klSummarizer(parser.document, SENTENCES_COUNT):
print(sentence)
f.writerow([line,sentence,"KL-Sum"])
print("Summarizing URL via KL-Sum: " + line)
urls.close()
print ("Writing to " + outputcsv + " complete.")
@cyberandy
Copy link

cyberandy commented Jun 28, 2018

@pshapiro I get the following error after entering the input text file (Sumy is working fine).

File "metadesc.py", line 19, in <module> urlinput = os.path.join(os.path.dirname(__file__), input('Enter input text file: ')) File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/posixpath.py", line 68, in join if b.startswith('/'): AttributeError: 'builtin_function_or_method' object has no attribute 'startswith'

I tried with and without the extension - for testing I have a csv with two URLs. Many thanks in advance!

@cyberandy
Copy link

All good - it was a missing module 👍

@oeonurer
Copy link

All good - it was a missing module

Hi.

Me to get the following error after entering the input text file (Sumy is working fine).

Help me?

@venrine
Copy link

venrine commented Jan 11, 2019

is there anything i am missing. i get this error. thanks in adavance
Traceback (most recent call last):
File "sample.py", line 49, in
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/sumy/parsers/html.py", line 34, in from_url
data = fetch_url(url)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/sumy/utils.py", line 23, in fetch_url
with closing(requests.get(url, headers=_HTTP_HEADERS)) as response:
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 640, in send
adapter = self.get_adapter(url=request.url)
File "/Users/venrine/Documents/htmls/env/lib/python3.5/site-packages/requests/sessions.py", line 731, in get_adapter
raise InvalidSchema("No connection adapters were found for '%s'" % url)
requests.exceptions.InvalidSchema: No connection adapters were found for

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment