Skip to content

Instantly share code, notes, and snippets.

@cyberandy
Forked from pshapiro/metadesc.py
Last active August 19, 2023 18:53
Show Gist options
  • Save cyberandy/e290d567157c7e4bf4aac26b2016999a to your computer and use it in GitHub Desktop.
Save cyberandy/e290d567157c7e4bf4aac26b2016999a to your computer and use it in GitHub Desktop.
Use sumy summarizer to extract summary from HTML pages that can be used for meta descriptions.
import csv
import os
import requests, sys
import pandas as pd
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Lsa
from sumy.summarizers.luhn import LuhnSummarizer as Luhn
from sumy.summarizers.text_rank import TextRankSummarizer as TxtRank
from sumy.summarizers.lex_rank import LexRankSummarizer as LexRank
from sumy.summarizers.sum_basic import SumBasicSummarizer as SumBasic
from sumy.summarizers.kl import KLSummarizer as KL
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
SENTENCES_COUNT = 1
# We call the script with 3 arguments:
# 1. the CSV with the list of URLs to analyze,
# 2. the new CSV where we will store the new MDs and
# 3. the language to be used (if missing "english" will be used)
# i.e. "generate-md.py in.csv out.csv english"
urlinput = sys.argv[1]
print("csv to analyze: ", urlinput)
outputcsv = sys.argv[2]
print("output csv name: ", outputcsv)
# Check if language has been set
def get_lan():
try:
sys.argv[3]
except IndexError:
return 'english'
else:
return sys.argv[3]
LANGUAGE = get_lan()
print("language set to: ", LANGUAGE)
# Open the CSV file with the list of URLs to analyze
df = pd.read_csv(urlinput)
print("Number of rows in csv", len(df))
# Create a list to store the MDs
data_x = []
# For each URL in the input CSV run the analysis and store the results in the list
for i in range(len(df)):
stemmer = Stemmer(LANGUAGE)
# Here is the URL to be analyzed
line = df.iloc[i][0]
print(line)
lsaSummarizer = Lsa(stemmer)
lsaSummarizer.stop_words = get_stop_words(LANGUAGE)
luhnSummarizer = Luhn(stemmer)
luhnSummarizer.stop_words = get_stop_words(LANGUAGE)
lexrankSummarizer = LexRank(stemmer)
lexrankSummarizer.stop_words = get_stop_words(LANGUAGE)
textrankSummarizer = TxtRank(stemmer)
textrankSummarizer.stop_words = get_stop_words(LANGUAGE)
sumbasicSummarizer = SumBasic(stemmer)
sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE)
klSummarizer = KL(stemmer)
klSummarizer.stop_words = get_stop_words(LANGUAGE)
# Error handling for HTTP connection problems
try:
parser = HtmlParser.from_url(line, Tokenizer(LANGUAGE))
except:
print('error while fetching', line)
# LSA
for lsa_sentence in lsaSummarizer(parser.document, SENTENCES_COUNT):
print(lsa_sentence)
print("Summarizing URL via LSA: " + line)
# Luhn
for luhn_sentence in luhnSummarizer(parser.document, SENTENCES_COUNT):
print(luhn_sentence)
print("Summarizing URL via Luhn: " + line)
# LexRank
for lex_sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT):
print(lex_sentence)
print("Summarizing URL via LexRank: " + line)
# TextRank
for text_sentence in textrankSummarizer(parser.document, SENTENCES_COUNT):
print(text_sentence)
print("Summarizing URL via TextRank: " + line)
# SumBasic
for sum_sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT):
print(sum_sentence)
print("Summarizing URL via SumBasic: " + line)
# KL-Sum
for kl_sentence in klSummarizer(parser.document, SENTENCES_COUNT):
print(kl_sentence)
print("Summarizing URL via KL-Sum: " + line)
# Storing all values into the list
data_x.append({"url":line, "LSA":lsa_sentence, "Luhn":luhn_sentence, "LexRank":lex_sentence, "TextRank":text_sentence, "SumBasic":sum_sentence, "KL-Sum":kl_sentence})
# Save results to the output CSV
df = pd.DataFrame(data_x, columns=["url", "LSA", "Luhn", "LexRank", "TextRank", "SumBasic", "KL-Sum"])
df.to_csv(outputcsv, encoding='utf-8', index=False)
print("Results saved on", outputcsv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment