Skip to content

Instantly share code, notes, and snippets.

@Abhayparashar31
Created October 17, 2022 16:07
Show Gist options
  • Save Abhayparashar31/21ab032b06f52a86674c8e05f8120106 to your computer and use it in GitHub Desktop.
Save Abhayparashar31/21ab032b06f52a86674c8e05f8120106 to your computer and use it in GitHub Desktop.
import sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
articles = []
for i in range(len(soup.select('p'))):
article = soup.select('p')[i].getText().strip()
articles.append(article)
raw_data = " ".join(articles)
import re
def clean_data(data):
text = re.sub(r"\[[0-9]*\]"," ",data)
text = text.lower()
text = re.sub(r'\s+'," ",text)
text = re.sub(r","," ",text)
return text
cleaned_article_content = clean_data(raw_data)
# For Strings
parser = PlaintextParser.from_string(cleaned_article_content,Tokenizer("english"))
summarizer = LexRankSummarizer()
#Summarize the document with 2 sentences
summary = summarizer(parser.document, 2)
for sentence in summary:
print(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment