Skip to content

Instantly share code, notes, and snippets.

@Abhayparashar31
Last active October 17, 2022 16:11
Show Gist options
  • Save Abhayparashar31/b3a2edf416e1f6cdd6547e5f60e8c13a to your computer and use it in GitHub Desktop.
Save Abhayparashar31/b3a2edf416e1f6cdd6547e5f60e8c13a to your computer and use it in GitHub Desktop.
import gensim
import re
from gensim.summarization.summarizer import summarize
import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Python_(programming_language)'
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
extracted_rows_content = []
for i in range(len(soup.select('p'))):
row_text = soup.select('p')[i].getText().strip()
extracted_rows_content.append(row_text)
raw_data = " ".join(extracted_rows_content)
import re
def clean_data(data):
text = re.sub(r"\[[0-9]*\]"," ",data)
text = text.lower()
text = re.sub(r'\s+'," ",text)
text = re.sub(r","," ",text)
return text
cleaned_article_content = clean_data(raw_data)
summary = summarize(cleaned_article_content, ratio = 0.01)
summary = re.sub('\[[^\]]*\]','',summary)
print(summary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment