Skip to content

Instantly share code, notes, and snippets.

@imShakil
Created August 14, 2020 00:14
Show Gist options
  • Save imShakil/58648d30a3fc8216465126df91f5b705 to your computer and use it in GitHub Desktop.
Save imShakil/58648d30a3fc8216465126df91f5b705 to your computer and use it in GitHub Desktop.
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
def scrape_url(url, headers=None):
try:
page = requests.get(url, headers)
page.raise_for_status()
except HTTPError as e:
print(e)
except Exception as err:
print(err)
else:
soup = BeautifulSoup(page.content, 'html.parser')
soup = BeautifulSoup(soup.prettify(), 'html.parser')
return soup
return BeautifulSoup("", 'html.parser')
if __name__ == '__main__':
soup = scrape_url("https://somoynews.tv")
all_links = soup.find_all('a')
training_samples = ['করোনা', 'কোভিড-১৯', 'উহান-ভাইরাস']
for link in all_links:
title_words = link.get_text().strip().split()
if set(training_samples).intersection(set(title_words)):
print("Title: {}, Link: {}".format(link.get_text().strip(), link.get('href')))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment