Skip to content

Instantly share code, notes, and snippets.

@kishanpython
Created May 18, 2022 11:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kishanpython/cf3597a24895532907c012cd1e193d45 to your computer and use it in GitHub Desktop.
Save kishanpython/cf3597a24895532907c012cd1e193d45 to your computer and use it in GitHub Desktop.
#installation in databricks notebook
%pip install BeautifulSoup4
#import necessary libs
import requests
from bs4 import BeautifulSoup
import pandas as pd
# urls
seed_urls = ['https://inshorts.com/en/read/technology',
'https://inshorts.com/en/read/sports',
'https://inshorts.com/en/read/world',
'https://inshorts.com/en/read/politics',
'https://inshorts.com/en/read/entertainment',
'https://inshorts.com/en/read/automobile',
'https://inshorts.com/en/read/science',
'https://inshorts.com/en/read/world']
#storage list
news_data = []
# scrapping data
for url in seed_urls:
news_category = url.split('/')[-1]
data = requests.get(url)
soup = BeautifulSoup(data.content, 'html.parser')
news_articles = [{ 'news_headline': headline.find('span', attrs={"itemprop": "headline"}).string,
'news_article': article.find('div', attrs={"itemprop": "articleBody"}).string,
'news_category': news_category}
for headline, article in zip(
soup.find_all('div', class_=["news-card-title news-right-box"]),
soup.find_all('div', class_=["news-card-content news-right-box"]))]
news_data.extend(news_articles)
# creating dataframe
inshorts_df = pd.DataFrame.from_dict(news_data,orient='columns')
#displaying the data
inshorts_df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment