Skip to content

Instantly share code, notes, and snippets.

@himlohiya
Created June 28, 2018 18:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save himlohiya/eaebb643eb6feb3b6d86368f0d37b5c7 to your computer and use it in GitHub Desktop.
Save himlohiya/eaebb643eb6feb3b6d86368f0d37b5c7 to your computer and use it in GitHub Desktop.
seed_urls = ['https://inshorts.com/en/read/technology',
'https://inshorts.com/en/read/sports',
'https://inshorts.com/en/read/world']
def build_dataset(seed_urls):
news_data = []
for url in seed_urls:
news_category = url.split('/')[-1]
data = requests.get(url)
soup = BeautifulSoup(data.content, 'html.parser')
news_articles = [{'news_headline': headline.find('span',
attrs={"itemprop": "headline"}).string,
'news_article': article.find('div',
attrs={"itemprop": "articleBody"}).string,
'news_category': news_category}
for headline, article in
zip(soup.find_all('div',
class_=["news-card-title news-right-box"]),
soup.find_all('div',
class_=["news-card-content news-right-box"]))
]
news_data.extend(news_articles)
df = pd.DataFrame(news_data)
df = df[['news_headline', 'news_article', 'news_category']]
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment