Skip to content

Instantly share code, notes, and snippets.

View CharlesRajendran's full-sized avatar
:octocat:
Living

Charles's Labs CharlesRajendran

:octocat:
Living
View GitHub Profile
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('Data.csv')
import bs4
from urllib.request import urlopen
def read_html(url):
# get the source html
url = urlopen(url)
html = url.read()
url.close()
# create beautiful soup object and parse the html so we can use bs methods
head_line = bs.find_all('h1', class_='nws-dtl-hdln')
# content will have heading and the content
content = head_line[0].getText() + '\n\n'
# extracting paragraph and concatinating it into content
for p in bs.find_all('section', class_='cb-nws-dtl-itms', itemprop='articleBody'):
content = content + p.getText().strip() + '\n\n'
bs.a.get('href')
# output - https://plus.google.com/104502282508811467249 (it has got the first a tags href)
img_list = bs.find_all('img', height='30')
'''
output - [<img alt="Cricbuzz Logo" height="30" itemprop="image" src="//i.cricketcb.com/statics/site/images/cbz-logo.png"
style="bottom: -4px; position: relative;" title="Cricbuzz Logo" width="101"/>]
'''
# since python is an object oriented programming language class is a keyword,
# therefore if we want to filter the elements with class attribute we need to use class_
p_list = bs.find_all('p', class_ = 'cb-nws-para')
p_list = bs.find_all('p')
#output - [<p class="cb-nws-para"> .....
title_text = title.getText()
# output - Destiny's child, Raza treats triumphs and failures the same | Cricbuzz.com
title = bs.title
# output - <title itemprop="name">Destiny's child, Raza treats triumphs and failures the same | Cricbuzz.com</title>
# create beautiful soup object and parse the html so we can use bs methods
import bs4
bs = bs4.BeautifulSoup(html, 'html.parser')
from urllib.request import urlopen
# read the page
url = urlopen('http://www.cricbuzz.com/cricket-news/100707/destinys-child-zimbabwes-middle-order-batsman-sikandar-raza-treats-triumphs-and-failures-the-same')
html = url.read()
url.close()