Charles's Labs CharlesRajendran

## kmean-data-import.py
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('Data.csv')

## full_scrape.py
import bs4
from urllib.request import urlopen

def read_html(url):
    # get the source html
    url = urlopen(url)
    html = url.read()
    url.close()

    # create beautiful soup object and parse the html so we can use bs methods

## scrape.py
head_line = bs.find_all('h1', class_='nws-dtl-hdln')

# content will have heading and the content
content = head_line[0].getText() + '\n\n'

# extracting paragraph and concatinating it into content
for p in bs.find_all('section', class_='cb-nws-dtl-itms', itemprop='articleBody'):
    content = content + p.getText().strip() + '\n\n'

## get-attribute.py
bs.a.get('href')
# output - https://plus.google.com/104502282508811467249 (it has got the first a tags href)

## filtering-list.py
img_list = bs.find_all('img', height='30')
'''
output - [<img alt="Cricbuzz Logo" height="30" itemprop="image" src="//i.cricketcb.com/statics/site/images/cbz-logo.png"
style="bottom: -4px; position: relative;" title="Cricbuzz Logo" width="101"/>]
'''
# since python is an object oriented programming language class is a keyword,
# therefore if we want to filter the elements with class attribute we need to use class_
p_list = bs.find_all('p', class_ = 'cb-nws-para')

## element_list.py
p_list = bs.find_all('p')
#output - [<p class="cb-nws-para"> .....

## element-text.py
title_text = title.getText()
# output - Destiny's child, Raza treats triumphs and failures the same | Cricbuzz.com

## element-access.py
title = bs.title
# output - <title itemprop="name">Destiny's child, Raza treats triumphs and failures the same | Cricbuzz.com</title>

## beautifulsoupobject.py
# create beautiful soup object and parse the html so we can use bs methods
import bs4
bs = bs4.BeautifulSoup(html, 'html.parser')

## read-the-html.py
from urllib.request import urlopen
# read the page
url = urlopen('http://www.cricbuzz.com/cricket-news/100707/destinys-child-zimbabwes-middle-order-batsman-sikandar-raza-treats-triumphs-and-failures-the-same')
html = url.read()
url.close()
	import pandas as pd
	import matplotlib.pyplot as plt

	data = pd.read_csv('Data.csv')
	import bs4
	from urllib.request import urlopen

	def read_html(url):
	# get the source html
	url = urlopen(url)
	html = url.read()
	url.close()

	# create beautiful soup object and parse the html so we can use bs methods
	head_line = bs.find_all('h1', class_='nws-dtl-hdln')

	# content will have heading and the content
	content = head_line[0].getText() + '\n\n'

	# extracting paragraph and concatinating it into content
	for p in bs.find_all('section', class_='cb-nws-dtl-itms', itemprop='articleBody'):
	content = content + p.getText().strip() + '\n\n'
	bs.a.get('href')
	# output - https://plus.google.com/104502282508811467249 (it has got the first a tags href)
	img_list = bs.find_all('img', height='30')
	'''
	output - [<img alt="Cricbuzz Logo" height="30" itemprop="image" src="//i.cricketcb.com/statics/site/images/cbz-logo.png"
	style="bottom: -4px; position: relative;" title="Cricbuzz Logo" width="101"/>]
	'''
	# since python is an object oriented programming language class is a keyword,
	# therefore if we want to filter the elements with class attribute we need to use class_
	p_list = bs.find_all('p', class_ = 'cb-nws-para')
	p_list = bs.find_all('p')
	#output - [<p class="cb-nws-para"> .....
	title_text = title.getText()
	# output - Destiny's child, Raza treats triumphs and failures the same \| Cricbuzz.com
	title = bs.title
	# output - <title itemprop="name">Destiny's child, Raza treats triumphs and failures the same \| Cricbuzz.com</title>
	# create beautiful soup object and parse the html so we can use bs methods
	import bs4
	bs = bs4.BeautifulSoup(html, 'html.parser')
	from urllib.request import urlopen
	# read the page
	url = urlopen('http://www.cricbuzz.com/cricket-news/100707/destinys-child-zimbabwes-middle-order-batsman-sikandar-raza-treats-triumphs-and-failures-the-same')
	html = url.read()
	url.close()