CHARITH1995/webscrapping-full.py

## webscrapping-full.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
'exec(%matplotlib inline)'
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import MySQLdb
from datetime import datetime
import io
import csv

url = "http://sinhala.adaderana.lk/sinhala-hot-news.php"

db = MySQLdb.connect(host="localhost",
                     user="root",
                     passwd="",
                     db="research",
					 charset='utf8',
                     use_unicode=True)

# create a Cursor object
cur = db.cursor()


driver = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
driver.get(url)
time.sleep(5)
htmlSource = driver.page_source
soup = BeautifulSoup(htmlSource, 'lxml')
type(soup)
#title = soup.title
all_div = soup.find_all("div",{"class" : "news-story"})

for link in all_div:
	news_container = link.find_all("div",{"class" : "story-text"})
	for news in news_container:
		h2_tags = news.find_all("h2")
		for url in h2_tags:
			a_tags = url.find_all('a')
			for end_point in a_tags:
				#print(end_point.get("href"))
				url_ind = "http://sinhala.adaderana.lk/"+end_point.get("href")
				driver_ind = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
				driver_ind.get(url_ind)
				time.sleep(5)
				html = driver_ind.page_source
				soup = BeautifulSoup(html, 'lxml')
				type(soup)
				heading = soup.find_all("h1",{"class" : "news-heading"})
				title = heading[0].get_text().encode(encoding='UTF-8').strip()
				date = soup.find_all("p",{"class" : "news-datestamp english-font"})
				date = date[0].get_text().strip()
				date = str(date)
				content = soup.find_all("div", { "class" : "news-content" })
				contents = content[0].get_text().encode(encoding='UTF-8').strip()
				courses_list=[]
				course=[title,date,contents]
				courses_list.append(course)

        			#save content to a file.csv
				#with open ('news.csv','w',encoding="utf-8") as file:
					#writer=csv.writer(file)
					#writer.writerow(course)

						#for row in course_list:

				#save content to a file.txt
				with open("news.txt", "w", encoding="utf-8") as file:
					file.write(date)
					file.write(title.decode(encoding='UTF-8'))
					file.write(contents.decode(encoding='UTF-8'))
					file.close()

        			#save content to db
				#sql = "INSERT INTO `newsarticles` (`source`, `sid`, `time`, `title`, `body`) VALUES (%s, %s, %s, %s, %s);"
				#try:
					#cur.execute(sql, ('adaderana',0, date, title, contents))
					#db.commit()
				#except (MySQLdb.Error, MySQLdb.Warning) as e:
					#print ("SQL Error")
					#raise e
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	'exec(%matplotlib inline)'
	from urllib.request import urlopen
	from bs4 import BeautifulSoup
	import time
	from selenium import webdriver
	import MySQLdb
	from datetime import datetime
	import io
	import csv

	url = "http://sinhala.adaderana.lk/sinhala-hot-news.php"

	db = MySQLdb.connect(host="localhost",
	user="root",
	passwd="",
	db="research",
	charset='utf8',
	use_unicode=True)

	# create a Cursor object
	cur = db.cursor()


	driver = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
	driver.get(url)
	time.sleep(5)
	htmlSource = driver.page_source
	soup = BeautifulSoup(htmlSource, 'lxml')
	type(soup)
	#title = soup.title
	all_div = soup.find_all("div",{"class" : "news-story"})

	for link in all_div:
	news_container = link.find_all("div",{"class" : "story-text"})
	for news in news_container:
	h2_tags = news.find_all("h2")
	for url in h2_tags:
	a_tags = url.find_all('a')
	for end_point in a_tags:
	#print(end_point.get("href"))
	url_ind = "http://sinhala.adaderana.lk/"+end_point.get("href")
	driver_ind = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
	driver_ind.get(url_ind)
	time.sleep(5)
	html = driver_ind.page_source
	soup = BeautifulSoup(html, 'lxml')
	type(soup)
	heading = soup.find_all("h1",{"class" : "news-heading"})
	title = heading[0].get_text().encode(encoding='UTF-8').strip()
	date = soup.find_all("p",{"class" : "news-datestamp english-font"})
	date = date[0].get_text().strip()
	date = str(date)
	content = soup.find_all("div", { "class" : "news-content" })
	contents = content[0].get_text().encode(encoding='UTF-8').strip()
	courses_list=[]
	course=[title,date,contents]
	courses_list.append(course)

	#save content to a file.csv
	#with open ('news.csv','w',encoding="utf-8") as file:
	#writer=csv.writer(file)
	#writer.writerow(course)

	#for row in course_list:

	#save content to a file.txt
	with open("news.txt", "w", encoding="utf-8") as file:
	file.write(date)
	file.write(title.decode(encoding='UTF-8'))
	file.write(contents.decode(encoding='UTF-8'))
	file.close()

	#save content to db
	#sql = "INSERT INTO `newsarticles` (`source`, `sid`, `time`, `title`, `body`) VALUES (%s, %s, %s, %s, %s);"
	#try:
	#cur.execute(sql, ('adaderana',0, date, title, contents))
	#db.commit()
	#except (MySQLdb.Error, MySQLdb.Warning) as e:
	#print ("SQL Error")
	#raise e