Skip to content

Instantly share code, notes, and snippets.

@CHARITH1995
Last active July 17, 2020 08:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CHARITH1995/bba4582bf00cd1b75f623843837e8fd4 to your computer and use it in GitHub Desktop.
Save CHARITH1995/bba4582bf00cd1b75f623843837e8fd4 to your computer and use it in GitHub Desktop.
webscrapping full code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
'exec(%matplotlib inline)'
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import MySQLdb
from datetime import datetime
import io
import csv
url = "http://sinhala.adaderana.lk/sinhala-hot-news.php"
db = MySQLdb.connect(host="localhost",
user="root",
passwd="",
db="research",
charset='utf8',
use_unicode=True)
# create a Cursor object
cur = db.cursor()
driver = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
driver.get(url)
time.sleep(5)
htmlSource = driver.page_source
soup = BeautifulSoup(htmlSource, 'lxml')
type(soup)
#title = soup.title
all_div = soup.find_all("div",{"class" : "news-story"})
for link in all_div:
news_container = link.find_all("div",{"class" : "story-text"})
for news in news_container:
h2_tags = news.find_all("h2")
for url in h2_tags:
a_tags = url.find_all('a')
for end_point in a_tags:
#print(end_point.get("href"))
url_ind = "http://sinhala.adaderana.lk/"+end_point.get("href")
driver_ind = webdriver.Firefox(executable_path=r'D:\apps\anaconda\geckodriver.exe')
driver_ind.get(url_ind)
time.sleep(5)
html = driver_ind.page_source
soup = BeautifulSoup(html, 'lxml')
type(soup)
heading = soup.find_all("h1",{"class" : "news-heading"})
title = heading[0].get_text().encode(encoding='UTF-8').strip()
date = soup.find_all("p",{"class" : "news-datestamp english-font"})
date = date[0].get_text().strip()
date = str(date)
content = soup.find_all("div", { "class" : "news-content" })
contents = content[0].get_text().encode(encoding='UTF-8').strip()
courses_list=[]
course=[title,date,contents]
courses_list.append(course)
#save content to a file.csv
#with open ('news.csv','w',encoding="utf-8") as file:
#writer=csv.writer(file)
#writer.writerow(course)
#for row in course_list:
#save content to a file.txt
with open("news.txt", "w", encoding="utf-8") as file:
file.write(date)
file.write(title.decode(encoding='UTF-8'))
file.write(contents.decode(encoding='UTF-8'))
file.close()
#save content to db
#sql = "INSERT INTO `newsarticles` (`source`, `sid`, `time`, `title`, `body`) VALUES (%s, %s, %s, %s, %s);"
#try:
#cur.execute(sql, ('adaderana',0, date, title, contents))
#db.commit()
#except (MySQLdb.Error, MySQLdb.Warning) as e:
#print ("SQL Error")
#raise e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment