Skip to content

Instantly share code, notes, and snippets.

@steven0608
Last active December 17, 2018 05:33
Show Gist options
  • Save steven0608/1dc598bdf793d36b76338dfda6bed214 to your computer and use it in GitHub Desktop.
Save steven0608/1dc598bdf793d36b76338dfda6bed214 to your computer and use it in GitHub Desktop.
web scrape my own site
import requests
# To make the get request
from bs4 import BeautifulSoup
# to pull data out of HTML
import pandas
#use the pandas dataframe to show my data in table on my jupyter notebook
class GetListOfBooks:
def __init__(self,url):
self.url = url
self.list=[]
def getPageNum(self):
totalPageButton = self.parseUrl(self.url).find("div",{"class":"pagination"}).find_all("a")
totalPages = totalPageButton[len(totalPageButton)-2]
return int(totalPages.text)
def parseUrl(self,url):
r = requests.get(url)
c = r.content
return BeautifulSoup(c,"html.parser")
def getPageContent(self,url):
allTableRow = self.parseUrl(url).find_all("tr")
for row in allTableRow:
tableData = row.find_all("td")
if len(tableData) == 2:
bookList={}
for data in tableData:
try:
bookList["Book Name"] = data.find("div",{"class":"content"}).text
except:
bookList["Author Name"] = data.text
self.list.append(bookList)
def getData(self):
self.getPageContent(self.url)
for num in range(2,self.getPageNum()+1):
url = "https://thawing-ridge-65567.herokuapp.com/books?page="+str(num)
self.getPageContent(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment