Skip to content

Instantly share code, notes, and snippets.

@andisugandi
Last active November 10, 2021 04:55
Show Gist options
  • Save andisugandi/14f1d57ec1e0bde6f691799399ea3424 to your computer and use it in GitHub Desktop.
Save andisugandi/14f1d57ec1e0bde6f691799399ea3424 to your computer and use it in GitHub Desktop.
Download Film Data the-numbers.com
import csv,os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen, URLError
from selenium import webdriver
counter = 0
currentDir=os.getcwd()
filename = currentDir + "/theNumbersScraper.csv"
pagecount = 1
headers=['ID',
'Release Date',
'Movie',
'Production Budget',
'Domestic Gross',
'Worldwide Gross']
with open(filename, 'w' ,newline='\n',encoding='utf-8') as csvfile:
writer = csv.writer(csvfile,delimiter='|')
writer.writerow(headers)
while pagecount < 6101:
"""
#movie-entries go from http://www.the-numbers.com/movie/budgets/all/1
#to http://www.the-numbers.com/movie/budgets/all/6101
#so there are 6100 entries
"""
request = Request("http://www.the-numbers.com/movie/budgets/all/"+str(pagecount))
request.add_header('User-agent', 'wswp')
website = urlopen(request).read().strip()
soup = BeautifulSoup(website,'lxml')
#movie-entries go from http://www.the-numbers.com/movie/budgets/all/1
#to http://www.the-numbers.com/movie/budgets/all/6101
#so there are 6100 entries
all_tr = soup.find_all("tr")
for movie in range(1, len(all_tr)):
row=[]
counter+=1
row.append(counter)
td = all_tr[movie].find_all("td")
for colIndex in range(1, len(td)):
row.append(td[colIndex].string)
writer.writerow(row)
pagecount +=100
csvfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment