Skip to content

Instantly share code, notes, and snippets.

@isaurssaurav
Last active April 3, 2020 11:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save isaurssaurav/7802cbcf37d65f338692f1f02e21c005 to your computer and use it in GitHub Desktop.
Save isaurssaurav/7802cbcf37d65f338692f1f02e21c005 to your computer and use it in GitHub Desktop.
web_scrap
# pip install requests
# pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup
def get_HTML_text(url):
response = requests.get(url)
## if any websites throw 403 add user agent and cookie
#use headers and send it as send argument in requests.get
#headers= {
#"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
#"cookie": "__cfduid=de9fdff979531ae501164dc2c3e650e4d1568871390; _ga=GA1.2.870057641.1568871391; _fbp=fb.1.1568871391716.1151791623; __gads=ID=14242a785084d4d8:T=1568871391:S=ALNI_Mby30Q72TQcn9MggZXuhDXS-LC96w; PHPSESSID=42thds5b205itvgvliv5hicsv5; _gid=GA1.2.644164567.1585910282"
#}
#response = requests.get(url, headers=headers)
response_page = response.text
response_status = response.status_code
print(response_status)
if response_status == 200:
return response_page
elif response_status == 403:
raise Exception("try after minute again")
else:
raise Exception("oops something went wrong")
#get html txt of website
response_page = get_HTML_text("https://www.imdb.com/chart/boxoffice/?ref_=nv_ch_cht")
# parse html
response_page_soup = BeautifulSoup(response_page, "html.parser")
# select table
container = response_page_soup.find(
"table", {"class": "chart full-width"})
# select table kok tbody ko tr
trs = container.find("tbody").findAll("tr")
# file open
try:
f = open("movies.csv", "w")
f.write("name, image \n") # headers
for tr in trs:
title = tr.find("td", {"class", "titleColumn"}).a.text
image = tr.find("td", {"class", "posterColumn"}).a.img["src"]
f.write(title+",\""+image+"\"\n")
except Exception as e:
print("oops", e)
else:
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment