isaurssaurav/web_scrap.py

## web_scrap.py
# pip install requests
# pip install beautifulsoup4
import requests
from bs4 import BeautifulSoup


def get_HTML_text(url):
    response = requests.get(url)

    ## if any websites throw 403 add user agent and cookie
    #use headers and send it as send argument in requests.get
    #headers= {
    #"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
    #"cookie": "__cfduid=de9fdff979531ae501164dc2c3e650e4d1568871390; _ga=GA1.2.870057641.1568871391; _fbp=fb.1.1568871391716.1151791623; __gads=ID=14242a785084d4d8:T=1568871391:S=ALNI_Mby30Q72TQcn9MggZXuhDXS-LC96w; PHPSESSID=42thds5b205itvgvliv5hicsv5; _gid=GA1.2.644164567.1585910282"
    #}

    #response = requests.get(url, headers=headers)

    response_page = response.text
    response_status = response.status_code

    print(response_status)
    if response_status == 200:
        return response_page
    elif response_status == 403:
        raise Exception("try after minute again")
    else:
        raise Exception("oops something went wrong")


#get html txt of website
response_page = get_HTML_text("https://www.imdb.com/chart/boxoffice/?ref_=nv_ch_cht")
# parse html
response_page_soup = BeautifulSoup(response_page, "html.parser")
# select table
container = response_page_soup.find(
    "table", {"class": "chart full-width"})
# select table kok tbody ko tr
trs = container.find("tbody").findAll("tr")

# file open
try:
    f = open("movies.csv", "w")
    f.write("name, image \n")  # headers

    for tr in trs:
        title = tr.find("td", {"class", "titleColumn"}).a.text
        image = tr.find("td", {"class", "posterColumn"}).a.img["src"]
        f.write(title+",\""+image+"\"\n")
except Exception as e:
    print("oops", e)
else:
    f.close()
	# pip install requests
	# pip install beautifulsoup4
	import requests
	from bs4 import BeautifulSoup


	def get_HTML_text(url):
	response = requests.get(url)

	## if any websites throw 403 add user agent and cookie
	#use headers and send it as send argument in requests.get
	#headers= {
	#"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
	#"cookie": "__cfduid=de9fdff979531ae501164dc2c3e650e4d1568871390; _ga=GA1.2.870057641.1568871391; _fbp=fb.1.1568871391716.1151791623; __gads=ID=14242a785084d4d8:T=1568871391:S=ALNI_Mby30Q72TQcn9MggZXuhDXS-LC96w; PHPSESSID=42thds5b205itvgvliv5hicsv5; _gid=GA1.2.644164567.1585910282"
	#}

	#response = requests.get(url, headers=headers)

	response_page = response.text
	response_status = response.status_code

	print(response_status)
	if response_status == 200:
	return response_page
	elif response_status == 403:
	raise Exception("try after minute again")
	else:
	raise Exception("oops something went wrong")



	#get html txt of website
	response_page = get_HTML_text("https://www.imdb.com/chart/boxoffice/?ref_=nv_ch_cht")
	# parse html
	response_page_soup = BeautifulSoup(response_page, "html.parser")
	# select table
	container = response_page_soup.find(
	"table", {"class": "chart full-width"})
	# select table kok tbody ko tr
	trs = container.find("tbody").findAll("tr")

	# file open
	try:
	f = open("movies.csv", "w")
	f.write("name, image \n") # headers

	for tr in trs:
	title = tr.find("td", {"class", "titleColumn"}).a.text
	image = tr.find("td", {"class", "posterColumn"}).a.img["src"]
	f.write(title+",\""+image+"\"\n")
	except Exception as e:
	print("oops", e)
	else:
	f.close()