Skip to content

Instantly share code, notes, and snippets.

View jkokatjuhha's full-sized avatar

Jekaterina Kokatjuhha jkokatjuhha

View GitHub Profile
proxies = {'http' : 'http://10.10.0.0:0000',
'https': 'http://120.10.0.0:0000'}
page_response = requests.get(page_link, proxies=proxies, timeout=5)
from bs4 import BeautifulSoup
import requests
page_link ='https://www.website_to_crawl.com'
# fetch the content from url
page_response = requests.get(page_link, timeout=5)
# parse html
page_content = BeautifulSoup(page_response.content, "html.parser")
# extract all html elements where price is stored
prices = page_content.find_all(class_='main_price')
try:
page_response = requests.get(page_link, timeout=5)
if page_response.status_code == 200:
# extract
else:
print(page_response.status_code)
# notify, try again
except requests.Timeout as e:
print("It is time to timeout")
print(str(e))
import numpy as np
import multiprocessing as multi
def chunks(n, page_list):
"""Splits the list into n chunks"""
return np.array_split(page_list,n)
cpus = multi.cpu_count()
workers = []
page_list = ['www.website.com/page1.html', 'www.website.com/page2.html'
# timeout is set to 5 secodns
page_response = requests.get(page_link, timeout=5, headers=headers)
# library to generate user agent
from user_agent import generate_user_agent
# generate a user agent
headers = {'User-Agent': generate_user_agent(device_type="desktop", os=('mac', 'linux'))}
#headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686 on x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.63 Safari/537.36'}
page_response = requests.get(page_link, timeout=5, headers=headers)
price = page_content.find(id='listings_prices')
# check if the element with such id exists or not
if price is None:
# NOTIFY! LOG IT, COUNT IT
else:
# do something