Skip to content

Instantly share code, notes, and snippets.

@ian-whitestone
Created March 30, 2020 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ian-whitestone/c4986b6dabd521c76b3b5ebbaab2a09b to your computer and use it in GitHub Desktop.
Save ian-whitestone/c4986b6dabd521c76b3b5ebbaab2a09b to your computer and use it in GitHub Desktop.
import pickle
PROXY_URL = 'https://z1h4spb3u7.execute-api.us-west-1.amazonaws.com/proxy_us_west_1'
def proxy_request(url):
proxy_response = requests.post(
PROXY_URL,
data={'url': url}
)
if not proxy_response.ok:
raise Exception(
"Proxy request not successful. Status code: "
f"{proxy_response.status_code}\n{proxy_response.text}"
)
resp = pickle.loads(proxy_response.content)
return resp
def get_page_info(url_and_proxy):
"""Return property count, page count and total properties under a given URL."""
url, proxy = url_and_proxy
time.sleep(random.random() * 10)
# session = requests.Session()
total_properties, num_pages, properties_per_page = None, None, None
try:
# resp = session.get(url, headers=HEADER, proxies=proxy)
resp = proxy_request(url)
resp.raise_for_status()
# .... other code
def scrape_page(url_proxy):
time.sleep(random.random() * 16)
details = []
try:
url, proxy = url_proxy
# session = requests.Session()
# resp = session.get(url, headers=HEADER, proxies=proxy)
resp = proxy_request(url)
bf = BeautifulSoup(resp.text, 'lxml')
details = [json.loads(x.text) for x in bf.find_all('script', type='application/ld+json')]
except Exception as e:
LOGGER.exception('failed for url {}, proxy {}'.format(url, proxy))
return url, json.dumps(details)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment