Skip to content

Instantly share code, notes, and snippets.

Last active June 1, 2021 23:51
Show Gist options
  • Save palewire/0dded073b8f9aa9202ca2f364e664568 to your computer and use it in GitHub Desktop.
Save palewire/0dded073b8f9aa9202ca2f364e664568 to your computer and use it in GitHub Desktop.
Rotating proxy scraper example
Display the source blob
Display the rendered blob
"cells": [
"cell_type": "markdown",
"metadata": {},
"source": [
"# Rotating proxy scraper example\n",
"By [Ben Welsh]("
"cell_type": "markdown",
"metadata": {},
"source": [
"An example of how to scrape a list of available proxies and use them to make web requests. Helpful when scraping sites that employ measures to restrict access."
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import itertools\n",
"from bs4 import BeautifulSoup"
"cell_type": "markdown",
"metadata": {},
"source": [
"### Get proxy list from"
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_proxies():\n",
" \"\"\"\n",
" Fetch a list of proxy addresses from the web.\n",
" \"\"\"\n",
" # Fetch the page with the list\n",
" r = requests.get('')\n",
" # Set it up in BeautifulSoup for parsing\n",
" soup = BeautifulSoup(r.text, \"html.parser\")\n",
" # Initialize a blank list to use later\n",
" proxies = set()\n",
" # Loop through all the rows in the table we want to scrape\n",
" for row in soup.find(\"tbody\").find_all('tr')[:75]:\n",
" # If it is listed as a working proxy ...\n",
" if 'yes' in str(row):\n",
" # ... parse out the IP\n",
" cell_list = row.find_all(\"td\")\n",
" ip = cell_list[0].string\n",
" port = cell_list[1].string\n",
" # Add it to our list\n",
" proxies.add(\"{}:{}\".format(ip, port))\n",
" # Return the list\n",
" return proxies"
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"proxy_list = get_proxies()"
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" '',\n",
" ''}"
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert it into a pool that will randomly return items forever"
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"proxy_pool = itertools.cycle(proxy_list)"
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a similar pool of user agents"
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"useragent_list = [\n",
" # Chrome\n",
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',\n",
" 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',\n",
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',\n",
" # Firefox\n",
" 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',\n",
" 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',\n",
" 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',\n",
" 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',\n",
" 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',\n",
" 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',\n",
" 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',\n",
" 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'\n",
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"useragent_pool = itertools.cycle(useragent_list)"
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'"
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"<itertools.cycle at 0x7f8acc3d01b8>"
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'"
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "markdown",
"metadata": {},
"source": [
"### Request a URL using a random proxy and a random user agent"
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def get_url(url):\n",
" \"\"\"\n",
" Returns the response from a URL, retries if it fails.\n",
" \"\"\"\n",
" # Get the proxy\n",
" proxy = next(proxy_pool)\n",
" \n",
" # Get the user agent\n",
" useragent = next(useragent_pool)\n",
" \n",
" # Log\n",
" print(\"Making a GET request for {} with proxy {} and user agent {}\".format(url, proxy, useragent))\n",
" \n",
" # Go get it\n",
" return requests.get(url, proxies={\"http\": proxy, \"https\": proxy}, headers={'User-Agent': useragent})"
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
"name": "stdout",
"output_type": "stream",
"text": [
"Making a GET request for with proxy and user agent Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36\n"
"source": [
"r = get_url(\"\")"
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
"source": [
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
"data": {
"text/plain": [
"u'<html style=\"height:100%\"><head><META NAME=\"ROBOTS\" CONTENT=\"NOINDEX, NOFOLLOW\"><meta name=\"format-detection\" content=\"telephone=no\"><meta name=\"viewport\" content=\"initial-scale=1.0\"><meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\"><script type=\"text/javascript\" src=\"/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3\"></script></head><body style=\"margin:0px;height:100%\"><iframe src=\"/_Incapsula_Resource?CWUDNSAI=1&xinfo=10-85779505-0%200NNN%20RT%281532808702515%207%29%20q%280%20-1%20-1%200%29%20r%280%20-1%29%20B16%284%2c312%2c0%29%20U18&incident_id=539032060125820800-325878285682329706&edet=16&cinfo=04000000\" frameborder=0 width=\"100%\" height=\"100%\" marginheight=\"0px\" marginwidth=\"0px\">Request unsuccessful. Incapsula incident ID: 539032060125820800-325878285682329706</iframe></body></html>'"
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
"source": [
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
"nbformat": 4,
"nbformat_minor": 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment