Skip to content

Instantly share code, notes, and snippets.

@asanakoy
Created October 12, 2018 19:11
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asanakoy/97bb8362e5e88de3e1894d814bc08561 to your computer and use it in GitHub Desktop.
Save asanakoy/97bb8362e5e88de3e1894d814bc08561 to your computer and use it in GitHub Desktop.
How to scrape data fwom awebsite through proxies
#Proxy list graper
# https://github.com/abdallahelsokary/Proxy-Collector-/blob/master/Proxy_Collector.py
import urllib.request
import urllib.error
import time
def proxy_list():
try:
time.sleep(1)
url = "https://free-proxy-list.net/" # the source
req = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
open_url = urllib.request.urlopen(req)
data = open_url.read()
soup = BeautifulSoup(data,'html.parser')
proxy_data = soup.find_all('tr')
Pr_list = []
for i in proxy_data:
Pr = "{0}:{1}".format(BeautifulSoup(str(list(i)[0]),'html.parser').text,BeautifulSoup(str(list(i)[1]),'html.parser').text)
Pr_list.append(Pr)
Pr_list.remove(Pr_list[0])
Pr_list.remove(Pr_list[-1])
print('Find ', len(Pr_list), 'proxies')
except:
pass
return Pr_list
def getPage(CheckPage, proxyDict, prxList):
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
if (prxList==[]):
prxList = proxy_list()
prxList = list(reversed(prxList))
prxList = prxList[0:100]
r = ''
try:
r = requests.get(CheckPage, proxies=proxyDict, headers=header)
except:
pass
if (str(r)!='<Response [200]>'):
prx = prxList.pop()
print('Changing proxy. New one:', prx, 'Proxies left:', len(prxList))
proxyDict = {'http': prx, 'https': prx}
soup, r, proxyDict, prxList = getPage(CheckPage, proxyDict, prxList)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
return soup, r, proxyDict, prxList
for ID in tqdm(IDsAll):
Page = 'http:/blabla.com/page?id=' + ID
MainTable = ''
while (MainTable==''):
try:
soup, r, proxyDict, prxList = getPage(Page, proxyDict, prxList)
MainTable = soup.body.findAll('table', {'class':'hdr14'})[0].tr.find('table', {'cellspacing':'0'})
except:
proxyDict = {'http': None, 'https': None}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment