Skip to content

Instantly share code, notes, and snippets.

@Cguilliman
Created August 30, 2019 07:41
Show Gist options
  • Save Cguilliman/bdace43a2dee1d01cb8615394a3039ca to your computer and use it in GitHub Desktop.
Save Cguilliman/bdace43a2dee1d01cb8615394a3039ca to your computer and use it in GitHub Desktop.
# PROXY MODEL
class Proxy(models.Model):
host = models.CharField(
verbose_name='Host',
max_length=255
)
is_valid = models.BooleanField(
verbose_name='Is valid',
default=True
)
# PARSER
import requests
from bs4 import BeautifulSoup as bs
CAR_PARSING_URL = 'http://link.com'
def get_proxies():
for proxy in Proxy.objects.filter(is_valid=True):
yield proxy
url = 'https://www.sslproxies.org/'
response = requests.get(url)
soup = bs(response.content, "lxml")
trs = soup.select('tr', {'role': 'row'})
for tr in trs[1:21]:
tds = tr.select('td')
proxy, is_created = Proxy.objects.get_or_create(
host=tds[0].text + ':' + tds[1].text
)
if not is_created:
yield proxy
def get_car(registration): # return: Dict
"""get car filtering kwargs from remote service (.go file)
Return: Dict - kwargs for car filtering
"""
response = {}
for proxy in get_proxies():
try:
page_content = (
requests
.post(
CAR_PARSING_URL,
data={'vrm': registration, 'submit': 'Lookup'},
proxies={'http': proxy.host, 'https': proxy.host},
timeout=1)
.content
)
soup = bs(page_content, 'html.parser')
get_element = lambda path: soup.select(path)[0]
response['manufacturer'] = get_element(
'#cc_box1 > div:nth-child(4) > div.col-xs-7.col-sm-7.col-md-7 > img',
).attrs.get('alt')
response['car_model'] = get_element(
'#cc_box1 > div:nth-child(5) > div.col-xs-7.col-sm-7.col-md-7'
).contents[0].split(' ')[0]
response['manufactured'] = get_element(
'#cc_box1 > div:nth-child(14) > div.col-xs-7.col-sm-7.col-md-7'
).contents[0]
except Exception as e:
print('---', e)
proxy.is_valid = False
proxy.save()
continue
else:
return response, True
return response, False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment