Skip to content

Instantly share code, notes, and snippets.

@thekindlyone
Created May 13, 2016 10:13
Show Gist options
  • Save thekindlyone/e888c72b5b86380b45f34c9e840620e0 to your computer and use it in GitHub Desktop.
Save thekindlyone/e888c72b5b86380b45f34c9e840620e0 to your computer and use it in GitHub Desktop.
# coding: utf-8
from __future__ import unicode_literals
from unidecode import unidecode
import requests
from bs4 import BeautifulSoup as bs
from time import sleep
import re
from kitchen.text.converters import to_bytes
import itertools
import csv
from multiprocessing import Process, Queue
def cleanse(data,transliteration=True):
try:
if transliteration:
return {key: unidecode(sanitize(value)) for key, value in data.iteritems()}
else:
return {key: to_bytes(sanitize(value)) for key, value in data.iteritems()}
except Exception as e:
print data
print str(e)
class Browser(object):
def __init__(self,url):
self.s=requests.Session()
self.s.head(url)
def soup(self,url):
r=self.s.get(url)
return bs(r.content)
def makeRequest(self,url,headers=None,maxattempts=15):
attempts=0
while attempts < maxattempts:
attempts+=1
try:
r=self.s.get(url,headers=headers)
return r
except Exception as e:
sleep(10)
continue
return False
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return itertools.izip_longest(*args, fillvalue=fillvalue)
def get_soup(url, max_attempts=5,agent={'User-agent': 'Mozilla/5.0'},num=None):
# user_agent = {'User-agent': 'Mozilla/5.0'}
for i in xrange(max_attempts):
try:
if num:
with num.get_lock():
num.value+=1
r = requests.get(url, headers=agent,timeout=10)
if r.status_code == 200:
return bs(r.content)
else:
print 'status at request',url, r.status_code
sleep(3)
except Exception as e:
print '{} at {}'.format(str(e),url)
sleep(3)
# continue
# print r.status_code
return False
def sanitize(text):
if text:
# text=str(text)
text = re.sub(' +', ' ', text)
text = re.sub('\n+', '\n', text)
text=text.replace('"', '')
text=text.replace(';', ',')
if not text:
text='N/A'
return'\n'.join([line.strip() for line in text.strip().split('\n')])
else:
return 'N/A'
def handle(func, default='N/A'):
try:
rv = func()
return rv
except Exception as e:
return default
def extract_number(text):
num=re.search('\d+',text).group()
return num
def flatten(l):
return [item for sublist in l for item in sublist]
def extract_float(text):
fl=re.search('[0-9.]+',text).group()
return fl
def scribe(q,headers,filename,mode='w',transliteration=True):
with open(filename, mode) as csvfile:
writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers,restval="N/A")
if mode=='w':
writer.writeheader()
count=0
while True:
row=q.get()
if row=='STOP':
break
else:
writer.writerow(cleanse(row,transliteration=transliteration))
count+=1
print count,'rows written'
# coding: utf-8
from utils import *
from urlparse import urljoin
import re
from threading import Thread
import Queue
from time import sleep
import csv
def process(url,city,q,other=False):
en='?lang=en'
soup=get_soup(url+en)
all_outlets_link=handle(lambda: soup.find('a',title=re.compile('All outlets')).get('href',False),False)
if not all_outlets_link or other :
name=soup.find('span',itemprop='name').text
contact= handle(lambda: soup.find('span',class_='tel-icon').text)
address=handle(lambda: soup.find('div',class_='res-main-address-text').text)
area=handle(lambda: soup.find('span',itemprop='addressLocality').text)
pricerange=handle(lambda: soup.find('span', {'itemprop': 'priceRange'}).text.strip())
payment=handle(lambda: ','.join([item.text.strip() for item in soup.findAll('span', {'itemprop': 'paymentAccepted'})]))
delivery='No' if 'No Home Delivery' in soup.text else 'Yes'
cuisine=handle(lambda: soup.find('a',itemprop='servesCuisine').text)
ophours=handle(lambda: '\n'.join([div.text for div in soup.select('div.res-week-timetable > div')]))
description=handle(lambda: soup.find('a',itemprop='typeEstablishment').text)
coords = handle(lambda: re.search('center=(.+?)&', str(soup)).group(1))
reviews = handle(lambda: soup.select('#selectors > li > a > span')[0].text)
ratings = handle(lambda: soup.find('div', {'itemprop': 'ratingValue'}).text.strip())
q.put({'Vendor Name': name,
'Country': 'Brazil',
'City': city,
'Address': address,
'Area':area,
'Coordinates': coords,
'URL': url,
'Delivery': delivery,
'Contact no.': contact,
'Operating Hours': ophours,
'Description': description,
'Cuisines': cuisine,
'Price Range': pricerange,
'Payment Options': payment,
'Ratings': ratings,
'Reviews': reviews
})
else:
for vlink in fetch_vendorlinks(all_outlets_link):
process(vlink,city,q,other=True)
def fetch_vendorlinks(url):
soup=get_soup(url)
vlinks=[a.get('href') for a in soup.select('a.result-title')]
return vlinks
def paginate(firstpage):
soup=get_soup(firstpage)
pages = soup.find('div', class_='pagination-number')
if not pages:
pages = 1
else:
match = re.search('\d+$', pages.text.strip())
if match:
pages = int(match.group())
else:
pages = 1
return ('{}?page={}'.format(firstpage, page) for page in range(1, pages + 1))
def scribe(q):
with open('zomato_brazil.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers)
writer.writeheader()
count=0
while True:
row=q.get()
if row=='STOP':
break
else:
writer.writerow(cleanse(row))
count+=1
print count,'rows written to sheet'
cities=[('Rio','https://www.zomato.com/rio/restaurants'),
('Sao Paulo','https://www.zomato.com/sao-paulo-sp/restaurants'),
('Brasilia','https://www.zomato.com/brasilia/restaurants'),
('Porto Alegre','https://www.zomato.com/portoalegre/restaurants'),
('Salvador','https://www.zomato.com/salvador/restaurants')]
headers = ['Vendor Name', 'Country', 'City', 'Address','Area', 'Coordinates', 'URL', 'Delivery', 'Contact no.', 'Operating Hours',
'Description', 'Cuisines', 'Price Range', 'Payment Options', 'Ratings', 'Reviews']
q=Queue.Queue()
threads=[]
scribethread=Thread(target=scribe, args=(q,))
scribethread.daemon=True
scribethread.start()
for cityname,citylink in cities:
for pno,pagelink in enumerate(paginate(citylink),start=1):
print 'processing page',pno,'of', cityname
for vlink in fetch_vendorlinks(pagelink):
t=Thread(target=process,args=(vlink,cityname,q))
t.daemon=True
t.start()
threads.append(t)
while([thread for thread in threads if thread.isAlive()]):
sleep(1)
for thread in threads:
thread.join()
q.put('STOP')
print 'All Done'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment