thekindlyone/utils.py

## utils.py
# coding: utf-8
from __future__ import unicode_literals
from unidecode import unidecode
import requests
from bs4 import BeautifulSoup as bs
from time import sleep
import re
from kitchen.text.converters import to_bytes
import itertools
import csv
from multiprocessing import Process, Queue
def cleanse(data,transliteration=True):
    try:
        if transliteration:
            return {key: unidecode(sanitize(value)) for key, value in data.iteritems()}
        else:
            return {key: to_bytes(sanitize(value)) for key, value in data.iteritems()}
    except Exception as e:
        print data
        print str(e)

class Browser(object):
    def __init__(self,url):
        self.s=requests.Session()
        self.s.head(url)

    def soup(self,url):
        r=self.s.get(url)
        return bs(r.content)

    def makeRequest(self,url,headers=None,maxattempts=15):
        attempts=0
        while attempts < maxattempts:
            attempts+=1
            try:
                r=self.s.get(url,headers=headers)
                return r
            except Exception as e:
                sleep(10)
                continue
        return False

def grouper(iterable, n, fillvalue=None):
    args = [iter(iterable)] * n
    return itertools.izip_longest(*args, fillvalue=fillvalue)


def get_soup(url, max_attempts=5,agent={'User-agent': 'Mozilla/5.0'},num=None):
    # user_agent = {'User-agent': 'Mozilla/5.0'}
    for i in xrange(max_attempts):
        try:
            if num:
                with num.get_lock():
                    num.value+=1
            r = requests.get(url, headers=agent,timeout=10)
            if r.status_code == 200:
                return bs(r.content)
            else:
                print 'status at request',url, r.status_code
                sleep(3)
        except Exception as e:
            print '{} at {}'.format(str(e),url)
            sleep(3)
            # continue

    # print r.status_code
    return False


def sanitize(text):
    if text:
        # text=str(text)
        text = re.sub(' +', ' ', text)
        text = re.sub('\n+', '\n', text)
        text=text.replace('"', '')
        text=text.replace(';', ',')
        if not text:
            text='N/A'
        return'\n'.join([line.strip() for line in text.strip().split('\n')])
    else:
        return 'N/A'


def handle(func, default='N/A'):
    try:
        rv = func()
        return rv
    except Exception as e:
        return default

def extract_number(text):
    num=re.search('\d+',text).group()
    return num


def flatten(l):
    return [item for sublist in l for item in sublist]

def extract_float(text):
    fl=re.search('[0-9.]+',text).group()
    return fl

def scribe(q,headers,filename,mode='w',transliteration=True):
    with open(filename, mode) as csvfile:
        writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers,restval="N/A")
        if mode=='w':
            writer.writeheader()
        count=0
        while True:
            row=q.get()
            if row=='STOP':
                break
            else:
                writer.writerow(cleanse(row,transliteration=transliteration))
                count+=1
                print count,'rows written'

## zomato_brazil.py
# coding: utf-8
from utils import *
from urlparse import urljoin
import re
from threading import Thread
import Queue
from time import sleep
import csv
def process(url,city,q,other=False):
    en='?lang=en'
    soup=get_soup(url+en)
    all_outlets_link=handle(lambda: soup.find('a',title=re.compile('All outlets')).get('href',False),False)
    if not all_outlets_link or other :
        name=soup.find('span',itemprop='name').text
        contact= handle(lambda: soup.find('span',class_='tel-icon').text)
        address=handle(lambda: soup.find('div',class_='res-main-address-text').text)
        area=handle(lambda: soup.find('span',itemprop='addressLocality').text)
        pricerange=handle(lambda: soup.find('span', {'itemprop': 'priceRange'}).text.strip())
        payment=handle(lambda: ','.join([item.text.strip() for item in soup.findAll('span', {'itemprop': 'paymentAccepted'})]))
        delivery='No' if 'No Home Delivery' in soup.text else 'Yes'
        cuisine=handle(lambda: soup.find('a',itemprop='servesCuisine').text)
        ophours=handle(lambda: '\n'.join([div.text for div in soup.select('div.res-week-timetable > div')]))
        description=handle(lambda: soup.find('a',itemprop='typeEstablishment').text)
        coords = handle(lambda: re.search('center=(.+?)&', str(soup)).group(1))
        reviews = handle(lambda: soup.select('#selectors > li > a > span')[0].text)
        ratings = handle(lambda: soup.find('div', {'itemprop': 'ratingValue'}).text.strip())
        q.put({'Vendor Name': name,
                    'Country': 'Brazil',
                    'City': city,
                    'Address': address,
                    'Area':area,
                    'Coordinates': coords,
                    'URL': url,
                    'Delivery': delivery,
                    'Contact no.': contact,
                    'Operating Hours': ophours,
                    'Description': description,
                    'Cuisines': cuisine,
                    'Price Range': pricerange,
                    'Payment Options': payment,
                    'Ratings': ratings,
                    'Reviews': reviews
                    })

    else:
        for vlink in fetch_vendorlinks(all_outlets_link):
            process(vlink,city,q,other=True)


def fetch_vendorlinks(url):
    soup=get_soup(url)
    vlinks=[a.get('href') for a in soup.select('a.result-title')]
    return vlinks


def paginate(firstpage):
    soup=get_soup(firstpage)
    pages = soup.find('div', class_='pagination-number')
    if not pages:
        pages = 1
    else:
        match = re.search('\d+$', pages.text.strip())
        if match:
            pages = int(match.group())
        else:
            pages = 1
    return ('{}?page={}'.format(firstpage, page) for page in range(1, pages + 1))


def scribe(q):
    with open('zomato_brazil.csv', 'w') as csvfile:
        writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers)
        writer.writeheader()
        count=0
        while True:
            row=q.get()
            if row=='STOP':
                break
            else:
                writer.writerow(cleanse(row))
                count+=1
                print count,'rows written to sheet'


cities=[('Rio','https://www.zomato.com/rio/restaurants'),
('Sao Paulo','https://www.zomato.com/sao-paulo-sp/restaurants'),
('Brasilia','https://www.zomato.com/brasilia/restaurants'),
('Porto Alegre','https://www.zomato.com/portoalegre/restaurants'),
('Salvador','https://www.zomato.com/salvador/restaurants')]

headers = ['Vendor Name', 'Country', 'City', 'Address','Area', 'Coordinates', 'URL', 'Delivery', 'Contact no.', 'Operating Hours',
           'Description', 'Cuisines', 'Price Range', 'Payment Options', 'Ratings', 'Reviews']


q=Queue.Queue()
threads=[]
scribethread=Thread(target=scribe, args=(q,))
scribethread.daemon=True
scribethread.start()

for cityname,citylink in cities:
    for pno,pagelink in enumerate(paginate(citylink),start=1):
        print 'processing page',pno,'of', cityname
        for vlink in fetch_vendorlinks(pagelink):
            t=Thread(target=process,args=(vlink,cityname,q))
            t.daemon=True
            t.start()
            threads.append(t)
        while([thread for thread in threads if thread.isAlive()]):
            sleep(1)


for thread in threads:
    thread.join()

q.put('STOP')
print 'All Done'
	# coding: utf-8
	from __future__ import unicode_literals
	from unidecode import unidecode
	import requests
	from bs4 import BeautifulSoup as bs
	from time import sleep
	import re
	from kitchen.text.converters import to_bytes
	import itertools
	import csv
	from multiprocessing import Process, Queue
	def cleanse(data,transliteration=True):
	try:
	if transliteration:
	return {key: unidecode(sanitize(value)) for key, value in data.iteritems()}
	else:
	return {key: to_bytes(sanitize(value)) for key, value in data.iteritems()}
	except Exception as e:
	print data
	print str(e)

	class Browser(object):
	def __init__(self,url):
	self.s=requests.Session()
	self.s.head(url)

	def soup(self,url):
	r=self.s.get(url)
	return bs(r.content)

	def makeRequest(self,url,headers=None,maxattempts=15):
	attempts=0
	while attempts < maxattempts:
	attempts+=1
	try:
	r=self.s.get(url,headers=headers)
	return r
	except Exception as e:
	sleep(10)
	continue
	return False

	def grouper(iterable, n, fillvalue=None):
	args = [iter(iterable)] * n
	return itertools.izip_longest(*args, fillvalue=fillvalue)





	def get_soup(url, max_attempts=5,agent={'User-agent': 'Mozilla/5.0'},num=None):
	# user_agent = {'User-agent': 'Mozilla/5.0'}
	for i in xrange(max_attempts):
	try:
	if num:
	with num.get_lock():
	num.value+=1
	r = requests.get(url, headers=agent,timeout=10)
	if r.status_code == 200:
	return bs(r.content)
	else:
	print 'status at request',url, r.status_code
	sleep(3)
	except Exception as e:
	print '{} at {}'.format(str(e),url)
	sleep(3)
	# continue

	# print r.status_code
	return False


	def sanitize(text):
	if text:
	# text=str(text)
	text = re.sub(' +', ' ', text)
	text = re.sub('\n+', '\n', text)
	text=text.replace('"', '')
	text=text.replace(';', ',')
	if not text:
	text='N/A'
	return'\n'.join([line.strip() for line in text.strip().split('\n')])
	else:
	return 'N/A'


	def handle(func, default='N/A'):
	try:
	rv = func()
	return rv
	except Exception as e:
	return default

	def extract_number(text):
	num=re.search('\d+',text).group()
	return num


	def flatten(l):
	return [item for sublist in l for item in sublist]

	def extract_float(text):
	fl=re.search('[0-9.]+',text).group()
	return fl

	def scribe(q,headers,filename,mode='w',transliteration=True):
	with open(filename, mode) as csvfile:
	writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers,restval="N/A")
	if mode=='w':
	writer.writeheader()
	count=0
	while True:
	row=q.get()
	if row=='STOP':
	break
	else:
	writer.writerow(cleanse(row,transliteration=transliteration))
	count+=1
	print count,'rows written'
	# coding: utf-8
	from utils import *
	from urlparse import urljoin
	import re
	from threading import Thread
	import Queue
	from time import sleep
	import csv
	def process(url,city,q,other=False):
	en='?lang=en'
	soup=get_soup(url+en)
	all_outlets_link=handle(lambda: soup.find('a',title=re.compile('All outlets')).get('href',False),False)
	if not all_outlets_link or other :
	name=soup.find('span',itemprop='name').text
	contact= handle(lambda: soup.find('span',class_='tel-icon').text)
	address=handle(lambda: soup.find('div',class_='res-main-address-text').text)
	area=handle(lambda: soup.find('span',itemprop='addressLocality').text)
	pricerange=handle(lambda: soup.find('span', {'itemprop': 'priceRange'}).text.strip())
	payment=handle(lambda: ','.join([item.text.strip() for item in soup.findAll('span', {'itemprop': 'paymentAccepted'})]))
	delivery='No' if 'No Home Delivery' in soup.text else 'Yes'
	cuisine=handle(lambda: soup.find('a',itemprop='servesCuisine').text)
	ophours=handle(lambda: '\n'.join([div.text for div in soup.select('div.res-week-timetable > div')]))
	description=handle(lambda: soup.find('a',itemprop='typeEstablishment').text)
	coords = handle(lambda: re.search('center=(.+?)&', str(soup)).group(1))
	reviews = handle(lambda: soup.select('#selectors > li > a > span')[0].text)
	ratings = handle(lambda: soup.find('div', {'itemprop': 'ratingValue'}).text.strip())
	q.put({'Vendor Name': name,
	'Country': 'Brazil',
	'City': city,
	'Address': address,
	'Area':area,
	'Coordinates': coords,
	'URL': url,
	'Delivery': delivery,
	'Contact no.': contact,
	'Operating Hours': ophours,
	'Description': description,
	'Cuisines': cuisine,
	'Price Range': pricerange,
	'Payment Options': payment,
	'Ratings': ratings,
	'Reviews': reviews
	})

	else:
	for vlink in fetch_vendorlinks(all_outlets_link):
	process(vlink,city,q,other=True)



	def fetch_vendorlinks(url):
	soup=get_soup(url)
	vlinks=[a.get('href') for a in soup.select('a.result-title')]
	return vlinks



	def paginate(firstpage):
	soup=get_soup(firstpage)
	pages = soup.find('div', class_='pagination-number')
	if not pages:
	pages = 1
	else:
	match = re.search('\d+$', pages.text.strip())
	if match:
	pages = int(match.group())
	else:
	pages = 1
	return ('{}?page={}'.format(firstpage, page) for page in range(1, pages + 1))



	def scribe(q):
	with open('zomato_brazil.csv', 'w') as csvfile:
	writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers)
	writer.writeheader()
	count=0
	while True:
	row=q.get()
	if row=='STOP':
	break
	else:
	writer.writerow(cleanse(row))
	count+=1
	print count,'rows written to sheet'


	cities=[('Rio','https://www.zomato.com/rio/restaurants'),
	('Sao Paulo','https://www.zomato.com/sao-paulo-sp/restaurants'),
	('Brasilia','https://www.zomato.com/brasilia/restaurants'),
	('Porto Alegre','https://www.zomato.com/portoalegre/restaurants'),
	('Salvador','https://www.zomato.com/salvador/restaurants')]

	headers = ['Vendor Name', 'Country', 'City', 'Address','Area', 'Coordinates', 'URL', 'Delivery', 'Contact no.', 'Operating Hours',
	'Description', 'Cuisines', 'Price Range', 'Payment Options', 'Ratings', 'Reviews']


	q=Queue.Queue()
	threads=[]
	scribethread=Thread(target=scribe, args=(q,))
	scribethread.daemon=True
	scribethread.start()

	for cityname,citylink in cities:
	for pno,pagelink in enumerate(paginate(citylink),start=1):
	print 'processing page',pno,'of', cityname
	for vlink in fetch_vendorlinks(pagelink):
	t=Thread(target=process,args=(vlink,cityname,q))
	t.daemon=True
	t.start()
	threads.append(t)
	while([thread for thread in threads if thread.isAlive()]):
	sleep(1)


	for thread in threads:
	thread.join()

	q.put('STOP')
	print 'All Done'