Created
February 8, 2016 23:25
-
-
Save VB16/0827905b1e8f4a0a76ff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# zomato final" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"from bs4 import BeautifulSoup\n", | |
"import urllib\n", | |
"import pandas\n", | |
"from pandas import DataFrame\n", | |
"import re\n", | |
"\n", | |
"# Getting html elements from zomato website using BeautifulSoup\n", | |
"url = 'https://www.zomato.com/new-york-city/top-restaurants'\n", | |
"htmlt=urllib.urlopen(url).read()\n", | |
"s=BeautifulSoup(htmlt)\n", | |
"\n", | |
"# Scraping Restaurant Names using Regex\n", | |
"reg = '<span>(.+?)</span>'\n", | |
"tags=s('span')\n", | |
"q=re.findall(reg,htmlt)\n", | |
"reg2 = '<span>(.+?)</span>'\n", | |
"q=re.findall(reg2,str(tags))\n", | |
"name=q[4:] # removing 1st 4 lines\n", | |
"\n", | |
"# Scraping ratings: \n", | |
"rat=s.findAll(\"div\",attrs = {\"class\" : \"collection_res_details_wrap\"})\n", | |
"regr='\\d+\\.\\d' # lotta time in figutinr out this\n", | |
"ratt=re.findall(regr,str(rat))\n", | |
"cui=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-cuisine2\"})\n", | |
"\n", | |
"# Scraping Cuisines\n", | |
"regcui='</span>(.+?)</div>'\n", | |
"cuis=re.findall(regcui,str(cui))\n", | |
"\n", | |
"# Scraping address \n", | |
"ad=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-zone\"})\n", | |
"addr=[]\n", | |
"for i in ad:\n", | |
" addr.append(i.text.strip())\n", | |
"\n", | |
"# Scraping count of Reviews\n", | |
"rev=s.findAll(\"div\", attrs= {\"class\" : \"clearfix res-box-menu\"})\n", | |
"\n", | |
"# Some restaurants have menus - below is a code to get rid of it!! Could have used string functions, but i wanted to waste time playing with strings:/\n", | |
"rew=[]\n", | |
"for o in rev:\n", | |
" stp=o.text.strip()\n", | |
" if len(stp)>11:\n", | |
" li=[]\n", | |
" word=stp\n", | |
" i=len(word)-1\n", | |
" for p in range(10):\n", | |
" li.append(word[i])\n", | |
" count = 0\n", | |
" i=i-1\n", | |
" s = ''.join(li)\n", | |
" w=str(s)\n", | |
" revstg=w[::-1]\n", | |
" rew.append(revstg)\n", | |
" else : rew.append(o.text.strip())\n", | |
" \n", | |
" #rew.append(o.text.strip())\n", | |
"\n", | |
"\n", | |
"# Creating a DataFrame for restaurants\n", | |
"d1={\"names\": name , \"ratings\":ratt, \"cuisine\":cuis, \"address\" : addr, \"crev\" : rew}\n", | |
"zr=DataFrame(d1)\n", | |
"\n", | |
"# Creating a csv file which will be imported in tableau for viualisation\n", | |
"zr.to_csv('C:/vr/tp/py/Projects/new-york-city.csv', sep=',')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment