Skip to content

Instantly share code, notes, and snippets.

@VB16
Created February 8, 2016 23:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save VB16/0827905b1e8f4a0a76ff to your computer and use it in GitHub Desktop.
Save VB16/0827905b1e8f4a0a76ff to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# zomato final"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\n",
"from bs4 import BeautifulSoup\n",
"import urllib\n",
"import pandas\n",
"from pandas import DataFrame\n",
"import re\n",
"\n",
"# Getting html elements from zomato website using BeautifulSoup\n",
"url = 'https://www.zomato.com/new-york-city/top-restaurants'\n",
"htmlt=urllib.urlopen(url).read()\n",
"s=BeautifulSoup(htmlt)\n",
"\n",
"# Scraping Restaurant Names using Regex\n",
"reg = '<span>(.+?)</span>'\n",
"tags=s('span')\n",
"q=re.findall(reg,htmlt)\n",
"reg2 = '<span>(.+?)</span>'\n",
"q=re.findall(reg2,str(tags))\n",
"name=q[4:] # removing 1st 4 lines\n",
"\n",
"# Scraping ratings: \n",
"rat=s.findAll(\"div\",attrs = {\"class\" : \"collection_res_details_wrap\"})\n",
"regr='\\d+\\.\\d' # lotta time in figutinr out this\n",
"ratt=re.findall(regr,str(rat))\n",
"cui=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-cuisine2\"})\n",
"\n",
"# Scraping Cuisines\n",
"regcui='</span>(.+?)</div>'\n",
"cuis=re.findall(regcui,str(cui))\n",
"\n",
"# Scraping address \n",
"ad=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-zone\"})\n",
"addr=[]\n",
"for i in ad:\n",
" addr.append(i.text.strip())\n",
"\n",
"# Scraping count of Reviews\n",
"rev=s.findAll(\"div\", attrs= {\"class\" : \"clearfix res-box-menu\"})\n",
"\n",
"# Some restaurants have menus - below is a code to get rid of it!! Could have used string functions, but i wanted to waste time playing with strings:/\n",
"rew=[]\n",
"for o in rev:\n",
" stp=o.text.strip()\n",
" if len(stp)>11:\n",
" li=[]\n",
" word=stp\n",
" i=len(word)-1\n",
" for p in range(10):\n",
" li.append(word[i])\n",
" count = 0\n",
" i=i-1\n",
" s = ''.join(li)\n",
" w=str(s)\n",
" revstg=w[::-1]\n",
" rew.append(revstg)\n",
" else : rew.append(o.text.strip())\n",
" \n",
" #rew.append(o.text.strip())\n",
"\n",
"\n",
"# Creating a DataFrame for restaurants\n",
"d1={\"names\": name , \"ratings\":ratt, \"cuisine\":cuis, \"address\" : addr, \"crev\" : rew}\n",
"zr=DataFrame(d1)\n",
"\n",
"# Creating a csv file which will be imported in tableau for viualisation\n",
"zr.to_csv('C:/vr/tp/py/Projects/new-york-city.csv', sep=',')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment