VB16/zomato

## zomato
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# zomato final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "\n",
    "from bs4 import BeautifulSoup\n",
    "import urllib\n",
    "import pandas\n",
    "from pandas import DataFrame\n",
    "import re\n",
    "\n",
    "# Getting html elements from zomato website using BeautifulSoup\n",
    "url = 'https://www.zomato.com/new-york-city/top-restaurants'\n",
    "htmlt=urllib.urlopen(url).read()\n",
    "s=BeautifulSoup(htmlt)\n",
    "\n",
    "# Scraping Restaurant Names using Regex\n",
    "reg = '<span>(.+?)</span>'\n",
    "tags=s('span')\n",
    "q=re.findall(reg,htmlt)\n",
    "reg2 = '<span>(.+?)</span>'\n",
    "q=re.findall(reg2,str(tags))\n",
    "name=q[4:] # removing 1st 4 lines\n",
    "\n",
    "# Scraping ratings: \n",
    "rat=s.findAll(\"div\",attrs = {\"class\" : \"collection_res_details_wrap\"})\n",
    "regr='\\d+\\.\\d'   # lotta time in figutinr out this\n",
    "ratt=re.findall(regr,str(rat))\n",
    "cui=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-cuisine2\"})\n",
    "\n",
    "# Scraping Cuisines\n",
    "regcui='</span>(.+?)</div>'\n",
    "cuis=re.findall(regcui,str(cui))\n",
    "\n",
    "# Scraping address \n",
    "ad=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-zone\"})\n",
    "addr=[]\n",
    "for i in ad:\n",
    "    addr.append(i.text.strip())\n",
    "\n",
    "# Scraping count of Reviews\n",
    "rev=s.findAll(\"div\", attrs= {\"class\" : \"clearfix res-box-menu\"})\n",
    "\n",
    "# Some restaurants have menus - below is a code to get rid of it!! Could have used string functions, but i wanted to waste time playing with strings:/\n",
    "rew=[]\n",
    "for o in rev:\n",
    "    stp=o.text.strip()\n",
    "    if len(stp)>11:\n",
    "        li=[]\n",
    "        word=stp\n",
    "        i=len(word)-1\n",
    "        for p in range(10):\n",
    "            li.append(word[i])\n",
    "            count = 0\n",
    "            i=i-1\n",
    "        s = ''.join(li)\n",
    "        w=str(s)\n",
    "        revstg=w[::-1]\n",
    "        rew.append(revstg)\n",
    "    else : rew.append(o.text.strip())\n",
    "    \n",
    "    #rew.append(o.text.strip())\n",
    "\n",
    "\n",
    "# Creating a DataFrame for restaurants\n",
    "d1={\"names\": name , \"ratings\":ratt, \"cuisine\":cuis, \"address\" : addr, \"crev\" : rew}\n",
    "zr=DataFrame(d1)\n",
    "\n",
    "# Creating a csv file which will be imported in tableau for viualisation\n",
    "zr.to_csv('C:/vr/tp/py/Projects/new-york-city.csv', sep=',')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# zomato final"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"\n",
	"from bs4 import BeautifulSoup\n",
	"import urllib\n",
	"import pandas\n",
	"from pandas import DataFrame\n",
	"import re\n",
	"\n",
	"# Getting html elements from zomato website using BeautifulSoup\n",
	"url = 'https://www.zomato.com/new-york-city/top-restaurants'\n",
	"htmlt=urllib.urlopen(url).read()\n",
	"s=BeautifulSoup(htmlt)\n",
	"\n",
	"# Scraping Restaurant Names using Regex\n",
	"reg = '<span>(.+?)</span>'\n",
	"tags=s('span')\n",
	"q=re.findall(reg,htmlt)\n",
	"reg2 = '<span>(.+?)</span>'\n",
	"q=re.findall(reg2,str(tags))\n",
	"name=q[4:] # removing 1st 4 lines\n",
	"\n",
	"# Scraping ratings: \n",
	"rat=s.findAll(\"div\",attrs = {\"class\" : \"collection_res_details_wrap\"})\n",
	"regr='\\d+\\.\\d' # lotta time in figutinr out this\n",
	"ratt=re.findall(regr,str(rat))\n",
	"cui=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-cuisine2\"})\n",
	"\n",
	"# Scraping Cuisines\n",
	"regcui='</span>(.+?)</div>'\n",
	"cuis=re.findall(regcui,str(cui))\n",
	"\n",
	"# Scraping address \n",
	"ad=s.findAll(\"div\", attrs= {\"class\" : \"top-res-box-zone\"})\n",
	"addr=[]\n",
	"for i in ad:\n",
	" addr.append(i.text.strip())\n",
	"\n",
	"# Scraping count of Reviews\n",
	"rev=s.findAll(\"div\", attrs= {\"class\" : \"clearfix res-box-menu\"})\n",
	"\n",
	"# Some restaurants have menus - below is a code to get rid of it!! Could have used string functions, but i wanted to waste time playing with strings:/\n",
	"rew=[]\n",
	"for o in rev:\n",
	" stp=o.text.strip()\n",
	" if len(stp)>11:\n",
	" li=[]\n",
	" word=stp\n",
	" i=len(word)-1\n",
	" for p in range(10):\n",
	" li.append(word[i])\n",
	" count = 0\n",
	" i=i-1\n",
	" s = ''.join(li)\n",
	" w=str(s)\n",
	" revstg=w[::-1]\n",
	" rew.append(revstg)\n",
	" else : rew.append(o.text.strip())\n",
	" \n",
	" #rew.append(o.text.strip())\n",
	"\n",
	"\n",
	"# Creating a DataFrame for restaurants\n",
	"d1={\"names\": name , \"ratings\":ratt, \"cuisine\":cuis, \"address\" : addr, \"crev\" : rew}\n",
	"zr=DataFrame(d1)\n",
	"\n",
	"# Creating a csv file which will be imported in tableau for viualisation\n",
	"zr.to_csv('C:/vr/tp/py/Projects/new-york-city.csv', sep=',')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}