jp-um/gist:39c915e34692c96496780f5ab414276b

## gistfile1.txt
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#extract data from tripadvisor website and use beautiful soup\n",
    "#extract textual data\n",
    "#save into mongodb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from collections import Counter\n",
    "from string import punctuation\n",
    "import json\n",
    "import pymongo # this gives a security warning in some cases\n",
    "import glob\n",
    "import re\n",
    "from nltk.corpus import stopwords\n",
    "from nltk import bigrams\n",
    "from nltk import trigrams\n",
    "from nltk import ngrams\n",
    "import string \n",
    "\n",
    "paths =  (glob.glob(\"TripAdvisorJson/json/json/*.json\"))\n",
    "\n",
    "client = pymongo.MongoClient('mongodb://localhost:27017/') # connect to DB\n",
    "client.database_names() # show names\n",
    "#if 'big_data_processing' in client.database_names(): # if data_science already exists, drop\n",
    "#    client.drop_database('big_data_processing')\n",
    "#db = client.big_data_processing\n",
    "\n",
    "#data will contain user profiles to be written to JSON file\n",
    "data = {} \n",
    "data['author'] = []  \n",
    "\n",
    "authorsDB = db.authorsDB\n",
    "\n",
    "with open('authors1.json') as data_file:    \n",
    "    authors = json.load(data_file)\n",
    "    \n",
    "for author in authors:\n",
    "\t\n",
    "\treviews = ''\n",
    "\n",
    "\tlink = \"https://www.tripadvisor.com/members/\" + author\n",
    "\tr = requests.get(link)\n",
    "\tPageContent = BeautifulSoup(r.content)\n",
    "\tAgeSince = PageContent.find('div',{ \"class\" : \"ageSince\" })\n",
    "\n",
    "\tif(AgeSince is None):\n",
    "\t\tcontinue\n",
    "        #print ()\n",
    "\n",
    "\tAgeElems = AgeSince.find_all('p')\n",
    "\n",
    "\tgender = ''\n",
    "\tstatus = ''\n",
    "\tage = ''\n",
    "\tnationality = ''\n",
    "\thotelId = ''\n",
    "\n",
    "\n",
    "\tif(len(AgeElems)>1):\n",
    "\t\tif 'year old'  in str(AgeElems[1]):\n",
    "\t\t\tage = re.findall(r'\\d+', str(AgeElems[1]))\n",
    "\t\tif 'male'  in str(AgeElems[1]):\n",
    "\t\t\tgender = 'male'\n",
    "\t\tif 'female'  in str(AgeElems[1]):\n",
    "\t\t\tgender = 'female'\n",
    "\t\tif 'Another gender identity' in str(AgeElems[1]):\n",
    "\t\t\tgender = 'Another gender identity'\n",
    "\n",
    "\thomeTown = PageContent.find('div',{ \"class\" : \"hometown\" })\n",
    "\tif(homeTown is None):\n",
    "\t\tCountries = homeTown.find_all('p')\n",
    "\t\tnationality = Countries[0].get_text()\n",
    "\n",
    "\tvisitedLocations = [];\n",
    "\thotelsVisited = [];\n",
    "\ti=0\n",
    "\t#--------------------------------------------\n",
    "\tfor path in paths:\n",
    "\t\tjsonData = {};\n",
    "\t\twith open(path) as data_file:    \n",
    "\t\t\tjsondata = json.load(data_file) \n",
    "\t\t\thotelName = ''\n",
    "\t\t\thotelRating = 0\n",
    "\t\t\thotelAddress = ''\n",
    "\n",
    "\t\t\tfor review in jsondata[\"Reviews\"]:\n",
    "\t\t\t\tif(review[\"Author\"] == author):\n",
    "                    \n",
    "\t\t\t\t\t#nationality = review[\"AuthorLocation\"]\n",
    "\t\t\t\t\t#if 'Name' in jsondata[\"HotelInfo\"]:\n",
    "\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"Name\"]\n",
    "\t\t\t\t\t#else:\n",
    "\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"HotelURL\"]\n",
    "                        \n",
    "\t\t\t\t\tif 'Address' in jsondata[\"HotelInfo\"]:\n",
    "\t\t\t\t\t\ttmphotelAddress = jsondata[\"HotelInfo\"][\"Address\"]\n",
    "\t\t\t\t\t\thotelAddress = cleanhtml(tmphotelAddress)\n",
    "\t\t\t\t\t\n",
    "\t\t\t\t\thotelId = jsondata[\"HotelInfo\"][\"HotelID\"]\n",
    "\t\t\t\t\t\n",
    "\t\t\t\t\thotelsVisited.append(hotelId);\n",
    "\t\t\t\t\tvisitedLocations.append(hotelAddress)\n",
    "\t\t\t\t\treviews = ''\n",
    "\t\t\t\t\t\n",
    "\t\t\t\t\treviews = reviews + ' ' + review[\"Content\"]\n",
    "\t#--------------------------------------------\n",
    "\n",
    "\tstatus = getStatusFromReview(reviews)\n",
    "\t#rint(tempstatus)\n",
    "\t#f(tempstatus is not None):\n",
    "\t#status = tempstatus\n",
    "\t\t\n",
    "\tauthorReview = {  \n",
    "\t\t'name': author,\n",
    "\t\t'age': age,\n",
    "\t\t'gender': gender,\n",
    "\t\t'status' : status,\n",
    "\t\t'visitedLocations' : visitedLocations,\n",
    "\t\t'hotelsVisited' : hotelsVisited,\n",
    "\t\t'Nationality': nationality,\n",
    "\t\t'reviews' : reviews\n",
    "\t}\n",
    "\n",
    "\tdata['author'].append(authorReview)\n",
    "\n",
    "\n",
    "\t# insert one and get ID\n",
    "\tauthorsDB.insert_one(authorReview).inserted_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#extract textual data using nlp techniques\n",
    "reviews_extraction = {}\n",
    "def getStatusFromReview(review):\n",
    "    if(review is None):\n",
    "        return ('')\n",
    "    else:\n",
    "        #print(review)\n",
    "        strip_punctuation(review)\n",
    "        review = review.lower()\n",
    "        \n",
    "        \n",
    "        review_split = review.split()\n",
    "        review_bigrams = list(bigrams(review_split))\n",
    "        review_trigrams = list(trigrams(review_split))\n",
    "        review_fivegrams = ngrams(review.split(), 5)\n",
    "    \n",
    "        possible_words_single = ['single','club','clubs',\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
    "                          \"buddies\",\"buddy\"]\n",
    "        possible_words_married = [\"husband\",\"newlyweds\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
    "                          \"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
    "                          \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
    "                          \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
    "        possible_words_married_children = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
    "                          \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
    "                          \"household\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
    "                          \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
    "                          \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
    "                          \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
    "\n",
    "        reviews_extraction = []\n",
    "        for i in review_split:\n",
    "            if(i == \"married\" or i == \"newlyweds\" or i == \"newlyweds\" or i == \"single\" or i == \"engaged\" or i == \"bars\" or i == \"clubs\" or i == \"romantic\" or i == \"sightseeing\" or i == \"shopping\" or i == \"honeymoon\" or i ==\"leisure\" or i ==\"pleasure\" or i ==\"kingbed\"):\n",
    "                reviews_extraction.append(i)\n",
    "\n",
    "        types_rooms = [\"single\", \"double\",\"twin\",\"king\",\"queen\",\"deluxe\"]\n",
    "        rooms_beds = [\"rooms\",\"room\",\"beds\",\"bed\"]\n",
    "\n",
    "        for i,j in review_bigrams:\n",
    "            if(i in types_rooms and j in rooms_beds):#if business trip\n",
    "                reviews_extraction.append(j)\n",
    "\n",
    "        possible_words = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
    "                          \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
    "                          \"household\",\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
    "                          \"buddies\",\"buddy\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
    "                          \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
    "                          \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
    "                             \"girlfriend\",\"boyfriend\",\"woman\"]\n",
    "        for i,j in review_bigrams:\n",
    "            if(i == \"my\" or i == \"our\" or i == \"the\" ):\n",
    "                if(j in possible_words):\n",
    "                    reviews_extraction.append(j)\n",
    "                    #possibly married with kids\n",
    "\n",
    "        numbers = [\"one\",\"two\",\"three\",\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\",\"ten\",\"eleven\",\"twelve\",\"thirteen\"]\n",
    "        for i,j in review_bigrams:\n",
    "            if(i in numbers or i.isdigit()):\n",
    "                if(j == \"adult\" or j == \"children\" or j ==\"people\" or j == \"adults\" or j == \"child\" or j == \"person\" or j ==\"persons\" or j ==\"individuals\"):\n",
    "                    reviews_extraction.append(j)\n",
    "\n",
    "        for i,j in review_bigrams:\n",
    "            if(j == \"trip\"):\n",
    "                reviews_extraction.append([i,j])\n",
    "\n",
    "        for i,j,k in review_trigrams:\n",
    "            if(i == \"my\" or i == \"our\" or i == \"the\" or i == \"with\" ):\n",
    "                if(k == \"yo\" or k == \"yr\" or k ==\"year\" or k ==\"old\" or k in possible_words):\n",
    "                    reviews_extraction.append([i,j,k])\n",
    "\n",
    "        for i,j,k,l,m in review_fivegrams:\n",
    "            if(i == \"my\" or i == \"our\" or i == \"the\"):\n",
    "                if(m in possible_words):\n",
    "                    reviews_extraction.append(m)\n",
    "\n",
    "        single = 0\n",
    "        married = 0\n",
    "        marriedChildren = 0\n",
    "\n",
    "        for r in list(reviews_extraction):\n",
    "            #print(r)\n",
    "            if(r in possible_words_single):\n",
    "                single += 1\n",
    "            if(r in possible_words_married):\n",
    "                married += 1\n",
    "            if(r in possible_words_married_children):\n",
    "                marriedChildren += 1\n",
    "\n",
    "        if((single > married)&(single>marriedChildren)):\n",
    "            #print(\"single\")\n",
    "            return \"single\"\n",
    "        if((married > single)&(married>marriedChildren)):\n",
    "            #print(\"married\")\n",
    "            return \"married\"\n",
    "        if((marriedChildren > single)&(marriedChildren>married)):\n",
    "            #print(\"married_children\")\n",
    "            return \"married_children\" \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#clean hotel address\n",
    "def cleanhtml(raw_html):\n",
    "    cleanr = re.compile('<.*?>')\n",
    "    cleantext = re.sub(cleanr, '', raw_html)\n",
    "    return cleantext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#strip punctuation from review\n",
    "from string import punctuation\n",
    "def strip_punctuation(s):\n",
    "    return ''.join(c for c in s if c not in punctuation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#mapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ">>> from bson.code import Code\n",
    ">>> mapper = Code(\"\"\"\n",
    "...               function () {\n",
    "...                 this.hotelsVisited.forEach(function(z) {\n",
    "...                   emit(z, 1);\n",
    "...                 });\n",
    "...               }\n",
    "...               \"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#reducer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    ">>> reducer = Code(\"\"\"\n",
    "...                function (key, values) {\n",
    "...                  var total = 0;\n",
    "...                  for (var i = 0; i < values.length; i++) {\n",
    "...                    total += values[i];\n",
    "...                  }\n",
    "...                  return total;\n",
    "...                }\n",
    "...                \"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#run query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result = authorsDBase.map_reduce(\n",
    "...     mapper, reducer, \"13\", query={\"age\":[\"65\"],\"gender\" : \"female\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#sort values descending order "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "results_sorted = result.find().sort('value' , -1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sorted_list_results = []\n",
    "\n",
    "for r in results_sorted: \n",
    "    sorted_list_results.append((r['_id'],r['value']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#save to json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "with open('data_65FemaleFinal.json', 'w') as outfile:\n",
    "    json.dump(sorted_list_results, outfile)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#extract data from tripadvisor website and use beautiful soup\n",
	"#extract textual data\n",
	"#save into mongodb"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import requests\n",
	"from bs4 import BeautifulSoup\n",
	"from collections import Counter\n",
	"from string import punctuation\n",
	"import json\n",
	"import pymongo # this gives a security warning in some cases\n",
	"import glob\n",
	"import re\n",
	"from nltk.corpus import stopwords\n",
	"from nltk import bigrams\n",
	"from nltk import trigrams\n",
	"from nltk import ngrams\n",
	"import string \n",
	"\n",
	"paths = (glob.glob(\"TripAdvisorJson/json/json/*.json\"))\n",
	"\n",
	"client = pymongo.MongoClient('mongodb://localhost:27017/') # connect to DB\n",
	"client.database_names() # show names\n",
	"#if 'big_data_processing' in client.database_names(): # if data_science already exists, drop\n",
	"# client.drop_database('big_data_processing')\n",
	"#db = client.big_data_processing\n",
	"\n",
	"#data will contain user profiles to be written to JSON file\n",
	"data = {} \n",
	"data['author'] = [] \n",
	"\n",
	"authorsDB = db.authorsDB\n",
	"\n",
	"with open('authors1.json') as data_file: \n",
	" authors = json.load(data_file)\n",
	" \n",
	"for author in authors:\n",
	"\t\n",
	"\treviews = ''\n",
	"\n",
	"\tlink = \"https://www.tripadvisor.com/members/\" + author\n",
	"\tr = requests.get(link)\n",
	"\tPageContent = BeautifulSoup(r.content)\n",
	"\tAgeSince = PageContent.find('div',{ \"class\" : \"ageSince\" })\n",
	"\n",
	"\tif(AgeSince is None):\n",
	"\t\tcontinue\n",
	" #print ()\n",
	"\n",
	"\tAgeElems = AgeSince.find_all('p')\n",
	"\n",
	"\tgender = ''\n",
	"\tstatus = ''\n",
	"\tage = ''\n",
	"\tnationality = ''\n",
	"\thotelId = ''\n",
	"\n",
	"\n",
	"\tif(len(AgeElems)>1):\n",
	"\t\tif 'year old' in str(AgeElems[1]):\n",
	"\t\t\tage = re.findall(r'\\d+', str(AgeElems[1]))\n",
	"\t\tif 'male' in str(AgeElems[1]):\n",
	"\t\t\tgender = 'male'\n",
	"\t\tif 'female' in str(AgeElems[1]):\n",
	"\t\t\tgender = 'female'\n",
	"\t\tif 'Another gender identity' in str(AgeElems[1]):\n",
	"\t\t\tgender = 'Another gender identity'\n",
	"\n",
	"\thomeTown = PageContent.find('div',{ \"class\" : \"hometown\" })\n",
	"\tif(homeTown is None):\n",
	"\t\tCountries = homeTown.find_all('p')\n",
	"\t\tnationality = Countries[0].get_text()\n",
	"\n",
	"\tvisitedLocations = [];\n",
	"\thotelsVisited = [];\n",
	"\ti=0\n",
	"\t#--------------------------------------------\n",
	"\tfor path in paths:\n",
	"\t\tjsonData = {};\n",
	"\t\twith open(path) as data_file: \n",
	"\t\t\tjsondata = json.load(data_file) \n",
	"\t\t\thotelName = ''\n",
	"\t\t\thotelRating = 0\n",
	"\t\t\thotelAddress = ''\n",
	"\n",
	"\t\t\tfor review in jsondata[\"Reviews\"]:\n",
	"\t\t\t\tif(review[\"Author\"] == author):\n",
	" \n",
	"\t\t\t\t\t#nationality = review[\"AuthorLocation\"]\n",
	"\t\t\t\t\t#if 'Name' in jsondata[\"HotelInfo\"]:\n",
	"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"Name\"]\n",
	"\t\t\t\t\t#else:\n",
	"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"HotelURL\"]\n",
	" \n",
	"\t\t\t\t\tif 'Address' in jsondata[\"HotelInfo\"]:\n",
	"\t\t\t\t\t\ttmphotelAddress = jsondata[\"HotelInfo\"][\"Address\"]\n",
	"\t\t\t\t\t\thotelAddress = cleanhtml(tmphotelAddress)\n",
	"\t\t\t\t\t\n",
	"\t\t\t\t\thotelId = jsondata[\"HotelInfo\"][\"HotelID\"]\n",
	"\t\t\t\t\t\n",
	"\t\t\t\t\thotelsVisited.append(hotelId);\n",
	"\t\t\t\t\tvisitedLocations.append(hotelAddress)\n",
	"\t\t\t\t\treviews = ''\n",
	"\t\t\t\t\t\n",
	"\t\t\t\t\treviews = reviews + ' ' + review[\"Content\"]\n",
	"\t#--------------------------------------------\n",
	"\n",
	"\tstatus = getStatusFromReview(reviews)\n",
	"\t#rint(tempstatus)\n",
	"\t#f(tempstatus is not None):\n",
	"\t#status = tempstatus\n",
	"\t\t\n",
	"\tauthorReview = { \n",
	"\t\t'name': author,\n",
	"\t\t'age': age,\n",
	"\t\t'gender': gender,\n",
	"\t\t'status' : status,\n",
	"\t\t'visitedLocations' : visitedLocations,\n",
	"\t\t'hotelsVisited' : hotelsVisited,\n",
	"\t\t'Nationality': nationality,\n",
	"\t\t'reviews' : reviews\n",
	"\t}\n",
	"\n",
	"\tdata['author'].append(authorReview)\n",
	"\n",
	"\n",
	"\t# insert one and get ID\n",
	"\tauthorsDB.insert_one(authorReview).inserted_id"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#extract textual data using nlp techniques\n",
	"reviews_extraction = {}\n",
	"def getStatusFromReview(review):\n",
	" if(review is None):\n",
	" return ('')\n",
	" else:\n",
	" #print(review)\n",
	" strip_punctuation(review)\n",
	" review = review.lower()\n",
	" \n",
	" \n",
	" review_split = review.split()\n",
	" review_bigrams = list(bigrams(review_split))\n",
	" review_trigrams = list(trigrams(review_split))\n",
	" review_fivegrams = ngrams(review.split(), 5)\n",
	" \n",
	" possible_words_single = ['single','club','clubs',\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
	" \"buddies\",\"buddy\"]\n",
	" possible_words_married = [\"husband\",\"newlyweds\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
	" \"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
	" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
	" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
	" possible_words_married_children = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
	" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
	" \"household\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
	" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
	" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
	" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
	"\n",
	" reviews_extraction = []\n",
	" for i in review_split:\n",
	" if(i == \"married\" or i == \"newlyweds\" or i == \"newlyweds\" or i == \"single\" or i == \"engaged\" or i == \"bars\" or i == \"clubs\" or i == \"romantic\" or i == \"sightseeing\" or i == \"shopping\" or i == \"honeymoon\" or i ==\"leisure\" or i ==\"pleasure\" or i ==\"kingbed\"):\n",
	" reviews_extraction.append(i)\n",
	"\n",
	" types_rooms = [\"single\", \"double\",\"twin\",\"king\",\"queen\",\"deluxe\"]\n",
	" rooms_beds = [\"rooms\",\"room\",\"beds\",\"bed\"]\n",
	"\n",
	" for i,j in review_bigrams:\n",
	" if(i in types_rooms and j in rooms_beds):#if business trip\n",
	" reviews_extraction.append(j)\n",
	"\n",
	" possible_words = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
	" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
	" \"household\",\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
	" \"buddies\",\"buddy\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
	" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
	" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
	" \"girlfriend\",\"boyfriend\",\"woman\"]\n",
	" for i,j in review_bigrams:\n",
	" if(i == \"my\" or i == \"our\" or i == \"the\" ):\n",
	" if(j in possible_words):\n",
	" reviews_extraction.append(j)\n",
	" #possibly married with kids\n",
	"\n",
	" numbers = [\"one\",\"two\",\"three\",\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\",\"ten\",\"eleven\",\"twelve\",\"thirteen\"]\n",
	" for i,j in review_bigrams:\n",
	" if(i in numbers or i.isdigit()):\n",
	" if(j == \"adult\" or j == \"children\" or j ==\"people\" or j == \"adults\" or j == \"child\" or j == \"person\" or j ==\"persons\" or j ==\"individuals\"):\n",
	" reviews_extraction.append(j)\n",
	"\n",
	" for i,j in review_bigrams:\n",
	" if(j == \"trip\"):\n",
	" reviews_extraction.append([i,j])\n",
	"\n",
	" for i,j,k in review_trigrams:\n",
	" if(i == \"my\" or i == \"our\" or i == \"the\" or i == \"with\" ):\n",
	" if(k == \"yo\" or k == \"yr\" or k ==\"year\" or k ==\"old\" or k in possible_words):\n",
	" reviews_extraction.append([i,j,k])\n",
	"\n",
	" for i,j,k,l,m in review_fivegrams:\n",
	" if(i == \"my\" or i == \"our\" or i == \"the\"):\n",
	" if(m in possible_words):\n",
	" reviews_extraction.append(m)\n",
	"\n",
	" single = 0\n",
	" married = 0\n",
	" marriedChildren = 0\n",
	"\n",
	" for r in list(reviews_extraction):\n",
	" #print(r)\n",
	" if(r in possible_words_single):\n",
	" single += 1\n",
	" if(r in possible_words_married):\n",
	" married += 1\n",
	" if(r in possible_words_married_children):\n",
	" marriedChildren += 1\n",
	"\n",
	" if((single > married)&(single>marriedChildren)):\n",
	" #print(\"single\")\n",
	" return \"single\"\n",
	" if((married > single)&(married>marriedChildren)):\n",
	" #print(\"married\")\n",
	" return \"married\"\n",
	" if((marriedChildren > single)&(marriedChildren>married)):\n",
	" #print(\"married_children\")\n",
	" return \"married_children\" \n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#clean hotel address\n",
	"def cleanhtml(raw_html):\n",
	" cleanr = re.compile('<.*?>')\n",
	" cleantext = re.sub(cleanr, '', raw_html)\n",
	" return cleantext"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#strip punctuation from review\n",
	"from string import punctuation\n",
	"def strip_punctuation(s):\n",
	" return ''.join(c for c in s if c not in punctuation)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#mapper"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	">>> from bson.code import Code\n",
	">>> mapper = Code(\"\"\"\n",
	"... function () {\n",
	"... this.hotelsVisited.forEach(function(z) {\n",
	"... emit(z, 1);\n",
	"... });\n",
	"... }\n",
	"... \"\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#reducer"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	">>> reducer = Code(\"\"\"\n",
	"... function (key, values) {\n",
	"... var total = 0;\n",
	"... for (var i = 0; i < values.length; i++) {\n",
	"... total += values[i];\n",
	"... }\n",
	"... return total;\n",
	"... }\n",
	"... \"\"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#run query"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"result = authorsDBase.map_reduce(\n",
	"... mapper, reducer, \"13\", query={\"age\":[\"65\"],\"gender\" : \"female\"})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#sort values descending order "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"results_sorted = result.find().sort('value' , -1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sorted_list_results = []\n",
	"\n",
	"for r in results_sorted: \n",
	" sorted_list_results.append((r['_id'],r['value']))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#save to json"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import json\n",
	"with open('data_65FemaleFinal.json', 'w') as outfile:\n",
	" json.dump(sorted_list_results, outfile)"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [conda root]",
	"language": "python",
	"name": "conda-root-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}