Created
July 7, 2017 13:21
-
-
Save jp-um/39c915e34692c96496780f5ab414276b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#extract data from tripadvisor website and use beautiful soup\n", | |
"#extract textual data\n", | |
"#save into mongodb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"from collections import Counter\n", | |
"from string import punctuation\n", | |
"import json\n", | |
"import pymongo # this gives a security warning in some cases\n", | |
"import glob\n", | |
"import re\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk import bigrams\n", | |
"from nltk import trigrams\n", | |
"from nltk import ngrams\n", | |
"import string \n", | |
"\n", | |
"paths = (glob.glob(\"TripAdvisorJson/json/json/*.json\"))\n", | |
"\n", | |
"client = pymongo.MongoClient('mongodb://localhost:27017/') # connect to DB\n", | |
"client.database_names() # show names\n", | |
"#if 'big_data_processing' in client.database_names(): # if data_science already exists, drop\n", | |
"# client.drop_database('big_data_processing')\n", | |
"#db = client.big_data_processing\n", | |
"\n", | |
"#data will contain user profiles to be written to JSON file\n", | |
"data = {} \n", | |
"data['author'] = [] \n", | |
"\n", | |
"authorsDB = db.authorsDB\n", | |
"\n", | |
"with open('authors1.json') as data_file: \n", | |
" authors = json.load(data_file)\n", | |
" \n", | |
"for author in authors:\n", | |
"\t\n", | |
"\treviews = ''\n", | |
"\n", | |
"\tlink = \"https://www.tripadvisor.com/members/\" + author\n", | |
"\tr = requests.get(link)\n", | |
"\tPageContent = BeautifulSoup(r.content)\n", | |
"\tAgeSince = PageContent.find('div',{ \"class\" : \"ageSince\" })\n", | |
"\n", | |
"\tif(AgeSince is None):\n", | |
"\t\tcontinue\n", | |
" #print ()\n", | |
"\n", | |
"\tAgeElems = AgeSince.find_all('p')\n", | |
"\n", | |
"\tgender = ''\n", | |
"\tstatus = ''\n", | |
"\tage = ''\n", | |
"\tnationality = ''\n", | |
"\thotelId = ''\n", | |
"\n", | |
"\n", | |
"\tif(len(AgeElems)>1):\n", | |
"\t\tif 'year old' in str(AgeElems[1]):\n", | |
"\t\t\tage = re.findall(r'\\d+', str(AgeElems[1]))\n", | |
"\t\tif 'male' in str(AgeElems[1]):\n", | |
"\t\t\tgender = 'male'\n", | |
"\t\tif 'female' in str(AgeElems[1]):\n", | |
"\t\t\tgender = 'female'\n", | |
"\t\tif 'Another gender identity' in str(AgeElems[1]):\n", | |
"\t\t\tgender = 'Another gender identity'\n", | |
"\n", | |
"\thomeTown = PageContent.find('div',{ \"class\" : \"hometown\" })\n", | |
"\tif(homeTown is None):\n", | |
"\t\tCountries = homeTown.find_all('p')\n", | |
"\t\tnationality = Countries[0].get_text()\n", | |
"\n", | |
"\tvisitedLocations = [];\n", | |
"\thotelsVisited = [];\n", | |
"\ti=0\n", | |
"\t#--------------------------------------------\n", | |
"\tfor path in paths:\n", | |
"\t\tjsonData = {};\n", | |
"\t\twith open(path) as data_file: \n", | |
"\t\t\tjsondata = json.load(data_file) \n", | |
"\t\t\thotelName = ''\n", | |
"\t\t\thotelRating = 0\n", | |
"\t\t\thotelAddress = ''\n", | |
"\n", | |
"\t\t\tfor review in jsondata[\"Reviews\"]:\n", | |
"\t\t\t\tif(review[\"Author\"] == author):\n", | |
" \n", | |
"\t\t\t\t\t#nationality = review[\"AuthorLocation\"]\n", | |
"\t\t\t\t\t#if 'Name' in jsondata[\"HotelInfo\"]:\n", | |
"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"Name\"]\n", | |
"\t\t\t\t\t#else:\n", | |
"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"HotelURL\"]\n", | |
" \n", | |
"\t\t\t\t\tif 'Address' in jsondata[\"HotelInfo\"]:\n", | |
"\t\t\t\t\t\ttmphotelAddress = jsondata[\"HotelInfo\"][\"Address\"]\n", | |
"\t\t\t\t\t\thotelAddress = cleanhtml(tmphotelAddress)\n", | |
"\t\t\t\t\t\n", | |
"\t\t\t\t\thotelId = jsondata[\"HotelInfo\"][\"HotelID\"]\n", | |
"\t\t\t\t\t\n", | |
"\t\t\t\t\thotelsVisited.append(hotelId);\n", | |
"\t\t\t\t\tvisitedLocations.append(hotelAddress)\n", | |
"\t\t\t\t\treviews = ''\n", | |
"\t\t\t\t\t\n", | |
"\t\t\t\t\treviews = reviews + ' ' + review[\"Content\"]\n", | |
"\t#--------------------------------------------\n", | |
"\n", | |
"\tstatus = getStatusFromReview(reviews)\n", | |
"\t#rint(tempstatus)\n", | |
"\t#f(tempstatus is not None):\n", | |
"\t#status = tempstatus\n", | |
"\t\t\n", | |
"\tauthorReview = { \n", | |
"\t\t'name': author,\n", | |
"\t\t'age': age,\n", | |
"\t\t'gender': gender,\n", | |
"\t\t'status' : status,\n", | |
"\t\t'visitedLocations' : visitedLocations,\n", | |
"\t\t'hotelsVisited' : hotelsVisited,\n", | |
"\t\t'Nationality': nationality,\n", | |
"\t\t'reviews' : reviews\n", | |
"\t}\n", | |
"\n", | |
"\tdata['author'].append(authorReview)\n", | |
"\n", | |
"\n", | |
"\t# insert one and get ID\n", | |
"\tauthorsDB.insert_one(authorReview).inserted_id" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#extract textual data using nlp techniques\n", | |
"reviews_extraction = {}\n", | |
"def getStatusFromReview(review):\n", | |
" if(review is None):\n", | |
" return ('')\n", | |
" else:\n", | |
" #print(review)\n", | |
" strip_punctuation(review)\n", | |
" review = review.lower()\n", | |
" \n", | |
" \n", | |
" review_split = review.split()\n", | |
" review_bigrams = list(bigrams(review_split))\n", | |
" review_trigrams = list(trigrams(review_split))\n", | |
" review_fivegrams = ngrams(review.split(), 5)\n", | |
" \n", | |
" possible_words_single = ['single','club','clubs',\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n", | |
" \"buddies\",\"buddy\"]\n", | |
" possible_words_married = [\"husband\",\"newlyweds\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n", | |
" \"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n", | |
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n", | |
" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n", | |
" possible_words_married_children = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n", | |
" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n", | |
" \"household\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n", | |
" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n", | |
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n", | |
" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n", | |
"\n", | |
" reviews_extraction = []\n", | |
" for i in review_split:\n", | |
" if(i == \"married\" or i == \"newlyweds\" or i == \"newlyweds\" or i == \"single\" or i == \"engaged\" or i == \"bars\" or i == \"clubs\" or i == \"romantic\" or i == \"sightseeing\" or i == \"shopping\" or i == \"honeymoon\" or i ==\"leisure\" or i ==\"pleasure\" or i ==\"kingbed\"):\n", | |
" reviews_extraction.append(i)\n", | |
"\n", | |
" types_rooms = [\"single\", \"double\",\"twin\",\"king\",\"queen\",\"deluxe\"]\n", | |
" rooms_beds = [\"rooms\",\"room\",\"beds\",\"bed\"]\n", | |
"\n", | |
" for i,j in review_bigrams:\n", | |
" if(i in types_rooms and j in rooms_beds):#if business trip\n", | |
" reviews_extraction.append(j)\n", | |
"\n", | |
" possible_words = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n", | |
" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n", | |
" \"household\",\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n", | |
" \"buddies\",\"buddy\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n", | |
" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n", | |
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n", | |
" \"girlfriend\",\"boyfriend\",\"woman\"]\n", | |
" for i,j in review_bigrams:\n", | |
" if(i == \"my\" or i == \"our\" or i == \"the\" ):\n", | |
" if(j in possible_words):\n", | |
" reviews_extraction.append(j)\n", | |
" #possibly married with kids\n", | |
"\n", | |
" numbers = [\"one\",\"two\",\"three\",\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\",\"ten\",\"eleven\",\"twelve\",\"thirteen\"]\n", | |
" for i,j in review_bigrams:\n", | |
" if(i in numbers or i.isdigit()):\n", | |
" if(j == \"adult\" or j == \"children\" or j ==\"people\" or j == \"adults\" or j == \"child\" or j == \"person\" or j ==\"persons\" or j ==\"individuals\"):\n", | |
" reviews_extraction.append(j)\n", | |
"\n", | |
" for i,j in review_bigrams:\n", | |
" if(j == \"trip\"):\n", | |
" reviews_extraction.append([i,j])\n", | |
"\n", | |
" for i,j,k in review_trigrams:\n", | |
" if(i == \"my\" or i == \"our\" or i == \"the\" or i == \"with\" ):\n", | |
" if(k == \"yo\" or k == \"yr\" or k ==\"year\" or k ==\"old\" or k in possible_words):\n", | |
" reviews_extraction.append([i,j,k])\n", | |
"\n", | |
" for i,j,k,l,m in review_fivegrams:\n", | |
" if(i == \"my\" or i == \"our\" or i == \"the\"):\n", | |
" if(m in possible_words):\n", | |
" reviews_extraction.append(m)\n", | |
"\n", | |
" single = 0\n", | |
" married = 0\n", | |
" marriedChildren = 0\n", | |
"\n", | |
" for r in list(reviews_extraction):\n", | |
" #print(r)\n", | |
" if(r in possible_words_single):\n", | |
" single += 1\n", | |
" if(r in possible_words_married):\n", | |
" married += 1\n", | |
" if(r in possible_words_married_children):\n", | |
" marriedChildren += 1\n", | |
"\n", | |
" if((single > married)&(single>marriedChildren)):\n", | |
" #print(\"single\")\n", | |
" return \"single\"\n", | |
" if((married > single)&(married>marriedChildren)):\n", | |
" #print(\"married\")\n", | |
" return \"married\"\n", | |
" if((marriedChildren > single)&(marriedChildren>married)):\n", | |
" #print(\"married_children\")\n", | |
" return \"married_children\" \n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#clean hotel address\n", | |
"def cleanhtml(raw_html):\n", | |
" cleanr = re.compile('<.*?>')\n", | |
" cleantext = re.sub(cleanr, '', raw_html)\n", | |
" return cleantext" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#strip punctuation from review\n", | |
"from string import punctuation\n", | |
"def strip_punctuation(s):\n", | |
" return ''.join(c for c in s if c not in punctuation)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#mapper" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
">>> from bson.code import Code\n", | |
">>> mapper = Code(\"\"\"\n", | |
"... function () {\n", | |
"... this.hotelsVisited.forEach(function(z) {\n", | |
"... emit(z, 1);\n", | |
"... });\n", | |
"... }\n", | |
"... \"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#reducer" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
">>> reducer = Code(\"\"\"\n", | |
"... function (key, values) {\n", | |
"... var total = 0;\n", | |
"... for (var i = 0; i < values.length; i++) {\n", | |
"... total += values[i];\n", | |
"... }\n", | |
"... return total;\n", | |
"... }\n", | |
"... \"\"\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#run query" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"result = authorsDBase.map_reduce(\n", | |
"... mapper, reducer, \"13\", query={\"age\":[\"65\"],\"gender\" : \"female\"})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#sort values descending order " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"results_sorted = result.find().sort('value' , -1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sorted_list_results = []\n", | |
"\n", | |
"for r in results_sorted: \n", | |
" sorted_list_results.append((r['_id'],r['value']))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#save to json" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import json\n", | |
"with open('data_65FemaleFinal.json', 'w') as outfile:\n", | |
" json.dump(sorted_list_results, outfile)" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda root]", | |
"language": "python", | |
"name": "conda-root-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment