Skip to content

Instantly share code, notes, and snippets.

@jp-um
Created July 7, 2017 13:21
Show Gist options
  • Save jp-um/39c915e34692c96496780f5ab414276b to your computer and use it in GitHub Desktop.
Save jp-um/39c915e34692c96496780f5ab414276b to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#extract data from tripadvisor website and use beautiful soup\n",
"#extract textual data\n",
"#save into mongodb"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"from collections import Counter\n",
"from string import punctuation\n",
"import json\n",
"import pymongo # this gives a security warning in some cases\n",
"import glob\n",
"import re\n",
"from nltk.corpus import stopwords\n",
"from nltk import bigrams\n",
"from nltk import trigrams\n",
"from nltk import ngrams\n",
"import string \n",
"\n",
"paths = (glob.glob(\"TripAdvisorJson/json/json/*.json\"))\n",
"\n",
"client = pymongo.MongoClient('mongodb://localhost:27017/') # connect to DB\n",
"client.database_names() # show names\n",
"#if 'big_data_processing' in client.database_names(): # if data_science already exists, drop\n",
"# client.drop_database('big_data_processing')\n",
"#db = client.big_data_processing\n",
"\n",
"#data will contain user profiles to be written to JSON file\n",
"data = {} \n",
"data['author'] = [] \n",
"\n",
"authorsDB = db.authorsDB\n",
"\n",
"with open('authors1.json') as data_file: \n",
" authors = json.load(data_file)\n",
" \n",
"for author in authors:\n",
"\t\n",
"\treviews = ''\n",
"\n",
"\tlink = \"https://www.tripadvisor.com/members/\" + author\n",
"\tr = requests.get(link)\n",
"\tPageContent = BeautifulSoup(r.content)\n",
"\tAgeSince = PageContent.find('div',{ \"class\" : \"ageSince\" })\n",
"\n",
"\tif(AgeSince is None):\n",
"\t\tcontinue\n",
" #print ()\n",
"\n",
"\tAgeElems = AgeSince.find_all('p')\n",
"\n",
"\tgender = ''\n",
"\tstatus = ''\n",
"\tage = ''\n",
"\tnationality = ''\n",
"\thotelId = ''\n",
"\n",
"\n",
"\tif(len(AgeElems)>1):\n",
"\t\tif 'year old' in str(AgeElems[1]):\n",
"\t\t\tage = re.findall(r'\\d+', str(AgeElems[1]))\n",
"\t\tif 'male' in str(AgeElems[1]):\n",
"\t\t\tgender = 'male'\n",
"\t\tif 'female' in str(AgeElems[1]):\n",
"\t\t\tgender = 'female'\n",
"\t\tif 'Another gender identity' in str(AgeElems[1]):\n",
"\t\t\tgender = 'Another gender identity'\n",
"\n",
"\thomeTown = PageContent.find('div',{ \"class\" : \"hometown\" })\n",
"\tif(homeTown is None):\n",
"\t\tCountries = homeTown.find_all('p')\n",
"\t\tnationality = Countries[0].get_text()\n",
"\n",
"\tvisitedLocations = [];\n",
"\thotelsVisited = [];\n",
"\ti=0\n",
"\t#--------------------------------------------\n",
"\tfor path in paths:\n",
"\t\tjsonData = {};\n",
"\t\twith open(path) as data_file: \n",
"\t\t\tjsondata = json.load(data_file) \n",
"\t\t\thotelName = ''\n",
"\t\t\thotelRating = 0\n",
"\t\t\thotelAddress = ''\n",
"\n",
"\t\t\tfor review in jsondata[\"Reviews\"]:\n",
"\t\t\t\tif(review[\"Author\"] == author):\n",
" \n",
"\t\t\t\t\t#nationality = review[\"AuthorLocation\"]\n",
"\t\t\t\t\t#if 'Name' in jsondata[\"HotelInfo\"]:\n",
"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"Name\"]\n",
"\t\t\t\t\t#else:\n",
"\t\t\t\t\t#\thotel = jsondata[\"HotelInfo\"][\"HotelURL\"]\n",
" \n",
"\t\t\t\t\tif 'Address' in jsondata[\"HotelInfo\"]:\n",
"\t\t\t\t\t\ttmphotelAddress = jsondata[\"HotelInfo\"][\"Address\"]\n",
"\t\t\t\t\t\thotelAddress = cleanhtml(tmphotelAddress)\n",
"\t\t\t\t\t\n",
"\t\t\t\t\thotelId = jsondata[\"HotelInfo\"][\"HotelID\"]\n",
"\t\t\t\t\t\n",
"\t\t\t\t\thotelsVisited.append(hotelId);\n",
"\t\t\t\t\tvisitedLocations.append(hotelAddress)\n",
"\t\t\t\t\treviews = ''\n",
"\t\t\t\t\t\n",
"\t\t\t\t\treviews = reviews + ' ' + review[\"Content\"]\n",
"\t#--------------------------------------------\n",
"\n",
"\tstatus = getStatusFromReview(reviews)\n",
"\t#rint(tempstatus)\n",
"\t#f(tempstatus is not None):\n",
"\t#status = tempstatus\n",
"\t\t\n",
"\tauthorReview = { \n",
"\t\t'name': author,\n",
"\t\t'age': age,\n",
"\t\t'gender': gender,\n",
"\t\t'status' : status,\n",
"\t\t'visitedLocations' : visitedLocations,\n",
"\t\t'hotelsVisited' : hotelsVisited,\n",
"\t\t'Nationality': nationality,\n",
"\t\t'reviews' : reviews\n",
"\t}\n",
"\n",
"\tdata['author'].append(authorReview)\n",
"\n",
"\n",
"\t# insert one and get ID\n",
"\tauthorsDB.insert_one(authorReview).inserted_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#extract textual data using nlp techniques\n",
"reviews_extraction = {}\n",
"def getStatusFromReview(review):\n",
" if(review is None):\n",
" return ('')\n",
" else:\n",
" #print(review)\n",
" strip_punctuation(review)\n",
" review = review.lower()\n",
" \n",
" \n",
" review_split = review.split()\n",
" review_bigrams = list(bigrams(review_split))\n",
" review_trigrams = list(trigrams(review_split))\n",
" review_fivegrams = ngrams(review.split(), 5)\n",
" \n",
" possible_words_single = ['single','club','clubs',\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
" \"buddies\",\"buddy\"]\n",
" possible_words_married = [\"husband\",\"newlyweds\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
" \"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
" possible_words_married_children = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
" \"household\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
" \"girlfriend\",\"boyfriend\",\"woman\",\"engaged\",\"married\",\"romantic\",\"honeymoon\"]\n",
"\n",
" reviews_extraction = []\n",
" for i in review_split:\n",
" if(i == \"married\" or i == \"newlyweds\" or i == \"newlyweds\" or i == \"single\" or i == \"engaged\" or i == \"bars\" or i == \"clubs\" or i == \"romantic\" or i == \"sightseeing\" or i == \"shopping\" or i == \"honeymoon\" or i ==\"leisure\" or i ==\"pleasure\" or i ==\"kingbed\"):\n",
" reviews_extraction.append(i)\n",
"\n",
" types_rooms = [\"single\", \"double\",\"twin\",\"king\",\"queen\",\"deluxe\"]\n",
" rooms_beds = [\"rooms\",\"room\",\"beds\",\"bed\"]\n",
"\n",
" for i,j in review_bigrams:\n",
" if(i in types_rooms and j in rooms_beds):#if business trip\n",
" reviews_extraction.append(j)\n",
"\n",
" possible_words = [\"husband\",\"hubbie\",\"hubby\",\"wife\",\"lady\",\"man\",\"spouse\",\"family\", \"sons\",\\\n",
" \"son\", \"daughter\", \"daughters\", \"children\", \"child\",\"kids\",\"kid\" ,\\\n",
" \"household\",\"friends\",\"friend\",\"peers\",\"peer\", \"mates\",\"mate\",\\\n",
" \"buddies\",\"buddy\", \"baby\",\"babies\",\"infant\",\"infants\",\"teenager\",\\\n",
" \"teenagers\",\"honeymoon\",\"grandmother\",\"grandfather\",\"inlaws\",\"inlaw\",\\\n",
" \"father\",\"fathers\",\"mother\",\"mothers\",\"wedding\", \"significant other\", \"fiance\"\\\n",
" \"girlfriend\",\"boyfriend\",\"woman\"]\n",
" for i,j in review_bigrams:\n",
" if(i == \"my\" or i == \"our\" or i == \"the\" ):\n",
" if(j in possible_words):\n",
" reviews_extraction.append(j)\n",
" #possibly married with kids\n",
"\n",
" numbers = [\"one\",\"two\",\"three\",\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\",\"ten\",\"eleven\",\"twelve\",\"thirteen\"]\n",
" for i,j in review_bigrams:\n",
" if(i in numbers or i.isdigit()):\n",
" if(j == \"adult\" or j == \"children\" or j ==\"people\" or j == \"adults\" or j == \"child\" or j == \"person\" or j ==\"persons\" or j ==\"individuals\"):\n",
" reviews_extraction.append(j)\n",
"\n",
" for i,j in review_bigrams:\n",
" if(j == \"trip\"):\n",
" reviews_extraction.append([i,j])\n",
"\n",
" for i,j,k in review_trigrams:\n",
" if(i == \"my\" or i == \"our\" or i == \"the\" or i == \"with\" ):\n",
" if(k == \"yo\" or k == \"yr\" or k ==\"year\" or k ==\"old\" or k in possible_words):\n",
" reviews_extraction.append([i,j,k])\n",
"\n",
" for i,j,k,l,m in review_fivegrams:\n",
" if(i == \"my\" or i == \"our\" or i == \"the\"):\n",
" if(m in possible_words):\n",
" reviews_extraction.append(m)\n",
"\n",
" single = 0\n",
" married = 0\n",
" marriedChildren = 0\n",
"\n",
" for r in list(reviews_extraction):\n",
" #print(r)\n",
" if(r in possible_words_single):\n",
" single += 1\n",
" if(r in possible_words_married):\n",
" married += 1\n",
" if(r in possible_words_married_children):\n",
" marriedChildren += 1\n",
"\n",
" if((single > married)&(single>marriedChildren)):\n",
" #print(\"single\")\n",
" return \"single\"\n",
" if((married > single)&(married>marriedChildren)):\n",
" #print(\"married\")\n",
" return \"married\"\n",
" if((marriedChildren > single)&(marriedChildren>married)):\n",
" #print(\"married_children\")\n",
" return \"married_children\" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#clean hotel address\n",
"def cleanhtml(raw_html):\n",
" cleanr = re.compile('<.*?>')\n",
" cleantext = re.sub(cleanr, '', raw_html)\n",
" return cleantext"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#strip punctuation from review\n",
"from string import punctuation\n",
"def strip_punctuation(s):\n",
" return ''.join(c for c in s if c not in punctuation)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#mapper"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
">>> from bson.code import Code\n",
">>> mapper = Code(\"\"\"\n",
"... function () {\n",
"... this.hotelsVisited.forEach(function(z) {\n",
"... emit(z, 1);\n",
"... });\n",
"... }\n",
"... \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#reducer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
">>> reducer = Code(\"\"\"\n",
"... function (key, values) {\n",
"... var total = 0;\n",
"... for (var i = 0; i < values.length; i++) {\n",
"... total += values[i];\n",
"... }\n",
"... return total;\n",
"... }\n",
"... \"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#run query"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"result = authorsDBase.map_reduce(\n",
"... mapper, reducer, \"13\", query={\"age\":[\"65\"],\"gender\" : \"female\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#sort values descending order "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"results_sorted = result.find().sort('value' , -1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sorted_list_results = []\n",
"\n",
"for r in results_sorted: \n",
" sorted_list_results.append((r['_id'],r['value']))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#save to json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import json\n",
"with open('data_65FemaleFinal.json', 'w') as outfile:\n",
" json.dump(sorted_list_results, outfile)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment