Skip to content

Instantly share code, notes, and snippets.

@sonalisharma
Created December 12, 2013 05:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sonalisharma/7923753 to your computer and use it in GitHub Desktop.
Save sonalisharma/7923753 to your computer and use it in GitHub Desktop.
Final Project deliverable for AYY250 - Python for computing Data Sciences, UC Berkeley
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Importing libraries"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import re, os, sys\n",
"import urllib\n",
"import BeautifulSoup as bs\n",
"from bs4 import BeautifulSoup\n",
"import pandas as pd\n",
"from dateutil import parser\n",
"from geopy import geocoders\n",
"from itertools import izip\n",
"import numpy as np\n",
"import requests\n",
"from datetime import datetime\n",
"import twitter\n",
"from twitter import OAuth\n",
"from twitter import Twitter\n",
"\n",
"import gdata.youtube\n",
"import gdata.youtube.service\n",
"yt_service = gdata.youtube.service.YouTubeService()\n",
"yt_service.ssl = True\n",
"yt_service = gdata.youtube.service.YouTubeService()\n",
"\n",
"g = geocoders.GoogleV3()\n",
"failures = []\n",
"perks = {}\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 42
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Reading project names from igg_link file"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"f=open(\"igg_link.txt\")\n",
"f=f.read()\n",
"f=f.split(\"\\n\")\n",
"len(f)\n",
"comment_link=[]\n",
"for i in f:\n",
" comment_link.append(\"http://www.indiegogo.com/projects/{}\".format(i))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"len(comment_link)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": [
"1585"
]
}
],
"prompt_number": 11
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Distance from DC"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def dc_dist(location):\n",
" from geopy import distance\n",
" _, cl = g.geocode('Washington, D.C.')\n",
" try:\n",
" _, ne = g.geocode(location)\n",
" dist=distance.distance(ne, cl).miles\n",
" return dist\n",
" except:\n",
" return \"NA\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from HTMLParser import HTMLParser\n",
"\n",
"class MLStripper(HTMLParser):\n",
" def __init__(self):\n",
" self.reset()\n",
" self.fed = []\n",
" def handle_data(self, d):\n",
" self.fed.append(d)\n",
" def get_data(self):\n",
" return ' '.join(self.fed)\n",
"\n",
"def strip_tags(html):\n",
" s = MLStripper()\n",
" s.feed(html)\n",
" return s.get_data()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Getting perks data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def perksdata(link):\n",
" data=(urllib.urlopen(link))\n",
" data=str(data.read())\n",
" p1 = re.compile('<section class=\\\"perks\\\">(.*?)\\</section>', re.IGNORECASE|re.DOTALL)\n",
" datapart2 = str(re.findall(p1, str(data)))\n",
" p12 = re.compile('<p class=\\\"fl claimed big-perk-button\\\">(.*?)\\</p>', re.IGNORECASE|re.DOTALL)\n",
" claims = str(re.findall(p12, str(datapart2)))\n",
" #print claims\n",
" claims=claims.split(\"\\\\n\")\n",
" #claims=claims.split(\",\")\n",
" #print claims\n",
" #print len(claims)\n",
" claimed=[]\n",
" for i in claims:\n",
" temp_perk = re.findall('\\d+',i)\n",
" if len(temp_perk)>0:\n",
" claimed.append(float(temp_perk[0]))\n",
" #print claimed\n",
" p13 = re.compile('<span class=\\\"currency currency-large\\\">(.*?)\\</span>', re.IGNORECASE|re.DOTALL)\n",
" tmpamount = str(re.findall(p13, str(datapart2)))[1:-1]\n",
" tmpamount=tmpamount.split(\" \")\n",
" perk_amount=[]\n",
" #print tmpamount\n",
" try:\n",
" for i in tmpamount:\n",
" i=i.replace(\",\", \"\")\n",
" amt = re.findall('\\d+',i)\n",
" perk_amount.append(int(amt[0]))\n",
" no_perks = len(perk_amount)\n",
" #print perk_amount\n",
" median = np.median(np.array(perk_amount))\n",
" try:\n",
" no_median_perk = claimed[perk_amount.index(median)]\n",
" except:\n",
" no_median_perk = 0 \n",
" perk_amount = np.array(perk_amount)\n",
" #perk_amount_min = np.array(perk_amount).argmin()\n",
" maxperk = perk_amount.max()\n",
" minperk = perk_amount.min()\n",
" no_min_perk = claimed[perk_amount.argmin()]\n",
" no_max_perk = claimed[perk_amount.argmax()]\n",
" res = (minperk,no_min_perk,maxperk,no_max_perk,median,no_median_perk,no_perks)\n",
" return res\n",
" except:\n",
" #failures.append((link,\"perks data\"))\n",
" res = ()\n",
" return res\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Social media data"
]
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Initializing keys"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"twitterapi = Twitter(auth=OAuth(\n",
" '99045947-JUBOFYWz3G7vprh81BBLTONVbZUsRxQtwSsqE3v0Q', 'rx2rNaaEpTvDgLUI9vxms8r4sGrUHyQcYwcBm5wc', \n",
" 'XGg0jqHyJQXuuauFVQywSQ', 'VP5zLNdesKjN0eelyU9vemBn2dXQcOaBZMn7xGvIG8'))\n",
"\n",
"fb_base_url = 'https://graph.facebook.com/'"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 15
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Facebook"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Facebook data\n",
"def getfbdata(user):\n",
" url = fb_base_url + user \n",
" response = requests.get(url)\n",
" profile = response.json()\n",
" likes = profile['likes']\n",
" talking = profile['talking_about_count']\n",
" return (likes, talking)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 85
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Twitter"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Twitter Data\n",
"def gettwitterdata(handle):\n",
" #print handle\n",
" user = twitterapi.GetUser(screen_name=handle)\n",
" return (user.GetFollowersCount(), user.GetStatusesCount())"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 86
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Youtube"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def yt(entr):\n",
" try:\n",
" entry = yt_service.GetYouTubeVideoEntry(video_id=entr)\n",
" try:\n",
" dur = entry.media.duration.seconds\n",
" except:\n",
" dur = 0\n",
" try:\n",
" cnt = entry.statistics.view_count\n",
" except:\n",
" cnt= 0\n",
" try:\n",
" rating = entry.rating.average\n",
" except:\n",
" rating = 0 \n",
" #print dur,cnt,rating\n",
" return (dur,cnt,rating)\n",
" except:\n",
" return (0,0,0)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 87
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def you(link):\n",
" dur = 0.0\n",
" vcount = 0.0\n",
" avgrating = 0.0\n",
" counter = 0 \n",
" try:\n",
" if link.startswith(\"http://youtu.be\"):\n",
" dur,vcount,avgrating = yt(link[15:])\n",
" return (dur,vcount,avgrating)\n",
" #you_final.append((link, link[15:]))\n",
" else:\n",
" test= urllib.urlopen(link).read()\n",
" test=BeautifulSoup(test)\n",
" temp = set()\n",
" \n",
" for anchor in test.findAll('a', href=True):\n",
" x= (anchor['href'])\n",
" if (x.startswith(\"/watch\")):\n",
" if (len(x[9:])==11):\n",
" temp.add(x[9:])\n",
" temp =list(temp)\n",
" for video in temp:\n",
" counter +=1\n",
" data = yt(video)\n",
" dur = dur + float(data[0])\n",
" vcount = vcount + float(data[1])\n",
" avgrating = avgrating + float(data[2])\n",
" return (float(dur)/counter,float(vcount)/counter,float(avgrating)/counter)\n",
" except:\n",
" return ('NA','NA','NA')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 88
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Getting social media data\n",
"def soc(link):\n",
" test=urllib.urlopen(link).read()\n",
" test=BeautifulSoup(test)\n",
" test=test.findAll(\"ul\", class_=\"fr links\")\n",
" test=BeautifulSoup(str(test))\n",
" test=test.findAll(\"li\")\n",
" test=BeautifulSoup(str(test))\n",
" fblikes = 'NA'\n",
" fbtalking ='NA'\n",
" tfollowercnt = 'NA'\n",
" tstatuscnt ='NA'\n",
" youdur = 'NA'\n",
" youviewcnt = 'NA'\n",
" youavgratig = 'NA'\n",
" for anchor in test.findAll('a', href=True):\n",
" soclink = anchor['href']\n",
" #print soclink\n",
" try:\n",
" if ('facebook' in soclink):\n",
" #print \"iside fb\"\n",
" fbdata = getfbdata(soclink[25:])\n",
" #print fbdata\n",
" fblikes = fbdata[0]\n",
" fbtalking = fbdata[1]\n",
" except:\n",
" pass\n",
" try:\n",
" if ('twitter' in soclink):\n",
" twitterdata = gettwitterdata(soclink[20:])\n",
" tfollowercnt = twitterdata[0]\n",
" tstatuscnt = twitterdata[1]\n",
" except:\n",
" pass\n",
" try:\n",
" if ('youtube' in soclink):\n",
" #print \"this is inside youtube\",soclink\n",
" youtubedata = you(soclink)\n",
" youdur = youtubedata[0]\n",
" youviewcnt = youtubedata[1]\n",
" youavgratig = youtubedata[2]\n",
" except:\n",
" pass\n",
" return (fblikes,fbtalking,tfollowercnt,tstatuscnt,youdur,youviewcnt,youavgratig)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 89
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print soc(\"http://www.indiegogo.com/projects/beddit-automatic-sleep-and-wellness-tracker-turn-your-bed-into-a-smart-bed\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(817, 36, 'NA', 'NA', 736.8888888888889, 1417.5555555555557, 3.285712416666667)\n"
]
}
],
"prompt_number": 90
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Getting project details"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def proj_head(link):\n",
" minperk = \"NA\"\n",
" no_min_perk = \"NA\"\n",
" maxperk = \"NA\"\n",
" no_max_perk = \"NA\"\n",
" median_perk = \"NA\"\n",
" no_med_perk = \"NA\"\n",
" total_perks =\"NA\"\n",
" fblikes = 'NA'\n",
" fbtalking ='NA'\n",
" tfollowercnt = 'NA'\n",
" tstatuscnt ='NA'\n",
" youdur = 'NA'\n",
" youviewcnt = 'NA'\n",
" youavgratig = 'NA'\n",
" US=0\n",
" current=0\n",
" proj_page=urllib.urlopen(link).read()\n",
" soup_page=BeautifulSoup(proj_page)\n",
" head=soup_page.find_all(\"div\", class_=\"sub-header\")\n",
" head=str(head)\n",
" head=BeautifulSoup(str(head))\n",
" head_line=head.find_all(\"h1\", class_=\"bold x-large notranslate\")\n",
" head_line=strip_tags(str(head_line))[1:-1]\n",
" head_line = re.sub(r\"[\\n\\t\\r]\", \"\",head_line)\n",
" desc_short=head.find_all(\"p\", class_=\"notranslate\")\n",
" desc_short=strip_tags(str(desc_short))[1:-1]\n",
" desc_short = re.sub(r\"[\\n\\t\\r]\", \"\",desc_short)\n",
" category=head.find_all(\"span\", class_=\"category\")\n",
" category=strip_tags(str(category))[2:-2]\n",
" location=head.find_all(\"span\", class_=\"location\")\n",
" location=strip_tags(str(location))[2:-2]\n",
" country=location.split(\",\")[-1][1:]\n",
" if country == \"United States\":\n",
" US=1\n",
" distance = \"NA\"\n",
" proj=str(head.find_all(\"div\", class_=\"project\"))\n",
" proj=proj.split(\"</li>\")\n",
" try:\n",
" for i in range(1, 5):\n",
" count_update= strip_tags(str(proj[1]))[10:]\n",
" count_comment= strip_tags(str(proj[2]))[11:]\n",
" count_funder= strip_tags(str(proj[3]))[10:]\n",
" count_photo= strip_tags(str(proj[4]))[10:]\n",
" amount_det=soup_page.find_all(\"div\", class_=\"targets clearfix\")\n",
" amount_det=BeautifulSoup(str(amount_det))\n",
" amount_raise=strip_tags(str(amount_det.find_all(\"span\", class_=\"amount medium clearfix\")))\n",
" amount_raise=amount_raise.replace(\",\",\"\")\n",
" amount_raised= \"\".join(re.findall('\\d+',amount_raise))\n",
" amount_cur=amount_raise[-4:-1]\n",
" amount_goal=strip_tags(str(amount_det.find_all(\"p\", class_=\"money-raised goal\")))[19:-10]\n",
" amount_goal=amount_goal.replace(\",\",\"\")\n",
" amount_goal= \"\".join(re.findall('\\d+',amount_goal))\n",
" ratio= float(amount_raised)/float(amount_goal)\n",
" time_left= strip_tags(str(amount_det.find_all(\"p\", class_=\"days-left\")))[2:-22]\n",
" time_left = time_left.strip()\n",
" if time_left != 0:\n",
" current=1\n",
" amount_tpe=soup_page.find_all(\"div\", class_=\"contribution-info\")\n",
" amount_tpe=BeautifulSoup(str(amount_tpe))\n",
" amount_type=strip_tags(str(amount_tpe.find_all(\"p\", class_=\"amount bold fl title\")))[1:-1]\n",
" amount_time=strip_tags(str(amount_tpe.find_all(\"p\", class_=\"funding-info\")))[1:-1].split(\".\")[1][18:-13]\n",
" amount_start_time=amount_time.split(\"-\")[0].replace(\",\",\"\")\n",
" amount_end_time=amount_time.split(\"-\")[1].replace(\",\",\"\")\n",
" delta=(parser.parse(amount_end_time)-parser.parse(amount_start_time)).total_seconds()\n",
" \n",
" amount_start_time = parser.parse(amount_start_time)\n",
" start_year = amount_start_time.year\n",
" start_month = amount_start_time.month\n",
" start_day = amount_start_time.day\n",
" amount_end_time = parser.parse(amount_end_time)\n",
" end_year = amount_end_time.year\n",
" end_month = amount_end_time.month\n",
" end_day = amount_end_time.day\n",
" perks_data = perksdata(link)\n",
" if len(perks_data)>0:\n",
" minperk = perks_data[0]\n",
" no_min_perk = perks_data[1]\n",
" maxperk = perks_data[2]\n",
" no_max_perk = perks_data[3]\n",
" median_perk = perks_data[4]\n",
" no_med_perk = perks_data[5]\n",
" total_perks = perks_data[6]\n",
" print \"Success\",link\n",
" if (link):\n",
" title = link[34:]\n",
" else:\n",
" title=\"\"\n",
" \n",
" #Getting social media statistics\n",
" try:\n",
" socdata = soc(link)\n",
" fblikes = socdata[0]\n",
" fbtalking = socdata[1]\n",
" tfollowercnt = socdata[2]\n",
" tstatuscnt = socdata[3]\n",
" youdur = socdata[4]\n",
" youviewcnt = socdata[5]\n",
" youavgratig = socdata[6]\n",
" except:\n",
" pass \n",
" return (title,head_line.strip(), desc_short.strip(), category.strip(), location.strip(), country.strip(), US, \n",
" distance, count_update, count_comment, count_funder, count_photo, amount_goal, \n",
" amount_raised, amount_cur, ratio, time_left, current, amount_type, start_year,start_month,start_day, end_year,end_month,end_day,\n",
" delta,minperk,no_min_perk,maxperk,no_max_perk,median_perk ,no_med_perk,total_perks,\n",
" fblikes,fbtalking,tfollowercnt,tstatuscnt,youdur,youviewcnt,youavgratig)\n",
" except:\n",
" failures.append((link,\"proj_headmethod\"))\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 91
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Testing for one project"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print proj_head(\"http://www.indiegogo.com/projects/beddit-automatic-sleep-and-wellness-tracker-turn-your-bed-into-a-smart-bed\")"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Success http://www.indiegogo.com/projects/beddit-automatic-sleep-and-wellness-tracker-turn-your-bed-into-a-smart-bed\n",
"('beddit-automatic-sleep-and-wellness-tracker-turn-your-bed-into-a-smart-bed', '', 'The Beddit sensor tracks your sleep quality, heart rate, and breathing under the sheet while you sleep. The app coaches you to improve sleep and performance.', 'Technology', 'Saratoga, California, United States', 'United States', 0, 'NA', '15', '501', '3981', '', '80000', '503571', 'SD ', 6.2946375, '0', 1, ' Flexible Funding ', 2013, 8, 2, 2013, 10, 15, 6393600.0, 10, 30.0, 7900, 3.0, 449.0, 26.0, 7, 817, 36, 'NA', 'NA', 736.8888888888889, 1417.5555555555557, 3.285712416666667)"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 92
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"Getting features for all projects and storing them in a single file"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"projdata=[]\n",
"#This was done in parts and hence index has been provided for comment_link\n",
"for proj in comment_link[1000:1500]:\n",
" projdata.append(proj_head(proj))\n",
"df = pd.DataFrame(projdata)\n",
"#df.to_csv(\"projdata.csv\",sep='\\t',header=False)\n",
"with open('projdata.csv', 'a') as f:\n",
" df.to_csv(f,sep='\\t',header=False)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment