Skip to content

Instantly share code, notes, and snippets.

@96chan
Created May 2, 2013 09:02
Show Gist options
  • Save 96chan/5501063 to your computer and use it in GitHub Desktop.
Save 96chan/5501063 to your computer and use it in GitHub Desktop.
Preparation for visualizing about TEDx 1) Trends of TEDx over past 5 years 2) Comparison between TEDx and TED
{
"metadata": {
"name": "Visualization"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"outline\"></span>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Outline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"#1\">1. Trends of TEDx</a><br>\n",
" <ul>\n",
" <a href=\"#1-1\"><li>Data Preparation</li></a>\n",
" <a href=\"#1-2\"><li>Visualization</li></a>\n",
" </ul>\n",
"<a href=\"#2\">2. Comparison between TED and TEDx</a>\n",
" <ul>\n",
" <a href=\"#2-1\"><li>Data Preparation</li></a>\n",
" <a href=\"#2-2\"><li>Visualization</li></a>\n",
" </ul>\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"1\"></span>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Trends of TEDx over 5 years (by language)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"1-1\"></span>"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"1. Data Preparation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# TEDx data by language"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 58
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"\n",
"TEDX_JSON_FILE = \"final_tedx_v1.json\"\n",
"tedx_lang = {}\n",
"event_array = []\n",
"\n",
"with open(TEDX_JSON_FILE, \"r\") as tedx_json_file:\n",
" for i, line in enumerate(tedx_json_file):\n",
" u = json.loads(line) \n",
" year = u['data']['uploaded'][:4]\n",
" # extract language attribute from corpus\n",
" if u['data'].has_key('lang') and u['data']['viewCount'] != '-':\n",
" # language name\n",
" lang = u['data']['lang']\n",
" # event name\n",
" if u['data'].has_key('event'):\n",
" event = u['data']['event']\n",
" elif u['data']['title'].find('at TEDx'):\n",
" event = u['data']['title'][u['data']['title'].find('at TEDx')+3:]\n",
" # view_cnt\n",
" view_cnt = u['data']['viewCount']\n",
" \n",
" # check language in the dictionary \n",
" if lang in tedx_lang.keys(): # existed language\n",
" # view count\n",
" tedx_lang[lang][0] += int(view_cnt)\n",
" # event append\n",
" if event not in event_array:\n",
" event_array.append(event)\n",
" # event count\n",
" tedx_lang[lang][1] += 1\n",
" # video count\n",
" tedx_lang[lang][2] += 1\n",
" else: # new langauge\n",
" # event append\n",
" if event not in event_array:\n",
" event_array.append(event)\n",
" tedx_lang[lang] = [int(view_cnt),1,1] # view/event/video\n",
" \n",
"print tedx_lang"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{u'Swedish': [44697, 7, 39], u'Icelandic': [24456, 1, 28], u'Estonian': [60411, 3, 50], u'Turkish': [199883, 3, 75], u'Romanian': [191920, 8, 71], u'Azerbaijani': [9190, 2, 17], u'Hindi': [9156, 2, 10], u'Dutch': [20211, 4, 25], u'Korean': [886851, 74, 573], u'Indonesian': [471437, 9, 58], u'Hungarian': [91745, 8, 54], u'Ukrainian': [62395, 9, 40], u'Lithuanian': [24422, 1, 8], u'Malay': [670, 1, 1], u'French': [3531082, 66, 550], u'Catalan': [12520, 4, 57], u'Russian': [382446, 37, 347], u'Thai': [255, 1, 1], u'Italian': [340887, 22, 237], u'Tamil': [44816, 1, 7], u'Slovene': [131883, 5, 73], u'Finnish': [84745, 2, 28], u'Hebrew': [24835, 12, 13], u'Bulgarian': [178818, 4, 65], u'Greek': [487140, 2, 104], u'English': [57297339, 1583, 15684], u'Rajasthani': [93767, 2, 17], u'Croatian': [23147, 11, 44], u'Portuguese': [1486959, 104, 913], u'Chinese': [106780, 14, 63], u'Czech': [207996, 5, 64], u'Japanese': [898128, 13, 258], u'Galician': [13146, 1, 7], u'German': [160169, 18, 118], u'Slovak': [128627, 5, 59], u'Spanish': [4111914, 127, 1225], u'Urdu': [171188, 3, 26], u'Polish': [798470, 9, 134], u'Arabic': [4127472, 40, 396]}\n"
]
}
],
"prompt_number": 59
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# TEDx language data by year "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 60
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import json\n",
"\n",
"TEDX_JSON_FILE = \"final_tedx_v1.json\"\n",
"tedx_year = {'2009':{}, '2010':{}, '2011':{}, '2012':{}, '2013':{}}\n",
"event_year = {'2009':[], '2010':[], '2011':[], '2012':[], '2013':[]}\n",
"year_array =['2009','2010','2011','2012','2013']\n",
"\n",
"with open(TEDX_JSON_FILE, \"r\") as tedx_json_file:\n",
" for i, line in enumerate(tedx_json_file):\n",
" u = json.loads(line) \n",
" year = u['data']['uploaded'][:4]\n",
" # extract language attribute from corpus\n",
" if u['data'].has_key('lang') and u['data']['viewCount'] != '-':\n",
" # language name\n",
" lang = u['data']['lang']\n",
" # event name\n",
" if u['data'].has_key('event'):\n",
" event = u['data']['event']\n",
" elif u['data']['title'].find('at TEDx'):\n",
" event = u['data']['title'][u['data']['title'].find('at TEDx')+3:]\n",
" # view_cnt\n",
" view_cnt = u['data']['viewCount']\n",
" \n",
" for i in year_array:\n",
" if year == i:\n",
" # check language in the dictionary \n",
" if lang in tedx_year[year].keys(): # existed language\n",
" # view count\n",
" tedx_year[year][lang][0] += int(view_cnt)\n",
" # event append\n",
" if event not in event_year[year]:\n",
" event_year[year].append(event)\n",
" # event count\n",
" tedx_year[year][lang][1] += 1\n",
" # video count\n",
" tedx_year[year][lang][2] += 1\n",
" else: # new langauge\n",
" # event append\n",
" if event not in event_year[year]:\n",
" event_year[year].append(event)\n",
" tedx_year[year][lang] = [int(view_cnt),1,1] # view/event/video\n",
" \n",
"print tedx_year"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"{'2009': {u'Portuguese': [27104, 3, 9], u'Slovene': [10844, 1, 4], u'Ukrainian': [8584, 1, 3], u'German': [8106, 1, 1], u'Romanian': [23750, 1, 2], u'Spanish': [19157, 1, 3], u'Malay': [670, 1, 1], u'Greek': [1584, 1, 1], u'Icelandic': [16926, 1, 14], u'English': [1682910, 45, 290], u'Russian': [28304, 2, 7], u'Korean': [17723, 1, 11], u'Italian': [11077, 1, 7]}, '2011': {u'Swedish': [3186, 3, 5], u'Icelandic': [6498, 1, 10], u'Estonian': [18989, 1, 16], u'Turkish': [120479, 2, 17], u'Romanian': [115947, 2, 28], u'Azerbaijani': [5720, 1, 4], u'Hindi': [1924, 1, 2], u'Dutch': [5894, 2, 4], u'Korean': [203726, 40, 219], u'Indonesian': [193243, 5, 21], u'Hungarian': [75993, 1, 25], u'Ukrainian': [16817, 1, 9], u'French': [636424, 13, 115], u'Catalan': [944, 1, 4], u'Russian': [108993, 23, 115], u'Croatian': [1910, 8, 8], u'Tamil': [44816, 1, 7], u'Slovene': [11416, 3, 10], u'Finnish': [32053, 1, 17], u'Hebrew': [21367, 2, 3], u'Bulgarian': [89274, 2, 20], u'Greek': [114832, 1, 22], u'English': [23659350, 632, 4401], u'Rajasthani': [47946, 2, 10], u'Italian': [155295, 11, 68], u'Portuguese': [565467, 49, 231], u'Chinese': [24944, 2, 9], u'Czech': [158376, 1, 28], u'Japanese': [243656, 7, 77], u'German': [39830, 8, 36], u'Slovak': [54768, 3, 19], u'Spanish': [1561392, 41, 264], u'Urdu': [83483, 1, 6], u'Polish': [38791, 2, 27], u'Arabic': [1330391, 13, 139]}, '2010': {u'Swedish': [28931, 4, 23], u'Estonian': [25709, 1, 5], u'Romanian': [30335, 1, 9], u'Hindi': [2087, 1, 2], u'Korean': [553794, 25, 118], u'Indonesian': [187375, 2, 10], u'Hungarian': [1934, 2, 2], u'Ukrainian': [25743, 1, 9], u'Lithuanian': [24422, 1, 8], u'French': [1521047, 6, 56], u'Russian': [150776, 3, 25], u'Slovene': [68588, 1, 30], u'Finnish': [52692, 2, 11], u'Spanish': [1620656, 33, 210], u'Greek': [287836, 3, 43], u'English': [9969544, 442, 2572], u'Rajasthani': [45821, 1, 7], u'Italian': [61155, 2, 35], u'Portuguese': [401315, 8, 142], u'Czech': [30468, 1, 11], u'Japanese': [363966, 1, 47], u'Galician': [2070, 1, 2], u'German': [75350, 2, 11], u'Slovak': [44058, 1, 11], u'Hebrew': [3468, 10, 10], u'Urdu': [53691, 1, 3], u'Polish': [38571, 2, 25], u'Arabic': [572364, 3, 38]}, '2013': {u'Turkish': [12282, 1, 25], u'Romanian': [1376, 3, 4], u'Azerbaijani': [875, 1, 7], u'Hindi': [133, 1, 1], u'Dutch': [195, 1, 1], u'Korean': [11334, 7, 37], u'Indonesian': [1947, 2, 12], u'Hungarian': [10049, 3, 22], u'French': [268420, 19, 116], u'Catalan': [3040, 2, 18], u'Russian': [26514, 7, 55], u'Italian': [8456, 3, 32], u'Slovene': [19577, 1, 12], u'Bulgarian': [22224, 1, 11], u'Greek': [23884, 2, 13], u'English': [4097079, 304, 2118], u'Croatian': [2238, 2, 17], u'Portuguese': [134268, 20, 141], u'Chinese': [2731, 1, 9], u'Czech': [5873, 1, 9], u'Japanese': [21305, 7, 56], u'Galician': [1633, 1, 1], u'German': [9375, 4, 18], u'Spanish': [345293, 37, 316], u'Polish': [665644, 7, 49], u'Arabic': [1291220, 16, 77]}, '2012': {u'Swedish': [12580, 2, 11], u'Icelandic': [1032, 1, 4], u'Estonian': [15713, 3, 29], u'Turkish': [67122, 2, 33], u'Romanian': [20512, 5, 28], u'Azerbaijani': [2595, 1, 6], u'Hindi': [5012, 2, 5], u'Dutch': [14122, 4, 20], u'Korean': [100274, 28, 188], u'Bulgarian': [67320, 4, 34], u'Hungarian': [3769, 5, 5], u'Ukrainian': [11251, 8, 19], u'French': [1105191, 47, 263], u'Catalan': [8536, 4, 35], u'Russian': [67859, 17, 145], u'Thai': [255, 1, 1], u'Croatian': [18999, 1, 19], u'Slovene': [21458, 3, 17], u'Indonesian': [88872, 5, 15], u'Greek': [59004, 1, 25], u'English': [17888456, 711, 6303], u'Italian': [104904, 12, 95], u'Portuguese': [358805, 52, 390], u'Chinese': [79105, 11, 45], u'Czech': [13279, 3, 16], u'Japanese': [269201, 6, 78], u'Galician': [9443, 1, 4], u'German': [27508, 6, 52], u'Slovak': [29801, 3, 29], u'Spanish': [565416, 59, 432], u'Urdu': [34014, 2, 17], u'Polish': [55464, 2, 33], u'Arabic': [933497, 24, 142]}}\n"
]
}
],
"prompt_number": 61
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# confirmation\n",
"sum = 0\n",
"for i in year_array:\n",
" if tedx_year[i].has_key('Arabic'):\n",
" sum += tedx_year[i]['Arabic'][0]\n",
"sum == tedx_lang['Arabic'][0]\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 62,
"text": [
"True"
]
}
],
"prompt_number": 62
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Filling up missing values for keeping the same size of data each year\n",
"keys = []\n",
"tedx_test = tedx_year\n",
"tedx_test['2011']['Galician'] = [0,0,-1]\n",
"tedx_test['2011']['Malay'] = [0,0,-1]\n",
"tedx_test['2011']['Thai'] = [0,0,-1]\n",
"tedx_test['2011']['Lithuanian'] = [0,0,-1]\n",
"\n",
"keys.append(tedx_test['2011'].keys())\n",
"\n",
"for i in year_array:\n",
" sub_keys = []\n",
" # calculate missing language\n",
" sub_keys.append((list(set(tedx_test['2011'].keys()) - set(tedx_test[i].keys()))))\n",
" for j in sub_keys[0]:\n",
" # filling up dummy variable (0,0,0)\n",
" tedx_test[i][j] = [0,0,-1]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Data Formating \n",
"\n",
"tedx= {'2009':[],'2010':[],'2011':[],'2012':[],'2013':[]}\n",
"\n",
"for i in year_array:\n",
" for j in tedx_test[i].keys():\n",
" for lang in keys[0]:\n",
" if j == lang:\n",
"\n",
" tedx[i].append({'language':str(j),'viewcount':tedx_test[i][j][0], 'eventcount':tedx_test[i][j][1], 'videocount':tedx_test[i][j][2]})\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 64
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"1-2\"></span>"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"2. Visualization"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.display import HTML\n",
"HTML('<iframe src=\"http://96chany.com/projects/tedx_popularity\" width=\"1000\" height=\"800\"></iframe>')\n",
"\n",
"# you can navigate years by sliding 'year' digits"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<iframe src=\"http://96chany.com/projects/tedx_popularity\" width=\"1000\" height=\"800\"></iframe>"
],
"output_type": "pyout",
"prompt_number": 65,
"text": [
"<IPython.core.display.HTML at 0x63d9f90>"
]
}
],
"prompt_number": 65
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"2\"></span>\n",
"<a href=\"#outline\">Go to Top </a>"
]
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"Comparing TEDx with TED"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"2-1\"></span>"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"1. Data Preparation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import csv\n",
"o = open('ted.csv','rU')\n",
"o.seek(0)\n",
"data = csv.reader(o)\n",
"\n",
"ted = {}\n",
"for row in data:\n",
" ted[str(row[0][:row[0].find('\\xe6')])] =int(row[0][row[0].find('\\xe6')+2:-1])\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 66
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"tedx_total = tedx_lang\n",
"\n",
"for i in tedx_total.keys():\n",
" tedx_total[str(i)] = tedx_total[i][2]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 67
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# figure out languages not in tedx, but in ted\n",
"print set(ted.keys()) - set(tedx_total.keys())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"set(['Hupa', 'Swahili', 'Latvian', 'Telugu', 'Vietnamese', 'Marathi', 'Slovenian', 'Gujarati', 'Lao', 'Swedish Chef', 'Kyrgyz', 'Luxembourgish', 'Malagasy', 'Danish', 'Khmer', 'Klingon', 'Norwegian Bokmal', 'Norwegian Nynorsk', 'Bosnian', 'Georgian', 'Chinese, Traditional', 'French, Canadian', 'Armenian', 'Serbo-Croatian', 'Maltese', 'Portuguese, Brazilian', 'Afrikaans', 'Tibetan', 'Cebuano', 'Bengali', 'Kurdish', 'Ingush', 'Uyghur', 'Nepali', 'Filipino', 'Uzbek', 'Algerian Arabic', 'Albanian', 'Irish', 'Burmese', 'Asturian', 'Tagalog', 'Serbian', 'Malayalam', 'Chinese, Yue', 'Assamese', 'Hausa', 'Kazakh', 'Macedo', 'Latin', 'Creole, Haitian', 'Chinese, Simplified', 'Bislama', 'Belarusian', 'Kannada', 'Amharic', 'Macedonian', 'Persian', 'Tajik', 'Mongolian', 'Basque', 'Esperanto', 'Occitan', 'Sinhala'])\n"
]
}
],
"prompt_number": 68
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# combining one variable with similar langauge \n",
"# e.g. Chinese, Chinese, Traditional, Chinese, Yue, Chinese, Simplified\n",
"ted_total = ted\n",
"ted_total['Chinese'] = 0\n",
"ted_total['Creole'] = 0\n",
"\n",
"for key in ted_total.keys():\n",
" if key.find(',') > 0:\n",
" ted_total[key[:key.find(',')]] += ted_total[key]\n",
"for key in ted_total.keys():\n",
" if key.find(',') > 0:\n",
" print key\n",
" ted_total.pop(key,None)\n",
"ted_total['Norwegian'] = ted_total['Norwegian Bokmal'] + ted_total['Norwegian Nynorsk'] \n",
"ted_total['Arabic'] += ted_total['Algerian Arabic']\n",
"ted_total['Swedish'] += ted_total['Swedish Chef']\n",
"ted_total.pop('Norwegian Bokmal',None)\n",
"ted_total.pop('Norwegian Nynorsk',None)\n",
"ted_total.pop('Algerian Arabic',None)\n",
"ted_total.pop('Swedish Chef',None)\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 70,
"text": [
"1"
]
}
],
"prompt_number": 70
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print set(ted_total.keys()) - set(tedx_total.keys())\n",
"print set(tedx_total.keys()) - set(ted_total.keys())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"set(['Hupa', 'Swahili', 'Latvian', 'Telugu', 'Vietnamese', 'Marathi', 'Slovenian', 'Gujarati', 'Kyrgyz', 'Creole', 'Lao', 'Malagasy', 'Danish', 'Khmer', 'Klingon', 'Latin', 'Luxembourgish', 'Bosnian', 'Georgian', 'Norwegian', 'Armenian', 'Maltese', 'Assamese', 'Afrikaans', 'Tibetan', 'Cebuano', 'Bengali', 'Kurdish', 'Ingush', 'Uyghur', 'Nepali', 'Filipino', 'Uzbek', 'Serbo-Croatian', 'Albanian', 'Burmese', 'Asturian', 'Tagalog', 'Serbian', 'Malayalam', 'Hausa', 'Irish', 'Macedo', 'Kazakh', 'Bislama', 'Belarusian', 'Kannada', 'Amharic', 'Macedonian', 'Persian', 'Tajik', 'Mongolian', 'Basque', 'Esperanto', 'Occitan', 'Sinhala'])\n",
"set([u'Rajasthani', u'Slovene'])\n"
]
}
],
"prompt_number": 71
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# filling the missing values (TED) <-- (TEDx)\n",
"ted_total['Rajasthani']=0\n",
"ted_total['Slovene']=0"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 72
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# filling the missing values (TEDx) <-- (TED)\n",
"\n",
"keys = []\n",
"keys.append(ted_total.keys())\n",
"\n",
"sub_keys = []\n",
"# calculate missing language\n",
"sub_keys.append((list(set(ted_total.keys()) - set(tedx_total.keys()))))\n",
"for j in sub_keys[0]:\n",
" # filling up dummy variable (0)\n",
" tedx_total[j] = 0"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 73
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# formatting\n",
"f = open(\"ted_tedx.csv\",\"w\")\n",
"print >>f, \"Language, TED, TEDx\"\n",
"for i in keys[0]:\n",
" print >>f, i,\",\",ted_total[i],\",\",tedx_total[i]\n",
"\n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 74
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span id=\"2-2\"></span>"
]
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"2. Visualization"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from IPython.display import HTML\n",
"HTML('<iframe src=\"http://96chany.com/projects/tedx_comparison\" width=\"1200\" height=\"1000\"></iframe>')\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<iframe src=\"http://96chany.com/projects/tedx_comparison\" width=\"1200\" height=\"1000\"></iframe>"
],
"output_type": "pyout",
"prompt_number": 77,
"text": [
"<IPython.core.display.HTML at 0x63f3490>"
]
}
],
"prompt_number": 77
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment