Created
January 2, 2015 22:03
-
-
Save decarboxy/a03994320d3ded48c401 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:6335ee7f60fa3d7e18bcc12078dae74109eed1eefd9f8db3173673911bd82efc" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from time import gmtime, strftime\n", | |
"from sys import argv\n", | |
"import re\n", | |
"from collections import defaultdict" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 48 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"overall_count = defaultdict(int)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 49 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"all_statuses = []" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 50 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open(\"messages_trimmed.txt\") as infile:\n", | |
" for line in infile:\n", | |
" line = line.split(\",\")\n", | |
" time = line[0]\n", | |
" month_year = strftime(\"%B_%Y\", gmtime(int(time)))\n", | |
" # mid-sentance commas mess things up\n", | |
" message_words = \",\".join(line[1:]).split()\n", | |
" words_stripped = [re.sub(r'\\W+', '', x) for x in message_words]\n", | |
" all_statuses.append((time, month_year, words_stripped)) " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 51 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for time, month_year, words_stripped in all_statuses:\n", | |
" for word in words_stripped:\n", | |
" overall_count[word.lower()] += 1\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 52 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"sorted(overall_count.items(), key = lambda x: x[1],reverse=True)[0:40]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 53, | |
"text": [ | |
"[('to', 2921),\n", | |
" ('the', 2652),\n", | |
" ('fix', 1979),\n", | |
" ('for', 1600),\n", | |
" ('add', 1234),\n", | |
" ('in', 1155),\n", | |
" ('and', 1104),\n", | |
" ('a', 998),\n", | |
" ('unit', 949),\n", | |
" ('of', 928),\n", | |
" ('from', 791),\n", | |
" ('test', 757),\n", | |
" ('tests', 736),\n", | |
" ('', 711),\n", | |
" ('added', 524),\n", | |
" ('remove', 512),\n", | |
" ('with', 456),\n", | |
" ('merge', 398),\n", | |
" ('policy', 385),\n", | |
" ('update', 373),\n", | |
" ('on', 357),\n", | |
" ('is', 348),\n", | |
" ('use', 344),\n", | |
" ('flake8', 331),\n", | |
" ('that', 328),\n", | |
" ('fixed', 325),\n", | |
" ('pep8', 320),\n", | |
" ('not', 313),\n", | |
" ('new', 313),\n", | |
" ('change', 301),\n", | |
" ('bug', 291),\n", | |
" ('job', 291),\n", | |
" ('when', 290),\n", | |
" ('fixes', 278),\n", | |
" ('scan', 276),\n", | |
" ('more', 270),\n", | |
" ('an', 268),\n", | |
" ('api', 257),\n", | |
" ('vendor', 255),\n", | |
" ('it', 248)]" | |
] | |
} | |
], | |
"prompt_number": 53 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Manually remove blank string and words like \"in\" and \"for\" to get the top 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"top_10_words = ['fix', 'add', 'unit', 'test', 'added', 'remove', 'merge', 'policy', 'update', 'flake8']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 54 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def top10_dict():\n", | |
" empty = {k:0 for k in top_10_words}" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"monthly_map = defaultdict(dict)\n", | |
"for time, month_year, words_stripped in all_statuses:\n", | |
" word_count = {k:0 for k in top_10_words}\n", | |
" for word in words_stripped:\n", | |
" word = word.lower()\n", | |
" if word == \"tests\":\n", | |
" word = \"test\"\n", | |
" if word in top_10_words:\n", | |
" word_count[word] += 1\n", | |
" for key in word_count:\n", | |
" if key in monthly_map[month_year]:\n", | |
" monthly_map[month_year][key] += word_count[key]\n", | |
" else:\n", | |
" monthly_map[month_year][key] = 1\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 56 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"with open(\"data_summary.txt\", \"w\") as outfile:\n", | |
" outfile.write(\"month\\tword\\tcount\\n\")\n", | |
" for label in monthly_map:\n", | |
" for word, count in monthly_map[label].items():\n", | |
" outfile.write(\"{}\\t{}\\t{}\\n\".format(label, word, count))\n", | |
" " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 57 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment