Skip to content

Instantly share code, notes, and snippets.

@decarboxy
Created January 2, 2015 22:03
Show Gist options
  • Save decarboxy/a03994320d3ded48c401 to your computer and use it in GitHub Desktop.
Save decarboxy/a03994320d3ded48c401 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "",
"signature": "sha256:6335ee7f60fa3d7e18bcc12078dae74109eed1eefd9f8db3173673911bd82efc"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from time import gmtime, strftime\n",
"from sys import argv\n",
"import re\n",
"from collections import defaultdict"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 48
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"overall_count = defaultdict(int)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"all_statuses = []"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 50
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open(\"messages_trimmed.txt\") as infile:\n",
" for line in infile:\n",
" line = line.split(\",\")\n",
" time = line[0]\n",
" month_year = strftime(\"%B_%Y\", gmtime(int(time)))\n",
" # mid-sentance commas mess things up\n",
" message_words = \",\".join(line[1:]).split()\n",
" words_stripped = [re.sub(r'\\W+', '', x) for x in message_words]\n",
" all_statuses.append((time, month_year, words_stripped)) "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 51
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for time, month_year, words_stripped in all_statuses:\n",
" for word in words_stripped:\n",
" overall_count[word.lower()] += 1\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 52
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sorted(overall_count.items(), key = lambda x: x[1],reverse=True)[0:40]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 53,
"text": [
"[('to', 2921),\n",
" ('the', 2652),\n",
" ('fix', 1979),\n",
" ('for', 1600),\n",
" ('add', 1234),\n",
" ('in', 1155),\n",
" ('and', 1104),\n",
" ('a', 998),\n",
" ('unit', 949),\n",
" ('of', 928),\n",
" ('from', 791),\n",
" ('test', 757),\n",
" ('tests', 736),\n",
" ('', 711),\n",
" ('added', 524),\n",
" ('remove', 512),\n",
" ('with', 456),\n",
" ('merge', 398),\n",
" ('policy', 385),\n",
" ('update', 373),\n",
" ('on', 357),\n",
" ('is', 348),\n",
" ('use', 344),\n",
" ('flake8', 331),\n",
" ('that', 328),\n",
" ('fixed', 325),\n",
" ('pep8', 320),\n",
" ('not', 313),\n",
" ('new', 313),\n",
" ('change', 301),\n",
" ('bug', 291),\n",
" ('job', 291),\n",
" ('when', 290),\n",
" ('fixes', 278),\n",
" ('scan', 276),\n",
" ('more', 270),\n",
" ('an', 268),\n",
" ('api', 257),\n",
" ('vendor', 255),\n",
" ('it', 248)]"
]
}
],
"prompt_number": 53
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Manually remove blank string and words like \"in\" and \"for\" to get the top 10"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"top_10_words = ['fix', 'add', 'unit', 'test', 'added', 'remove', 'merge', 'policy', 'update', 'flake8']"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 54
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def top10_dict():\n",
" empty = {k:0 for k in top_10_words}"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 55
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"monthly_map = defaultdict(dict)\n",
"for time, month_year, words_stripped in all_statuses:\n",
" word_count = {k:0 for k in top_10_words}\n",
" for word in words_stripped:\n",
" word = word.lower()\n",
" if word == \"tests\":\n",
" word = \"test\"\n",
" if word in top_10_words:\n",
" word_count[word] += 1\n",
" for key in word_count:\n",
" if key in monthly_map[month_year]:\n",
" monthly_map[month_year][key] += word_count[key]\n",
" else:\n",
" monthly_map[month_year][key] = 1\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 56
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"with open(\"data_summary.txt\", \"w\") as outfile:\n",
" outfile.write(\"month\\tword\\tcount\\n\")\n",
" for label in monthly_map:\n",
" for word, count in monthly_map[label].items():\n",
" outfile.write(\"{}\\t{}\\t{}\\n\".format(label, word, count))\n",
" "
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 57
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment