Skip to content

Instantly share code, notes, and snippets.

@cds-amal
Created March 24, 2015 03:49
Show Gist options
  • Save cds-amal/25d4ac69c0f309455f02 to your computer and use it in GitHub Desktop.
Save cds-amal/25d4ac69c0f309455f02 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'\n",
"t1 = pd.read_csv(fn, header=0)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from tidylib import tidy_document\n",
"import html2text"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def cleanup(txt):\n",
" doc, errors = tidy_document(str(txt))\n",
" doc = html2text.html2text(doc)\n",
" return doc.replace('*','').replace('_','').lower().strip()\n",
" \n",
"def add_clean_text(row):\n",
" row['scrape'] = cleanup(row.AdditionalDescription)\n",
" return row"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"t1_fix = t1.apply(add_clean_text,1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Mayor's Office of Contract Services 67\n",
"Community Boards 36\n",
"Citywide Administrative Services 35\n",
"Landmarks Preservation Commission 18\n",
"Housing Preservation and Development 18\n",
"Human Resources Administration 17\n",
"Transportation 16\n",
"Health and Mental Hygiene 15\n",
"Administration for Children's Services 14\n",
"City Planning 13\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# I only care about these columns for now\n",
"cols = ['RequestID', 'ConfirmationNumber', 'AgencyCode', 'AgencyName',\n",
" 'AgencyDivision', 'SectionID', 'SectionName', 'scrape']\n",
"\n",
"fixed = t1_fix[cols]\n",
"\n",
"# top 10: breakdown of ads by agencies\n",
"fixed['AgencyName'].value_counts()[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Focus on parsing the Mayor's Office of Contract of Services ads.**"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# select Office of Contract Services Ads\n",
"mocs = fixed['AgencyName'] == \"Mayor's Office of Contract Services\"\n",
"mocs_ads = fixed[mocs][['RequestID', 'scrape']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Example entries**\n",
"\n",
" __public meeting notice__\n",
" \n",
" public notice is hereby given that the \n",
" franchise and concession review committee will hold a \n",
" public meeting on wednesday, october 8, 2014 at 2:30 p.m., \n",
" at 22 reade street, spector hall, borough of manhattan. \n",
" \n",
"\n",
"Fields to extract | description | Parsing Status\n",
"------------------| -----------------| --------------\n",
"datetime | meeting datetime | working\n",
"\n",
"__Notice of intent to extend contract__\n",
"\n",
" vendor: accenture llp\n",
" description of services: design, development and deployment of application\n",
" enhancements and extensions to the existing apt system along with the\n",
" appropriate documentation required.\n",
"\n",
" award method of original contract: intergovernmental\n",
" fms contract type: consultant\n",
" end date of original contract: 1/31/2015 \n",
" method of renewal/extenction the agency intends to utilize: extension\n",
" new start date of the proposed renewed/extended contract: 2/1/15\n",
" new end date of the proposed renewed/extended contract: 7/31/15\n",
" modifications sought to the nature of services performed under the contract: none\n",
" reason(s) the agency intends to renew/extend the contract: continuation of services\n",
" personnel in substantially similar titles within agency: apt project manager –\n",
" 1; apt technical lead – 1; apt developer - 2\n",
" headcount of personnel in substantially similar titles within agency: 4\n",
"\n",
"\n",
"Fields to extract |desc | Parsing Status\n",
"----------------------------------------------------------------------------|-----|---------------\n",
"vendor | - | needs test \n",
"description of services | - | needs test \n",
"award method of origian contract | - | needs test \n",
"fms contract type | - | needs test \n",
"end date of original contract | - | needs test \n",
"method of renewalextension | - | needs test \n",
"new start date of proposed renewed/extended contract | - | needs test \n",
"new end date of proposed renewed/extended contract | - | needs test \n",
"modifications sought to the nature of services performed under the contract | - | needs test \n",
"reason(s) the agency intends to renew/extend the contract | - | needs test \n",
"personnel in substantially similar titles within agency | - | debugging & needs test \n",
"headcount of personnel in substantially similar titles within agency | - | needs test \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# build a regex for public meeting notice\n",
"rex_time = 'on\\s+(?P<day>[^\\s,]+)[\\s,]\\s+(?P<month>[^\\s,]+)\\s+(?P<date>\\d+)[,\\s]+(?P<year>\\d+)\\s+(at)?\\s*(?P<hour>\\d+):(?P<minute>\\d+)\\s+(?P<tod>\\w+\\.?\\w+\\.?)'\n",
"rex_time = re.compile(rex_time, re.IGNORECASE|re.DOTALL|re.MULTILINE)\n",
"\n",
"# build regexes to parse Notice of intent to extend contract ads\n",
"extend_contract_keys = [\n",
" 'vendor',\n",
" 'description of services', \n",
" 'award method of origian contract',\n",
" 'fms contract type',\n",
" 'end date of original contract',\n",
" 'method of renewalextension',\n",
" 'new start date of proposed renewed/extended contract',\n",
" 'new end date of proposed renewed/extended contract',\n",
" 'modifications sought to the nature of services performed under the contract',\n",
" 'reason(s) the agency intends to renew/extend the contract',\n",
" 'personnel in substantially similar titles within agency',\n",
" 'headcount of personnel in substantially similar titles within agency'\n",
"]\n",
"\n",
"rex_mm_dd_year = '(?P<month>\\d{1,2})/(?P<day>\\d{1,2})/(?P<year>\\d{2,4})'\n",
"rex_extend_contract = {\n",
" 'vendor' : 'vendor:\\s+(?P<vendor>[^\\n]+$)\\n',\n",
" 'description of services' : 'description\\sof\\sservices:\\s+(?P<text>.+)\\n+award',\n",
" 'award method of origian contract': 'award\\smethod\\sof\\soriginal\\scontract:\\s+(?P<text>[^\\n]+)',\n",
" 'fms contract type' : 'fms\\scontract\\stype:\\s+(?P<text>[^\\n]+$)\\n',\n",
" 'end date of original contract': 'end\\sdate\\sof\\soriginal\\scontract:\\s' + rex_mm_dd_year,\n",
" 'method of renewalextension': 'method\\sof\\srenewal/extension[^:]+:\\s(?P<text>[^\\n]+$)\\n',\n",
" 'new start date of proposed renewed/extended contract': 'new\\sstart\\sdate\\sof\\sthe\\sproposed\\srenewed.extended\\scontract:\\s' + rex_mm_dd_year,\n",
" 'new end date of proposed renewed/extended contract': 'new\\send\\sdate\\sof\\sthe\\sproposed\\srenewed.extended\\scontract:\\s' + rex_mm_dd_year,\n",
" 'modifications sought to the nature of services performed under the contract': 'modifications\\ssought\\sto\\sthe\\snature\\sof\\sservices\\sperformed\\sunder\\sthe\\scontract:\\s+(?P<text>.+)\\n+reason',\n",
" 'reason(s) the agency intends to renew/extend the contract': 'reason\\(s\\)\\sthe\\sagency\\sintends\\sto\\srenew/extend\\sthe\\scontract:\\s(?P<text>.+)\\n+personnel',\n",
" \n",
" # todo - debug this regex. not working on all the dataset\n",
" # \n",
" 'personnel in substantially similar titles within agency': '^personnel\\sin\\ssubstantially\\ssimilar\\stitles\\swithin\\sagency:\\s+(?P<text>(none\\s?|.+))\\n(headcount)?',\n",
" 'headcount of personnel in substantially similar titles within agency': 'headcount\\sof\\spersonnel\\sin\\ssubstantially\\ssimilar\\stitles\\swithin\\sagency:\\s+(?P<text>\\d+)\\n'\n",
"}\n",
"for k,v in rex_extend_contract.items():\n",
" rex_extend_contract[k] = re.compile(v, re.IGNORECASE|re.MULTILINE|re.DOTALL)\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"mocs = fixed['AgencyName'] == \"Mayor's Office of Contract Services\"\n",
"mocs_ads = fixed[mocs][['RequestID', 'scrape']]\n"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['apt project manager –\\n1; apt technical lead – 1; apt developer - 2\\n', 'apt project manager –\\n1; apt technical lead – 1; apt developer - 2\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['architect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of brooklyn and\\nstaten island\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of the bronx and\\nmanhattan\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof corlears hook comfort station located in the borough of manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof nine comfort stations located in the boroughs of the bronx and manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof eight comfort stations located in the boroughs of brooklyn and staten\\nisland\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors', 'architect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of brooklyn and\\nstaten island\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of the bronx and\\nmanhattan\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof corlears hook comfort station located in the borough of manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof nine comfort stations located in the boroughs of the bronx and manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof eight comfort stations located in the boroughs of brooklyn and staten\\nisland\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none', 'none', 'headcount']\n",
"['none ', 'none ', '']\n",
"['none ', 'none ', '']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"['none\\n', 'none\\n', 'headcount']\n",
"\n"
]
},
{
"ename": "IndexError",
"evalue": "('string index out of range', 'occurred at index 121')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-102-8007c9279f54>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mt1_parsed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmocs_ads\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Volumes/Sofai/python-projects/nltk3/lib/python3.4/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 3687\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3688\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3689\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3690\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3691\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Volumes/Sofai/python-projects/nltk3/lib/python3.4/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 3777\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3778\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3779\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3780\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3781\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-102-8007c9279f54>\u001b[0m in \u001b[0;36mparse\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mtmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'none'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 39\u001b[0;31m \u001b[0mtmp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtmp\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 40\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: ('string index out of range', 'occurred at index 121')"
]
}
],
"source": [
"def parse_regs(txt):\n",
" txt = re.sub(' +', ' ', txt)\n",
" txt = re.sub('\\n{3,}', '\\n', txt)\n",
" \n",
" if 'public notice is hereby given that' in txt:\n",
" return 100, json.dumps(rex_time.findall(txt))\n",
" elif 'notice of intent to extend contract(s)' in txt or \\\n",
" 'notice of intent to issue new solicitation' in txt:\n",
" out = {}\n",
" for k in extend_contract_keys:\n",
" match = rex_extend_contract[k].findall(txt)\n",
" if len(match) and isinstance(match[0], str):\n",
" match = [match[0].strip()]\n",
" if len(match):\n",
" out[k] = match[0]\n",
" return 101, json.dumps(out)\n",
" else:\n",
" return 999,json.dumps(None)\n",
" \n",
"def parse(row):\n",
" id, row['result'] = parse_regs(row.scrape)\n",
" data = json.loads(row['result'])\n",
" if id == 100:\n",
" # not interested in parsing this for now\n",
" row['extracted_date'] = ''\n",
" for k in extend_contract_keys:\n",
" row[k] = ''\n",
"\n",
" elif id == 101:\n",
" row['extracted_date'] = ''\n",
" if isinstance(data, dict):\n",
" for k in extend_contract_keys:\n",
" row[k] = json.dumps(data.get(k,''))\n",
" if k == 'personnel in substantially similar titles within agency':\n",
" tmp = json.loads(row[k])\n",
" print (tmp)\n",
" if 'none' in tmp:\n",
" tmp = 'none'\n",
" else:\n",
" tmp = tmp[0]\n",
" row[k] = json.dumps(tmp)\n",
" else:\n",
" for k in extend_contract_keys:\n",
" row[k] = ''\n",
" else:\n",
" row['extracted_date'] = ''\n",
" for k in extend_contract_keys:\n",
" row[k] = ''\n",
" \n",
" return row\n",
"\n",
"t1_parsed = mocs_ads.apply(parse,1)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>RequestID</th>\n",
" <th>personnel in substantially similar titles within agency</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20131104111</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20131104112</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20131104113</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>20140930106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>20141001102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>60</th>\n",
" <td>20141001124</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>20141001125</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>20141002106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>20141002107</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>69</th>\n",
" <td>20141002108</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>20141003102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75</th>\n",
" <td>20141003105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>20141003107</td>\n",
" <td>\"none\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>20141006106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>20141007107</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>20141008104</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>20141008110</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>20141008112</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>20141009107</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>121</th>\n",
" <td>20141014101</td>\n",
" <td>\"\\\"\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>133</th>\n",
" <td>20141016105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>20141021105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>157</th>\n",
" <td>20141021107</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>182</th>\n",
" <td>20141023111</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194</th>\n",
" <td>20141027107</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205</th>\n",
" <td>20141029102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224</th>\n",
" <td>20141103103</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>225</th>\n",
" <td>20141103104</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229</th>\n",
" <td>20141103109</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>20141114110</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>331</th>\n",
" <td>20141125102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>334</th>\n",
" <td>20141125105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>339</th>\n",
" <td>20141125111</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>355</th>\n",
" <td>20141201101</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>357</th>\n",
" <td>20141201103</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>368</th>\n",
" <td>20141202105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>371</th>\n",
" <td>20141202108</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>373</th>\n",
" <td>20141202110</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>401</th>\n",
" <td>20141204117</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>405</th>\n",
" <td>20141208101</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406</th>\n",
" <td>20141208102</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>411</th>\n",
" <td>20141209115</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>418</th>\n",
" <td>20141210103</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419</th>\n",
" <td>20141210104</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>420</th>\n",
" <td>20141210105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>421</th>\n",
" <td>20141210106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>439</th>\n",
" <td>20141215102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>444</th>\n",
" <td>20141215113</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>446</th>\n",
" <td>20141216119</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>449</th>\n",
" <td>20141216123</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>452</th>\n",
" <td>20141217106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>454</th>\n",
" <td>20141218102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>459</th>\n",
" <td>20141219107</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>460</th>\n",
" <td>20141219108</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>466</th>\n",
" <td>20141222103</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>467</th>\n",
" <td>20141223102</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>469</th>\n",
" <td>20141223108</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471</th>\n",
" <td>20141224104</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>472</th>\n",
" <td>20141224105</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" <tr>\n",
" <th>473</th>\n",
" <td>20141224106</td>\n",
" <td>\"[\"</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>67 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" RequestID personnel in substantially similar titles within agency\n",
"1 20131104111 \n",
"2 20131104112 \n",
"3 20131104113 \n",
"47 20140930106 \"[\" \n",
"55 20141001102 \"[\" \n",
"60 20141001124 \"[\" \n",
"61 20141001125 \"[\" \n",
"67 20141002106 \"[\" \n",
"68 20141002107 \"[\" \n",
"69 20141002108 \"[\" \n",
"73 20141003102 \"[\" \n",
"75 20141003105 \"[\" \n",
"77 20141003107 \"none\" \n",
"84 20141006106 \"[\" \n",
"91 20141007107 \"[\" \n",
"97 20141008104 \"[\" \n",
"103 20141008110 \"[\" \n",
"104 20141008112 \"[\" \n",
"107 20141009107 \"[\" \n",
"121 20141014101 \"\\\"\" \n",
"133 20141016105 \"[\" \n",
"155 20141021105 \"[\" \n",
"157 20141021107 \n",
"182 20141023111 \"[\" \n",
"194 20141027107 \"[\" \n",
"205 20141029102 \"[\" \n",
"224 20141103103 \"[\" \n",
"225 20141103104 \"[\" \n",
"229 20141103109 \"[\" \n",
"270 20141114110 \"[\" \n",
".. ... ... \n",
"331 20141125102 \"[\" \n",
"334 20141125105 \"[\" \n",
"339 20141125111 \"[\" \n",
"355 20141201101 \"[\" \n",
"357 20141201103 \"[\" \n",
"368 20141202105 \"[\" \n",
"371 20141202108 \"[\" \n",
"373 20141202110 \"[\" \n",
"401 20141204117 \"[\" \n",
"405 20141208101 \"[\" \n",
"406 20141208102 \n",
"411 20141209115 \"[\" \n",
"418 20141210103 \"[\" \n",
"419 20141210104 \"[\" \n",
"420 20141210105 \"[\" \n",
"421 20141210106 \"[\" \n",
"439 20141215102 \"[\" \n",
"444 20141215113 \"[\" \n",
"446 20141216119 \"[\" \n",
"449 20141216123 \n",
"452 20141217106 \"[\" \n",
"454 20141218102 \"[\" \n",
"459 20141219107 \"[\" \n",
"460 20141219108 \"[\" \n",
"466 20141222103 \"[\" \n",
"467 20141223102 \"[\" \n",
"469 20141223108 \"[\" \n",
"471 20141224104 \"[\" \n",
"472 20141224105 \"[\" \n",
"473 20141224106 \"[\" \n",
"\n",
"[67 rows x 2 columns]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import csv\n",
"headers = [ 'RequestID',\n",
"# 'vendor',\n",
"# 'description of services', \n",
"# 'award method of origian contract',\n",
"# 'fms contract type',\n",
"# 'end date of original contract',\n",
"# 'method of renewalextension',\n",
"# 'new start date of proposed renewed/extended contract',\n",
"# 'new end date of proposed renewed/extended contract',\n",
"# 'modifications sought to the nature of services performed under the contract',\n",
"# 'reason(s) the agency intends to renew/extend the contract',\n",
" 'personnel in substantially similar titles within agency'\n",
"# 'headcount of personnel in substantially similar titles within agency'\n",
" ]\n",
"\n",
"output = t1_parsed[headers]\n",
"output.to_csv(open('foobar.csv','w', encoding='utf-8'), quoting=csv.QUOTE_ALL, sep=',',encoding='utf-8',escapechar='\\\\')\n",
"output"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment