Skip to content

Instantly share code, notes, and snippets.

@Green-Guo
Created May 9, 2018 21:58
Show Gist options
  • Save Green-Guo/020fc4a4a7873054c075f47331a4c86a to your computer and use it in GitHub Desktop.
Save Green-Guo/020fc4a4a7873054c075f47331a4c86a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat_minor": 2,
"cells": [
{
"execution_count": 1,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "import jaydebeapi, jpype\nfrom ibmdbpy import IdaDataBase\nfrom ibmdbpy import IdaDataFrame\nimport pandas as pd # this package is for data processing\nimport numpy as np # this package is for numerical operations\nfrom numpy import eye, asarray, dot, sum, diag\nfrom numpy.linalg import svd\nfrom sklearn.decomposition import IncrementalPCA # matrix dimension reduction\nimport matplotlib.pyplot as plt; plt.rcdefaults() # data visualzation\nfrom collections import OrderedDict # this package is for array sequencing\nfrom warnings import filterwarnings as fw\nfrom sklearn.model_selection import train_test_split # machine learning\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import confusion_matrix\nfrom datetime import datetime\nfrom sklearn import metrics\nimport os"
},
{
"execution_count": 2,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "os.environ['CLASSPATH'] = \"/opt/ibm/jdbc/db2jcc4.jar\"\nargs='-Djava.class.path=%s' % os.environ['CLASSPATH']\njvm = jpype.getDefaultJVMPath()\njpype.startJVM(jvm, args)\nmyDB = 'jdbc:db2://dashdb-txn-flex-yp-dal10-100.services.dal.bluemix.net:50000/BLUDB'\nmyServerID = 'bluadmin'\nmyServerPWD = 'ZjQ1NTNlMjg5MmFk'\nconnection_string=myDB+':user='+myServerID+';password='+myServerPWD+\";\"\nidadb=IdaDataBase(dsn=connection_string)"
},
{
"execution_count": 3,
"cell_type": "code",
"metadata": {
"collapsed": true
},
"outputs": [
{
"execution_count": 3,
"metadata": {},
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>TABSCHEMA</th>\n <th>TABNAME</th>\n <th>OWNER</th>\n <th>TYPE</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>AMGDPR</td>\n <td>ACTIVITY_LOG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>1</th>\n <td>AMGDPR</td>\n <td>COLUMN_ASSET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>2</th>\n <td>AMGDPR</td>\n <td>TABLE_CONTEXT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>3</th>\n <td>COL_52056</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>4</th>\n <td>COL_52056</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>5</th>\n <td>COL_52056</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>6</th>\n <td>COL_52056</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>7</th>\n <td>COL_88072</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>8</th>\n <td>COL_88072</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>9</th>\n <td>COL_88072</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>10</th>\n <td>COL_88072</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>11</th>\n <td>COL_88072</td>\n <td>NE</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>12</th>\n <td>COL_88072</td>\n <td>NE_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>13</th>\n <td>DB2GSE</td>\n <td>GSE_AUTHORIZATION</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>14</th>\n <td>DB2GSE</td>\n <td>GSE_COORDINATE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>15</th>\n <td>DB2GSE</td>\n <td>GSE_DEFAULTS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>16</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>17</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODER_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>18</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODING</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>19</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODING_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>20</th>\n <td>DB2GSE</td>\n <td>GSE_GEOMETRY_COLUMNS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>21</th>\n <td>DB2GSE</td>\n <td>GSE_INDEX_SAVE</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>22</th>\n <td>DB2GSE</td>\n <td>GSE_SIZINGS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>23</th>\n <td>DB2GSE</td>\n <td>GSE_SPATIAL_REFERENCE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>24</th>\n <td>DB2GSE</td>\n <td>GSE_SRS_REPLICATED_AST</td>\n <td>DB2INST1</td>\n <td>S</td>\n </tr>\n <tr>\n <th>25</th>\n <td>DB2GSE</td>\n <td>GSE_UNITS_OF_MEASURE</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>26</th>\n <td>DB2GSE</td>\n <td>ST_COORDINATE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>27</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODERS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>28</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODER_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>29</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODING</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>159</th>\n <td>wexNonPI</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>160</th>\n <td>wexNonPI</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>161</th>\n <td>wexNonPI</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>162</th>\n <td>wexNonPI</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>163</th>\n <td>wexPI</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>164</th>\n <td>wexPI</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>165</th>\n <td>wexPI</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>166</th>\n <td>wexPI</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>167</th>\n <td>wexPI</td>\n <td>NE</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>168</th>\n <td>wexPI</td>\n <td>NE_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>169</th>\n <td>wexTest</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>170</th>\n <td>wexTest</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>171</th>\n <td>wexTest</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>172</th>\n <td>wexTest</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>173</th>\n <td>wexTest1</td>\n <td>CATEGORY</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>174</th>\n <td>wexTest1</td>\n <td>CATEGORY_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>175</th>\n <td>wexTest1</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>176</th>\n <td>wexTest1</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>177</th>\n <td>wexTest1</td>\n <td>SUBCATEGORY</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>178</th>\n <td>wexTest1</td>\n <td>SUBCATEGORY_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>179</th>\n <td>wexTest1</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>180</th>\n <td>wexTest1</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>181</th>\n <td>wexTest2</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>182</th>\n <td>wexTest2</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>183</th>\n <td>wexTest2</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>184</th>\n <td>wexTest2</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>185</th>\n <td>wexTest5</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>186</th>\n <td>wexTest5</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>187</th>\n <td>wexTest5</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>188</th>\n <td>wexTest5</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n </tbody>\n</table>\n<p>189 rows \u00d7 4 columns</p>\n</div>",
"text/plain": " TABSCHEMA TABNAME OWNER TYPE\n0 AMGDPR ACTIVITY_LOG BLUADMIN T\n1 AMGDPR COLUMN_ASSET BLUADMIN T\n2 AMGDPR TABLE_CONTEXT BLUADMIN T\n3 COL_52056 DATE_FACET BLUADMIN T\n4 COL_52056 DOC_FACT BLUADMIN T\n5 COL_52056 MENTION BLUADMIN T\n6 COL_52056 MENTION_BRG BLUADMIN T\n7 COL_88072 DATE_FACET BLUADMIN T\n8 COL_88072 DOC_FACT BLUADMIN T\n9 COL_88072 MENTION BLUADMIN T\n10 COL_88072 MENTION_BRG BLUADMIN T\n11 COL_88072 NE BLUADMIN T\n12 COL_88072 NE_BRG BLUADMIN T\n13 DB2GSE GSE_AUTHORIZATION DB2INST1 T\n14 DB2GSE GSE_COORDINATE_SYSTEMS DB2INST1 T\n15 DB2GSE GSE_DEFAULTS DB2INST1 T\n16 DB2GSE GSE_GEOCODERS DB2INST1 T\n17 DB2GSE GSE_GEOCODER_PARAMETERS DB2INST1 T\n18 DB2GSE GSE_GEOCODING DB2INST1 T\n19 DB2GSE GSE_GEOCODING_PARAMETERS DB2INST1 T\n20 DB2GSE GSE_GEOMETRY_COLUMNS DB2INST1 T\n21 DB2GSE GSE_INDEX_SAVE DB2INST1 T\n22 DB2GSE GSE_SIZINGS DB2INST1 T\n23 DB2GSE GSE_SPATIAL_REFERENCE_SYSTEMS DB2INST1 T\n24 DB2GSE GSE_SRS_REPLICATED_AST DB2INST1 S\n25 DB2GSE GSE_UNITS_OF_MEASURE DB2INST1 T\n26 DB2GSE ST_COORDINATE_SYSTEMS DB2INST1 V\n27 DB2GSE ST_GEOCODERS DB2INST1 V\n28 DB2GSE ST_GEOCODER_PARAMETERS DB2INST1 V\n29 DB2GSE ST_GEOCODING DB2INST1 V\n.. ... ... ... ...\n159 wexNonPI DATE_FACET BLUADMIN T\n160 wexNonPI DOC_FACT BLUADMIN T\n161 wexNonPI MENTION BLUADMIN T\n162 wexNonPI MENTION_BRG BLUADMIN T\n163 wexPI DATE_FACET BLUADMIN T\n164 wexPI DOC_FACT BLUADMIN T\n165 wexPI MENTION BLUADMIN T\n166 wexPI MENTION_BRG BLUADMIN T\n167 wexPI NE BLUADMIN T\n168 wexPI NE_BRG BLUADMIN T\n169 wexTest DATE_FACET BLUADMIN T\n170 wexTest DOC_FACT BLUADMIN T\n171 wexTest WORD BLUADMIN T\n172 wexTest WORD_BRG BLUADMIN T\n173 wexTest1 CATEGORY BLUADMIN T\n174 wexTest1 CATEGORY_BRG BLUADMIN T\n175 wexTest1 DATE_FACET BLUADMIN T\n176 wexTest1 DOC_FACT BLUADMIN T\n177 wexTest1 SUBCATEGORY BLUADMIN T\n178 wexTest1 SUBCATEGORY_BRG BLUADMIN T\n179 wexTest1 WORD BLUADMIN T\n180 wexTest1 WORD_BRG BLUADMIN T\n181 wexTest2 DATE_FACET BLUADMIN T\n182 wexTest2 DOC_FACT BLUADMIN T\n183 wexTest2 WORD BLUADMIN T\n184 wexTest2 WORD_BRG BLUADMIN T\n185 wexTest5 DATE_FACET BLUADMIN T\n186 wexTest5 DOC_FACT BLUADMIN T\n187 wexTest5 WORD BLUADMIN T\n188 wexTest5 WORD_BRG BLUADMIN T\n\n[189 rows x 4 columns]"
},
"output_type": "execute_result"
}
],
"source": "df=idadb.show_tables(show_all=True)\ndf"
},
{
"execution_count": 11,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"execution_count": 11,
"metadata": {},
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>ID</th>\n <th>LEVEL1</th>\n <th>KEYWORD</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1548</td>\n <td>PERSON ...</td>\n <td>I ...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1549</td>\n <td>PERSON ...</td>\n <td>Pizza Hut ...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1550</td>\n <td>PERSON ...</td>\n <td>Chris Voss ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>1551</td>\n <td>PERSON ...</td>\n <td>Jeff Weiner ...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>1552</td>\n <td>PERSON ...</td>\n <td>Deepak Goel ...</td>\n </tr>\n <tr>\n <th>5</th>\n <td>1553</td>\n <td>PERSON ...</td>\n <td>Arianna Huffington ...</td>\n </tr>\n <tr>\n <th>6</th>\n <td>1554</td>\n <td>PERSON ...</td>\n <td>Geoffrey Colon ...</td>\n </tr>\n <tr>\n <th>7</th>\n <td>1555</td>\n <td>PERSON ...</td>\n <td>Vincent Washington ...</td>\n </tr>\n <tr>\n <th>8</th>\n <td>1556</td>\n <td>PERSON ...</td>\n <td>Keith Ferrazzi ...</td>\n </tr>\n <tr>\n <th>9</th>\n <td>1557</td>\n <td>PERSON ...</td>\n <td>Gary Vaynerchuk ...</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " ID LEVEL1 \\\n0 1548 PERSON ... \n1 1549 PERSON ... \n2 1550 PERSON ... \n3 1551 PERSON ... \n4 1552 PERSON ... \n5 1553 PERSON ... \n6 1554 PERSON ... \n7 1555 PERSON ... \n8 1556 PERSON ... \n9 1557 PERSON ... \n\n KEYWORD \n0 I ... \n1 Pizza Hut ... \n2 Chris Voss ... \n3 Jeff Weiner ... \n4 Deepak Goel ... \n5 Arianna Huffington ... \n6 Geoffrey Colon ... \n7 Vincent Washington ... \n8 Keith Ferrazzi ... \n9 Gary Vaynerchuk ... "
},
"output_type": "execute_result"
}
],
"source": "idadf = IdaDataFrame(idadb, 'COL_52056.MENTION')\nidadf.head(10)"
},
{
"execution_count": 12,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "df_sample = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))"
},
{
"execution_count": 17,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Uploading 100 rows (maxnrow was set to 2000)\n"
},
{
"execution_count": 17,
"metadata": {},
"data": {
"text/plain": "<ibmdbpy.frame.IdaDataFrame at 0x2ad69355aa20>"
},
"output_type": "execute_result"
}
],
"source": "idadb.as_idadataframe(df_sample)"
},
{
"execution_count": 19,
"cell_type": "code",
"metadata": {
"scrolled": true
},
"outputs": [
{
"execution_count": 19,
"metadata": {},
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>TABSCHEMA</th>\n <th>TABNAME</th>\n <th>OWNER</th>\n <th>TYPE</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>AMGDPR</td>\n <td>ACTIVITY_LOG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>1</th>\n <td>AMGDPR</td>\n <td>COLUMN_ASSET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>2</th>\n <td>AMGDPR</td>\n <td>TABLE_CONTEXT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>3</th>\n <td>COL_52056</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>4</th>\n <td>COL_52056</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>5</th>\n <td>COL_52056</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>6</th>\n <td>COL_52056</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>7</th>\n <td>COL_88072</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>8</th>\n <td>COL_88072</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>9</th>\n <td>COL_88072</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>10</th>\n <td>COL_88072</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>11</th>\n <td>COL_88072</td>\n <td>NE</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>12</th>\n <td>COL_88072</td>\n <td>NE_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>13</th>\n <td>DB2GSE</td>\n <td>GSE_AUTHORIZATION</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>14</th>\n <td>DB2GSE</td>\n <td>GSE_COORDINATE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>15</th>\n <td>DB2GSE</td>\n <td>GSE_DEFAULTS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>16</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>17</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODER_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>18</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODING</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>19</th>\n <td>DB2GSE</td>\n <td>GSE_GEOCODING_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>20</th>\n <td>DB2GSE</td>\n <td>GSE_GEOMETRY_COLUMNS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>21</th>\n <td>DB2GSE</td>\n <td>GSE_INDEX_SAVE</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>22</th>\n <td>DB2GSE</td>\n <td>GSE_SIZINGS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>23</th>\n <td>DB2GSE</td>\n <td>GSE_SPATIAL_REFERENCE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>24</th>\n <td>DB2GSE</td>\n <td>GSE_SRS_REPLICATED_AST</td>\n <td>DB2INST1</td>\n <td>S</td>\n </tr>\n <tr>\n <th>25</th>\n <td>DB2GSE</td>\n <td>GSE_UNITS_OF_MEASURE</td>\n <td>DB2INST1</td>\n <td>T</td>\n </tr>\n <tr>\n <th>26</th>\n <td>DB2GSE</td>\n <td>ST_COORDINATE_SYSTEMS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>27</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODERS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>28</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODER_PARAMETERS</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>29</th>\n <td>DB2GSE</td>\n <td>ST_GEOCODING</td>\n <td>DB2INST1</td>\n <td>V</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>160</th>\n <td>wexNonPI</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>161</th>\n <td>wexNonPI</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>162</th>\n <td>wexNonPI</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>163</th>\n <td>wexPI</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>164</th>\n <td>wexPI</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>165</th>\n <td>wexPI</td>\n <td>MENTION</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>166</th>\n <td>wexPI</td>\n <td>MENTION_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>167</th>\n <td>wexPI</td>\n <td>NE</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>168</th>\n <td>wexPI</td>\n <td>NE_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>169</th>\n <td>wexTest</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>170</th>\n <td>wexTest</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>171</th>\n <td>wexTest</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>172</th>\n <td>wexTest</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>173</th>\n <td>wexTest1</td>\n <td>CATEGORY</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>174</th>\n <td>wexTest1</td>\n <td>CATEGORY_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>175</th>\n <td>wexTest1</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>176</th>\n <td>wexTest1</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>177</th>\n <td>wexTest1</td>\n <td>SUBCATEGORY</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>178</th>\n <td>wexTest1</td>\n <td>SUBCATEGORY_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>179</th>\n <td>wexTest1</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>180</th>\n <td>wexTest1</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>181</th>\n <td>wexTest2</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>182</th>\n <td>wexTest2</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>183</th>\n <td>wexTest2</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>184</th>\n <td>wexTest2</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>185</th>\n <td>wexTest5</td>\n <td>DATE_FACET</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>186</th>\n <td>wexTest5</td>\n <td>DOC_FACT</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>187</th>\n <td>wexTest5</td>\n <td>WORD</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>188</th>\n <td>wexTest5</td>\n <td>WORD_BRG</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n <tr>\n <th>189</th>\n <td>BLUADMIN</td>\n <td>DATA_FRAME_54907_1525901763</td>\n <td>BLUADMIN</td>\n <td>T</td>\n </tr>\n </tbody>\n</table>\n<p>190 rows \u00d7 4 columns</p>\n</div>",
"text/plain": " TABSCHEMA TABNAME OWNER TYPE\n0 AMGDPR ACTIVITY_LOG BLUADMIN T\n1 AMGDPR COLUMN_ASSET BLUADMIN T\n2 AMGDPR TABLE_CONTEXT BLUADMIN T\n3 COL_52056 DATE_FACET BLUADMIN T\n4 COL_52056 DOC_FACT BLUADMIN T\n5 COL_52056 MENTION BLUADMIN T\n6 COL_52056 MENTION_BRG BLUADMIN T\n7 COL_88072 DATE_FACET BLUADMIN T\n8 COL_88072 DOC_FACT BLUADMIN T\n9 COL_88072 MENTION BLUADMIN T\n10 COL_88072 MENTION_BRG BLUADMIN T\n11 COL_88072 NE BLUADMIN T\n12 COL_88072 NE_BRG BLUADMIN T\n13 DB2GSE GSE_AUTHORIZATION DB2INST1 T\n14 DB2GSE GSE_COORDINATE_SYSTEMS DB2INST1 T\n15 DB2GSE GSE_DEFAULTS DB2INST1 T\n16 DB2GSE GSE_GEOCODERS DB2INST1 T\n17 DB2GSE GSE_GEOCODER_PARAMETERS DB2INST1 T\n18 DB2GSE GSE_GEOCODING DB2INST1 T\n19 DB2GSE GSE_GEOCODING_PARAMETERS DB2INST1 T\n20 DB2GSE GSE_GEOMETRY_COLUMNS DB2INST1 T\n21 DB2GSE GSE_INDEX_SAVE DB2INST1 T\n22 DB2GSE GSE_SIZINGS DB2INST1 T\n23 DB2GSE GSE_SPATIAL_REFERENCE_SYSTEMS DB2INST1 T\n24 DB2GSE GSE_SRS_REPLICATED_AST DB2INST1 S\n25 DB2GSE GSE_UNITS_OF_MEASURE DB2INST1 T\n26 DB2GSE ST_COORDINATE_SYSTEMS DB2INST1 V\n27 DB2GSE ST_GEOCODERS DB2INST1 V\n28 DB2GSE ST_GEOCODER_PARAMETERS DB2INST1 V\n29 DB2GSE ST_GEOCODING DB2INST1 V\n.. ... ... ... ...\n160 wexNonPI DOC_FACT BLUADMIN T\n161 wexNonPI MENTION BLUADMIN T\n162 wexNonPI MENTION_BRG BLUADMIN T\n163 wexPI DATE_FACET BLUADMIN T\n164 wexPI DOC_FACT BLUADMIN T\n165 wexPI MENTION BLUADMIN T\n166 wexPI MENTION_BRG BLUADMIN T\n167 wexPI NE BLUADMIN T\n168 wexPI NE_BRG BLUADMIN T\n169 wexTest DATE_FACET BLUADMIN T\n170 wexTest DOC_FACT BLUADMIN T\n171 wexTest WORD BLUADMIN T\n172 wexTest WORD_BRG BLUADMIN T\n173 wexTest1 CATEGORY BLUADMIN T\n174 wexTest1 CATEGORY_BRG BLUADMIN T\n175 wexTest1 DATE_FACET BLUADMIN T\n176 wexTest1 DOC_FACT BLUADMIN T\n177 wexTest1 SUBCATEGORY BLUADMIN T\n178 wexTest1 SUBCATEGORY_BRG BLUADMIN T\n179 wexTest1 WORD BLUADMIN T\n180 wexTest1 WORD_BRG BLUADMIN T\n181 wexTest2 DATE_FACET BLUADMIN T\n182 wexTest2 DOC_FACT BLUADMIN T\n183 wexTest2 WORD BLUADMIN T\n184 wexTest2 WORD_BRG BLUADMIN T\n185 wexTest5 DATE_FACET BLUADMIN T\n186 wexTest5 DOC_FACT BLUADMIN T\n187 wexTest5 WORD BLUADMIN T\n188 wexTest5 WORD_BRG BLUADMIN T\n189 BLUADMIN DATA_FRAME_54907_1525901763 BLUADMIN T\n\n[190 rows x 4 columns]"
},
"output_type": "execute_result"
}
],
"source": "df=idadb.show_tables(show_all=True)\ndf"
},
{
"execution_count": 20,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"execution_count": 20,
"metadata": {},
"data": {
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>A</th>\n <th>B</th>\n <th>C</th>\n <th>D</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>0</td>\n <td>27</td>\n <td>36</td>\n <td>99</td>\n </tr>\n <tr>\n <th>1</th>\n <td>1</td>\n <td>55</td>\n <td>37</td>\n <td>49</td>\n </tr>\n <tr>\n <th>2</th>\n <td>1</td>\n <td>74</td>\n <td>21</td>\n <td>49</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>87</td>\n <td>15</td>\n <td>87</td>\n </tr>\n <tr>\n <th>4</th>\n <td>4</td>\n <td>38</td>\n <td>60</td>\n <td>31</td>\n </tr>\n <tr>\n <th>5</th>\n <td>4</td>\n <td>70</td>\n <td>30</td>\n <td>10</td>\n </tr>\n <tr>\n <th>6</th>\n <td>6</td>\n <td>57</td>\n <td>59</td>\n <td>36</td>\n </tr>\n <tr>\n <th>7</th>\n <td>8</td>\n <td>20</td>\n <td>30</td>\n <td>92</td>\n </tr>\n <tr>\n <th>8</th>\n <td>8</td>\n <td>47</td>\n <td>68</td>\n <td>92</td>\n </tr>\n <tr>\n <th>9</th>\n <td>9</td>\n <td>98</td>\n <td>16</td>\n <td>53</td>\n </tr>\n </tbody>\n</table>\n</div>",
"text/plain": " A B C D\n0 0 27 36 99\n1 1 55 37 49\n2 1 74 21 49\n3 4 87 15 87\n4 4 38 60 31\n5 4 70 30 10\n6 6 57 59 36\n7 8 20 30 92\n8 8 47 68 92\n9 9 98 16 53"
},
"output_type": "execute_result"
}
],
"source": "idadf = IdaDataFrame(idadb, 'BLUADMIN.DATA_FRAME_54907_1525901763')\nidadf.head(10)"
},
{
"execution_count": 6,
"cell_type": "code",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Connection closed.\n"
}
],
"source": "idadb.close()"
},
{
"source": "***End of DB2 connection demo***",
"cell_type": "markdown",
"metadata": {}
},
{
"execution_count": null,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "'''Under development'''\n\n# Import Document-Term Matrices - in Phase 2, import from DB2\ndtm = \\'DTM_run1_v1.0.xlsx'\nX = np.array(dtm)\nfw('ignore')\n\n# Perform Incremental PCA (memory-efficient)\n# This step is to reduce document term matrix to derive a fewer column matrix\n# where the column names comprise a list of features that are privacy related\n# Here column names are set to be no more than 10.\nipca = IncrementalPCA(n_components=10, batch_size=10)\nipca.fit(X)\nIncrementalPCA(batch_size=10, copy=True, n_components=10, whiten=False)\n\n# Varimax rotation on factor pattern - get most important features for PI\n# This is a standard procedure to ensure we extract the most relevant columns\n# Refer to more details: https://en.wikipedia.org/wiki/Varimax_rotation\ndef varimax(Phi, gamma = 1, q = 20, tol = 1e-6):\n p,k = Phi.shape\n R = eye(k)\n d=0\n for i in range(q):\n d_old = d\n Lambda = dot(Phi, R)\n u,s,vh = svd(dot(Phi.T,asarray(Lambda)**3 - (gamma/p) * dot(Lambda, diag(diag(dot(Lambda.T,Lambda))))))\n R = dot(u,vh)\n d = sum(s)\n if d/d_old < tol: break\n return dot(Phi, R)\n\n# Get top principal components whose cumulative variance explained > 95%\n# Normally if the top N components retains >=95% of the variance within\n# the original matrix, we say it retains all the information\nipca.transform(X)\nobjects = np.arange(1,11,1)\ny_pos = np.arange(len(objects))\nperformance = np.cumsum(np.round(ipca.explained_variance_ratio_, decimals=4)*100)\n\n# A scree plot which shows how many components to select\nfig = plt.figure()\nax = fig.add_subplot(111)\nplt.bar(y_pos, performance, align='center', alpha=0.5)\nplt.xticks(y_pos, objects)\nplt.ylabel('Variance')\nplt.title('Scree Plot of iPCA')\nax.axhline(95, color=\"gray\", ls='dashed')\n# A 95% dashed line is marked; so the # of components that first crossed over\n# this threshold is the correct # of features to extract\nplt.show()\n\n# Output a list of features for logit model use\nlogit_vars = [dtm.columns[i] for i in list(OrderedDict\\\n .fromkeys([list(i).index(max(i)) for i in varimax(ipca.components_)]))][:list(performance>=95).index(True)+1]\nprint(list(performance>=95).index(True)+1, \"components were selected. They are \", logit_vars)\n\nlogit_features = [dtm.iloc[:,i] for i in list(OrderedDict\\\n .fromkeys([list(i).index(max(i)) for i in varimax(ipca.components_)]))][:list(performance>=95).index(True)+1]\n\n# Reads training, testing data for logit modulation. The data we are pulling in here are the direct output of WEX CA\n# annotations (word mentions). Below, we are combining the annotations for both PD and non-PD documents to derive\n# a single DTM (Document Term Matrix) for logit model \nmention = pd.read_csv(r'mention.csv', header=None, encoding='ISO-8859-1').rename(columns={0:'##',1:'Category',2:'Mention'})\nmention_brg = pd.read_csv(r'mention_brg.csv', header=None, encoding='ISO-8859-1').rename(columns={0:'#',1:'##'})\nmention = pd.concat((mention_brg,mention), axis=1).drop(axis=1,labels='##').drop_duplicates()\ntrain = pd.pivot_table(mention, values='Mention', index='#', columns='Category', aggfunc=np.count_nonzero).fillna(0)\ntrain['y']=1\n\nmention_ = pd.read_csv(r'mention_.csv', header=None, encoding='ISO-8859-1').rename(columns={0:'##',1:'Category',2:'Mention'})\nmention_brg_ = pd.read_csv(r'mention_brg_.csv', header=None, encoding='ISO-8859-1').rename(columns={0:'#',1:'##'})\nmention_ = pd.concat((mention_brg_,mention_), axis=1).drop(axis=1,labels='##').drop_duplicates()\ntrain2 = pd.pivot_table(mention_, values='Mention', index='#', columns='Category', aggfunc=np.count_nonzero).fillna(0)\ntrain2['y']=0\n\ntrain = pd.concat((train, train2)).fillna(0) # form a single DTM (Document Term Matrix) for logit model \n\nX = train.iloc[:,:-1] # subset all 21 features for independent variables (X's)\ny = train.iloc[:, -1] # subset label (1 for PD, 0 otherwise) for dependent variable (y)\n\n# For machine learning, split DTM 70/30 for training, testing set respectively\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\nlogit = LogisticRegression() # instantiates a logistic function\nlogit.fit(X_train, y_train) # train logit on the 70% training set\ny_pred = logit.predict(X_test) # predict on testing set X's\naccuracy1 = logit.score(X_test, y_test) # accuracy scoring and print out\nprecision = metrics.precision_score(y_pred=logit.fit(X_train, y_train).predict_proba(X_test).round()[:,1],y_true=y_test)\nrecall = metrics.recall_score(y_pred=logit.fit(X_train, y_train).predict_proba(X_test).round()[:,1],y_true=y_test)\nf1 = metrics.f1_score(y_pred=logit.fit(X_train, y_train).predict_proba(X_test).round()[:,1],y_true=y_test)\nmodel = np.round([accuracy1,precision,recall,f1],decimals=2)\n\n# Same model building as above, but subsetting only 3 most important factors based on PCA\ntrain_features = X_train[np.intersect1d(X_train.columns, logit_vars)]\ntest_features = X_test[np.intersect1d(X_test.columns, logit_vars)]\nlogit1 = LogisticRegression(C=0.5, max_iter=100000, tol=1e-5, solver='lbfgs')\nlogit1.fit(train_features, y_train)\ny_pred2 = logit1.predict(test_features)\naccuracy2 = logit1.score(test_features, y_test)\nprecision1 = metrics.precision_score(logit.fit(train_features, y_train).predict_proba(test_features).round()[:,1],y_test)\nrecall1 = metrics.recall_score(logit.fit(train_features, y_train).predict_proba(test_features).round()[:,1],y_test)\nf11 = metrics.f1_score(logit.fit(train_features, y_train).predict_proba(test_features).round()[:,1],y_test)\nmodel1 = np.round([accuracy2,precision1,recall1,f11],decimals=2)\n\n# Same model building, but subsetting all features except 3 most important factors\ntrain_features1 = X_train[np.setdiff1d(X_train.columns, logit_vars)]\ntest_features1 = X_test[np.setdiff1d(X_test.columns, logit_vars)]\nlogit2 = LogisticRegression(C=0.5, max_iter=100000, tol=1e-5, solver='lbfgs')\nlogit2.fit(train_features1, y_train)\ny_pred3 = logit2.predict(test_features1)\naccuracy3 = logit2.score(test_features1, y_test)\nprecision2 = metrics.precision_score(logit.fit(train_features1, y_train).predict_proba(test_features1).round()[:,1],y_test)\nrecall2 = metrics.recall_score(logit.fit(train_features1, y_train).predict_proba(test_features1).round()[:,1],y_test)\nf12 = metrics.f1_score(logit.fit(train_features1, y_train).predict_proba(test_features1).round()[:,1],y_test)\nmodel2 = np.round([accuracy3,precision2,recall2,f12],decimals=2)\n\n# This step creates a new folder in current directory named \"output\", which will contain:\n# 1. a probability sheet (probability.csv) of predicted likelihood of containing PD by document;\n# 2. a diagnostic table (diagnostics.csv) of model performance and technical specifications\nos.makedirs(os.path.dirname('./output/'), exist_ok=True)\npd.DataFrame([proba[1] for proba in logit1.fit(train_features, y_train).predict_proba(test_features)],\\\n [proba[1]*0 for proba in logit1.fit(train_features, y_train).predict_proba(test_features)]).\\\nreset_index().rename(columns={'index':'Human_Decision', 0:'Predicted_Probability'}).to_csv('./output/probability.csv')\n\nacc1='Accuracy of document classifier using all 21 features: {:.2f}'.format(accuracy1)\nacc2='Accuracy when only using PCA-selected features: {:.2f}'.format(accuracy2)\nacc3='Accuracy when using unselected features: {:.2f}'.format(accuracy3)\nvars='{:.2f} components were selected. They are '.format(list(performance>=95).index(True)+1)\npd.DataFrame({'Model Diagnostic': list(('\\n','Report generated '+datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'\\n',\\\n 'Logistic Regression for Document Privacy Prediction','\\n',vars,logit_vars,'\\n',\\\n acc1,'\\n',acc2,'\\n',acc3))}).to_csv('./output/diagnostics.csv')\n\n# Draws confusion matrix\ncm = pd.DataFrame(confusion_matrix(y_test, y_pred2))\ncm.columns=['Predicted Non-PI','Predicted PI']\ncm.index=['Actual Non-PI','Actual PI']\n\ndef _color_red_or_green(val): # Configures confusion matrix\n color = 'white' if val <= 5 else 'green'\n return 'color: %s' % color\ncm.style.background_gradient(cmap='plasma', low=.5, high=0).highlight_null('red').applymap(_color_red_or_green)\\\n .set_properties(**{'font-size':'20pt'})"
},
{
"execution_count": null,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "pd.DataFrame([model,model1,model2]).rename(index={0: 'All 21 features',1: 'PCA 3 features',2: 'Remaining features'},columns={0: 'Accuracy',1: 'Precision',2: 'Recall', 3: 'F1'}).style.apply(lambda x: ['background: lightgreen' if x.name == 'PCA 3 features' else '' for i in x], axis=1)"
},
{
"execution_count": null,
"cell_type": "code",
"metadata": {},
"outputs": [],
"source": "import scikitplot as skplt # use \"pip install scikitplot\" to install this specialty plotting package\n\n# Compute Brier score, a proper score function that measures the accuracy of probabilistic predictions\n# Go to http://www.matterofstats.com/mafl-stats-journal/2013/12/22/is-that-a-good-probability-score-the-brier-score-edition.html\n# for more information, which gives a desired Brier score range (lower than 0.175)\nprint(' is', metrics.brier_score_loss(y_test.values, logit.fit(train_features, y_train).predict_proba(test_features)[:,1]))\n\n# Plot receiver operating characteristic (ROC curve), which illustrates the diagnostic ability of the binary classifier\n# as its discrimination threshold is varied. https://en.wikipedia.org/wiki/Receiver_operating_characteristic\nskplt.metrics.plot_roc_curve(y_test.values, logit.fit(train_features1, y_train).predict_proba(test_features1),'Using all 21 features')\nskplt.metrics.plot_roc_curve(y_test.values, logit.fit(train_features, y_train).predict_proba(test_features),title='PCA')"
},
{
"source": "***This section contains additional, non-essential visualization***\n\nPlease refer to the previous section (first code chunk) for essential model components. This second code chunk is for demonstration purpose only, and mainly contains two receiver operating characteristic (ROC) curves, which illustrate the diagnostic ability of the binary classifier as its discrimination threshold is varied. The direct intrepretation is, in the first plot above, when all 21 features (assignment, date, location...) were using in the logit model, the classifier performs close to the baseline (dotted line) of ~50% accuracy, making it no better than random guessing. However, in the second plot above, when the 3 most important variables, name, email, organization, were surgically used, the true positive rate emerged much higher, making the model much more robust than when all features were used indiscriminately.",
"cell_type": "markdown",
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.5",
"name": "python3",
"language": "python"
},
"language_info": {
"mimetype": "text/x-python",
"nbconvert_exporter": "python",
"version": "3.5.4",
"name": "python",
"file_extension": ".py",
"pygments_lexer": "ipython3",
"codemirror_mode": {
"version": 3,
"name": "ipython"
}
}
},
"nbformat": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment