Skip to content

Instantly share code, notes, and snippets.

@aaronaddleman
Created October 31, 2018 16:42
Show Gist options
  • Save aaronaddleman/5ef9277582657295a469f20536614286 to your computer and use it in GitHub Desktop.
Save aaronaddleman/5ef9277582657295a469f20536614286 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"metadata":{"language_info":{"name":"python","version":"3.6.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"}},"nbformat_minor":2,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Read in Data Files\n\n* How to read in csv files\n* Files with no headers\n* Files with delimiters other than commas","metadata":{}},{"cell_type":"code","source":"import pandas as pd","metadata":{"trusted":true},"execution_count":1,"outputs":[]},{"cell_type":"code","source":"pd.read_csv('small_data.csv')","metadata":{"trusted":true},"execution_count":2,"outputs":[{"execution_count":2,"output_type":"execute_result","data":{"text/plain":" name gender country age\n0 Sophia F Bulgaria 23\n1 John M USA 24\n2 Jennifer F USA 46\n3 Edgar M England 13\n4 Patrick M Netherlands 72","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>name</th>\n <th>gender</th>\n <th>country</th>\n <th>age</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Sophia</td>\n <td>F</td>\n <td>Bulgaria</td>\n <td>23</td>\n </tr>\n <tr>\n <th>1</th>\n <td>John</td>\n <td>M</td>\n <td>USA</td>\n <td>24</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Jennifer</td>\n <td>F</td>\n <td>USA</td>\n <td>46</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Edgar</td>\n <td>M</td>\n <td>England</td>\n <td>13</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Patrick</td>\n <td>M</td>\n <td>Netherlands</td>\n <td>72</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# reading in data that has no headers... look... the headers are incorrect\npd.read_csv('adult.data', nrows=5)","metadata":{"trusted":true},"execution_count":3,"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":" 39 State-gov 77516 Bachelors 13 Never-married \\\n0 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n1 38 Private 215646 HS-grad 9 Divorced \n2 53 Private 234721 11th 7 Married-civ-spouse \n3 28 Private 338409 Bachelors 13 Married-civ-spouse \n4 37 Private 284582 Masters 14 Married-civ-spouse \n\n Adm-clerical Not-in-family White Male 2174 0 40 \\\n0 Exec-managerial Husband White Male 0 0 13 \n1 Handlers-cleaners Not-in-family White Male 0 0 40 \n2 Handlers-cleaners Husband Black Male 0 0 40 \n3 Prof-specialty Wife Black Female 0 0 40 \n4 Exec-managerial Wife White Female 0 0 40 \n\n United-States <=50K \n0 United-States <=50K \n1 United-States <=50K \n2 United-States <=50K \n3 Cuba <=50K \n4 United-States <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>39</th>\n <th>State-gov</th>\n <th>77516</th>\n <th>Bachelors</th>\n <th>13</th>\n <th>Never-married</th>\n <th>Adm-clerical</th>\n <th>Not-in-family</th>\n <th>White</th>\n <th>Male</th>\n <th>2174</th>\n <th>0</th>\n <th>40</th>\n <th>United-States</th>\n <th>&lt;=50K</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>37</td>\n <td>Private</td>\n <td>284582</td>\n <td>Masters</td>\n <td>14</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Wife</td>\n <td>White</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# specify no headers exist\npd.read_csv('adult.data', nrows=5, header=None)","metadata":{"trusted":true},"execution_count":4,"outputs":[{"execution_count":4,"output_type":"execute_result","data":{"text/plain":" 0 1 2 3 4 5 \\\n0 39 State-gov 77516 Bachelors 13 Never-married \n1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse \n2 38 Private 215646 HS-grad 9 Divorced \n3 53 Private 234721 11th 7 Married-civ-spouse \n4 28 Private 338409 Bachelors 13 Married-civ-spouse \n\n 6 7 8 9 10 11 12 \\\n0 Adm-clerical Not-in-family White Male 2174 0 40 \n1 Exec-managerial Husband White Male 0 0 13 \n2 Handlers-cleaners Not-in-family White Male 0 0 40 \n3 Handlers-cleaners Husband Black Male 0 0 40 \n4 Prof-specialty Wife Black Female 0 0 40 \n\n 13 14 \n0 United-States <=50K \n1 United-States <=50K \n2 United-States <=50K \n3 United-States <=50K \n4 Cuba <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>10</th>\n <th>11</th>\n <th>12</th>\n <th>13</th>\n <th>14</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>39</td>\n <td>State-gov</td>\n <td>77516</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Never-married</td>\n <td>Adm-clerical</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>2174</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td>&lt;=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"col_names = [\n \"age\",\n \"type_employer\",\n \"fnlwgt\",\n \"education\",\n \"education_num\",\n \"marital\",\n \"occupation\",\n \"relationship\",\n \"race\",\n \"sex\",\n \"capital_gain\",\n \"capital_loss\",\n \"hr_per_week\",\n \"country\",\n \"income\"\n]","metadata":{"trusted":true},"execution_count":26,"outputs":[]},{"cell_type":"code","source":"pd.read_csv('adult.data', nrows=5, header=None, names=col_names)","metadata":{"trusted":true},"execution_count":27,"outputs":[{"execution_count":27,"output_type":"execute_result","data":{"text/plain":" age type_employer fnlwgt education education_num \\\n0 39 State-gov 77516 Bachelors 13 \n1 50 Self-emp-not-inc 83311 Bachelors 13 \n2 38 Private 215646 HS-grad 9 \n3 53 Private 234721 11th 7 \n4 28 Private 338409 Bachelors 13 \n\n marital occupation relationship race sex \\\n0 Never-married Adm-clerical Not-in-family White Male \n1 Married-civ-spouse Exec-managerial Husband White Male \n2 Divorced Handlers-cleaners Not-in-family White Male \n3 Married-civ-spouse Handlers-cleaners Husband Black Male \n4 Married-civ-spouse Prof-specialty Wife Black Female \n\n capital_gain capital_loss hr_per_week country income \n0 2174 0 40 United-States <=50K \n1 0 0 13 United-States <=50K \n2 0 0 40 United-States <=50K \n3 0 0 40 United-States <=50K \n4 0 0 40 Cuba <=50K ","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>age</th>\n <th>type_employer</th>\n <th>fnlwgt</th>\n <th>education</th>\n <th>education_num</th>\n <th>marital</th>\n <th>occupation</th>\n <th>relationship</th>\n <th>race</th>\n <th>sex</th>\n <th>capital_gain</th>\n <th>capital_loss</th>\n <th>hr_per_week</th>\n <th>country</th>\n <th>income</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>39</td>\n <td>State-gov</td>\n <td>77516</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Never-married</td>\n <td>Adm-clerical</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>2174</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>1</th>\n <td>50</td>\n <td>Self-emp-not-inc</td>\n <td>83311</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Exec-managerial</td>\n <td>Husband</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>13</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>2</th>\n <td>38</td>\n <td>Private</td>\n <td>215646</td>\n <td>HS-grad</td>\n <td>9</td>\n <td>Divorced</td>\n <td>Handlers-cleaners</td>\n <td>Not-in-family</td>\n <td>White</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>3</th>\n <td>53</td>\n <td>Private</td>\n <td>234721</td>\n <td>11th</td>\n <td>7</td>\n <td>Married-civ-spouse</td>\n <td>Handlers-cleaners</td>\n <td>Husband</td>\n <td>Black</td>\n <td>Male</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>United-States</td>\n <td>&lt;=50K</td>\n </tr>\n <tr>\n <th>4</th>\n <td>28</td>\n <td>Private</td>\n <td>338409</td>\n <td>Bachelors</td>\n <td>13</td>\n <td>Married-civ-spouse</td>\n <td>Prof-specialty</td>\n <td>Wife</td>\n <td>Black</td>\n <td>Female</td>\n <td>0</td>\n <td>0</td>\n <td>40</td>\n <td>Cuba</td>\n <td>&lt;=50K</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# issues: no header, delimeter is not a comma\npd.read_csv('auto-mpg.data', nrows=5, header=None)","metadata":{"trusted":true},"execution_count":29,"outputs":[{"execution_count":29,"output_type":"execute_result","data":{"text/plain":" 0\n0 18.0 8 307.0 130.0 3504. 12...\n1 15.0 8 350.0 165.0 3693. 11...\n2 18.0 8 318.0 150.0 3436. 11...\n3 16.0 8 304.0 150.0 3433. 12...\n4 17.0 8 302.0 140.0 3449. 10...","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>18.0 8 307.0 130.0 3504. 12...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>15.0 8 350.0 165.0 3693. 11...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>18.0 8 318.0 150.0 3436. 11...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>16.0 8 304.0 150.0 3433. 12...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>17.0 8 302.0 140.0 3449. 10...</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]},{"cell_type":"code","source":"# read the file, line by line\nwith open('auto-mpg.data') as f:\n for line_index, line in enumerate(f):\n print(line)\n if line_index == 5:\n break","metadata":{"trusted":true},"execution_count":30,"outputs":[{"name":"stdout","text":"18.0 8 307.0 130.0 3504. 12.0 70 1\t\"chevrolet chevelle malibu\"\n\n15.0 8 350.0 165.0 3693. 11.5 70 1\t\"buick skylark 320\"\n\n18.0 8 318.0 150.0 3436. 11.0 70 1\t\"plymouth satellite\"\n\n16.0 8 304.0 150.0 3433. 12.0 70 1\t\"amc rebel sst\"\n\n17.0 8 302.0 140.0 3449. 10.5 70 1\t\"ford torino\"\n\n15.0 8 429.0 198.0 4341. 10.0 70 1\t\"ford galaxie 500\"\n\n","output_type":"stream"}]},{"cell_type":"code","source":"pd.read_csv('auto-mpg.data', nrows=5, header=None, sep='\\s+')","metadata":{"trusted":true},"execution_count":31,"outputs":[{"execution_count":31,"output_type":"execute_result","data":{"text/plain":" 0 1 2 3 4 5 6 7 8\n0 18.0 8 307.0 130.0 3504.0 12.0 70 1 chevrolet chevelle malibu\n1 15.0 8 350.0 165.0 3693.0 11.5 70 1 buick skylark 320\n2 18.0 8 318.0 150.0 3436.0 11.0 70 1 plymouth satellite\n3 16.0 8 304.0 150.0 3433.0 12.0 70 1 amc rebel sst\n4 17.0 8 302.0 140.0 3449.0 10.5 70 1 ford torino","text/html":"<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>0</th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>18.0</td>\n <td>8</td>\n <td>307.0</td>\n <td>130.0</td>\n <td>3504.0</td>\n <td>12.0</td>\n <td>70</td>\n <td>1</td>\n <td>chevrolet chevelle malibu</td>\n </tr>\n <tr>\n <th>1</th>\n <td>15.0</td>\n <td>8</td>\n <td>350.0</td>\n <td>165.0</td>\n <td>3693.0</td>\n <td>11.5</td>\n <td>70</td>\n <td>1</td>\n <td>buick skylark 320</td>\n </tr>\n <tr>\n <th>2</th>\n <td>18.0</td>\n <td>8</td>\n <td>318.0</td>\n <td>150.0</td>\n <td>3436.0</td>\n <td>11.0</td>\n <td>70</td>\n <td>1</td>\n <td>plymouth satellite</td>\n </tr>\n <tr>\n <th>3</th>\n <td>16.0</td>\n <td>8</td>\n <td>304.0</td>\n <td>150.0</td>\n <td>3433.0</td>\n <td>12.0</td>\n <td>70</td>\n <td>1</td>\n <td>amc rebel sst</td>\n </tr>\n <tr>\n <th>4</th>\n <td>17.0</td>\n <td>8</td>\n <td>302.0</td>\n <td>140.0</td>\n <td>3449.0</td>\n <td>10.5</td>\n <td>70</td>\n <td>1</td>\n <td>ford torino</td>\n </tr>\n </tbody>\n</table>\n</div>"},"metadata":{}}]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment