Created
January 7, 2017 03:57
-
-
Save datarocks/218bc0dc86e6733627a84fc1a8ab970f to your computer and use it in GitHub Desktop.
a data munging script using tablib
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tablib | |
from copy import deepcopy | |
# pull the data in, using tablib, a really great library for messing around with tabular data | |
# more details about tablib here: http://docs.python-tablib.org/en/latest/). | |
# For serious data analysis, you are going to need to mess around with pandas http://pandas.pydata.org/ | |
data = tablib.Dataset().load(open('picodash_instagram_nra_blog_2016-12-14.csv').read()) | |
# make the headers for the new dataset, there has to be a more elegant way to do this, but this works | |
headers = data.headers | |
# There is going to be a new header, "hashtag", so lets add it to the headers list | |
headers.append(u'hashtag') | |
# The new dataset we are building, with the headers set | |
stacked_data = tablib.Dataset(headers=headers) | |
# chug through the rows of the dataset | |
for row in data.dict: | |
# split the data | |
for hashtag in row['Hashtags'].split(','): | |
newrow = deepcopy(row) | |
# add the specific hashtag | |
newrow.update({'hashtag': hashtag}) | |
values = tuple(newrow.values()) | |
# add the new row to the dataset | |
stacked_data.append(values) | |
# export the dataset as a new excel document | |
with open('hashtag_data.xlsx', 'wb') as f: | |
f.write(stacked_data.xlsx) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment