Created
July 18, 2017 07:06
-
-
Save TomHortons/d766738d4ce4bd564a96bbdd5529bfaa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import os | |
import gc | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
%matplotlib inline | |
pal = sns.color_palette() | |
import plotly.offline as py | |
py.init_notebook_mode(connected=True) | |
import plotly.graph_objs as go | |
import plotly.tools as tls | |
print('# File sizes') | |
for f in os.listdir('../input'): | |
if not os.path.isdir('../input/' + f): | |
print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB') | |
else: | |
sizes = [os.path.getsize('../input/'+f+'/'+x)/1000000 for x in os.listdir('../input/' + f)] | |
print(f.ljust(30) + str(round(sum(sizes), 2)) + 'MB' + ' ({} files)'.format(len(sizes))) | |
df_train = pd.read_csv('../input/train_v2.csv') | |
labels = df_train['tags'].apply(lambda x: x.split(' ')) | |
from collections import Counter, defaultdict | |
counts = defaultdict(int) | |
for l in labels: | |
for l2 in l: | |
counts[l2] += 1 | |
data=[go.Bar(x=list(counts.keys()), y=list(counts.values()))] | |
layout=dict(height=800, width=800, title='Distribution of training labels') | |
fig=dict(data=data, layout=layout) | |
py.iplot(data, filename='train-label-dist') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment