Skip to content

Instantly share code, notes, and snippets.

@TomHortons
Created July 18, 2017 07:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TomHortons/d766738d4ce4bd564a96bbdd5529bfaa to your computer and use it in GitHub Desktop.
Save TomHortons/d766738d4ce4bd564a96bbdd5529bfaa to your computer and use it in GitHub Desktop.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pal = sns.color_palette()
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
print('# File sizes')
for f in os.listdir('../input'):
if not os.path.isdir('../input/' + f):
print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')
else:
sizes = [os.path.getsize('../input/'+f+'/'+x)/1000000 for x in os.listdir('../input/' + f)]
print(f.ljust(30) + str(round(sum(sizes), 2)) + 'MB' + ' ({} files)'.format(len(sizes)))
df_train = pd.read_csv('../input/train_v2.csv')
labels = df_train['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
for l2 in l:
counts[l2] += 1
data=[go.Bar(x=list(counts.keys()), y=list(counts.values()))]
layout=dict(height=800, width=800, title='Distribution of training labels')
fig=dict(data=data, layout=layout)
py.iplot(data, filename='train-label-dist')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment