Skip to content

Instantly share code, notes, and snippets.

View sciencelee's full-sized avatar

Aaron Lee sciencelee

View GitHub Profile
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("vgsales.csv")
df = df.loc[df['Year'] < 2008]
df.head(3)
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("vgsales.csv")
# If this was SQL, it would look something like this:
'''
SELECT Year, SUM Global_Sales
FROM df
GROUP BY Year
'''
# In Python...
# Grouping it by year produces a DataFrameGroupBy object
df.groupby(['Year'])
'''
SELECT SUM Global_Sales
FROM df
GROUP BY Publisher
'''
# Grouped by Publisher and selected the sum of Global_Sales (a Series object)
df.groupby(['Publisher'])['Global_Sales'].sum()
# Now we should be able to plot it??
# We can make this into a horizontal bar graph to make this a little easier to read.
df.groupby(['Publisher'])['Global_Sales'].sum().plot(kind='barh')
# That last line produces a messy bar graph with hundreds of publishers.
# We can use iloc or head mehthods to limit the number shown
df.groupby(['Publisher'])['Global_Sales'].sum().head(10).plot(kind='barh')
# Flatten the grouped object as seen above
df_flat = df.groupby(['Publisher'])['Global_Sales'].sum().reset_index()
# sort it by Global_Sales, include only the first 10, and make a horizontal bar plot. Yay!
df_flat.sort_values(['Global_Sales'], ascending=False).head(10).plot(kind='barh', y='Global_Sales', x='Publisher')
# SQL Query might look like...
'''
SELECT Year, SUM Global_Sales
FROM df_me
GROUP BY Year, Platform
'''
# plot global sales by platform by year. Sounds tricky, but we can handle it.
platforms = ['NES', 'PS', '2600', 'PS2', 'GBA']
# This line of code does it for us
df_me.groupby(['Year', 'Platform'])['Global_Sales'].sum().unstack().plot()
import pandas as pd
df = pd.read_csv('kc_house_data.csv')