Skip to content

Instantly share code, notes, and snippets.

@linwoodc3
Created April 18, 2017 03:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save linwoodc3/f407377dc0b1ed8f2db163d132c2b386 to your computer and use it in GitHub Desktop.
Save linwoodc3/f407377dc0b1ed8f2db163d132c2b386 to your computer and use it in GitHub Desktop.
matplotlib plotting functions for my District Data Labs Twitter post.
# Author: Linwood Creekmore
# email: valinvescap@gmail.com
# date: 17 April 2017
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.patches as patches
import datetime
import pandas as pd
import numpy as np
def countplot(geodataframe,data,colorlist):
'''A simple bar plot of magnitude for each language
This creates a bar plot with the Economist theme.
The data is a filtered pandas Series representing counts
of each language to be displayed. This assumes the input
Series is generated from the `reader` function. *Requires pandas library.*
Parameters
----------
geodataframe : geopandas GeoDataFrame
geopandas dataframe with original data
data : pandas Series
pandas Series with counts of languages.
Returns
-------
matplotlib plot
Economist-styled plot of the magnitude of language occurence.
'''
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 15))
gs = gridspec.GridSpec(2, 2, height_ratios=[3, 1],width_ratios=[3,1])
ax1 = plt.subplot(gs[0])
b=(geodataframe[geodataframe.lang !='und'].groupby('lang')['lang'].count())
colors = colorlist #['014d64','6794a7', '7ad2f6', '01a2d9', '76c0c1','00887d','97b6b0','d7d29e','1a476e','90353b','9c8847','938dd2','6e8e84','c10534','cac27e']
colors = list(map(lambda x: "#{0}".format(x),colors))
# b[b>(b.sum()*.02)]
# plot and highlight highest bar
b1 = data[data>(data.sum()*.02)].sort_values(ascending=False).plot(kind='bar',
linewidth=[2.5,0,0,0,0,0,0],
edgecolor=['red','#EDEDED','#EDEDED',
'#EDEDED','#EDEDED','#EDEDED','#EDEDED'],
color=colors,
stacked=True,figsize=(20,17))
ax1.grid(False)
ax1.yaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0)
ax1.set_xticklabels(['English',"Indonesian",'Japanese',"Spanish",
'Turkish','Portuguese','Tagalog (Filipino)'],rotation=45)
for tick_label in ax1.yaxis.get_ticklabels():
tick_label.set_fontsize(20)
for tick_label in ax1.xaxis.get_ticklabels():
tick_label.set_fontsize(20)
ax1.set_xlabel('Language of Tweet',fontsize=24)
ax1.set_ylabel('Count',fontsize=24)
ax1.set_axisbelow(True)
ax1.annotate(xy=(0.3,185000),xytext=(2.3, 119000), fontsize=22,style='italic',
s=('Twitter is a US-based company so it\ncomes as no surprise that'
' the English\nlanguage dominates our sample of\n~600,000 tweets.'
),
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':10},
arrowprops=dict(facecolor='black', shrink=0.05),
multialignment='left')
plt.suptitle('Top Tweeted Languages',
fontsize=47,fontweight='bold')
# plt.text(7.9, data.max()*1.45, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=9.5,
# style='normal',ha='center',va='top', wrap=True,multialignment='right')
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg')
# newax = f.add_axes([.87, 0.96, 0.1, 0.1], anchor='SW', zorder=10)
# newax.imshow(im)
# newax.axis('off')
ax2 = plt.subplot(gs[1])
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
data = data.values[:,np.newaxis]
rowColors = ['#eceff6',"#d0daec"]
colLabels = "Language"
rowLabels = ["English",'Indonesian','Japanese','Spanish','Turkish'\
,'Portuguese','Tagalog','Thai','Russian','French','Italian',\
'German','Estonian','Arabic','Dutch']#counted.index.values[:,np.newaxis]
the_table = ax2.table(cellText=data,
rowLabels=rowLabels,
colWidths=[0.25, 0.25],
rowColours=["#d0daec"]*16,
colColours=['#eceff6'],
cellColours=np.array(['#eceff6']*15)[:,np.newaxis],
loc='center')
ax2.axis('tight')
the_table.set_fontsize(20)
the_table.scale(2.5, 3.9)
ax2.set_axis_off()
the_table.properties()
for key, cell in the_table.get_celld().items():
cell.set_linewidth(0.2)
plt.show()
def countryplot(geodataframe,data,colorlist):
'''A simple bar plot of magnitude for Twitter usage
by country.
This creates a bar plot with the Economist theme.
The data is a filtered pandas Series representing counts
of each language to be displayed. This assumes the input
Series is generated from the `reader` function.
*Requires pandas library.*
Parameters
----------
geodataframe : geopandas GeoDataFrame
geopandas dataframe with original data
data : pandas Series
pandas Series with counts of languages.
Returns
-------
matplotlib plot
Economist-styled plot of the magnitude of Twitter
usage in each country.
'''
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 17),frameon=True)
gs = gridspec.GridSpec(2, 2, height_ratios=[3,1],width_ratios=[3,1])
# first plot
ax1 = plt.subplot(gs[0])
colors = colorlist
# group by country
try:
countrycount = geodataframe.groupby(['NAME'])['NAME'].count()
except:
countrycount = geodataframe.groupby(['name'])['name'].count()
data.sort_values(ascending=True).plot(
kind='barh',
ax=ax1,
colormap='RdBu_r',figsize=(20,17))
# adding grids on horizontal line only
ax1.yaxis.label.set_visible(False)
ax1.grid(False)
ax1.xaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0)
#changing y and x tick label size
for tick_label in ax1.yaxis.get_ticklabels():
tick_label.set_fontsize(24)
for tick_label in ax1.xaxis.get_ticklabels():
tick_label.set_fontsize(24)
# overarching title
plt.suptitle('Top Tweeting Countries',
fontsize=38,fontweight='bold')
# adding text annotation
ax1.text(x=38000,y= 2.6,
fontsize=22,
s=('Surprisingly, the United States is not the\ngreatest user'
' of Twitter in our dataset. This\ncould be for a number of reasons,\n'
'especially given my unscientific retreival\nof the data.'
' We do however, see some\nconsistency in the data. Seven of the top ten\n'
'Twitter-using countries are in the '
'top 10\nof my unscientifically collected dataset.'),
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':18},
multialignment='left')
ax1.text(37000, 0.98, 'Comparison Source:\nNumber of active Twitter users in leading markets as of May 2016 \nhttps://www.statista.com/', style='italic',
bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=14)
y=data.sort_values(ascending=False)[:15].sort_values(ascending=True).values
# adding labels to horizontal bar
for i, v in enumerate(y):
if v > 10000:
ax1.text(v-8200 , i-.13, str(v), color='white', fontweight='bold', fontsize=18)
else:
ax1.text(v-5700 , i-.13, str(v), color='white', fontweight='bold', fontsize=18)
# add text to my little avatar
# plt.text(data.max()*1.282, 16.3, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=16,
# style='normal',ha='center',va='top', wrap=True,multialignment='right')
# # read in my avatar and plot on new axis
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg')
# newax = f.add_axes([.89, .98, .05, .05], anchor='SW', zorder=10)
# newax.imshow(im)
# newax.axis('off')
# second image; the table
ax2 = plt.subplot(gs[1])
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
datain = data.sort_values(ascending=False)[:14].values[:,np.newaxis]
rowColors = ['#eceff6',"#d0daec"]
colLabels = "Count"
rowLabels = countrycount.sort_values(ascending=False)[:14].index.values
the_table = ax2.table(cellText=datain,
rowLabels=rowLabels,
colWidths=[0.10, 0.10],
rowColours=["#d0daec"]*16,
colColours=['#eceff6'],
cellColours=np.array(['#eceff6']*14)[:,np.newaxis],
loc='center')
ax2.axis('tight')
the_table.set_fontsize(23)
the_table.scale(3,3.2)
ax2.set_axis_off()
the_table.properties()
for key, cell in the_table.get_celld().items():
cell.set_linewidth(0.2)
# plt.savefig('twitterusagecountryplot2.png')
plt.show()
def hourplot(geodataframe,country1='United States',country2='Indonesia'):
'''Function that compares local time occurences.
Function extracts Indonesia and United States
originating tweets from the data set. Then,
it converts each datetime to local time and
creates a barplot to compare the count of
tweets by local time hour of the day.*
Parameters
----------
geodataframe : geopandas GeoDataFrame
geopandas dataframe with original data
Returns
-------
matplotlib plot
Economist-styled plot of the magnitude of tweets
by local time hour of the day.
'''
us_count = geodataframe.normtime[geodataframe.NAME==country1]\
.groupby(geodataframe.normtime.apply(lambda x:x.hour)).size()
indo_count=geodataframe.normtime[geodataframe.NAME==country2]\
.groupby(geodataframe.normtime.apply(lambda x:x.hour)).size()
f,ax = plt.subplots(figsize=(20,12))
ax.set_xlabel('Busiest Hour of the Day for Tweets (Normalized Local Time)',fontsize=30)
# adding grids on horizontal line only
ax.set_xticks(np.arange(24),minor=True)
#changing y and x tick label size
for tick_label in ax.yaxis.get_ticklabels():
tick_label.set_fontsize(20)
for tick_label in ax.xaxis.get_ticklabels():
tick_label.set_fontsize(20)
us_count.plot(kind='bar',width=0.8,ax=ax,color='#01a2d9',\
label=country1,alpha=1,zorder=10)
ax.bar(np.arange(indo_count.index.values.shape[0]),\
indo_count.values,color='#014d64',\
width=0.8,label=country2)
ax.set_xlabel('Hour of the Day (Country Local Time)',fontsize=22)
plt.setp( ax.xaxis.get_majorticklabels(), rotation=45 )
ax.legend(fontsize=22)
ax.grid(False)
ax.yaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0)
#highlight rectangle
ax.add_patch(
patches.Rectangle(
(9.8, 0), # (x,y)
9, # width
11300, # height
fill=True, color="#ff6d6d",
alpha=0.3, zorder=-1))
ax.annotate(s=("Huge gap in U.S. data at local peak\ntime"
" for Twitter usage (1100-1300 Local)\n"
"just when Indonesia Twitter usage peaks. \n"
"This explains our anomaly."),
xy=(11.5,10000),xytext=(1,9500),arrowprops=dict(facecolor='black',\
shrink=0.05),\
bbox={'facecolor':'#6794a7', 'alpha':.8, 'pad':10},\
fontsize=20,style='italic')
ax.text(1, 8380, ('Comparison Source:\nThe Biggest Social Media Science Study: What'
' 4.8 Million Tweets \nSay About the Best Time to Tweet\nhttps://blog'
'.bufferapp.com/best-time-to-tweet-research'), style='italic',
bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=10)
# plt.text(20.2, us_count.max()*2.1, "Data by Linwood\nhttps://goo.gl/pV7Oqt",
# fontsize=16,style='normal',ha='center',va='top', wrap=True,multialignment='right')
# # read in my avatar and plot on new axis
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg')
# newax = f.add_axes([.9, 1, .05, .08], anchor='SW', zorder=10)
# newax.imshow(im)
# newax.axis('off')
plt.suptitle('United States Data has Gap During Peak Usage Time (in Local Time Hours)',\
fontsize=25,fontweight='bold')
plt.legend(loc='best')
# plt.savefig('linwoodSample_normalizedTimeAnalysis.png')
plt.show()
def waterplot(geodataframe,data,colorlist):
'''A simple bar plot of magnitude for Twitter usage
by body of water.
This creates a bar plot with the Economist theme.
The data is a filtered pandas Series representing counts
of each language to be displayed. This assumes the input
Series is generated from the `reader` function.
*Requires pandas library.*
Parameters
----------
geodataframe : geopandas GeoDataFrame
geopandas dataframe with original data
data : pandas Series
pandas Series with counts of languages.
Returns
-------
matplotlib plot
Economist-styled plot of the magnitude of Twitter
usage in each country.
'''
f, (ax1, ax2) = plt.subplots(1, 2,figsize=(22, 17),frameon=True)
gs = gridspec.GridSpec(2, 2, height_ratios=[6,1],width_ratios=[5,1])
# first plot
ax1 = plt.subplot(gs[0])
colors = colorlist
# group by country
try:
countrycount = geodataframe.groupby(['NAME'])['NAME'].count()
except:
countrycount = geodataframe.groupby(['name'])['name'].count()
data.sort_values(ascending=True).plot(
kind='barh',
ax=ax1,
colormap='RdBu_r',figsize=(20,15))
# adding grids on horizontal line only
ax1.yaxis.label.set_visible(False)
ax1.grid(False)
ax1.xaxis.grid(True, color='w', ls='-', lw=1.5, zorder=0)
#changing y and x tick label size
for tick_label in ax1.yaxis.get_ticklabels():
tick_label.set_fontsize(18)
for tick_label in ax1.xaxis.get_ticklabels():
tick_label.set_fontsize(22)
# overarching title
plt.suptitle('Top 10 Bodies of Water by Count of Tweets',
fontsize=38,fontweight='bold')
# adding text annotation
ax1.text(x=440,y= 1.6,
fontsize=22,
s=('A good number of tweets occur from bodies\n'
'of water. The North Atlantic and Pacific\n'
'Oceans likley dominate because they hold\n'
'major trade/travel routes.'),
bbox={'facecolor':'#6794a7', 'alpha':0.5, 'pad':18},
multialignment='left')
# ax1.text(37000, 0.98, 'Comparison Source:\nNumber of active Twitter users in leading markets as of May 2016 \nhttps://www.statista.com/', style='italic',
# bbox={'facecolor':'whitesmoke', 'alpha':0.5, 'pad':10},fontsize=14)
y=data.sort_values(ascending=False)[:10].sort_values(ascending=True).values
# adding labels to horizontal bar
for i, v in enumerate(y):
if v > 10000:
ax1.text(v-70 , i-.13, str(v), color='white', fontweight='bold', fontsize=18)
else:
ax1.text(v-70 , i-.13, str(v), color='white', fontweight='bold', fontsize=24)
# add text to my little avatar
# plt.text(data.max()*1.282, 16.3, "Data by Linwood\nhttps://goo.gl/pV7Oqt", fontsize=16,
# style='normal',ha='center',va='top', wrap=True,multialignment='right')
# # read in my avatar and plot on new axis
# im = plt.imread('/Users/linwood/Downloads/LinwoodCartoon.jpg')
# newax = f.add_axes([.89, .98, .05, .05], anchor='SW', zorder=10)
# newax.imshow(im)
# newax.axis('off')
# second image; the table
ax2 = plt.subplot(gs[1])
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
datain = data.sort_values(ascending=False)[:10].values[:,np.newaxis]
rowColors = ['#eceff6',"#d0daec"]
colLabels = "Count"
rowLabels = countrycount.sort_values(ascending=False)[:10].index.values
the_table = ax2.table(cellText=datain,
rowLabels=rowLabels,
colWidths=[0.10, 0.20],
rowColours=["#d0daec"]*16,
colColours=['#eceff6'],
cellColours=np.array(['#eceff6']*10)[:,np.newaxis],
loc='center')
# ax2.axis('tight')
the_table.set_fontsize(20)
the_table.scale(1.8,4.3)
ax2.set_axis_off()
the_table.properties()
for key, cell in the_table.get_celld().items():
cell.set_linewidth(0.2)
# plt.savefig('twitterusagecountryplot2.png')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment