Skip to content

Instantly share code, notes, and snippets.

@rejsmont
Last active February 12, 2021 19:12
Show Gist options
  • Save rejsmont/ae0269b541ca768c9b869e7fda325d39 to your computer and use it in GitHub Desktop.
Save rejsmont/ae0269b541ca768c9b869e7fda325d39 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DC Comics superheroines: 166\n",
"Marvel Comics superheroines: 170\n"
]
}
],
"source": [
"import requests\n",
"from bs4 import BeautifulSoup\n",
"\n",
"url = 'https://en.wikipedia.org/wiki/List_of_superheroines'\n",
"with requests.get(url) as r:\n",
" soup = BeautifulSoup(r.content, 'lxml')\n",
" ulist = soup.select('h2 + ul > li')\n",
" print('DC Comics superheroines:',\n",
" len([ x for x in ulist if 'DC' in str(x) ]))\n",
" print('Marvel Comics superheroines:',\n",
" len([ x for x in ulist if 'Marvel' in str(x) ]))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Marvel superheroes: 684 including 180 female (26.32%)\n",
"DC superheroes: 622 including 144 female (23.15%)\n"
]
}
],
"source": [
"import requests\n",
"\n",
"template = {'action': \"query\", 'list': 'categorymembers', 'cmlimit': '200', 'format': \"json\"}\n",
"categories = ['Marvel Comics superheroes', 'DC Comics superheroes',\n",
" 'Marvel Comics female superheroes', 'DC Comics female superheroes']\n",
"tags = ['Marvel', 'DC', 'Marvel female', 'DC female']\n",
"\n",
"params = [dict({'cmtitle': 'Category: ' + c}, **template) for c in categories]\n",
"results = {}\n",
"\n",
"for c, p in zip(tags, params):\n",
" superheroes = []\n",
" do_continue = True\n",
" while do_continue:\n",
" r = requests.get('https://en.wikipedia.org/w/api.php', params=p)\n",
" if r:\n",
" data = r.json()\n",
" do_continue = 'continue' in data\n",
" if do_continue:\n",
" p['cmcontinue'] = data['continue']['cmcontinue']\n",
" candidates = (i['title'].split(' (')[0] for i in data['query']['categorymembers'])\n",
" superheroes += [c for c in candidates if not c.startswith('Category:')]\n",
" results[c] = list(set(superheroes))\n",
"\n",
"results['Marvel female'] = [superheroin for superheroin in results['Marvel female']\n",
" if superheroin in results['Marvel']]\n",
"results['DC female'] = [superheroin for superheroin in results['DC female']\n",
" if superheroin in results['DC']]\n",
" \n",
"ml_total = len(results['Marvel'])\n",
"ml_female = len(results['Marvel female'])\n",
"ml_percent = 100 * ml_female / ml_total\n",
"dc_total = len(results['DC'])\n",
"dc_female = len(results['DC female'])\n",
"dc_percent = 100 * dc_female / dc_total\n",
"\n",
"print(f'Marvel superheroes: {ml_total} including {ml_female} female ({ml_percent:.2f}%)')\n",
"print(f'DC superheroes: {dc_total} including {dc_female} female ({dc_percent:.2f}%)')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Significant: False, p-value:0.09457\n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"import matplotlib.ticker as mticker\n",
"import pandas as pd\n",
"from scipy.stats import mannwhitneyu\n",
"\n",
"superheroin_names = results['Marvel female'] + results['DC female']\n",
"superhero_names = results['Marvel'] + results['DC']\n",
"superhero_universe = ['Marvel' for _ in results['Marvel']] + ['DC' for _ in results['DC']]\n",
"superhero_gender = ['Female' if superhero in superheroin_names else 'Male' for superhero in superhero_names]\n",
"\n",
"superheroes = pd.DataFrame({'Name': superhero_names, \n",
" 'Universe': superhero_universe, \n",
" 'Gender': superhero_gender}).sort_values('Name')\n",
"\n",
"data = superheroes.groupby(['Universe', 'Gender'], as_index=False).size()\n",
"data.loc[data['Gender'] == 'Female', 'size'] = -data.loc[data['Gender'] == 'Female', 'size']\n",
"ax = sns.barplot(x='size', y='Universe', hue='Gender', data=data, dodge=False, orient='h')\n",
"ax.set_xlim(-data['size'].max() * 1.1, data['size'].max() * 1.1)\n",
"ticks_loc = ax.get_xticks().tolist()\n",
"labels = [str(abs(int(t))) for t in ticks_loc]\n",
"ax.xaxis.set_major_locator(mticker.FixedLocator(ticks_loc))\n",
"ax.set_xticklabels(labels)\n",
"ax.set_ylabel('Universe')\n",
"ax.set_xlabel('Number of superheroes by gender')\n",
"\n",
"stats = mannwhitneyu(superheroes.loc[superheroes['Universe'] == 'Marvel', 'Gender'],\n",
" superheroes.loc[superheroes['Universe'] == 'DC', 'Gender'])\n",
"print(f'Significant: {stats.pvalue < 0.05}, p-value:{stats.pvalue:0.5f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment