Skip to content

Instantly share code, notes, and snippets.

@psychemedia
Last active October 29, 2018 18:38
Show Gist options
  • Save psychemedia/0165a0af93c1e5e2ee02533df6cd3206 to your computer and use it in GitHub Desktop.
Save psychemedia/0165a0af93c1e5e2ee02533df6cd3206 to your computer and use it in GitHub Desktop.
Simple wordcloud notebook for @cogdog
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['th@t',\n",
" 'this',\n",
" 'goodbye',\n",
" 'cogdog',\n",
" 'th@t',\n",
" 'whenever',\n",
" 'goodbye',\n",
" 'whenever',\n",
" 'th@t',\n",
" 'wherever']"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Random word list generator\n",
"\n",
"from random import choices\n",
"_words = [\"hello\", \"goodbye\", \"this\", \"th@t\", 'whenever', 'wherever', 'cogdog']\n",
"choices(_words, k=10)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"try:\n",
" import pandas as pd\n",
"except:\n",
" !pip install pandas "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col1 col2\n",
"0 th@t hello\n",
"1 whenever th@t\n",
"2 th@t goodbye\n",
"3 cogdog whenever\n",
"4 wherever th@t"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Create a simple dataframe with two random word lists\n",
"\n",
"import pandas as pd\n",
"\n",
"df = pd.DataFrame({'col1':choices(_words, k=100), 'col2':choices(_words+['gotcha'], k=100)})\n",
"\n",
"#We can save the data to a csv file...\n",
"df.to_csv('mywords.csv', index=False)\n",
"\n",
"#Or preview it\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"col1,col2\r\n",
"th@t,hello\r\n",
"whenever,th@t\r\n"
]
}
],
"source": [
"#Here's what it looks like as csv\n",
"!head -n 3 mywords.csv"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col1 col2\n",
"0 th@t hello\n",
"1 whenever th@t\n",
"2 th@t goodbye\n",
"3 cogdog whenever\n",
"4 wherever th@t"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#load the csv into another dataframe - to show we can\n",
"df2 = pd.read_csv('mywords.csv')\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"#Install wordcloud package\n",
"try:\n",
" import wordcloud\n",
"except:\n",
" !pip install wordcloud"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"try:\n",
" import matplotlib\n",
"except:\n",
" !pip install matplotlib"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"#Required graphics package\n",
"import matplotlib.pyplot as plt\n",
"#...and magic to diplay results inline in the notebook...\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from wordcloud import WordCloud\n",
"\n",
"# Generate a word cloud image\n",
"wordcloud = WordCloud().generate(' '.join(df2['col1'].tolist()))\n",
"\n",
"\n",
"\n",
"plt.imshow(wordcloud, interpolation='bilinear');"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"wordcloud = WordCloud().generate(' '.join(df2['col2'].tolist()))\n",
"\n",
"plt.imshow(wordcloud, interpolation='bilinear');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you have sentences, they can be split...\n",
"\n",
"It's particularly easy if the split is regular. For example:"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" <th>col3a</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" <td>@th@t/hello</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" <td>@whenever/th@t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" <td>@th@t/goodbye</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" <td>@cogdog/whenever</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" <td>@wherever/th@t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col1 col2 col3a\n",
"0 th@t hello @th@t/hello\n",
"1 whenever th@t @whenever/th@t\n",
"2 th@t goodbye @th@t/goodbye\n",
"3 cogdog whenever @cogdog/whenever\n",
"4 wherever th@t @wherever/th@t"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['col3a'] = '@'+df['col1']+ '/' + df['col2']\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" <th>col3a</th>\n",
" <th>col3b</th>\n",
" <th>col3c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" <td>@th@t/hello</td>\n",
" <td>@th@t</td>\n",
" <td>hello</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" <td>@whenever/th@t</td>\n",
" <td>@whenever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" <td>@th@t/goodbye</td>\n",
" <td>@th@t</td>\n",
" <td>goodbye</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" <td>@cogdog/whenever</td>\n",
" <td>@cogdog</td>\n",
" <td>whenever</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" <td>@wherever/th@t</td>\n",
" <td>@wherever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col1 col2 col3a col3b col3c\n",
"0 th@t hello @th@t/hello @th@t hello\n",
"1 whenever th@t @whenever/th@t @whenever th@t\n",
"2 th@t goodbye @th@t/goodbye @th@t goodbye\n",
"3 cogdog whenever @cogdog/whenever @cogdog whenever\n",
"4 wherever th@t @wherever/th@t @wherever th@t"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#We can split a string in a column and then expand it over a couple of columns\n",
"df[['col3b','col3c']] = df['col3a'].str.split('/', 1, expand=True)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Want username without the `@`?"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>col1</th>\n",
" <th>col2</th>\n",
" <th>col3a</th>\n",
" <th>col3b</th>\n",
" <th>col3c</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" <td>@th@t/hello</td>\n",
" <td>th@t</td>\n",
" <td>hello</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" <td>@whenever/th@t</td>\n",
" <td>whenever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" <td>@th@t/goodbye</td>\n",
" <td>th@t</td>\n",
" <td>goodbye</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" <td>@cogdog/whenever</td>\n",
" <td>cogdog</td>\n",
" <td>whenever</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" <td>@wherever/th@t</td>\n",
" <td>wherever</td>\n",
" <td>th@t</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" col1 col2 col3a col3b col3c\n",
"0 th@t hello @th@t/hello th@t hello\n",
"1 whenever th@t @whenever/th@t whenever th@t\n",
"2 th@t goodbye @th@t/goodbye th@t goodbye\n",
"3 cogdog whenever @cogdog/whenever cogdog whenever\n",
"4 wherever th@t @wherever/th@t wherever th@t"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#We could do a trivial replace, but we can also regex to be more precise\n",
"#https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.replace.html\n",
"df['col3b'] = df['col3b'].str.replace('^@','')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment