Skip to content

Instantly share code, notes, and snippets.

@PiotrZakrzewski
Created November 21, 2020 14:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PiotrZakrzewski/39b563a6e2878d85d2bcf1f148849679 to your computer and use it in GitHub Desktop.
Save PiotrZakrzewski/39b563a6e2878d85d2bcf1f148849679 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LICENSE \u001b[1m\u001b[36mmacrobase.egg-info\u001b[m\u001b[m/ setup.py\r\n",
"README.md \u001b[1m\u001b[36mmb_lite\u001b[m\u001b[m/ test.csv\r\n",
"Showcase.ipynb movies.csv tmdb_5000_credits.csv\r\n",
"\u001b[1m\u001b[36mbuild\u001b[m\u001b[m/ movies_etl.ipynb tmdb_5000_movies.csv\r\n",
"\u001b[1m\u001b[36mdist\u001b[m\u001b[m/ requirements-dev.txt\r\n",
"\u001b[1m\u001b[36menv\u001b[m\u001b[m/ requirements.txt\r\n"
]
}
],
"source": [
"ls"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"tmdb_5000_movies.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',\n",
" 'original_title', 'overview', 'popularity', 'production_companies',\n",
" 'production_countries', 'release_date', 'revenue', 'runtime',\n",
" 'spoken_languages', 'status', 'tagline', 'title', 'vote_average',\n",
" 'vote_count'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"875.581305"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(df[\"popularity\"]) - min(df[\"popularity\"])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 150.437577\n",
"1 139.082615\n",
"2 107.376788\n",
"3 112.312950\n",
"4 43.926995\n",
" ... \n",
"4798 14.269792\n",
"4799 0.642552\n",
"4800 1.444476\n",
"4801 0.857008\n",
"4802 1.929883\n",
"Name: popularity, Length: 4803, dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"popularity\"]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df_reduced = df[[\"title\" ,\"spoken_languages\" ,\"production_companies\", \"production_countries\", \"genres\", \"keywords\", \"original_language\", \"popularity\"]]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 United States of America\n",
"1 United States of America\n",
"2 United States of America\n",
"3 United States of America\n",
"4 United States of America\n",
" ... \n",
"4798 United States of America\n",
"4799 N/A\n",
"4800 United States of America\n",
"4801 United States of America\n",
"4802 United States of America\n",
"Name: production_countries, Length: 4803, dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"from collections import Counter\n",
"def de_json_column(dataframe, column_name):\n",
" all_jsons = dataframe[column_name]\n",
" all_names = []\n",
" for jsn in all_jsons:\n",
" jsn = json.loads(jsn)\n",
" for el in jsn:\n",
" all_names.append(el[\"name\"])\n",
" names_count = Counter(all_names)\n",
" def dejson(x):\n",
" jsn = json.loads(x)\n",
" highest = 'N/A'\n",
" c = 0 \n",
" for el in jsn:\n",
" name = el[\"name\"]\n",
" current_c = names_count[name]\n",
" if current_c > c:\n",
" highest = name\n",
" c = current_c\n",
" return highest\n",
" return dataframe[column_name].map(dejson)\n",
"de_json_column(df, 'production_countries')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>spoken_languages</th>\n",
" <th>production_companies</th>\n",
" <th>production_countries</th>\n",
" <th>genres</th>\n",
" <th>keywords</th>\n",
" <th>original_language</th>\n",
" <th>popularity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Avatar</td>\n",
" <td>English</td>\n",
" <td>Twentieth Century Fox Film Corporation</td>\n",
" <td>United States of America</td>\n",
" <td>Action</td>\n",
" <td>3d</td>\n",
" <td>en</td>\n",
" <td>150.437577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Pirates of the Caribbean: At World's End</td>\n",
" <td>English</td>\n",
" <td>Walt Disney Pictures</td>\n",
" <td>United States of America</td>\n",
" <td>Action</td>\n",
" <td>aftercreditsstinger</td>\n",
" <td>en</td>\n",
" <td>139.082615</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Spectre</td>\n",
" <td>English</td>\n",
" <td>Columbia Pictures</td>\n",
" <td>United States of America</td>\n",
" <td>Action</td>\n",
" <td>based on novel</td>\n",
" <td>en</td>\n",
" <td>107.376788</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>The Dark Knight Rises</td>\n",
" <td>English</td>\n",
" <td>Warner Bros.</td>\n",
" <td>United States of America</td>\n",
" <td>Drama</td>\n",
" <td>superhero</td>\n",
" <td>en</td>\n",
" <td>112.312950</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>John Carter</td>\n",
" <td>English</td>\n",
" <td>Walt Disney Pictures</td>\n",
" <td>United States of America</td>\n",
" <td>Action</td>\n",
" <td>based on novel</td>\n",
" <td>en</td>\n",
" <td>43.926995</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4798</th>\n",
" <td>El Mariachi</td>\n",
" <td>Español</td>\n",
" <td>Columbia Pictures</td>\n",
" <td>United States of America</td>\n",
" <td>Thriller</td>\n",
" <td>united states–mexico barrier</td>\n",
" <td>es</td>\n",
" <td>14.269792</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4799</th>\n",
" <td>Newlyweds</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>Comedy</td>\n",
" <td>N/A</td>\n",
" <td>en</td>\n",
" <td>0.642552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4800</th>\n",
" <td>Signed, Sealed, Delivered</td>\n",
" <td>English</td>\n",
" <td>Muse Entertainment Enterprises</td>\n",
" <td>United States of America</td>\n",
" <td>Drama</td>\n",
" <td>investigation</td>\n",
" <td>en</td>\n",
" <td>1.444476</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4801</th>\n",
" <td>Shanghai Calling</td>\n",
" <td>English</td>\n",
" <td>N/A</td>\n",
" <td>United States of America</td>\n",
" <td>N/A</td>\n",
" <td>N/A</td>\n",
" <td>en</td>\n",
" <td>0.857008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4802</th>\n",
" <td>My Date with Drew</td>\n",
" <td>English</td>\n",
" <td>rusty bear entertainment</td>\n",
" <td>United States of America</td>\n",
" <td>Documentary</td>\n",
" <td>obsession</td>\n",
" <td>en</td>\n",
" <td>1.929883</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>4803 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" title spoken_languages \\\n",
"0 Avatar English \n",
"1 Pirates of the Caribbean: At World's End English \n",
"2 Spectre English \n",
"3 The Dark Knight Rises English \n",
"4 John Carter English \n",
"... ... ... \n",
"4798 El Mariachi Español \n",
"4799 Newlyweds N/A \n",
"4800 Signed, Sealed, Delivered English \n",
"4801 Shanghai Calling English \n",
"4802 My Date with Drew English \n",
"\n",
" production_companies production_countries \\\n",
"0 Twentieth Century Fox Film Corporation United States of America \n",
"1 Walt Disney Pictures United States of America \n",
"2 Columbia Pictures United States of America \n",
"3 Warner Bros. United States of America \n",
"4 Walt Disney Pictures United States of America \n",
"... ... ... \n",
"4798 Columbia Pictures United States of America \n",
"4799 N/A N/A \n",
"4800 Muse Entertainment Enterprises United States of America \n",
"4801 N/A United States of America \n",
"4802 rusty bear entertainment United States of America \n",
"\n",
" genres keywords original_language popularity \n",
"0 Action 3d en 150.437577 \n",
"1 Action aftercreditsstinger en 139.082615 \n",
"2 Action based on novel en 107.376788 \n",
"3 Drama superhero en 112.312950 \n",
"4 Action based on novel en 43.926995 \n",
"... ... ... ... ... \n",
"4798 Thriller united states–mexico barrier es 14.269792 \n",
"4799 Comedy N/A en 0.642552 \n",
"4800 Drama investigation en 1.444476 \n",
"4801 N/A N/A en 0.857008 \n",
"4802 Documentary obsession en 1.929883 \n",
"\n",
"[4803 rows x 8 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.options.mode.chained_assignment = None\n",
"df_reduced[\"spoken_languages\"] = de_json_column(df_reduced, \"spoken_languages\")\n",
"df_reduced[\"production_countries\"] = de_json_column(df_reduced, \"production_countries\")\n",
"df_reduced[\"production_companies\"] = de_json_column(df_reduced, \"production_companies\")\n",
"df_reduced[\"keywords\"] = de_json_column(df_reduced, \"keywords\" )\n",
"df_reduced[\"genres\"] = de_json_column(df_reduced, \"genres\" )\n",
"df_reduced"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df_reduced.to_csv(\"movies.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment