Skip to content

Instantly share code, notes, and snippets.

@kzzzr
Last active August 4, 2021 13:53
Show Gist options
  • Save kzzzr/88757434a13bb7e650735ff0d86360f9 to your computer and use it in GitHub Desktop.
Save kzzzr/88757434a13bb7e650735ff0d86360f9 to your computer and use it in GitHub Desktop.
Export and analyse MongoDB schema to relational view
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 166,
"source": [
"import yaml\n",
"import pandas as pd\n",
"import glob"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"pd.show_versions()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"yaml.load?"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"!ls | grep yml"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 171,
"source": [
"yml = glob.glob('*.yml')"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 172,
"source": [
"s = []\n",
"\n",
"for y in yml:\n",
" with open(y, 'r') as f:\n",
" s += yaml.load(f, Loader=yaml.FullLoader)['schema']"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"s?"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 174,
"source": [
"df = pd.io.json.json_normalize(s,\n",
" record_path=['tables', 'columns'],\n",
" meta=['db', ['tables', 'table'], ['tables', 'collection']],\n",
" record_prefix='column.',\n",
" meta_prefix='database.')"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 175,
"source": [
"df.columns = ['column.Name', 'column.MongoType', 'column.SqlName', 'column.SqlType',\n",
" 'database', 'database.relational_table', 'database.collection']\n",
"\n",
"column_order = ['database', 'database.relational_table', 'database.collection',\n",
" 'column.Name', 'column.SqlName', 'column.MongoType', 'column.SqlType']\n",
"\n",
"df = df[column_order]"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"df"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"df.describe()"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 178,
"source": [
"df.to_excel(\"output.xlsx\")"
],
"outputs": [],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"df[[\"database\", \"database.collection\"]].drop_duplicates()"
],
"outputs": [],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
# prepare list of databases
declare -a DATABASES=(
"documents"
"communication"
"flights"
)
# loop through databases and generate schemas to yaml files
for db in "${DATABASES[@]}"
do
mongodrdl \
--host=<host_address> \
--port=<port> \
--db=$db \
--username=<username> \
--password=<password> \
--out=./$db.yml \
--sampleSize=10000 # choose number of documents to be sampled
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment