kzzzr/parse_mongodb_schema.ipynb

## parse_mongodb_schema.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 166,
   "source": [
    "import yaml\n",
    "import pandas as pd\n",
    "import glob"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "pd.show_versions()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "yaml.load?"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "!ls | grep yml"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "source": [
    "yml = glob.glob('*.yml')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "source": [
    "s = []\n",
    "\n",
    "for y in yml:\n",
    "    with open(y, 'r') as f:\n",
    "        s += yaml.load(f, Loader=yaml.FullLoader)['schema']"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "s?"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "source": [
    "df = pd.io.json.json_normalize(s,\n",
    "                               record_path=['tables', 'columns'],\n",
    "                               meta=['db', ['tables', 'table'], ['tables', 'collection']],\n",
    "                               record_prefix='column.',\n",
    "                               meta_prefix='database.')"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "source": [
    "df.columns = ['column.Name', 'column.MongoType', 'column.SqlName', 'column.SqlType',\n",
    "       'database', 'database.relational_table', 'database.collection']\n",
    "\n",
    "column_order = ['database', 'database.relational_table', 'database.collection',\n",
    "        'column.Name', 'column.SqlName', 'column.MongoType', 'column.SqlType']\n",
    "\n",
    "df = df[column_order]"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "df"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "df.describe()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "source": [
    "df.to_excel(\"output.xlsx\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "df[[\"database\", \"database.collection\"]].drop_duplicates()"
   ],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

## schema.sh
# prepare list of databases
  declare -a DATABASES=(
    "documents"
    "communication"
    "flights"
    )

# loop through databases and generate schemas to yaml files
    for db in "${DATABASES[@]}"
    do
    mongodrdl \
        --host=<host_address> \
        --port=<port> \
        --db=$db \
        --username=<username> \
        --password=<password> \
        --out=./$db.yml \
        --sampleSize=10000 # choose number of documents to be sampled
    done
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 166,
	"source": [
	"import yaml\n",
	"import pandas as pd\n",
	"import glob"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"pd.show_versions()"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"yaml.load?"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"!ls \| grep yml"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 171,
	"source": [
	"yml = glob.glob('*.yml')"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 172,
	"source": [
	"s = []\n",
	"\n",
	"for y in yml:\n",
	" with open(y, 'r') as f:\n",
	" s += yaml.load(f, Loader=yaml.FullLoader)['schema']"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"s?"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 174,
	"source": [
	"df = pd.io.json.json_normalize(s,\n",
	" record_path=['tables', 'columns'],\n",
	" meta=['db', ['tables', 'table'], ['tables', 'collection']],\n",
	" record_prefix='column.',\n",
	" meta_prefix='database.')"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 175,
	"source": [
	"df.columns = ['column.Name', 'column.MongoType', 'column.SqlName', 'column.SqlType',\n",
	" 'database', 'database.relational_table', 'database.collection']\n",
	"\n",
	"column_order = ['database', 'database.relational_table', 'database.collection',\n",
	" 'column.Name', 'column.SqlName', 'column.MongoType', 'column.SqlType']\n",
	"\n",
	"df = df[column_order]"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"df"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"df.describe()"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": 178,
	"source": [
	"df.to_excel(\"output.xlsx\")"
	],
	"outputs": [],
	"metadata": {}
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"source": [
	"df[[\"database\", \"database.collection\"]].drop_duplicates()"
	],
	"outputs": [],
	"metadata": {}
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}
	# prepare list of databases
	declare -a DATABASES=(
	"documents"
	"communication"
	"flights"
	)

	# loop through databases and generate schemas to yaml files
	for db in "${DATABASES[@]}"
	do
	mongodrdl \
	--host=<host_address> \
	--port=<port> \
	--db=$db \
	--username=<username> \
	--password=<password> \
	--out=./$db.yml \
	--sampleSize=10000 # choose number of documents to be sampled
	done