"<img style='float:left' src=''>\n",
"**Work with data & machine learning models**\n",
"* easily store data in a high-performance data cluster (MongoDB)\n",
"* store your fitted or unfitted scikit-learn models\n",
"* run predictions on the compute cluster directly from stored data\n",
"* store & use remote data (ftp, http, s3)\n",
"**Easily use compute resources in the cluster**\n",
"* fit models in the compute cluster, in parallel\n",
"* perform grid search\n",
"* all asynchronously\n",
"**Share data, notebooks**\n",
"* write, store & share notebooks directly online, no setup required\n",
"* run jobs on a regular schedule\n",
"* share notebooks and data across users\n",
"**Automatic REST API for any client**\n",
"* datasets\n",
"* models\n",
"* jobs (reports)\n",
"* arbitrary custom scripts (python)\n",
"**On-Premise or On-Cloud Custom Installation**\n",
"* customizable backends (e.g. Spark, R, SAS)\n",
"* custom runtimes (e.g. dask, Spark)\n",
"* arbitrary data storage extensions API\n",
"* custom data types extensions API\n",
"* native-Python data streaming API (like Spark Streaming, much simpler)"
"!pip install pandas_datareader tqdm"
"import omegaml as om \n",
"# list datasets stored in cluster\n",
"# list models stored in clusters\n",
"# list jobs & results stored in cluster\n",
"# Enterprise Edition\n",
"# list custom scripts stored in cluster\n",
"# om.scripts.list()"
"# store any python data\n",
"om.datasets.put(['any data'], 'mydata')\n",
"# store numpy arrays and pandas dataframes\n",
"import pandas as pd\n",
"from sklearn.datasets import load_iris\n",
"X, y = load_iris(True)\n",
"data = pd.DataFrame(X)\n",
"data['y'] = y\n",
"om.datasets.put(data, 'iris')\n",
"# Enterprise Edition\n",
"# store remote datasets as a reference (no copy)\n",
"# om.datasets.put('', 'demographics')\n",
"# om.datasets.get('demographics')"
"# store financial time series including indicies\n",
"%matplotlib inline\n",
"import pandas as pd\n",
"import as web\n",
"import datetime\n",
"start = datetime.datetime(2017, 1, 1)\n",
"end = datetime.datetime(2018, 1, 31)\n",
"prices = web.DataReader(\"GOOGL\", 'yahoo', start, end)\n",
"# get data back in their original format\n",
"om.datasets.put(prices, 'google', append=False)\n",
"prices = om.datasets.get('google')\n",
"# filter data in the database -- notice the nice syntax\n",
"%time om.datasets.get('google', Close__gte=900, Close__lte=920)"
"# filter & aggregate data locally (let's make it large)\n",
"from tqdm import tqdm\n",
"N = 1e6\n",
"ldf_google_large = om.datasets.getl('google-large')\n",
"dupl = int((N - len(ldf_google_large or [])) / len(prices) + 1)\n",
"for i in tqdm(range(dupl)):\n",
" om.datasets.put(prices, 'google-large')\n",
"print(\"google-large has {} records\".format(len(om.datasets.getl('google-large'))))"
"# filter & aggregate data locally (let's make it large)\n",
"def getdata():\n",
" data = om.datasets.get('google-large')\n",
" return data[(data.Close >= 900) & (data.Close <= 920)].mean() \n",
"%time getdata()"
"# filter and aggregate by database - 2-3x faster\n",
"%time om.datasets.getl('google-large', Close__gte=900, Close__lte=920).mean().iloc[0]"
"# index based access by loading data first\n",
"def getdata():\n",
" dfx = om.datasets.get('google-large')\n",
" return dfx.loc[pd.to_datetime('2017-01-03')]\n",
"%time getdata()"
"# index-based access directly in database\n",
"dfx = om.datasets.getl('google-large')\n",
"%time dfx.loc[pd.to_datetime('2017-01-03')].value"
"# train models locally\n",
"%matplotlib inline\n",
"import pandas as pd \n",
"from sklearn.svm import SVR\n",
"prices = om.datasets.get('google')\n",
"X = prices[['High', 'Low']].rolling(5).mean().dropna()\n",
"y = prices.iloc[4:]['Close']\n",
"print(X.shape, y.shape)\n",
"train_loc = X.shape[0] // 2\n",
"model = SVR(kernel='linear', tol=0.1)\n",
"[0:train_loc], y.iloc[0:train_loc])\n",
"r2 = model.score(X, y)\n",
"yhat = pd.DataFrame({'yhat': model.predict(X[train_loc:])})\n",
"yhat.index = X.index[train_loc:]\n",
"ax = prices.iloc[train_loc:]['Close'].plot()\n",
"yhat.plot(color='r', ax=ax)"
"# predict remotely\n",
"# store models and new data\n",
"om.models.put(model, 'google-predict')\n",
"om.datasets.put(X[train_loc:], 'google-rolling', append=False)\n",
"# then predict remotely\n",
"pred = om.runtime.model('google-predict').predict('google-rolling[High,Low]').get()\n",
"# show results\n",
"pred = pd.DataFrame({'yhat': pred}, index=range(len(pred)))\n",
"actual = om.datasets.get('google[Close]').iloc[train_loc:]\n",
"pred.index = actual.index[:len(pred)]\n",
"ax = actual.plot()\n",
"pred.plot(color='r', ax=ax)"
"# we can also train remote\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import numpy as np\n",
"iris = load_iris()\n",
"X =\n",
"y =\n",
"df = pd.DataFrame(X)\n",
"df['y'] = y\n",
"from sklearn.cluster import KMeans\n",
"model = KMeans(n_clusters=8)\n",
"# fit & predict remote\n",
"om.models.drop('iris-model', True)\n",
"om.models.put(model, 'iris-model')\n",
"om.runtime.model('iris-model').fit(X, y).get()\n",
"# get back remote fitted model and show results\n",
"model = om.models.get('iris-model')\n",
"labels = model.labels_\n",
"fig = plt.figure(figsize=(4, 3))\n",
"ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)\n",
"ax.scatter(X[:, 3], X[:, 0], X[:, 2],\n",
" c=labels.astype(np.float), edgecolor='k')\n",
"# we store lots of information on models\n",
"# perform gridsearch on cluster\n",
"om.datasets.put(df, 'iris', append=False)\n",
"params = {\n",
" 'n_clusters': range(1,8),\n",
" }\n",
"om.runtime.model('iris-model').gridsearch('iris[^y]', 'iris[y]', parameters=params).get()"
"# see what gridsearch results we have\n",
"gsresult = om.models.metadata('iris-model')['attributes']['gridsearch']"
"# look at gridsearch results\n",
"gsModel = gsresult[0]['gsModel']\n",
"gs = om.models.get(gsModel)\n",
"# use the model REST API \n",
"import requests\n",
"from omegaml.client.auth import OmegaRestApiAuth\n",
"import omegaml as om \n",
"# -- setup authentication and API URL\n",
"auth = OmegaRestApiAuth.make_from(om)\n",
"url = getattr(om.defaults, 'OMEGA_RESTAPI_URL', 'http://localhost:5000')\n",
"modelname = 'iris-model'\n",
"dataset = 'iris'\n",
"# -- prepare dataset\n",
"om.datasets.put(pd.DataFrame(X), 'iris', append=False)\n",
"# -- call REST API\n",
"print('Requesting from', url)\n",
"resp = requests.put('{url}/api/v1/model/{modelname}/predict?datax={dataset}'.format(**locals()), auth=auth)\n",
"# use the datasets REST API \n",
"import requests\n",
"print('Requesting from', url)\n",
"resp = requests.get('{url}/api/v1/dataset/{dataset}'.format(**locals()), auth=auth)\n",
"# Enterprise Edition\n",
"# deploy lambda-style arbitrary algorithms\n",
"# om.scripts.put('pkg:///app/omegapkg/demo/helloworld/', 'helloworld')"
"# Enterprise Edition\n",
"# run lambdas\n",
"# from datetime import datetime\n",
"# dtnow =\n",
"# om.runtime.script('helloworld').run(foo=dtnow).get()"
"# Enterprise Edition\n",
"# use REST API to run lambdas\n",
"# import requests\n",
"# from omegacommon.auth import OmegaRestApiAuth\n",
"# auth = OmegaRestApiAuth(**auth_config)\n",
"# resp ='', \n",
"# params=dict(foo=dtnow), auth=auth)\n",
"# resp.json()"
"# run jobs (python notebooks) online\n",
"if 'scheduled-report.ipynb' in\n",
" om.runtime.job('scheduled-report').run()\n",
"## Enterprise Edition\n",
"### per-user online dashboard \n",
" \n",
"### per-user online notebook automated setup\n",
