Use command:
file -i somefile.csv
In output:
charset=<some charset>
st.write( | |
""" | |
# Databricks Streamlit Demo :fire: | |
This Streamlit application connects to Databricks SQL Endpoint and creates some visualizations based on the [NYC Taxi Dataset](#https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page). | |
""" | |
) |
class DataProvider: | |
# above goes some low-level code | |
def _get_data(self, query: str) -> pd.DataFrame: | |
self.logger.debug(f"Running SQL query: {query}") | |
start_time = dt.datetime.now() | |
data = pd.read_sql(query, self.connection) | |
end_time = dt.datetime.now() | |
time_delta = end_time - start_time | |
self.logger.debug( | |
f"Query executed, returning the result. Total query time: {time_delta}" |
This method is tested on Linux distribution and based on this sources:
Steps to setup:
pip.conf
. This file may also exist in your $HOME/pip.[global]
import logging | |
def create_logger(name, log_file, level=logging.INFO,filemode='w'): | |
""" | |
Creates logger | |
:param name: logger name | |
:param log_file: logger file | |
:param level: logging level | |
:parameter filemode: output type ('w'-oWerwrite,'a'-Append) |
# PSI (Population Stability Index) is useful when you want to compare two distributions more gently, then 2-KSA etс | |
# More docs at http://ucanalytics.com/blogs/population-stability-index-psi-banking-case-study/ | |
import numpy as np | |
import pandas as pd | |
def psi(v1,v2,groups=10): | |
""" | |
v1 - first distribution (1-D array) | |
v2 - second distribution (1-D array) |
import pandas as pd | |
import sklearn.ensemble as ske | |
from sklearn.preprocessing import Imputer | |
from sklearn.feature_selection import SelectFromModel | |
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier | |
from sklearn.base import TransformerMixin | |
from sklearn.cross_validation import train_test_split | |
import numpy as np | |
from tqdm import tqdm | |
import xgboost as xgb |
import numpy as np | |
import pandas as pd | |
ds_size = 10000 | |
targets = np.random.choice([0,1],size=ds_size) | |
probs = np.random.uniform(0,1,size=ds_size) | |
df = pd.DataFrame([targets,probs]).T | |
df.columns=['target','proba'] | |
def getLiftTable(df,targetColumn,probaColumn,quantilesSize=10): |