Allie .S Ubisse AllieUbisse

## python batch geocoding.py
"""
Python script for batch geocoding of addresses using the Google Geocoding API.
This script allows for massive lists of addresses to be geocoded for free by pausing when the
geocoder hits the free rate limit set by Google (2500 per day).  If you have an API key for paid
geocoding from Google, set it in the API key section.
Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
come from a csv file with a column "Address". Adjust the code to your own requirements as needed.
After every 500 successul geocode operations, a temporary file with results is recorded in case of
script failure / loss of connection later.
Addresses and data are held in memory, so this script may need to be adjusted to process files line

## pyspark_help.md

      
              1 file
            
          
              1 fork
            
          
              0 comments
            
          
              1 star
            
          
                hammadzz
                / pyspark_help.md
            
            
              Last active
              August 23, 2020 12:13
            
              
                PySpark HelpSheet
              
          
    Common Alias Functions

These functions are exactly equivalent


Reference


filter
where
pyspark.sql.DataFrame.filter


drop_duplicates
dropDuplicates
pyspark.sql.DataFrame.drop_duplicates


avg
mean
pyspark.sql.GroupedData.avg


## forecasting_metrics.py
import numpy as np

EPSILON = 1e-10


def _error(actual: np.ndarray, predicted: np.ndarray):
    """ Simple error """
    return actual - predicted


## sstreaming-spark-final.py
'''
spark/bin/spark-submit \
    --master local --driver-memory 4g \
    --num-executors 2 --executor-memory 4g \
    --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 \
    sstreaming-spark-final.py
'''
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import expr

## test_MLflow_2.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                azarnyx
                / test_MLflow_2.ipynb
            
            
              Created
              June 30, 2019 12:07
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## 01-model-training.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                ldrewniak
                / 01-model-training.ipynb
            
            
              Created
              November 5, 2019 14:01
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## mlflow-model-evaluation.py
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

import mlflow
import mlflow.sklearn

import numpy as np

# Launch the experiment on mlflow
experiment_name = "electricityconsumption-forecast"

## mlflow_gridsearch.py

def log_run(gridsearch: sklearn.GridSearchCV, experiment_name: str, model_name: str, run_index: int, conda_env, tags={}):
    """Logging of cross validation results to mlflow tracking server

    Args:
        experiment_name (str): experiment name
        model_name (str): Name of the model
        run_index (int): Index of the run (in Gridsearch)
        conda_env (str): A dictionary that describes the conda environment (MLFlow Format)
        tags (dict): Dictionary of extra data and tags (usually features)

## 7.2 Splice MLflow Support.ipynb

      
              1 file
            
          
              1 fork
            
          
              1 comment
            
          
              1 star
            
          
                Ben-Epstein
                / 7.2 Splice MLflow Support.ipynb
            
            
              Created
              July 24, 2020 00:37
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## 3-ln_model.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                carlleston
                / 3-ln_model.ipynb
            
            
              Last active
              August 23, 2020 12:09
            
              
                pre-processing and linear model in pyspark
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	"""
	Python script for batch geocoding of addresses using the Google Geocoding API.
	This script allows for massive lists of addresses to be geocoded for free by pausing when the
	geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid
	geocoding from Google, set it in the API key section.
	Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
	come from a csv file with a column "Address". Adjust the code to your own requirements as needed.
	After every 500 successul geocode operations, a temporary file with results is recorded in case of
	script failure / loss of connection later.
	Addresses and data are held in memory, so this script may need to be adjusted to process files line
		Reference
filter	where	pyspark.sql.DataFrame.filter
drop_duplicates	dropDuplicates	pyspark.sql.DataFrame.drop_duplicates
avg	mean	pyspark.sql.GroupedData.avg
	import numpy as np

	EPSILON = 1e-10


	def _error(actual: np.ndarray, predicted: np.ndarray):
	""" Simple error """
	return actual - predicted
	'''
	spark/bin/spark-submit \
	--master local --driver-memory 4g \
	--num-executors 2 --executor-memory 4g \
	--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.0 \
	sstreaming-spark-final.py
	'''
	from pyspark.sql import SparkSession
	from pyspark.sql.types import *
	from pyspark.sql.functions import expr
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

	import mlflow
	import mlflow.sklearn

	import numpy as np

	# Launch the experiment on mlflow
	experiment_name = "electricityconsumption-forecast"

	def log_run(gridsearch: sklearn.GridSearchCV, experiment_name: str, model_name: str, run_index: int, conda_env, tags={}):
	"""Logging of cross validation results to mlflow tracking server

	Args:
	experiment_name (str): experiment name
	model_name (str): Name of the model
	run_index (int): Index of the run (in Gridsearch)
	conda_env (str): A dictionary that describes the conda environment (MLFlow Format)
	tags (dict): Dictionary of extra data and tags (usually features)