zeryx zeryx

## algorithm.py
from Algorithmia import ADK
import joblib


## This function uses the model manifest `state` or `modelData` class to get model files defined in the model manifest automatically.
## No client work required, just make sure the name in `get_model` matches the name in your model manifest.
def load(state):
    state['model'] = joblib.load(state.get_model("model"))
    state['vectorizer'] = joblib.load(state.get_model("vectorizer"))
    return state

## model_reloading.py
import Algorithmia
from time import time
import pickle
from src.data import data

client = Algorithmia.client()
DATA_MODEL_DIR = "data://.my/example"
MODEL_NAME = "example.pkl"
TIME_0 = 0
LAST_MODIFIED = ""

## algorithm_with_lock.py
from Algorithmia import ADK
import Algorithmia
from time import sleep, time

state_file_path = "data://.my/locking/resource.json"
lock_file_path = "data://.my/locking/lock"
client = Algorithmia.client()


class AlgorithmiaLock(object):

## algorithm_process_pandas_df.py
import Algorithmia
import pandas as pd
client = Algorithmia.client()


def apply(input):
  input_dataframe = pd.DataFrame.from_dict(client.file(input).getJson())
  ...
  ...

## generative_model_finetuning.py

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AdamW
from random import choice
from torch.nn import functional as F
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to('cuda')

## subseq.py
import sys
from time import perf_counter

class SequenceDiscoveryNode:
    def __init__(self, parent, value):
        self.parent = parent
        self.children = []
        self.value = value

    def construct_tree(self, remaining_sequence: list):

## mleap_spark.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                zeryx
                / mleap_spark.md
            
            
              Last active
              September 28, 2020 21:17
            
          
    Mleap + Algorithmia: When to leave your spark pipeline behind for scalable deployment


Intro

Spark is a very powerful big data processing system thats capable of insane workloads. Sometimes though, there are critical paths that don't scale as effectively as you might want. In this blog post, we'll be discussing Spark, Spark Pipelines - and how you might be able to export a critical component from your spark project to Algorithmia by using the MLeap model interchange format & runtime.
What makes Spark great?

Apache Spark is at it's core a distributed data transformation engine for very large datasets and workloads. It links directly with very powerful and battle tested distributed data systems like Hadoop and Cassandra which are industry standard for working in spaces such as the financial industry.

  
## Algorithm.scala
package com.algorithmia

import com.algorithmia.handler.AbstractAlgorithm
import ml.combust.bundle.BundleFile
import ml.combust.bundle.dsl.Bundle
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.MleapSupport._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row, Transformer}

import scala.collection.mutable

## procedure_and_agenda.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                zeryx
                / procedure_and_agenda.md
            
            
              Created
              August 26, 2020 20:37
            
          
    What are we going to do today?

We'll understand what this gitlab -> algorithmia integration does
Gitlab -> Algorithmia Procedure:

Create a new Algorithm on Algorithmia
Create a new project in Gitlab
Add our secret variables to the GitLab project from Algorithmia
Clone both git repositories to our local system


Copy over template code from the Algorithmia repo to the Gitlab repo


## databricks_mleap_example.scala
import ml.combust.bundle.BundleFile
import ml.combust.mleap.spark.SparkSupport._
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.bundle.SparkBundleContext
import org.apache.spark.ml.feature.{Binarizer, StringIndexer}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import resource._
  val datasetName = "example-data.csv"
  val dataframe: DataFrame = spark.sqlContext.read.format("csv").option("header", true).load(datasetName).withColumn("test_double", col("test_double").cast("double"))
	from Algorithmia import ADK
	import joblib


	## This function uses the model manifest `state` or `modelData` class to get model files defined in the model manifest automatically.
	## No client work required, just make sure the name in `get_model` matches the name in your model manifest.
	def load(state):
	state['model'] = joblib.load(state.get_model("model"))
	state['vectorizer'] = joblib.load(state.get_model("vectorizer"))
	return state
	import Algorithmia
	from time import time
	import pickle
	from src.data import data

	client = Algorithmia.client()
	DATA_MODEL_DIR = "data://.my/example"
	MODEL_NAME = "example.pkl"
	TIME_0 = 0
	LAST_MODIFIED = ""
	from Algorithmia import ADK
	import Algorithmia
	from time import sleep, time

	state_file_path = "data://.my/locking/resource.json"
	lock_file_path = "data://.my/locking/lock"
	client = Algorithmia.client()


	class AlgorithmiaLock(object):
	import Algorithmia
	import pandas as pd
	client = Algorithmia.client()



	def apply(input):
	input_dataframe = pd.DataFrame.from_dict(client.file(input).getJson())
	...
	...

	from transformers import AutoTokenizer, AutoModelForCausalLM
	from transformers import AdamW
	from random import choice
	from torch.nn import functional as F
	import torch

	tokenizer = AutoTokenizer.from_pretrained("gpt2")
	model = AutoModelForCausalLM.from_pretrained("gpt2").to('cuda')
	import sys
	from time import perf_counter

	class SequenceDiscoveryNode:
	def __init__(self, parent, value):
	self.parent = parent
	self.children = []
	self.value = value

	def construct_tree(self, remaining_sequence: list):
	package com.algorithmia

	import com.algorithmia.handler.AbstractAlgorithm
	import ml.combust.bundle.BundleFile
	import ml.combust.bundle.dsl.Bundle
	import ml.combust.mleap.core.types._
	import ml.combust.mleap.runtime.MleapSupport._
	import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row, Transformer}

	import scala.collection.mutable
	import ml.combust.bundle.BundleFile
	import ml.combust.mleap.spark.SparkSupport._
	import org.apache.spark.ml.Pipeline
	import org.apache.spark.ml.bundle.SparkBundleContext
	import org.apache.spark.ml.feature.{Binarizer, StringIndexer}
	import org.apache.spark.sql._
	import org.apache.spark.sql.functions._
	import resource._
	val datasetName = "example-data.csv"
	val dataframe: DataFrame = spark.sqlContext.read.format("csv").option("header", true).load(datasetName).withColumn("test_double", col("test_double").cast("double"))