Timothy Renner timothyrenner

## TableJoinKafkaStream.java
package io.github.timothyrenner.kstreamex.tablejoin;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.KeyValue;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.kstream.KStream;

## tweet_utils.py
from datetime import datetime

import string

from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords

#Gets the tweet time.
def get_time(tweet):
    return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y")

## bigfoot_streamlit_app.py
import streamlit as st
import pandas as pd
import altair as alt
import pydeck as pdk
import os

from dateutil.parser import parse

try:
    from dotenv import load_dotenv, find_dotenv

## gfl_with_h3.py
from h3 import h3
from pygfl.easy import solve_gfl

def build_neighbor_edges(hexids):
    # Hash the hexid to the position so we can easily look
    # up where the original hexid position is in the array.
    hexid_to_position = {h:ii for ii,h in enumerate(hexids)}
    edges = []
    for h in hexids:
        for n in h3.k_ring(h,1):

## pyspark_pandas_udf_call.py
data_frame.withColumn(
    "prediction",
    predict_pandas_udf(col("feature1"), col("feature2"), ...)
)

## pyspark_pandas_udf_creation.py
import pandas as pd

from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import DoubleType


@pandas_udf(returnType=DoubleType())
def predict_pandas_udf(*features):
    """ Executes the prediction using numpy arrays.


## pyspark_partition_call.py
my_data.rdd.mapPartitions(predict_partition).toDF()

## pyspark_partition_definition.py
import pandas as pd


# We'll need this handy list more than once. It enforces the
# column order required by the model.
FEATURES = ["feature1", "feature2", "feature3", ...]


def predict_partition(rows):
    """ Calls a vectorized prediction by loading the partition into memory.

## pyspark_udf_call.py
my_df.withColumn(
    "predicted_score",
    predict_udf(col("feature1"), col("feature2"), ...)
)

## pyspark_udf_creation.py
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType


predict_udf = udf(predict, DoubleType())
	package io.github.timothyrenner.kstreamex.tablejoin;

	import org.apache.kafka.clients.producer.KafkaProducer;
	import org.apache.kafka.clients.producer.ProducerRecord;

	import org.apache.kafka.common.serialization.Serdes;
	import org.apache.kafka.streams.KafkaStreams;
	import org.apache.kafka.streams.KeyValue;
	import org.apache.kafka.streams.StreamsConfig;
	import org.apache.kafka.streams.kstream.KStream;
	from datetime import datetime

	import string

	from nltk.stem.lancaster import LancasterStemmer
	from nltk.corpus import stopwords

	#Gets the tweet time.
	def get_time(tweet):
	return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y")
	import streamlit as st
	import pandas as pd
	import altair as alt
	import pydeck as pdk
	import os

	from dateutil.parser import parse

	try:
	from dotenv import load_dotenv, find_dotenv
	from h3 import h3
	from pygfl.easy import solve_gfl

	def build_neighbor_edges(hexids):
	# Hash the hexid to the position so we can easily look
	# up where the original hexid position is in the array.
	hexid_to_position = {h:ii for ii,h in enumerate(hexids)}
	edges = []
	for h in hexids:
	for n in h3.k_ring(h,1):
	data_frame.withColumn(
	"prediction",
	predict_pandas_udf(col("feature1"), col("feature2"), ...)
	)
	import pandas as pd

	from pyspark.sql.functions import pandas_udf
	from pyspark.sql.types import DoubleType


	@pandas_udf(returnType=DoubleType())
	def predict_pandas_udf(*features):
	""" Executes the prediction using numpy arrays.
	import pandas as pd


	# We'll need this handy list more than once. It enforces the
	# column order required by the model.
	FEATURES = ["feature1", "feature2", "feature3", ...]


	def predict_partition(rows):
	""" Calls a vectorized prediction by loading the partition into memory.
	my_df.withColumn(
	"predicted_score",
	predict_udf(col("feature1"), col("feature2"), ...)
	)
	from pyspark.sql.functions import udf
	from pyspark.sql.types import DoubleType


	predict_udf = udf(predict, DoubleType())