ottomata/netflow_spark_structured_streaming.py

## netflow_spark_structured_streaming.py
# From stat1004:
# pyspark2 --jars ~otto/spark-sql-kafka-0-10_2.11-2.3.1.jar,~otto/kafka-clients-1.1.0.jar
# Need spark-sql-kafka for DataStream source and kafka-clients for Kafka serdes.

from pyspark.sql.functions import *
from pyspark.sql.types import *

# Declare a Spark schema that matches the JSONData.
# In a future MEP world this would be automatically loaded
# from a JSONSchema.
netflow_schema = StructType() \
    .add("event_type", "string") \
    .add("tag2", "integer") \
    .add("as_src", "integer") \
    .add("as_dst", "integer") \
    .add("as_path", "string") \
    .add("peer_as_src", "integer") \
    .add("peer_as_dst", "string") \
    .add("ip_src", "string") \
    .add("ip_dst", "string") \
    .add("port_src", "integer") \
    .add("port_dst", "integer") \
    .add("tcp_flags", "string") \
    .add("ip_proto", "string") \
    .add("stamp_inserted", "timestamp") \
    .add("stamp_updated", "timestamp") \
    .add("packets", "integer") \
    .add("bytes", "integer") \
    .add("writer_id", "string")

# Create a streaming DataFrame from the netflow topic in Kafka.
netflow_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-jumbo1001.eqiad.wmnet:9092") \
    .option("subscribe", "netflow") \
    .load()

# The JSON data is in the Kafka message value.
# We need to read it as json (using from_json function) using our schema,
# and then select the subfields into a top level DataFrame
netflow = netflow_stream \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", netflow_schema).alias("data")).select("data.*")


# Query the DataFrame using Spark SQL API.
# This will output the top 20 counts seen for (configurable) ip_src,
# updated every 30 seconds.
# NOTE: .format("console") writes to the console; this could
# be writing back to another Kafka topic, or a file, or elsewhere.
ip_src_counts = netflow \
    .withWatermark("stamp_updated", "30 seconds") \
    .groupBy("ip_src").count() \
    .orderBy("count", ascending=False) \
    .writeStream \
    .trigger(processingTime="30 seconds") \
    .outputMode("complete") \
    .option("truncate", False) \
    .format("console") \
    .start()

# -------------------------------------------
# Batch: 2
# -------------------------------------------
# +--------------+-----+
# |ip_src        |count|
# +--------------+-----+
# |X.X.X.X       |8809 |
# |X.X.X.X       |8295 |
# |X.X.X.X       |743  |
# ...
# +--------------+-----+
# only showing top 20 rows

# To stop the streaming query:
ip_src_counts.stop()

# See also:
# https://databricks.com/blog/2016/07/28/structured-streaming-in-apache-spark.html
# https://spark.apache.org/docs/2.3.1/structured-streaming-kafka-integration.html
# https://spark.apache.org/docs/2.3.1/structured-streaming-programming-guide.html
# https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html
#
	# From stat1004:
	# pyspark2 --jars ~otto/spark-sql-kafka-0-10_2.11-2.3.1.jar,~otto/kafka-clients-1.1.0.jar
	# Need spark-sql-kafka for DataStream source and kafka-clients for Kafka serdes.

	from pyspark.sql.functions import *
	from pyspark.sql.types import *

	# Declare a Spark schema that matches the JSONData.
	# In a future MEP world this would be automatically loaded
	# from a JSONSchema.
	netflow_schema = StructType() \
	.add("event_type", "string") \
	.add("tag2", "integer") \
	.add("as_src", "integer") \
	.add("as_dst", "integer") \
	.add("as_path", "string") \
	.add("peer_as_src", "integer") \
	.add("peer_as_dst", "string") \
	.add("ip_src", "string") \
	.add("ip_dst", "string") \
	.add("port_src", "integer") \
	.add("port_dst", "integer") \
	.add("tcp_flags", "string") \
	.add("ip_proto", "string") \
	.add("stamp_inserted", "timestamp") \
	.add("stamp_updated", "timestamp") \
	.add("packets", "integer") \
	.add("bytes", "integer") \
	.add("writer_id", "string")

	# Create a streaming DataFrame from the netflow topic in Kafka.
	netflow_stream = spark \
	.readStream \
	.format("kafka") \
	.option("kafka.bootstrap.servers", "kafka-jumbo1001.eqiad.wmnet:9092") \
	.option("subscribe", "netflow") \
	.load()

	# The JSON data is in the Kafka message value.
	# We need to read it as json (using from_json function) using our schema,
	# and then select the subfields into a top level DataFrame
	netflow = netflow_stream \
	.selectExpr("CAST(value AS STRING)") \
	.select(from_json("value", netflow_schema).alias("data")).select("data.*")


	# Query the DataFrame using Spark SQL API.
	# This will output the top 20 counts seen for (configurable) ip_src,
	# updated every 30 seconds.
	# NOTE: .format("console") writes to the console; this could
	# be writing back to another Kafka topic, or a file, or elsewhere.
	ip_src_counts = netflow \
	.withWatermark("stamp_updated", "30 seconds") \
	.groupBy("ip_src").count() \
	.orderBy("count", ascending=False) \
	.writeStream \
	.trigger(processingTime="30 seconds") \
	.outputMode("complete") \
	.option("truncate", False) \
	.format("console") \
	.start()

	# -------------------------------------------
	# Batch: 2
	# -------------------------------------------
	# +--------------+-----+
	# \|ip_src \|count\|
	# +--------------+-----+
	# \|X.X.X.X \|8809 \|
	# \|X.X.X.X \|8295 \|
	# \|X.X.X.X \|743 \|
	# ...
	# +--------------+-----+
	# only showing top 20 rows

	# To stop the streaming query:
	ip_src_counts.stop()

	# See also:
	# https://databricks.com/blog/2016/07/28/structured-streaming-in-apache-spark.html
	# https://spark.apache.org/docs/2.3.1/structured-streaming-kafka-integration.html
	# https://spark.apache.org/docs/2.3.1/structured-streaming-programming-guide.html
	# https://databricks.com/blog/2017/05/08/event-time-aggregation-watermarking-apache-sparks-structured-streaming.html
	#