Skip to content

Instantly share code, notes, and snippets.

@hrchu
Created January 17, 2018 07:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hrchu/2b2590f2f737ef430ac32b7f8edc15c0 to your computer and use it in GitHub Desktop.
Save hrchu/2b2590f2f737ef430ac32b7f8edc15c0 to your computer and use it in GitHub Desktop.
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Consumes messages from one or more topics in Kafka and does wordcount.
Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
<bootstrap-servers> The Kafka "bootstrap.servers" configuration. A
comma-separated list of host:port.
<subscribe-type> There are three kinds of type, i.e. 'assign', 'subscribe',
'subscribePattern'.
|- <assign> Specific TopicPartitions to consume. Json string
| {"topicA":[0,1],"topicB":[2,4]}.
|- <subscribe> The topic list to subscribe. A comma-separated list of
| topics.
|- <subscribePattern> The pattern used to subscribe to topic(s).
| Java regex string.
|- Only one of "assign, "subscribe" or "subscribePattern" options can be
| specified for Kafka source.
<topics> Different value format depends on the value of 'subscribe-type'.
Run the example
`$ bin/spark-submit examples/src/main/python/sql/streaming/structured_kafka_wordcount.py \
host1:port1,host2:port2 subscribe topic1,topic2`
"""
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, udf
from pyspark.sql.functions import split
from pyspark.sql.types import StringType, ArrayType
@udf(returnType=ArrayType(StringType()))
def my_split2(x):
return x.split(' ')
def main(bootstrapServers, subscribeType, topics):
spark = SparkSession\
.builder\
.appName("StructuredKafkaWordCount") \
.config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0') \
.getOrCreate()
# Create DataSet representing the stream of input lines from kafka
lines = spark\
.readStream\
.format("kafka")\
.option("kafka.bootstrap.servers", bootstrapServers)\
.option(subscribeType, topics)\
.load()\
.selectExpr("CAST(value AS STRING)")
my_split = udf(lambda x: x.split(' '), ArrayType(StringType()))
# Split the lines into words
words = lines.select(
# explode turns each item in an array into a separate row
explode(
# split(lines.value, ' ') # no warning
# my_split(lines.value) # the warning occurred
my_split2(lines.value) # the warning occurred
).alias('word')
)
# Generate running word count
wordCounts = words.groupBy('word').count()
# Start running the query that prints the running counts to the console
query = wordCounts\
.writeStream\
.outputMode('complete')\
.format('console')\
.start()
query.awaitTermination()
main("127.0.0.1:9092", "subscribe", "test")
# if __name__ == "__main__":
# if len(sys.argv) != 4:
# print("""
# Usage: structured_kafka_wordcount.py <bootstrap-servers> <subscribe-type> <topics>
# """, file=sys.stderr)
# exit(-1)
#
# bootstrapServers = sys.argv[1]
# subscribeType = sys.argv[2]
# topics = sys.argv[3]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment