sanp/parse_json_with_spark_lateral_view.py

## parse_json_with_spark_lateral_view.py
# Parse JSON data with this one weird trick!

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row

# Set up basic spark session
conf = (SparkConf()
         .setAppName('My App')
         .set("spark.executor.memory", '10g'))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)
spark_session = sql_context.sparkSession

# Sample Data
jstr1 = u'{"header":{"id":12345,"foo":"bar"},"body":{"id":111000,"name":"foobar","sub_json":{"id":54321,"sub_sub_json":{"col1":20,"col2":"somethong"}}}}'
jstr2 = u'{"header":{"id":12346,"foo":"baz"},"body":{"id":111002,"name":"barfoo","sub_json":{"id":23456,"sub_sub_json":{"col1":30,"col2":"something else"}}}}'
jstr3 = u'{"header":{"id":43256,"foo":"foobaz"},"body":{"id":20192,"name":"bazbar","sub_json":{"id":39283,"sub_sub_json":{"col1":50,"col2":"another thing"}}}}'
df = sql_context.createDataFrame([Row(json=jstr1),Row(json=jstr2),Row(json=jstr3)])
df.show()
# >>> df.show()
# +--------------------+
# |                json|
# +--------------------+
# |{"header":{"id":1...|
# |{"header":{"id":1...|
# |{"header":{"id":4...|
# +--------------------+

# Create a sql view of the data
df.createOrReplaceTempView('df')

# Parse it - step by step:

# 1. Just the top level JSON
q = """
  select
    json
  from df
"""
result = spark_session.sql(q)
result.show()
# >>> result.show()
# +--------------------+
# |                json|
# +--------------------+
# |{"header":{"id":1...|
# |{"header":{"id":1...|
# |{"header":{"id":4...|
# +--------------------+

# 2. Add the next level of nested data
q = """
  select
    a.json
    , b.header
    , b.body
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
"""
result = spark_session.sql(q)
result.show()
# >>> result.show()
# +--------------------+--------------------+--------------------+
# |                json|              header|                body|
# +--------------------+--------------------+--------------------+
# |{"header":{"id":1...|{"id":12345,"foo"...|{"id":111000,"nam...|
# |{"header":{"id":1...|{"id":12346,"foo"...|{"id":111002,"nam...|
# |{"header":{"id":4...|{"id":43256,"foo"...|{"id":20192,"name...|
# +--------------------+--------------------+--------------------+

# 3. Go deeper into the nesting
q = """
  select
    a.json
    , b.header
    , c.id
    , c.foo
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
  lateral view json_tuple(b.header, 'id', 'foo') c
    as id, foo
"""
result = spark_session.sql(q)
result.show()
# >>> result.show()
# +--------------------+--------------------+-----+------+
# |                json|              header|   id|   foo|
# +--------------------+--------------------+-----+------+
# |{"header":{"id":1...|{"id":12345,"foo"...|12345|   bar|
# |{"header":{"id":1...|{"id":12346,"foo"...|12346|   baz|
# |{"header":{"id":4...|{"id":43256,"foo"...|43256|foobaz|
# +--------------------+--------------------+-----+------+

# ...

# Fully parsed out everything
q = """
  select
    c.header_id
    , c.foo
    , d.body_id
    , d.name
    , e.id as sub_json_id
    , f.col1
    , f.col2
  from df a
  lateral view json_tuple(a.json, 'header', 'body') b
    as header, body
  lateral view json_tuple(b.header, 'id', 'foo') c
    as header_id, foo
  lateral view json_tuple(b.body, 'id', 'name', 'sub_json') d
    as body_id, name, sub_json
  lateral view json_tuple(d.sub_json, 'id', 'sub_sub_json') e
    as id, sub_sub_json
  lateral view json_tuple(e.sub_sub_json, 'col1', 'col2') f
    as col1, col2
"""
result = spark_session.sql(q)
result.show()
# >>> result.show()
# +---------+------+-------+------+-----------+----+--------------+
# |header_id|   foo|body_id|  name|sub_json_id|col1|          col2|
# +---------+------+-------+------+-----------+----+--------------+
# |    12345|   bar| 111000|foobar|      54321|  20|     somethong|
# |    12346|   baz| 111002|barfoo|      23456|  30|something else|
# |    43256|foobaz|  20192|bazbar|      39283|  50| another thing|
# +---------+------+-------+------+-----------+----+--------------+

# Now you can parse anything!
	# Parse JSON data with this one weird trick!

	from pyspark import SparkContext
	from pyspark import SparkConf
	from pyspark.sql import SQLContext
	from pyspark.sql import Row

	# Set up basic spark session
	conf = (SparkConf()
	.setAppName('My App')
	.set("spark.executor.memory", '10g'))
	sc = SparkContext(conf=conf)
	sql_context = SQLContext(sc)
	spark_session = sql_context.sparkSession

	# Sample Data
	jstr1 = u'{"header":{"id":12345,"foo":"bar"},"body":{"id":111000,"name":"foobar","sub_json":{"id":54321,"sub_sub_json":{"col1":20,"col2":"somethong"}}}}'
	jstr2 = u'{"header":{"id":12346,"foo":"baz"},"body":{"id":111002,"name":"barfoo","sub_json":{"id":23456,"sub_sub_json":{"col1":30,"col2":"something else"}}}}'
	jstr3 = u'{"header":{"id":43256,"foo":"foobaz"},"body":{"id":20192,"name":"bazbar","sub_json":{"id":39283,"sub_sub_json":{"col1":50,"col2":"another thing"}}}}'
	df = sql_context.createDataFrame([Row(json=jstr1),Row(json=jstr2),Row(json=jstr3)])
	df.show()
	# >>> df.show()
	# +--------------------+
	# \| json\|
	# +--------------------+
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":4...\|
	# +--------------------+

	# Create a sql view of the data
	df.createOrReplaceTempView('df')

	# Parse it - step by step:

	# 1. Just the top level JSON
	q = """
	select
	json
	from df
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+
	# \| json\|
	# +--------------------+
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":1...\|
	# \|{"header":{"id":4...\|
	# +--------------------+

	# 2. Add the next level of nested data
	q = """
	select
	a.json
	, b.header
	, b.body
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+--------------------+--------------------+
	# \| json\| header\| body\|
	# +--------------------+--------------------+--------------------+
	# \|{"header":{"id":1...\|{"id":12345,"foo"...\|{"id":111000,"nam...\|
	# \|{"header":{"id":1...\|{"id":12346,"foo"...\|{"id":111002,"nam...\|
	# \|{"header":{"id":4...\|{"id":43256,"foo"...\|{"id":20192,"name...\|
	# +--------------------+--------------------+--------------------+

	# 3. Go deeper into the nesting
	q = """
	select
	a.json
	, b.header
	, c.id
	, c.foo
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	lateral view json_tuple(b.header, 'id', 'foo') c
	as id, foo
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +--------------------+--------------------+-----+------+
	# \| json\| header\| id\| foo\|
	# +--------------------+--------------------+-----+------+
	# \|{"header":{"id":1...\|{"id":12345,"foo"...\|12345\| bar\|
	# \|{"header":{"id":1...\|{"id":12346,"foo"...\|12346\| baz\|
	# \|{"header":{"id":4...\|{"id":43256,"foo"...\|43256\|foobaz\|
	# +--------------------+--------------------+-----+------+

	# ...

	# Fully parsed out everything
	q = """
	select
	c.header_id
	, c.foo
	, d.body_id
	, d.name
	, e.id as sub_json_id
	, f.col1
	, f.col2
	from df a
	lateral view json_tuple(a.json, 'header', 'body') b
	as header, body
	lateral view json_tuple(b.header, 'id', 'foo') c
	as header_id, foo
	lateral view json_tuple(b.body, 'id', 'name', 'sub_json') d
	as body_id, name, sub_json
	lateral view json_tuple(d.sub_json, 'id', 'sub_sub_json') e
	as id, sub_sub_json
	lateral view json_tuple(e.sub_sub_json, 'col1', 'col2') f
	as col1, col2
	"""
	result = spark_session.sql(q)
	result.show()
	# >>> result.show()
	# +---------+------+-------+------+-----------+----+--------------+
	# \|header_id\| foo\|body_id\| name\|sub_json_id\|col1\| col2\|
	# +---------+------+-------+------+-----------+----+--------------+
	# \| 12345\| bar\| 111000\|foobar\| 54321\| 20\| somethong\|
	# \| 12346\| baz\| 111002\|barfoo\| 23456\| 30\|something else\|
	# \| 43256\|foobaz\| 20192\|bazbar\| 39283\| 50\| another thing\|
	# +---------+------+-------+------+-----------+----+--------------+

	# Now you can parse anything!