justinTM/example.py

## example.py
import os
from pyspark.sql import SparkSession

# create the spark session on a cluster of multiple cores
DIR_JSONS = '/tmp/in/jsons'
SPARK = SparkSession.builder.appName('APP_NAME').getOrCreate()
sc = SPARK.sparkContext

# execute a shell command using python
def os_shell_jq(filepath):
  os.system(f"jq -c '.' '{filepath}' > '{filepath}'")

# get all JSON filepaths in a directory
jsons = [ os.path.join(DIR_JSONS, f) for f in listdir(DIR_JSONS) ]

# call the python function (shell command) for each string JSON filepath
rdd = sc.parallelize(jsons).foreach(os_shell_jq)
	import os
	from pyspark.sql import SparkSession

	# create the spark session on a cluster of multiple cores
	DIR_JSONS = '/tmp/in/jsons'
	SPARK = SparkSession.builder.appName('APP_NAME').getOrCreate()
	sc = SPARK.sparkContext

	# execute a shell command using python
	def os_shell_jq(filepath):
	os.system(f"jq -c '.' '{filepath}' > '{filepath}'")

	# get all JSON filepaths in a directory
	jsons = [ os.path.join(DIR_JSONS, f) for f in listdir(DIR_JSONS) ]

	# call the python function (shell command) for each string JSON filepath
	rdd = sc.parallelize(jsons).foreach(os_shell_jq)