drboyer/myfile.csv

## myfile.csv

          
            quad
            val

            
              nw
              0

            
              ne
              1

            
              se
              2

            
              sw
              3

## query_csv.py
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("csv-reader").getOrCreate()
# quiet down logging
spark.sparkContext.setLogLevel("WARN")

# read csv file with the DataFrame column names coming from the first row in the CSV
csv_df = spark.read.csv("/tmp/myfile.csv", header=True)

# you can filter your dataframe with a "where" SQL clause or use
#   the select() function to pick out only certain columns
filtered = csv_df.where("quad == 'ne'")

# print the results as a table
filtered.show(truncate=False)

## setup.md

      
    Raw
  

              setup.md
            
          
    To run this, install pyspark via:
pip install pyspark

(either globally or in a virtualenv)
You'll also need Java 8 installed and may need to set the JAVA_HOME environment variable.
Once that's done, to run the example here, save myfile.csv to your /tmp directory. Then you can run
spark-submit query_csv.py

to run your Spark application and print the results.
References:


Spark SQL functions
Spark DataFrame reference
	from pyspark.sql import SparkSession

	spark = SparkSession.builder.appName("csv-reader").getOrCreate()
	# quiet down logging
	spark.sparkContext.setLogLevel("WARN")

	# read csv file with the DataFrame column names coming from the first row in the CSV
	csv_df = spark.read.csv("/tmp/myfile.csv", header=True)

	# you can filter your dataframe with a "where" SQL clause or use
	# the select() function to pick out only certain columns
	filtered = csv_df.where("quad == 'ne'")

	# print the results as a table
	filtered.show(truncate=False)