Skip to content

Instantly share code, notes, and snippets.

@theho
Created June 12, 2016 00:08
Show Gist options
  • Save theho/ee20727c9e4d086ac0053cd8821d83e9 to your computer and use it in GitHub Desktop.
Save theho/ee20727c9e4d086ac0053cd8821d83e9 to your computer and use it in GitHub Desktop.
"""
Single Node Cassandra/Spark using pySpark
"""
import os
import sys
SPARK_HOME = "/Users/jimmy/dev/spark"
os.environ['SPARK_HOME'] = SPARK_HOME
os.environ["PYSPARK_PYTHON"] = "python3"
MASTER = 'spark://172.16.41.129:7077'
CASSANDRA = 'spark' # Cassandra installed onto the same node as spark master/slave
sys.path.append(os.path.join(SPARK_HOME, 'python'))
sys.path.append(os.path.join(SPARK_HOME, 'python/lib/py4j-0.9-src.zip'))
try:
import pyspark_cassandra
from pyspark import SparkContext, SparkConf
print("Successfully imported Spark Modules")
except ImportError as e:
print("Can not import Spark Modules", e)
sys.exit(1)
conf = SparkConf() \
.setAppName("PySpark Cassandra Test") \
.set('spark.master', MASTER) \
.set("spark.jars.packages", 'TargetHolding/pyspark-cassandra:0.3.5') \
.set("spark.cassandra.connection.host", CASSANDRA)
# spark.cassandra.username
# spark.cassandra.password
# A test table was created as test.kv, with two columns, k and v.
sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
print(sc.cassandraTable("test", "kv").count())
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
kv = sqlContext.read.format("org.apache.spark.sql.cassandra"). \
load(keyspace="test", table="kv")
kv.where(kv.k == 'hello').show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment