Skip to content

Instantly share code, notes, and snippets.

@tomz
Created November 13, 2015 18:42
Show Gist options
  • Save tomz/0c813ee0324b7167fceb to your computer and use it in GitHub Desktop.
Save tomz/0c813ee0324b7167fceb to your computer and use it in GitHub Desktop.
Dir["./spark/lib/*.jar"].each { |jar| require jar }
require "~/sqljdbc/sqljdbc42.jar"
import java.util.HashMap
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.SQLContext
AWS_KEY = ENV["AWS_KEY"]
AWS_SECRET = ENV["AWS_SECRET"]
S3_BUCKET = "tomzeng/rdd_output"
SQL_DRIVER = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
SQL_USERNAME = ENV["SQL_USERNAME"]
SQL_PWD = ENV["SQL_PWD"]
SQL_HOST_PORT = "ngc1379w4v.database.windows.net:1433"
SQL_DB = "sample"
SQL_CONNECTION_URL = "jdbc:sqlserver://#{SQL_HOST_PORT};database=#{SQL_DB};user=#{SQL_USERNAME};password=#{SQL_PWD};encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"
sc = JavaSparkContext.new(SparkConf.new.set_app_name("JRuby Spark JDBC DF S3").set_master("local[*]"))
sql_context = SQLContext.new(sc)
options = {"driver" => SQL_DRIVER, "url" => SQL_CONNECTION_URL, "dbtable" => "SalesLT.Customer"}
# load the table into data frame
jdbc_df = sql_context.load("jdbc", options)
# display the table rows
customer_rows = jdbc_df.collect_as_list()
customer_rows.each {|row| puts row}
# saving the SQL Server table to s3
jdbc_df.select("*").write().format("com.databricks.spark.csv").save("s3n://#{AWS_KEY}:#{AWS_SECRET}@#{S3_BUCKET}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment