Skip to content

Instantly share code, notes, and snippets.

@xuechendi
Created April 24, 2019 01:40
Show Gist options
  • Save xuechendi/e426065d8c31f514c869c12772f4d67a to your computer and use it in GitHub Desktop.
Save xuechendi/e426065d8c31f514c869c12772f4d67a to your computer and use it in GitHub Desktop.
Apache_Arrow_PySpark_Verification
from pyspark.sql import SparkSession
def plus_one_func(v):
return v + 1
spark = SparkSession.builder.appName("pyspark_expression").getOrCreate()
df = spark.read.load("/HiBench/DataFrame/Input")
df = df.withColumn('count', plus_one(df["count"]))
df.write.format("parquet").save("/HiBench/DataFrame/Output")
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import IntegerType
def plus_one_func(v):
return v + 1
plus_one = pandas_udf(plus_one_func, returnType=IntegerType())
spark = SparkSession.builder.appName("pyspark_pandas_udf").getOrCreate()
df = spark.read.load("/HiBench/DataFrame/Input")
df = df.withColumn('count', plus_one(df["count"]))
df.write.format("parquet").save("/HiBench/DataFrame/Output")
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def plus_one_func(v):
return v + 1
plus_one = udf(plus_one_func, IntegerType())
spark = SparkSession.builder.appName("pyspark_udf").getOrCreate()
df = spark.read.load("/HiBench/DataFrame/Input")
df = df.withColumn('count', plus_one(df["count"]))
df.write.format("parquet").save("/HiBench/DataFrame/Output")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment