Skip to content

Instantly share code, notes, and snippets.

@pedrovasconcellos
Last active April 15, 2023 05:15
Show Gist options
  • Save pedrovasconcellos/0eaddb1120fd0753e47522d0ec064cdb to your computer and use it in GitHub Desktop.
Save pedrovasconcellos/0eaddb1120fd0753e47522d0ec064cdb to your computer and use it in GitHub Desktop.
Spark Example
from pyspark import SparkContext
import os
ip = os.popen("ifconfig en0 | grep inet | grep -v inet6 | awk '{print $2}'").read().strip()
os.environ['SPARK_LOCAL_IP'] = ip
#bash> export SPARK_LOCAL_IP=$(ifconfig en0 | grep inet | grep -v inet6 | awk '{print $2}')
sc = SparkContext()
sc.setLogLevel("OFF")
data = [1,2,3,4,5]
rdd = sc.parallelize(data)
print("\nProcessing started.")
print(f"SPARK_LOCAL_IP={os.environ.get('SPARK_LOCAL_IP')}\n")
print(f"Original Collection: {rdd.collect()}")
print(f"Count of elements: {rdd.count()}")
print(f"Result: {rdd.reduce(lambda x, y: x + y)}")
print(f"Map func(X + 10): {rdd.map(lambda x: x + 10).collect()}")
print(f"Filter(isPair): {rdd.filter(lambda x: (x % 2) == 0).collect()}")
print(" \nFinished processing.\n")
sc.stop()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
spark = SparkSession.builder.appName("Exemplo").getOrCreate()
spark.sparkContext.setLogLevel("OFF")
# Define o schema do DataFrame
schema = StructType([
StructField("nome", StringType(), True),
StructField("idade", IntegerType(), True),
StructField("cidade", StringType(), True)
])
data = [("Pedro", 31, "São Paulo"),
("Vinicius", 27, "Rio de Janeiro"),
("Kim", 29, "Seul")]
rdd = spark.sparkContext.parallelize(data)
# Cria um DataFrame a partir do RDD e do schema definido
df = spark.createDataFrame(rdd, schema)
df_filtrado = df.filter(df.idade > 28)
df_filtrado.show()
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment