Skip to content

Instantly share code, notes, and snippets.

@takemikami
Last active April 11, 2019 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save takemikami/740dd33682d66ee0666963cd832e048c to your computer and use it in GitHub Desktop.
Save takemikami/740dd33682d66ee0666963cd832e048c to your computer and use it in GitHub Desktop.
pyspark.sql code samples
from pyspark.sql import functions as F
df = spark.createDataFrame([(1, 1.2345), (2, 9.8765)], ["col1", "col2"])
# 型のキャスト
df.select(F.col("col2").cast("int")).show()
# ↓
# +----+
# |col2|
# +----+
# | 1|
# | 9|
# +----+
# 四捨五入
df.select(F.round("col2", 1)).show()
# ↓
# +--------------+
# |round(col2, 1)|
# +--------------+
# | 1.2|
# | 9.9|
# +--------------+
# 相関係数の計算
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["c1","c2"], outputCol="features")
df_vector = assembler.transform(df.select(f.col("c1").cast("double"), f.col("c2").cast("double")))
pearsonCorr = Correlation.corr(df_vector, 'features', 'pearson').collect()[0][0]
print(str(pearsonCorr).replace('nan', 'NaN'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment