takemikami/basic_sample.py

## basic_sample.py
from pyspark.sql import functions as F

df = spark.createDataFrame([(1, 1.2345), (2, 9.8765)], ["col1", "col2"])

# 型のキャスト
df.select(F.col("col2").cast("int")).show()
#  ↓
# +----+
# |col2|
# +----+
# |   1|
# |   9|
# +----+

# 四捨五入
df.select(F.round("col2", 1)).show()
#  ↓
# +--------------+
# |round(col2, 1)|
# +--------------+
# |           1.2|
# |           9.9|
# +--------------+

# 相関係数の計算
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["c1","c2"], outputCol="features")
df_vector = assembler.transform(df.select(f.col("c1").cast("double"), f.col("c2").cast("double")))
pearsonCorr = Correlation.corr(df_vector, 'features', 'pearson').collect()[0][0]
print(str(pearsonCorr).replace('nan', 'NaN'))
	from pyspark.sql import functions as F

	df = spark.createDataFrame([(1, 1.2345), (2, 9.8765)], ["col1", "col2"])

	# 型のキャスト
	df.select(F.col("col2").cast("int")).show()
	# ↓
	# +----+
	# \|col2\|
	# +----+
	# \| 1\|
	# \| 9\|
	# +----+

	# 四捨五入
	df.select(F.round("col2", 1)).show()
	# ↓
	# +--------------+
	# \|round(col2, 1)\|
	# +--------------+
	# \| 1.2\|
	# \| 9.9\|
	# +--------------+

	# 相関係数の計算
	from pyspark.ml.stat import Correlation
	from pyspark.ml.feature import VectorAssembler
	assembler = VectorAssembler(inputCols=["c1","c2"], outputCol="features")
	df_vector = assembler.transform(df.select(f.col("c1").cast("double"), f.col("c2").cast("double")))
	pearsonCorr = Correlation.corr(df_vector, 'features', 'pearson').collect()[0][0]
	print(str(pearsonCorr).replace('nan', 'NaN'))