- Linux環境でMambaforgeを使って環境を構築する
- クラスター管理などはおいておいて、学習用の最低限の環境を手軽に作る
- python3.11
- pyspark3.4.0
- openjdk17
- pandas2.0.1
- pandasとの連携を確認するため
- その他、jupyterなど
# use mambaforge
mamba env create -f pyspark-env.yaml
testdata | |
# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all | |
# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all | |
### JupyterNotebooks ### | |
# gitignore template for Jupyter Notebooks | |
# website: http://jupyter.org/ | |
.ipynb_checkpoints | |
*/.ipynb_checkpoints/* | |
# IPython | |
profile_default/ | |
ipython_config.py | |
# Remove previous ipynb_checkpoints | |
# git rm -r .ipynb_checkpoints/ | |
### PyCharm+all ### | |
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider | |
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 | |
# User-specific stuff | |
.idea/**/workspace.xml | |
.idea/**/tasks.xml | |
.idea/**/usage.statistics.xml | |
.idea/**/dictionaries | |
.idea/**/shelf | |
# AWS User-specific | |
.idea/**/aws.xml | |
# Generated files | |
.idea/**/contentModel.xml | |
# Sensitive or high-churn files | |
.idea/**/dataSources/ | |
.idea/**/dataSources.ids | |
.idea/**/dataSources.local.xml | |
.idea/**/sqlDataSources.xml | |
.idea/**/dynamic.xml | |
.idea/**/uiDesigner.xml | |
.idea/**/dbnavigator.xml | |
# Gradle | |
.idea/**/gradle.xml | |
.idea/**/libraries | |
# Gradle and Maven with auto-import | |
# When using Gradle or Maven with auto-import, you should exclude module files, | |
# since they will be recreated, and may cause churn. Uncomment if using | |
# auto-import. | |
# .idea/artifacts | |
# .idea/compiler.xml | |
# .idea/jarRepositories.xml | |
# .idea/modules.xml | |
# .idea/*.iml | |
# .idea/modules | |
# *.iml | |
# *.ipr | |
# CMake | |
cmake-build-*/ | |
# Mongo Explorer plugin | |
.idea/**/mongoSettings.xml | |
# File-based project format | |
*.iws | |
# IntelliJ | |
out/ | |
# mpeltonen/sbt-idea plugin | |
.idea_modules/ | |
# JIRA plugin | |
atlassian-ide-plugin.xml | |
# Cursive Clojure plugin | |
.idea/replstate.xml | |
# SonarLint plugin | |
.idea/sonarlint/ | |
# Crashlytics plugin (for Android Studio and IntelliJ) | |
com_crashlytics_export_strings.xml | |
crashlytics.properties | |
crashlytics-build.properties | |
fabric.properties | |
# Editor-based Rest Client | |
.idea/httpRequests | |
# Android studio 3.1+ serialized cache file | |
.idea/caches/build_file_checksums.ser | |
### PyCharm+all Patch ### | |
# Ignore everything but code style settings and run configurations | |
# that are supposed to be shared within teams. | |
.idea/* | |
!.idea/codeStyles | |
!.idea/runConfigurations | |
### Python ### | |
# Byte-compiled / optimized / DLL files | |
__pycache__/ | |
*.py[cod] | |
*$py.class | |
# C extensions | |
*.so | |
# Distribution / packaging | |
.Python | |
build/ | |
develop-eggs/ | |
dist/ | |
downloads/ | |
eggs/ | |
.eggs/ | |
lib/ | |
lib64/ | |
parts/ | |
sdist/ | |
var/ | |
wheels/ | |
share/python-wheels/ | |
*.egg-info/ | |
.installed.cfg | |
*.egg | |
MANIFEST | |
# PyInstaller | |
# Usually these files are written by a python script from a template | |
# before PyInstaller builds the exe, so as to inject date/other infos into it. | |
*.manifest | |
*.spec | |
# Installer logs | |
pip-log.txt | |
pip-delete-this-directory.txt | |
# Unit test / coverage reports | |
htmlcov/ | |
.tox/ | |
.nox/ | |
.coverage | |
.coverage.* | |
.cache | |
nosetests.xml | |
coverage.xml | |
*.cover | |
*.py,cover | |
.hypothesis/ | |
.pytest_cache/ | |
cover/ | |
# Translations | |
*.mo | |
*.pot | |
# Django stuff: | |
*.log | |
local_settings.py | |
db.sqlite3 | |
db.sqlite3-journal | |
# Flask stuff: | |
instance/ | |
.webassets-cache | |
# Scrapy stuff: | |
.scrapy | |
# Sphinx documentation | |
docs/_build/ | |
# PyBuilder | |
.pybuilder/ | |
target/ | |
# Jupyter Notebook | |
# IPython | |
# pyenv | |
# For a library or package, you might want to ignore these files since the code is | |
# intended to run in multiple environments; otherwise, check them in: | |
# .python-version | |
# pipenv | |
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | |
# However, in case of collaboration, if having platform-specific dependencies or dependencies | |
# having no cross-platform support, pipenv may install dependencies that don't work, or not | |
# install all needed dependencies. | |
#Pipfile.lock | |
# poetry | |
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. | |
# This is especially recommended for binary packages to ensure reproducibility, and is more | |
# commonly ignored for libraries. | |
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control | |
#poetry.lock | |
# pdm | |
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. | |
#pdm.lock | |
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it | |
# in version control. | |
# https://pdm.fming.dev/#use-with-ide | |
.pdm.toml | |
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm | |
__pypackages__/ | |
# Celery stuff | |
celerybeat-schedule | |
celerybeat.pid | |
# SageMath parsed files | |
*.sage.py | |
# Environments | |
.env | |
.venv | |
env/ | |
venv/ | |
ENV/ | |
env.bak/ | |
venv.bak/ | |
# Spyder project settings | |
.spyderproject | |
.spyproject | |
# Rope project settings | |
.ropeproject | |
# mkdocs documentation | |
/site | |
# mypy | |
.mypy_cache/ | |
.dmypy.json | |
dmypy.json | |
# Pyre type checker | |
.pyre/ | |
# pytype static type analyzer | |
.pytype/ | |
# Cython debug symbols | |
cython_debug/ | |
# PyCharm | |
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can | |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore | |
# and can be added to the global gitignore or merged into this file. For a more nuclear | |
# option (not recommended) you can uncomment the following to ignore the entire idea folder. | |
#.idea/ | |
### Python Patch ### | |
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration | |
poetry.toml | |
# ruff | |
.ruff_cache/ | |
# LSP config files | |
pyrightconfig.json | |
### VisualStudioCode ### | |
.vscode/* | |
!.vscode/settings.json | |
!.vscode/tasks.json | |
!.vscode/launch.json | |
!.vscode/extensions.json | |
!.vscode/*.code-snippets | |
# Local History for Visual Studio Code | |
.history/ | |
# Built Visual Studio Code Extensions | |
*.vsix | |
### VisualStudioCode Patch ### | |
# Ignore all local history of files | |
.history | |
.ionide | |
# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all |
pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、 メモリに乗り切らないレベルのデータ量は処理できない。 また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。
遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、 更にpandasとの連携・併用ができるツールとしてpysparkが存在するため紹介する。
なお、詳細はPySpark Documentationなども参照のこと
Mambaforgeをインストール済みのLinux環境を前提とする。
以下のようなconda仮想環境作成用のyamlを用意する:
# pyspark_env.yaml
name: pyspark-intro
channels:
- conda-forge
dependencies:
- python=3.11
- pyspark>=3.4
- openjdk=17
- pandas>=2.0.1
- pyarrow
# for testdata
- numpy
- seaborn
# jupyter
- jupyterlab
※先日pandasの2.0がリリースされたが、
deprecatedになっていた
iteritems
が削除された影響で、
pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。pyspark>=3.4.0
で修正済みなので、pandasも2系を使う
mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)
mamba env create -f pyspark_env.yaml
# 出来上がった環境(pyspark-intro)をactivate
conda activate pyspark-intro
# or `mamba activate pyspark-intro`
あたりを簡単に確認する。
データ処理周りの詳細は省略するが、Sparkのデータフレーム機能がSpark SQLと対応していることから、 概ねSQLで出来ることようなことを実行出来ると考えればOK。(というかSQLでも書ける)
# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)
!rm -rf testdata && mkdir testdata
from pyspark.sql.session import SparkSession
# spark sessionの立ち上げ
spark = (SparkSession.builder
.master("local[*]")
.appName("LocalTest")
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.executor.memory", "4g") # TMP (実際はハードコートはあまり良くない)
.getOrCreate()
)
spark
your 131072x1 screen size is bogus. expect trouble
23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)
23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
<div>
<p><b>SparkSession - in-memory</b></p>
SparkContext
<p><a href="http://172.24.210.1:4040">Spark UI</a></p>
<dl>
<dt>Version</dt>
<dd><code>v3.4.0</code></dd>
<dt>Master</dt>
<dd><code>local[*]</code></dd>
<dt>AppName</dt>
<dd><code>LocalTest</code></dd>
</dl>
</div>
↑config
部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)
参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion
なお、詳細かつ環境依存性のあるconfigは別途configファイル($SPARK_HOME/conf/spark-defaults.conf
)を作成するなどして設定すると良い
コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html
また、コンフィグファイルの置き場所の設定などに必要な環境変数SPARK_HOME
について、condaでインストールした場合の設定方法などはここなどを参照
Spark Sessionを起動すると、デフォルトではhttp://localhost:4040にSpark UIが立ち上がり、 実行状態などをモニタリングすることが出来る(デバッグなどにも便利)
# テストデータの作成
import seaborn as sns
iris = sns.load_dataset("iris")
display(iris)
iris.to_csv("testdata/iris.csv", index=False)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
# pysparkでcsvを読み込み
df_iris = spark.read.csv(
"testdata/iris.csv",
header=True,
inferSchema=True,
)
# 遅延評価するので、カラムのスキーマのみ表示される
display(df_iris)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)
※(補足):
# showなどによって初めて評価が行われる
df_iris.show(5) # 5件を表示
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# selectによるカラムの選択
(df_iris
.select(["sepal_length", "sepal_width"])
.show(3)
)
+------------+-----------+
|sepal_length|sepal_width|
+------------+-----------+
| 5.1| 3.5|
| 4.9| 3.0|
| 4.7| 3.2|
+------------+-----------+
only showing top 3 rows
# filterによる簡単なクエリ
(df_iris
.filter(df_iris.species == "virginica")
.filter(df_iris.sepal_length > 5.0)
.show(3)
)
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 6.3| 3.3| 6.0| 2.5|virginica|
| 5.8| 2.7| 5.1| 1.9|virginica|
| 7.1| 3.0| 5.9| 2.1|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
# groupByによる集約(1)
(df_iris
.groupBy('species')
.count()
.show()
)
+----------+-----+
| species|count|
+----------+-----+
| virginica| 50|
|versicolor| 50|
| setosa| 50|
+----------+-----+
# groupByによる集約(2)
(df_iris
.groupBy('species')
.agg({
'sepal_length': 'mean',
'sepal_width': 'mean',
'petal_length': 'mean',
'petal_width': 'mean',
})
.show()
)
+----------+------------------+------------------+-----------------+------------------+
| species| avg(sepal_width)| avg(petal_width)|avg(sepal_length)| avg(petal_length)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|2.9739999999999998| 2.026|6.587999999999998| 5.552|
|versicolor|2.7700000000000005|1.3259999999999998| 5.936| 4.26|
| setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|
+----------+------------------+------------------+-----------------+------------------+
# DataFrameのサマリー
df_iris.describe().show()
23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+-------+------------------+-------------------+------------------+------------------+---------+
|summary| sepal_length| sepal_width| petal_length| petal_width| species|
+-------+------------------+-------------------+------------------+------------------+---------+
| count| 150| 150| 150| 150| 150|
| mean| 5.843333333333335| 3.057333333333334|3.7580000000000027| 1.199333333333334| null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467| null|
| min| 4.3| 2.0| 1.0| 0.1| setosa|
| max| 7.9| 4.4| 6.9| 2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+
# SQLでクエリを書くことも可能
df_iris.createOrReplaceTempView('iris')
spark.sql("show tables").show()
tmp = spark.sql("SELECT * FROM iris WHERE species = 'setosa'")
tmp.show()
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
| | iris| true|
+---------+---------+-----------+
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
詳細はUser Guide, API Reference, およびSpark SQLのリファレンスを参照
次に、データのRead/Writeを確認する
# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する
display(iris.__class__) # irisはpandas dataframe
df = spark.createDataFrame(iris)
display(df)
df.show()
pandas.core.frame.DataFrame
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
| 5.1| 3.5| 1.4| 0.2| setosa|
| 4.9| 3.0| 1.4| 0.2| setosa|
| 4.7| 3.2| 1.3| 0.2| setosa|
| 4.6| 3.1| 1.5| 0.2| setosa|
| 5.0| 3.6| 1.4| 0.2| setosa|
| 5.4| 3.9| 1.7| 0.4| setosa|
| 4.6| 3.4| 1.4| 0.3| setosa|
| 5.0| 3.4| 1.5| 0.2| setosa|
| 4.4| 2.9| 1.4| 0.2| setosa|
| 4.9| 3.1| 1.5| 0.1| setosa|
| 5.4| 3.7| 1.5| 0.2| setosa|
| 4.8| 3.4| 1.6| 0.2| setosa|
| 4.8| 3.0| 1.4| 0.1| setosa|
| 4.3| 3.0| 1.1| 0.1| setosa|
| 5.8| 4.0| 1.2| 0.2| setosa|
| 5.7| 4.4| 1.5| 0.4| setosa|
| 5.4| 3.9| 1.3| 0.4| setosa|
| 5.1| 3.5| 1.4| 0.3| setosa|
| 5.7| 3.8| 1.7| 0.3| setosa|
| 5.1| 3.8| 1.5| 0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows
# spark dataframeからpandas dataframeへの変換
display(df.__class__) # pyspark.sql.dataframe.DataFrame
pdf = df.toPandas()
display(pdf)
pyspark.sql.dataframe.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったらtoPandas
をして扱いやすいpandasに変換する、という使い方が出来る。
最後に色々な形式でデータの書き込みと読み込みを行う
先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。
# dfはspark dataframe
# csv.gz
df.write.save("testdata/iris_csv", format="csv")
# formatオプションを使う代わりに、↓のように書いてもOK
# df.write.csv("testdata/iris_csv", compression="gzip")
!ls testdata/iris_csv
_SUCCESS
part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
!head testdata/iris_csv/part-00000-*.csv
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa
↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。 (ファイルが分散して生成される)
読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、 基本は保存したディレクトリを指定して読み込む
df_csv = spark.read.load("testdata/iris_csv", format="csv")
# formatを指定せず、↓のようにすることも可能
# df_csv = spark.read.csv("testdata/iris_csv")
df_csv.show(3)
+---+---+---+---+----------+
|_c0|_c1|_c2|_c3| _c4|
+---+---+---+---+----------+
|6.3|3.3|4.7|1.6|versicolor|
|4.9|2.4|3.3|1.0|versicolor|
|6.6|2.9|4.6|1.3|versicolor|
+---+---+---+---+----------+
only showing top 3 rows
(↑csvだとカラム名などの情報が欠落しがちで使いづらい)
# テキスト形式だとcsvの他にjsonなども指定可能
# compressionで圧縮形式を指定することも出来る
df.write.json("testdata/iris_json", compression='gzip')
!ls testdata/iris_json
_SUCCESS
part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
# 読み込みも同様
df_json = spark.read.json("testdata/iris_json")
# or df_json = spark.read.load("testdata/iris_json", format="json")
display(df_json)
df_json.show(3)
DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|petal_length|petal_width|sepal_length|sepal_width| species|
+------------+-----------+------------+-----------+----------+
| 3.3| 1.0| 5.0| 2.3|versicolor|
| 4.2| 1.3| 5.6| 2.7|versicolor|
| 4.2| 1.2| 5.7| 3.0|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
(↑jsonだとcsvよりは型情報が保持される)
# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など
df.write.parquet("testdata/iris_parquet", compression="snappy")
!ls testdata/iris_parquet
df.write.orc("testdata/iris_orc", compression="zlib")
!ls testdata/iris_orc
23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
_SUCCESS
part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
_SUCCESS
part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
# 読み込み
# parquet
print("parquet:")
df_parquet = spark.read.parquet("testdata/iris_parquet")
display(df_parquet)
df_parquet.show(3)
print("\n------------------\n")
# orc
print("orc:")
df_orc = spark.read.orc("testdata/iris_orc")
display(df_orc)
df_orc.show(3)
parquet:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
------------------
orc:
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+----------+
| 5.0| 2.3| 3.3| 1.0|versicolor|
| 5.6| 2.7| 4.2| 1.3|versicolor|
| 5.7| 3.0| 4.2| 1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows
# partitionを切って保存することも可能
df.write.save(
"testdata/iris_with_partition",
format="parquet",
compression="snappy",
partitionBy="species"
)
!cd testdata/iris_with_partition && tree
�[00;38;5;33m.�[0m
├── _SUCCESS
├── �[00;38;5;33mspecies=setosa�[0m
│ ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── �[00;38;5;33mspecies=versicolor�[0m
│ ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│ └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── �[00;38;5;33mspecies=virginica�[0m
├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
4 directories, 11 files
カラム"species"の値ごとに別ディレクトリが切られてデータが保存される
RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る(し、不適切だと悪化する)
↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。 一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。
# パーティションがある場合でも読み込み方法は同様
df_partition = spark.read.load("testdata/iris_with_partition")
display(df_partition)
df_partition.show(3)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width| species|
+------------+-----------+------------+-----------+---------+
| 7.9| 3.8| 6.4| 2.0|virginica|
| 6.4| 2.8| 5.6| 2.2|virginica|
| 6.3| 2.8| 5.1| 1.5|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows
他にもsaveAsTableで永続化したテーブルとして書き込みを行ったりも出来る。
扱えるデータ保存形式もData Sourcesのように、 他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。
import pyspark.pandas as ps
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
warnings.warn(
参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html
最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子
なお、Koalasという、sparkをpandasっぽく使えるツールが存在するが、 ここなどを見るとこれが正式にpysparkに取り込まれた?様子。
import pyspark.pandas as ps
# pandas on spark DataFrameを作成する
psdf = ps.read_csv("testdata/iris.csv")
display(psdf.__class__) # pyspark.pandas.frame.DataFrame
psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pyspark.pandas.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# filterや表示も通常のpandasのようにできている
psdf[psdf['sepal_length'] > 7.5]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
105 | 7.6 | 3.0 | 6.6 | 2.1 | virginica |
117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
118 | 7.7 | 2.6 | 6.9 | 2.3 | virginica |
122 | 7.7 | 2.8 | 6.7 | 2.0 | virginica |
131 | 7.9 | 3.8 | 6.4 | 2.0 | virginica |
135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
# pysparkのDataFrameに変換
sdf = psdf.to_spark(index_col=["index"])
sdf.show(5)
+-----+------------+-----------+------------+-----------+-------+
|index|sepal_length|sepal_width|petal_length|petal_width|species|
+-----+------------+-----------+------------+-----------+-------+
| 0| 5.1| 3.5| 1.4| 0.2| setosa|
| 1| 4.9| 3.0| 1.4| 0.2| setosa|
| 2| 4.7| 3.2| 1.3| 0.2| setosa|
| 3| 4.6| 3.1| 1.5| 0.2| setosa|
| 4| 5.0| 3.6| 1.4| 0.2| setosa|
+-----+------------+-----------+------------+-----------+-------+
only showing top 5 rows
# 普通のpandasに変換
pdf = psdf.to_pandas()
display(pdf.__class__) # pandas.core.frame.DataFrame
pdf.head()
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.
warnings.warn(message, PandasAPIOnSparkAdviceWarning)
pandas.core.frame.DataFrame
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。
例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、 データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。
pysparkはdaskに比べると、
name: pyspark-intro | |
channels: | |
- conda-forge | |
dependencies: | |
- python=3.11 | |
- pyspark>=3.4 | |
- openjdk=17 | |
- pandas>=2.0.1 | |
- pyarrow | |
# for testdata | |
- numpy | |
- seaborn | |
# jupyter | |
- jupyterlab |
name: pyspark-intro | |
channels: | |
- conda-forge | |
dependencies: | |
- _libgcc_mutex=0.1=conda_forge | |
- _openmp_mutex=4.5=2_gnu | |
- aiofiles=22.1.0=pyhd8ed1ab_0 | |
- aiosqlite=0.18.0=pyhd8ed1ab_0 | |
- alsa-lib=1.2.8=h166bdaf_0 | |
- anyio=3.6.2=pyhd8ed1ab_0 | |
- argon2-cffi=21.3.0=pyhd8ed1ab_0 | |
- argon2-cffi-bindings=21.2.0=py311hd4cff14_3 | |
- arrow-cpp=11.0.0=ha770c72_14_cpu | |
- asttokens=2.2.1=pyhd8ed1ab_0 | |
- attrs=22.2.0=pyh71513ae_0 | |
- aws-c-auth=0.6.26=hf365957_1 | |
- aws-c-cal=0.5.21=h48707d8_2 | |
- aws-c-common=0.8.14=h0b41bf4_0 | |
- aws-c-compression=0.2.16=h03acc5a_5 | |
- aws-c-event-stream=0.2.20=h00877a2_4 | |
- aws-c-http=0.7.6=hf342b9f_0 | |
- aws-c-io=0.13.19=h5b20300_3 | |
- aws-c-mqtt=0.8.6=hc4349f7_12 | |
- aws-c-s3=0.2.7=h909e904_1 | |
- aws-c-sdkutils=0.1.8=h03acc5a_0 | |
- aws-checksums=0.1.14=h03acc5a_5 | |
- aws-crt-cpp=0.19.8=hf7fbfca_12 | |
- aws-sdk-cpp=1.10.57=h17c43bd_8 | |
- babel=2.12.1=pyhd8ed1ab_1 | |
- backcall=0.2.0=pyh9f0ad1d_0 | |
- backports=1.0=pyhd8ed1ab_3 | |
- backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 | |
- beautifulsoup4=4.12.2=pyha770c72_0 | |
- bleach=6.0.0=pyhd8ed1ab_0 | |
- brotli=1.0.9=h166bdaf_8 | |
- brotli-bin=1.0.9=h166bdaf_8 | |
- brotlipy=0.7.0=py311hd4cff14_1005 | |
- bzip2=1.0.8=h7f98852_4 | |
- c-ares=1.18.1=h7f98852_0 | |
- ca-certificates=2022.12.7=ha878542_0 | |
- cairo=1.16.0=h35add3b_1015 | |
- certifi=2022.12.7=pyhd8ed1ab_0 | |
- cffi=1.15.1=py311h409f033_3 | |
- charset-normalizer=3.1.0=pyhd8ed1ab_0 | |
- comm=0.1.3=pyhd8ed1ab_0 | |
- contourpy=1.0.7=py311ha3edf6b_0 | |
- cryptography=40.0.2=py311h9b4c7bb_0 | |
- cycler=0.11.0=pyhd8ed1ab_0 | |
- debugpy=1.6.7=py311hcafe171_0 | |
- decorator=5.1.1=pyhd8ed1ab_0 | |
- defusedxml=0.7.1=pyhd8ed1ab_0 | |
- entrypoints=0.4=pyhd8ed1ab_0 | |
- executing=1.2.0=pyhd8ed1ab_0 | |
- expat=2.5.0=hcb278e6_1 | |
- flit-core=3.8.0=pyhd8ed1ab_0 | |
- font-ttf-dejavu-sans-mono=2.37=hab24e00_0 | |
- font-ttf-inconsolata=3.000=h77eed37_0 | |
- font-ttf-source-code-pro=2.038=h77eed37_0 | |
- font-ttf-ubuntu=0.83=hab24e00_0 | |
- fontconfig=2.14.2=h14ed4e7_0 | |
- fonts-conda-ecosystem=1=0 | |
- fonts-conda-forge=1=0 | |
- fonttools=4.39.3=py311h2582759_0 | |
- freetype=2.12.1=hca18f0e_1 | |
- gettext=0.21.1=h27087fc_0 | |
- gflags=2.2.2=he1b5a44_1004 | |
- giflib=5.2.1=h0b41bf4_3 | |
- glog=0.6.0=h6f12383_0 | |
- graphite2=1.3.13=h58526e2_1001 | |
- harfbuzz=6.0.0=h3ff4399_1 | |
- icu=72.1=hcb278e6_0 | |
- idna=3.4=pyhd8ed1ab_0 | |
- importlib-metadata=6.6.0=pyha770c72_0 | |
- importlib_metadata=6.6.0=hd8ed1ab_0 | |
- importlib_resources=5.12.0=pyhd8ed1ab_0 | |
- ipykernel=6.22.0=pyh210e3f2_0 | |
- ipython=8.12.0=pyh41d4057_0 | |
- ipython_genutils=0.2.0=py_1 | |
- jedi=0.18.2=pyhd8ed1ab_0 | |
- jinja2=3.1.2=pyhd8ed1ab_1 | |
- json5=0.9.5=pyh9f0ad1d_0 | |
- jsonschema=4.17.3=pyhd8ed1ab_0 | |
- jupyter_client=8.2.0=pyhd8ed1ab_0 | |
- jupyter_core=5.3.0=py311h38be061_0 | |
- jupyter_events=0.6.3=pyhd8ed1ab_0 | |
- jupyter_server=2.5.0=pyhd8ed1ab_0 | |
- jupyter_server_fileid=0.9.0=pyhd8ed1ab_0 | |
- jupyter_server_terminals=0.4.4=pyhd8ed1ab_1 | |
- jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0 | |
- jupyter_ydoc=0.2.3=pyhd8ed1ab_0 | |
- jupyterlab=3.6.3=pyhd8ed1ab_0 | |
- jupyterlab_pygments=0.2.2=pyhd8ed1ab_0 | |
- jupyterlab_server=2.22.1=pyhd8ed1ab_0 | |
- keyutils=1.6.1=h166bdaf_0 | |
- kiwisolver=1.4.4=py311h4dd048b_1 | |
- krb5=1.20.1=h81ceb04_0 | |
- lcms2=2.15=haa2dc70_1 | |
- ld_impl_linux-64=2.40=h41732ed_0 | |
- lerc=4.0.0=h27087fc_0 | |
- libabseil=20230125.0=cxx17_hcb278e6_1 | |
- libarrow=11.0.0=h93537a5_14_cpu | |
- libblas=3.9.0=16_linux64_openblas | |
- libbrotlicommon=1.0.9=h166bdaf_8 | |
- libbrotlidec=1.0.9=h166bdaf_8 | |
- libbrotlienc=1.0.9=h166bdaf_8 | |
- libcblas=3.9.0=16_linux64_openblas | |
- libcrc32c=1.1.2=h9c3ff4c_0 | |
- libcups=2.3.3=h36d4200_3 | |
- libcurl=8.0.1=h588be90_0 | |
- libdeflate=1.18=h0b41bf4_0 | |
- libedit=3.1.20191231=he28a2e2_2 | |
- libev=4.33=h516909a_1 | |
- libevent=2.1.10=h28343ad_4 | |
- libexpat=2.5.0=hcb278e6_1 | |
- libffi=3.4.2=h7f98852_5 | |
- libgcc-ng=12.2.0=h65d4601_19 | |
- libgfortran-ng=12.2.0=h69a702a_19 | |
- libgfortran5=12.2.0=h337968e_19 | |
- libglib=2.76.2=hebfc3b9_0 | |
- libgomp=12.2.0=h65d4601_19 | |
- libgoogle-cloud=2.8.0=h0bc5f78_1 | |
- libgrpc=1.52.1=hcf146ea_1 | |
- libiconv=1.17=h166bdaf_0 | |
- libjpeg-turbo=2.1.5.1=h0b41bf4_0 | |
- liblapack=3.9.0=16_linux64_openblas | |
- libnghttp2=1.52.0=h61bc06f_0 | |
- libnsl=2.0.0=h7f98852_0 | |
- libnuma=2.0.16=h0b41bf4_1 | |
- libopenblas=0.3.21=pthreads_h78a6416_3 | |
- libpng=1.6.39=h753d276_0 | |
- libprotobuf=3.21.12=h3eb15da_0 | |
- libsodium=1.0.18=h36c2ea0_1 | |
- libsqlite=3.40.0=h753d276_1 | |
- libssh2=1.10.0=hf14f497_3 | |
- libstdcxx-ng=12.2.0=h46fd767_19 | |
- libthrift=0.18.1=h5e4af38_0 | |
- libtiff=4.5.0=ha587672_6 | |
- libutf8proc=2.8.0=h166bdaf_0 | |
- libuuid=2.38.1=h0b41bf4_0 | |
- libwebp-base=1.3.0=h0b41bf4_0 | |
- libxcb=1.13=h7f98852_1004 | |
- libzlib=1.2.13=h166bdaf_4 | |
- lz4-c=1.9.4=hcb278e6_0 | |
- markupsafe=2.1.2=py311h2582759_0 | |
- matplotlib-base=3.7.1=py311h8597a09_0 | |
- matplotlib-inline=0.1.6=pyhd8ed1ab_0 | |
- mistune=2.0.5=pyhd8ed1ab_0 | |
- munkres=1.1.4=pyh9f0ad1d_0 | |
- nbclassic=0.5.5=pyhb4ecaf3_1 | |
- nbclient=0.7.3=pyhd8ed1ab_0 | |
- nbconvert=7.3.1=pyhd8ed1ab_0 | |
- nbconvert-core=7.3.1=pyhd8ed1ab_0 | |
- nbconvert-pandoc=7.3.1=pyhd8ed1ab_0 | |
- nbformat=5.8.0=pyhd8ed1ab_0 | |
- ncurses=6.3=h27087fc_1 | |
- nest-asyncio=1.5.6=pyhd8ed1ab_0 | |
- notebook=6.5.4=pyha770c72_0 | |
- notebook-shim=0.2.2=pyhd8ed1ab_0 | |
- numpy=1.24.3=py311h64a7726_0 | |
- openjdk=17.0.3=h4335b31_6 | |
- openjpeg=2.5.0=hfec8fc6_2 | |
- openssl=3.1.0=hd590300_1 | |
- orc=1.8.3=hfdbbad2_0 | |
- packaging=23.1=pyhd8ed1ab_0 | |
- pandas=2.0.1=py311h320fe9a_0 | |
- pandoc=2.19.2=h32600fe_2 | |
- pandocfilters=1.5.0=pyhd8ed1ab_0 | |
- parquet-cpp=1.5.1=2 | |
- parso=0.8.3=pyhd8ed1ab_0 | |
- patsy=0.5.3=pyhd8ed1ab_0 | |
- pcre2=10.40=hc3806b6_0 | |
- pexpect=4.8.0=pyh1a96a4e_2 | |
- pickleshare=0.7.5=py_1003 | |
- pillow=9.5.0=py311h573f0d3_0 | |
- pip=23.1.1=pyhd8ed1ab_0 | |
- pixman=0.40.0=h36c2ea0_0 | |
- pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0 | |
- platformdirs=3.2.0=pyhd8ed1ab_0 | |
- pooch=1.7.0=pyha770c72_3 | |
- prometheus_client=0.16.0=pyhd8ed1ab_0 | |
- prompt-toolkit=3.0.38=pyha770c72_0 | |
- prompt_toolkit=3.0.38=hd8ed1ab_0 | |
- psutil=5.9.5=py311h2582759_0 | |
- pthread-stubs=0.4=h36c2ea0_1001 | |
- ptyprocess=0.7.0=pyhd3deb0d_0 | |
- pure_eval=0.2.2=pyhd8ed1ab_0 | |
- py4j=0.10.9.7=pyhd8ed1ab_0 | |
- pyarrow=11.0.0=py311hbdf6286_14_cpu | |
- pycparser=2.21=pyhd8ed1ab_0 | |
- pygments=2.15.1=pyhd8ed1ab_0 | |
- pyopenssl=23.1.1=pyhd8ed1ab_0 | |
- pyparsing=3.0.9=pyhd8ed1ab_0 | |
- pyrsistent=0.19.3=py311h2582759_0 | |
- pysocks=1.7.1=pyha2e5f31_6 | |
- pyspark=3.4.0=pyhd8ed1ab_0 | |
- python=3.11.3=h2755cc3_0_cpython | |
- python-dateutil=2.8.2=pyhd8ed1ab_0 | |
- python-fastjsonschema=2.16.3=pyhd8ed1ab_0 | |
- python-json-logger=2.0.7=pyhd8ed1ab_0 | |
- python-tzdata=2023.3=pyhd8ed1ab_0 | |
- python_abi=3.11=3_cp311 | |
- pytz=2023.3=pyhd8ed1ab_0 | |
- pyyaml=6.0=py311hd4cff14_5 | |
- pyzmq=25.0.2=py311hd6ccaeb_0 | |
- re2=2023.02.02=hcb278e6_0 | |
- readline=8.2=h8228510_1 | |
- requests=2.28.2=pyhd8ed1ab_1 | |
- rfc3339-validator=0.1.4=pyhd8ed1ab_0 | |
- rfc3986-validator=0.1.1=pyh9f0ad1d_0 | |
- s2n=1.3.41=h3358134_0 | |
- scipy=1.10.1=py311h8e6699e_0 | |
- seaborn=0.12.2=hd8ed1ab_0 | |
- seaborn-base=0.12.2=pyhd8ed1ab_0 | |
- send2trash=1.8.0=pyhd8ed1ab_0 | |
- setuptools=67.7.1=pyhd8ed1ab_0 | |
- six=1.16.0=pyh6c4a22f_0 | |
- snappy=1.1.10=h9fff704_0 | |
- sniffio=1.3.0=pyhd8ed1ab_0 | |
- soupsieve=2.3.2.post1=pyhd8ed1ab_0 | |
- stack_data=0.6.2=pyhd8ed1ab_0 | |
- statsmodels=0.13.5=py311h4c7f6c3_2 | |
- terminado=0.17.1=pyh41d4057_0 | |
- tinycss2=1.2.1=pyhd8ed1ab_0 | |
- tk=8.6.12=h27826a3_0 | |
- tomli=2.0.1=pyhd8ed1ab_0 | |
- tornado=6.3=py311h2582759_0 | |
- traitlets=5.9.0=pyhd8ed1ab_0 | |
- typing-extensions=4.5.0=hd8ed1ab_0 | |
- typing_extensions=4.5.0=pyha770c72_0 | |
- tzdata=2023c=h71feb2d_0 | |
- ucx=1.14.0=h8c404fb_1 | |
- urllib3=1.26.15=pyhd8ed1ab_0 | |
- wcwidth=0.2.6=pyhd8ed1ab_0 | |
- webencodings=0.5.1=py_1 | |
- websocket-client=1.5.1=pyhd8ed1ab_0 | |
- wheel=0.40.0=pyhd8ed1ab_0 | |
- xorg-fixesproto=5.0=h7f98852_1002 | |
- xorg-inputproto=2.3.2=h7f98852_1002 | |
- xorg-kbproto=1.0.7=h7f98852_1002 | |
- xorg-libice=1.0.10=h7f98852_0 | |
- xorg-libsm=1.2.3=hd9c2040_1000 | |
- xorg-libx11=1.8.4=h0b41bf4_0 | |
- xorg-libxau=1.0.9=h7f98852_0 | |
- xorg-libxdmcp=1.1.3=h7f98852_0 | |
- xorg-libxext=1.3.4=h0b41bf4_2 | |
- xorg-libxfixes=5.0.3=h7f98852_1004 | |
- xorg-libxi=1.7.10=h7f98852_0 | |
- xorg-libxrender=0.9.10=h7f98852_1003 | |
- xorg-libxtst=1.2.3=h7f98852_1002 | |
- xorg-recordproto=1.14.2=h7f98852_1002 | |
- xorg-renderproto=0.11.1=h7f98852_1002 | |
- xorg-xextproto=7.3.0=h0b41bf4_1003 | |
- xorg-xproto=7.0.31=h7f98852_1007 | |
- xz=5.2.6=h166bdaf_0 | |
- y-py=0.5.9=py311hfe55011_0 | |
- yaml=0.2.5=h7f98852_2 | |
- ypy-websocket=0.8.2=pyhd8ed1ab_0 | |
- zeromq=4.3.4=h9c3ff4c_1 | |
- zipp=3.15.0=pyhd8ed1ab_0 | |
- zlib=1.2.13=h166bdaf_4 | |
- zstd=1.5.2=h3eb15da_6 | |
# prefix: /path/to/pyspark-intro |