junkor-1011/.gitignore

## .gitignore
testdata

# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all

### JupyterNotebooks ###
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/

.ipynb_checkpoints
*/.ipynb_checkpoints/*

# IPython
profile_default/
ipython_config.py

# Remove previous ipynb_checkpoints
#   git rm -r .ipynb_checkpoints/

### PyCharm+all ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn.  Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based Rest Client
.idea/httpRequests

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

### PyCharm+all Patch ###
# Ignore everything but code style settings and run configurations
# that are supposed to be shared within teams.

.idea/*

!.idea/codeStyles
!.idea/runConfigurations

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook

# IPython

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml

# ruff
.ruff_cache/

# LSP config files
pyrightconfig.json

### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets

# Local History for Visual Studio Code
.history/

# Built Visual Studio Code Extensions
*.vsix

### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide

# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all

## README.md

      
    Raw
  

              README.md
            
          
    pyspark入門

概要


Linux環境でMambaforgeを使って環境を構築する

クラスター管理などはおいておいて、学習用の最低限の環境を手軽に作る


環境作成


python3.11
pyspark3.4.0

openjdk17


pandas2.0.1

pandasとの連携を確認するため


その他、jupyterなど

# use mambaforge
mamba env create -f pyspark-env.yaml

  
## pyspark-test.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              pyspark-test.ipynb
            
          
      Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## pyspark-test.md

      
    Raw
  

              pyspark-test.md
            
          
    Pyspark入門

pandasなどは小規模~中規模くらいのデータセットの処理や分析には有効だが、
メモリに乗り切らないレベルのデータ量は処理できない。
また、GILによって基本的に1スレッドでの動作になるため、マシンスペックを最大限活かし切るのは難しい。
遅延評価、並列分散処理によって所謂ビッグデータといわれるレベルのテーブルデータの処理・分析に使うことができ、
更にpandasとの連携・併用ができるツールとしてpysparkが存在するため紹介する。
なお、詳細はPySpark Documentationなども参照のこと
環境の作成

Mambaforgeをインストール済みのLinux環境を前提とする。

conda 23.1.0
mamba 1.4.1

以下のようなconda仮想環境作成用のyamlを用意する:
# pyspark_env.yaml
name: pyspark-intro
channels:
  - conda-forge
dependencies:
  - python=3.11
  - pyspark>=3.4
  - openjdk=17
  - pandas>=2.0.1
  - pyarrow
  # for testdata
  - numpy
  - seaborn
  # jupyter
  - jupyterlab
※先日pandasの2.0がリリースされたが、
deprecatedになっていたiteritemsが削除された影響で、
pandasからpysparkのDataFrameに変換する部分が一部動作しなくなる現象が起こるため、当面は1.5系を使う。
pyspark>=3.4.0で修正済みなので、pandasも2系を使う
mambaコマンドで仮想環境を作成(condaだと依存関係の解決が遅いのでmambaの使用を推奨)
mamba env create -f pyspark_env.yaml

# 出来上がった環境(pyspark-intro)をactivate
conda activate pyspark-intro
# or `mamba activate pyspark-intro`
実行


spark sessionの起動
データの読み込みと保存
pandasとの連携

あたりを簡単に確認する。
データ処理周りの詳細は省略するが、Sparkのデータフレーム機能がSpark SQLと対応していることから、
概ねSQLで出来ることようなことを実行出来ると考えればOK。（というかSQLでも書ける）
# テストデータ格納ディレクトリのセットアップ(Linuxコマンドを呼び出して実行している)

!rm -rf testdata && mkdir testdata
from pyspark.sql.session import SparkSession

# spark sessionの立ち上げ
spark = (SparkSession.builder
  .master("local[*]")
  .appName("LocalTest")
  .config("spark.sql.execution.arrow.pyspark.enabled", "true")
  .config("spark.executor.memory", "4g") # TMP (実際はハードコートはあまり良くない)
  .getOrCreate()
)
spark
your 131072x1 screen size is bogus. expect trouble
23/04/25 00:12:59 WARN Utils: Your hostname, XXXXXXX-XXXXXXX resolves to a loopback address: 127.0.1.1; using 172.24.210.1 instead (on interface eth0)
23/04/25 00:12:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/25 00:12:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<div>
    <p><b>SparkSession - in-memory</b></p>


    SparkContext
<p><a href="http://172.24.210.1:4040">Spark UI</a></p>

<dl>
  <dt>Version</dt>
    <dd><code>v3.4.0</code></dd>
  <dt>Master</dt>
    <dd><code>local[*]</code></dd>
  <dt>AppName</dt>
    <dd><code>LocalTest</code></dd>
</dl>


</div>

↑config部分について、pandasとの連携のため、configを調整しておく(pyarrowをインストールしておく必要あり)
参考: https://learn.microsoft.com/ja-jp/azure/databricks/pandas/pyspark-pandas-conversion
なお、詳細かつ環境依存性のあるconfigは別途configファイル($SPARK_HOME/conf/spark-defaults.conf)を作成するなどして設定すると良い
コンフィグ設定の詳細: https://spark.apache.org/docs/latest/configuration.html
また、コンフィグファイルの置き場所の設定などに必要な環境変数SPARK_HOMEについて、condaでインストールした場合の設定方法などはここなどを参照
Spark Sessionを起動すると、デフォルトではhttp://localhost:4040にSpark UIが立ち上がり、
実行状態などをモニタリングすることが出来る（デバッグなどにも便利）
# テストデータの作成
import seaborn as sns

iris = sns.load_dataset("iris")
display(iris)

iris.to_csv("testdata/iris.csv", index=False)

<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

  
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      145
      6.7
      3.0
      5.2
      2.3
      virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      virginica
    
  
150 rows × 5 columns

# pysparkでcsvを読み込み

df_iris = spark.read.csv(
    "testdata/iris.csv",
    header=True,
    inferSchema=True,
)

# 遅延評価するので、カラムのスキーマのみ表示される
display(df_iris)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]

(↑csvから読み込んでいるため、ヘッダーの有無の指定やスキーマの自動推定をオプションで与えている)
※(補足):

遅延評価を行う関係で、オンメモリに展開するpandasなどと異なりSparkの処理のパフォーマンスはデータソースの形式の影響をもろに受ける

データ形式(csv/json/parquet/orc/RDBテーブル/...etc)や、圧縮形式(gzip/xz/snappy/zlib/...etc)、カラムに対するパーティションなど


データ分析・データ処理用途であれば行指向形式であるcsvを読み込んで処理を実行するのは一般に遅く、列指向形式のparquetやorcなどの方が有利

# showなどによって初めて評価が行われる
df_iris.show(5) # 5件を表示
+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

# selectによるカラムの選択
(df_iris
    .select(["sepal_length", "sepal_width"])
    .show(3)
)
+------------+-----------+
|sepal_length|sepal_width|
+------------+-----------+
|         5.1|        3.5|
|         4.9|        3.0|
|         4.7|        3.2|
+------------+-----------+
only showing top 3 rows

# filterによる簡単なクエリ
(df_iris
    .filter(df_iris.species == "virginica")
    .filter(df_iris.sepal_length > 5.0)
    .show(3)
)
+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  species|
+------------+-----------+------------+-----------+---------+
|         6.3|        3.3|         6.0|        2.5|virginica|
|         5.8|        2.7|         5.1|        1.9|virginica|
|         7.1|        3.0|         5.9|        2.1|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows

# groupByによる集約(1)
(df_iris
    .groupBy('species')
    .count()
    .show()    
)
+----------+-----+
|   species|count|
+----------+-----+
| virginica|   50|
|versicolor|   50|
|    setosa|   50|
+----------+-----+

# groupByによる集約(2)

(df_iris
    .groupBy('species')
    .agg({
        'sepal_length': 'mean',
        'sepal_width': 'mean',
        'petal_length': 'mean',
        'petal_width': 'mean',
    })
    .show()
)
+----------+------------------+------------------+-----------------+------------------+
|   species|  avg(sepal_width)|  avg(petal_width)|avg(sepal_length)| avg(petal_length)|
+----------+------------------+------------------+-----------------+------------------+
| virginica|2.9739999999999998|             2.026|6.587999999999998|             5.552|
|versicolor|2.7700000000000005|1.3259999999999998|            5.936|              4.26|
|    setosa| 3.428000000000001|0.2459999999999999|5.005999999999999|1.4620000000000002|
+----------+------------------+------------------+-----------------+------------------+

# DataFrameのサマリー
df_iris.describe().show()
23/04/25 00:13:09 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+------------------+-------------------+------------------+------------------+---------+
|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|
+-------+------------------+-------------------+------------------+------------------+---------+
|  count|               150|                150|               150|               150|      150|
|   mean| 5.843333333333335|  3.057333333333334|3.7580000000000027| 1.199333333333334|     null|
| stddev|0.8280661279778637|0.43586628493669793|1.7652982332594662|0.7622376689603467|     null|
|    min|               4.3|                2.0|               1.0|               0.1|   setosa|
|    max|               7.9|                4.4|               6.9|               2.5|virginica|
+-------+------------------+-------------------+------------------+------------------+---------+

# SQLでクエリを書くことも可能
df_iris.createOrReplaceTempView('iris')

spark.sql("show tables").show()

tmp = spark.sql("SELECT * FROM iris WHERE species = 'setosa'")
tmp.show()
+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |     iris|       true|
+---------+---------+-----------+

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1|        0.1| setosa|
|         5.8|        4.0|         1.2|        0.2| setosa|
|         5.7|        4.4|         1.5|        0.4| setosa|
|         5.4|        3.9|         1.3|        0.4| setosa|
|         5.1|        3.5|         1.4|        0.3| setosa|
|         5.7|        3.8|         1.7|        0.3| setosa|
|         5.1|        3.8|         1.5|        0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows

詳細はUser Guide, API Reference,
およびSpark SQLのリファレンスを参照
次に、データのRead/Writeを確認する
# さっきは便宜上csvから読み込んだが、型情報が失われる + pandasとの連携を見るため、pandasのデータフレームから直接pysparkに変換する

display(iris.__class__) # irisはpandas dataframe
df = spark.createDataFrame(iris)

display(df)

df.show()
pandas.core.frame.DataFrame


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]


+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1|        0.1| setosa|
|         5.8|        4.0|         1.2|        0.2| setosa|
|         5.7|        4.4|         1.5|        0.4| setosa|
|         5.4|        3.9|         1.3|        0.4| setosa|
|         5.1|        3.5|         1.4|        0.3| setosa|
|         5.7|        3.8|         1.7|        0.3| setosa|
|         5.1|        3.8|         1.5|        0.3| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 20 rows

# spark dataframeからpandas dataframeへの変換

display(df.__class__) # pyspark.sql.dataframe.DataFrame

pdf = df.toPandas()
display(pdf)
pyspark.sql.dataframe.DataFrame


<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

  
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      145
      6.7
      3.0
      5.2
      2.3
      virginica
    
    
      146
      6.3
      2.5
      5.0
      1.9
      virginica
    
    
      147
      6.5
      3.0
      5.2
      2.0
      virginica
    
    
      148
      6.2
      3.4
      5.4
      2.3
      virginica
    
    
      149
      5.9
      3.0
      5.1
      1.8
      virginica
    
  
150 rows × 5 columns

↑pysparkで大規模~中規模くらいのデータを処理してサイズダウンし、オンメモリに乗るくらいになったらtoPandasをして扱いやすいpandasに変換する、という使い方が出来る。
最後に色々な形式でデータの書き込みと読み込みを行う
先述の通り、sparkはデータ保存形式が処理パフォーマンスにダイレクトに効いてくるため、この辺りの取り扱いは肝になってくる。
# dfはspark dataframe

# csv.gz
df.write.save("testdata/iris_csv", format="csv")

# formatオプションを使う代わりに、↓のように書いてもOK
# df.write.csv("testdata/iris_csv", compression="gzip")

!ls testdata/iris_csv
_SUCCESS
part-00000-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00001-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00002-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00003-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00004-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00005-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00006-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv
part-00007-15ac8e26-2201-46b1-9e77-23eca0cb0744-c000.csv

!head testdata/iris_csv/part-00000-*.csv
5.1,3.5,1.4,0.2,setosa
4.9,3.0,1.4,0.2,setosa
4.7,3.2,1.3,0.2,setosa
4.6,3.1,1.5,0.2,setosa
5.0,3.6,1.4,0.2,setosa
5.4,3.9,1.7,0.4,setosa
4.6,3.4,1.4,0.3,setosa
5.0,3.4,1.5,0.2,setosa
4.4,2.9,1.4,0.2,setosa
4.9,3.1,1.5,0.1,setosa

↑pandasのように単一のファイルを指定する感じではなく、保存するディレクトリを指定する。
(ファイルが分散して生成される)
読み込む際は個々のファイルを指定することもできる(←Spark以外で作ったファイルの読み込みなど)が、
基本は保存したディレクトリを指定して読み込む
df_csv = spark.read.load("testdata/iris_csv", format="csv")

# formatを指定せず、↓のようにすることも可能
# df_csv = spark.read.csv("testdata/iris_csv")

df_csv.show(3)
+---+---+---+---+----------+
|_c0|_c1|_c2|_c3|       _c4|
+---+---+---+---+----------+
|6.3|3.3|4.7|1.6|versicolor|
|4.9|2.4|3.3|1.0|versicolor|
|6.6|2.9|4.6|1.3|versicolor|
+---+---+---+---+----------+
only showing top 3 rows

(↑csvだとカラム名などの情報が欠落しがちで使いづらい)
# テキスト形式だとcsvの他にjsonなども指定可能
# compressionで圧縮形式を指定することも出来る

df.write.json("testdata/iris_json", compression='gzip')

!ls testdata/iris_json
_SUCCESS
part-00000-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00001-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00002-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00003-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00004-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00005-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00006-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz
part-00007-8f677572-eb1f-4e24-9861-d72e16706c35-c000.json.gz

# 読み込みも同様

df_json = spark.read.json("testdata/iris_json")
# or df_json = spark.read.load("testdata/iris_json", format="json")

display(df_json)
df_json.show(3)
DataFrame[petal_length: double, petal_width: double, sepal_length: double, sepal_width: double, species: string]


+------------+-----------+------------+-----------+----------+
|petal_length|petal_width|sepal_length|sepal_width|   species|
+------------+-----------+------------+-----------+----------+
|         3.3|        1.0|         5.0|        2.3|versicolor|
|         4.2|        1.3|         5.6|        2.7|versicolor|
|         4.2|        1.2|         5.7|        3.0|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows

(↑jsonだとcsvよりは型情報が保持される)
# データ分析・処理用途として適しているsnappy.parquetやzlib.orc形式など

df.write.parquet("testdata/iris_parquet", compression="snappy")
!ls testdata/iris_parquet

df.write.orc("testdata/iris_orc", compression="zlib")
!ls testdata/iris_orc
23/04/25 00:13:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

_SUCCESS
part-00000-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00001-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00002-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00003-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00004-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00005-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00006-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
part-00007-8d6652c2-fdb4-460e-a4a0-193c3e5c62cd-c000.snappy.parquet
_SUCCESS
part-00000-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00001-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00002-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00003-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00004-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00005-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00006-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc
part-00007-7f758941-8b8d-498b-86bf-d55b85dbf8d8-c000.zlib.orc

# 読み込み

# parquet
print("parquet:")
df_parquet = spark.read.parquet("testdata/iris_parquet")
display(df_parquet)
df_parquet.show(3)

print("\n------------------\n")

# orc
print("orc:")
df_orc = spark.read.orc("testdata/iris_orc")
display(df_orc)
df_orc.show(3)
parquet:


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]


+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   species|
+------------+-----------+------------+-----------+----------+
|         5.0|        2.3|         3.3|        1.0|versicolor|
|         5.6|        2.7|         4.2|        1.3|versicolor|
|         5.7|        3.0|         4.2|        1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows


------------------

orc:


DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]


+------------+-----------+------------+-----------+----------+
|sepal_length|sepal_width|petal_length|petal_width|   species|
+------------+-----------+------------+-----------+----------+
|         5.0|        2.3|         3.3|        1.0|versicolor|
|         5.6|        2.7|         4.2|        1.3|versicolor|
|         5.7|        3.0|         4.2|        1.2|versicolor|
+------------+-----------+------------+-----------+----------+
only showing top 3 rows

# partitionを切って保存することも可能

df.write.save(
    "testdata/iris_with_partition",
    format="parquet",
    compression="snappy",
    partitionBy="species"
    )

!cd testdata/iris_with_partition && tree
�[00;38;5;33m.�[0m
├── _SUCCESS
├── �[00;38;5;33mspecies=setosa�[0m
│   ├── part-00000-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│   ├── part-00001-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│   └── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
├── �[00;38;5;33mspecies=versicolor�[0m
│   ├── part-00002-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│   ├── part-00003-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│   ├── part-00004-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
│   └── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
└── �[00;38;5;33mspecies=virginica�[0m
    ├── part-00005-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
    ├── part-00006-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet
    └── part-00007-2ec171c3-f1ae-485e-b57c-194f20ea425a.c000.snappy.parquet

4 directories, 11 files

カラム"species"の値ごとに別ディレクトリが切られてデータが保存される
RDBのインデックスと同様、データアクセスの仕方に応じて適切に設定することでパフォーマンスを良くすることが出来る（し、不適切だと悪化する）
↑の例だと、speciesを指定して分析を行うような場合、興味のあるspeciesしか見ないためアクセスするデータ量を限定することができる。
一方、かけるクエリの多くが複数のspeciesにまたがるような場合はパフォーマンスの向上は見込めず、悪化する可能性もある。
# パーティションがある場合でも読み込み方法は同様

df_partition = spark.read.load("testdata/iris_with_partition")

display(df_partition)
df_partition.show(3)
DataFrame[sepal_length: double, sepal_width: double, petal_length: double, petal_width: double, species: string]


+------------+-----------+------------+-----------+---------+
|sepal_length|sepal_width|petal_length|petal_width|  species|
+------------+-----------+------------+-----------+---------+
|         7.9|        3.8|         6.4|        2.0|virginica|
|         6.4|        2.8|         5.6|        2.2|virginica|
|         6.3|        2.8|         5.1|        1.5|virginica|
+------------+-----------+------------+-----------+---------+
only showing top 3 rows

他にもsaveAsTableで永続化したテーブルとして書き込みを行ったりも出来る。
扱えるデータ保存形式もData Sourcesのように、
他にもRDB(JDBCを使う)や、Hive Tableなど色々なものに対応している。
import pyspark.pandas as ps
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/__init__.py:50: UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.
  warnings.warn(

(補足)pysparkにおけるpandas API

参考: https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html
最近はかなりpandasに近いAPIを使ってsparkを扱う方法が整備されつつある様子
なお、Koalasという、sparkをpandasっぽく使えるツールが存在するが、
ここなどを見るとこれが正式にpysparkに取り込まれた？様子。
import pyspark.pandas as ps

# pandas on spark DataFrameを作成する
psdf = ps.read_csv("testdata/iris.csv")

display(psdf.__class__) # pyspark.pandas.frame.DataFrame

psdf.head(5) # pandas DataFrameのようにheadで先頭を表示できる
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: If `index_col` is not specified for `read_csv`, the default index is attached which can cause additional overhead.
  warnings.warn(message, PandasAPIOnSparkAdviceWarning)


pyspark.pandas.frame.DataFrame


<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

  
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa
    
  
# filterや表示も通常のpandasのようにできている
psdf[psdf['sepal_length'] > 7.5]

<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

  
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
      105
      7.6
      3.0
      6.6
      2.1
      virginica
    
    
      117
      7.7
      3.8
      6.7
      2.2
      virginica
    
    
      118
      7.7
      2.6
      6.9
      2.3
      virginica
    
    
      122
      7.7
      2.8
      6.7
      2.0
      virginica
    
    
      131
      7.9
      3.8
      6.4
      2.0
      virginica
    
    
      135
      7.7
      3.0
      6.1
      2.3
      virginica
    
  
# pysparkのDataFrameに変換

sdf = psdf.to_spark(index_col=["index"])

sdf.show(5)
+-----+------------+-----------+------------+-----------+-------+
|index|sepal_length|sepal_width|petal_length|petal_width|species|
+-----+------------+-----------+------------+-----------+-------+
|    0|         5.1|        3.5|         1.4|        0.2| setosa|
|    1|         4.9|        3.0|         1.4|        0.2| setosa|
|    2|         4.7|        3.2|         1.3|        0.2| setosa|
|    3|         4.6|        3.1|         1.5|        0.2| setosa|
|    4|         5.0|        3.6|         1.4|        0.2| setosa|
+-----+------------+-----------+------------+-----------+-------+
only showing top 5 rows

# 普通のpandasに変換
pdf = psdf.to_pandas()

display(pdf.__class__) # pandas.core.frame.DataFrame

pdf.head()
/home/wsl-user/LocalApps/Mambaforge/envs/pyspark-intro/lib/python3.11/site-packages/pyspark/pandas/utils.py:975: PandasAPIOnSparkAdviceWarning: `to_pandas` loads all data into the driver's memory. It should only be used if the resulting pandas DataFrame is expected to be small.
  warnings.warn(message, PandasAPIOnSparkAdviceWarning)


pandas.core.frame.DataFrame


<style scoped>
    .dataframe tbody tr th:only-of-type {
        vertical-align: middle;
    }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

  
      sepal_length
      sepal_width
      petal_length
      petal_width
      species
    
  
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      setosa
    
  
(参考) dask

pandasおよびnumpyと互換性および連携性の高いAPIを持ち、遅延評価・並列分散処理を行うことが出来るライブラリ。
例えばデータサイズが大きいテーブルデータに対してはdaskのデータフレームで遅延評価・並列分散処理を行っておき、
データサイズが小さくなるタイミングでpandasのデータフレームに変換する、といった今回pysparkでやろうとしていることとほぼ同じことが出来る。
pysparkはdaskに比べると、

クラスターなどを組める分、より本格的なビッグデータにも対応出来る

(今回紹介するようなやり方で、ライトに使うことも可能)


Javaをバックエンドに動くことに加え、本格的に動かす場合はクラスター構築などが必要になるため、そのレベルでやると動作環境の構築のハードルは高い
pandas、numpyとの親和性はdaskの方が高い高かった

daskは遅延評価される以外は同じメソッドをそのまま使えるケースが多い
sparkはむしろSQLと互換性がある
ただ、先述の通りpandas APIが整備されつつあるので、pysparkもある程度pandasの使い方そのままで使えるようになっている模様


## pyspark_env.yaml
name: pyspark-intro
channels:
  - conda-forge
dependencies:
  - python=3.11
  - pyspark>=3.4
  - openjdk=17
  - pandas>=2.0.1
  - pyarrow
  # for testdata
  - numpy
  - seaborn
  # jupyter
  - jupyterlab

## pyspark_env_freeze.yaml
name: pyspark-intro
channels:
  - conda-forge
dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=2_gnu
  - aiofiles=22.1.0=pyhd8ed1ab_0
  - aiosqlite=0.18.0=pyhd8ed1ab_0
  - alsa-lib=1.2.8=h166bdaf_0
  - anyio=3.6.2=pyhd8ed1ab_0
  - argon2-cffi=21.3.0=pyhd8ed1ab_0
  - argon2-cffi-bindings=21.2.0=py311hd4cff14_3
  - arrow-cpp=11.0.0=ha770c72_14_cpu
  - asttokens=2.2.1=pyhd8ed1ab_0
  - attrs=22.2.0=pyh71513ae_0
  - aws-c-auth=0.6.26=hf365957_1
  - aws-c-cal=0.5.21=h48707d8_2
  - aws-c-common=0.8.14=h0b41bf4_0
  - aws-c-compression=0.2.16=h03acc5a_5
  - aws-c-event-stream=0.2.20=h00877a2_4
  - aws-c-http=0.7.6=hf342b9f_0
  - aws-c-io=0.13.19=h5b20300_3
  - aws-c-mqtt=0.8.6=hc4349f7_12
  - aws-c-s3=0.2.7=h909e904_1
  - aws-c-sdkutils=0.1.8=h03acc5a_0
  - aws-checksums=0.1.14=h03acc5a_5
  - aws-crt-cpp=0.19.8=hf7fbfca_12
  - aws-sdk-cpp=1.10.57=h17c43bd_8
  - babel=2.12.1=pyhd8ed1ab_1
  - backcall=0.2.0=pyh9f0ad1d_0
  - backports=1.0=pyhd8ed1ab_3
  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
  - beautifulsoup4=4.12.2=pyha770c72_0
  - bleach=6.0.0=pyhd8ed1ab_0
  - brotli=1.0.9=h166bdaf_8
  - brotli-bin=1.0.9=h166bdaf_8
  - brotlipy=0.7.0=py311hd4cff14_1005
  - bzip2=1.0.8=h7f98852_4
  - c-ares=1.18.1=h7f98852_0
  - ca-certificates=2022.12.7=ha878542_0
  - cairo=1.16.0=h35add3b_1015
  - certifi=2022.12.7=pyhd8ed1ab_0
  - cffi=1.15.1=py311h409f033_3
  - charset-normalizer=3.1.0=pyhd8ed1ab_0
  - comm=0.1.3=pyhd8ed1ab_0
  - contourpy=1.0.7=py311ha3edf6b_0
  - cryptography=40.0.2=py311h9b4c7bb_0
  - cycler=0.11.0=pyhd8ed1ab_0
  - debugpy=1.6.7=py311hcafe171_0
  - decorator=5.1.1=pyhd8ed1ab_0
  - defusedxml=0.7.1=pyhd8ed1ab_0
  - entrypoints=0.4=pyhd8ed1ab_0
  - executing=1.2.0=pyhd8ed1ab_0
  - expat=2.5.0=hcb278e6_1
  - flit-core=3.8.0=pyhd8ed1ab_0
  - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
  - font-ttf-inconsolata=3.000=h77eed37_0
  - font-ttf-source-code-pro=2.038=h77eed37_0
  - font-ttf-ubuntu=0.83=hab24e00_0
  - fontconfig=2.14.2=h14ed4e7_0
  - fonts-conda-ecosystem=1=0
  - fonts-conda-forge=1=0
  - fonttools=4.39.3=py311h2582759_0
  - freetype=2.12.1=hca18f0e_1
  - gettext=0.21.1=h27087fc_0
  - gflags=2.2.2=he1b5a44_1004
  - giflib=5.2.1=h0b41bf4_3
  - glog=0.6.0=h6f12383_0
  - graphite2=1.3.13=h58526e2_1001
  - harfbuzz=6.0.0=h3ff4399_1
  - icu=72.1=hcb278e6_0
  - idna=3.4=pyhd8ed1ab_0
  - importlib-metadata=6.6.0=pyha770c72_0
  - importlib_metadata=6.6.0=hd8ed1ab_0
  - importlib_resources=5.12.0=pyhd8ed1ab_0
  - ipykernel=6.22.0=pyh210e3f2_0
  - ipython=8.12.0=pyh41d4057_0
  - ipython_genutils=0.2.0=py_1
  - jedi=0.18.2=pyhd8ed1ab_0
  - jinja2=3.1.2=pyhd8ed1ab_1
  - json5=0.9.5=pyh9f0ad1d_0
  - jsonschema=4.17.3=pyhd8ed1ab_0
  - jupyter_client=8.2.0=pyhd8ed1ab_0
  - jupyter_core=5.3.0=py311h38be061_0
  - jupyter_events=0.6.3=pyhd8ed1ab_0
  - jupyter_server=2.5.0=pyhd8ed1ab_0
  - jupyter_server_fileid=0.9.0=pyhd8ed1ab_0
  - jupyter_server_terminals=0.4.4=pyhd8ed1ab_1
  - jupyter_server_ydoc=0.8.0=pyhd8ed1ab_0
  - jupyter_ydoc=0.2.3=pyhd8ed1ab_0
  - jupyterlab=3.6.3=pyhd8ed1ab_0
  - jupyterlab_pygments=0.2.2=pyhd8ed1ab_0
  - jupyterlab_server=2.22.1=pyhd8ed1ab_0
  - keyutils=1.6.1=h166bdaf_0
  - kiwisolver=1.4.4=py311h4dd048b_1
  - krb5=1.20.1=h81ceb04_0
  - lcms2=2.15=haa2dc70_1
  - ld_impl_linux-64=2.40=h41732ed_0
  - lerc=4.0.0=h27087fc_0
  - libabseil=20230125.0=cxx17_hcb278e6_1
  - libarrow=11.0.0=h93537a5_14_cpu
  - libblas=3.9.0=16_linux64_openblas
  - libbrotlicommon=1.0.9=h166bdaf_8
  - libbrotlidec=1.0.9=h166bdaf_8
  - libbrotlienc=1.0.9=h166bdaf_8
  - libcblas=3.9.0=16_linux64_openblas
  - libcrc32c=1.1.2=h9c3ff4c_0
  - libcups=2.3.3=h36d4200_3
  - libcurl=8.0.1=h588be90_0
  - libdeflate=1.18=h0b41bf4_0
  - libedit=3.1.20191231=he28a2e2_2
  - libev=4.33=h516909a_1
  - libevent=2.1.10=h28343ad_4
  - libexpat=2.5.0=hcb278e6_1
  - libffi=3.4.2=h7f98852_5
  - libgcc-ng=12.2.0=h65d4601_19
  - libgfortran-ng=12.2.0=h69a702a_19
  - libgfortran5=12.2.0=h337968e_19
  - libglib=2.76.2=hebfc3b9_0
  - libgomp=12.2.0=h65d4601_19
  - libgoogle-cloud=2.8.0=h0bc5f78_1
  - libgrpc=1.52.1=hcf146ea_1
  - libiconv=1.17=h166bdaf_0
  - libjpeg-turbo=2.1.5.1=h0b41bf4_0
  - liblapack=3.9.0=16_linux64_openblas
  - libnghttp2=1.52.0=h61bc06f_0
  - libnsl=2.0.0=h7f98852_0
  - libnuma=2.0.16=h0b41bf4_1
  - libopenblas=0.3.21=pthreads_h78a6416_3
  - libpng=1.6.39=h753d276_0
  - libprotobuf=3.21.12=h3eb15da_0
  - libsodium=1.0.18=h36c2ea0_1
  - libsqlite=3.40.0=h753d276_1
  - libssh2=1.10.0=hf14f497_3
  - libstdcxx-ng=12.2.0=h46fd767_19
  - libthrift=0.18.1=h5e4af38_0
  - libtiff=4.5.0=ha587672_6
  - libutf8proc=2.8.0=h166bdaf_0
  - libuuid=2.38.1=h0b41bf4_0
  - libwebp-base=1.3.0=h0b41bf4_0
  - libxcb=1.13=h7f98852_1004
  - libzlib=1.2.13=h166bdaf_4
  - lz4-c=1.9.4=hcb278e6_0
  - markupsafe=2.1.2=py311h2582759_0
  - matplotlib-base=3.7.1=py311h8597a09_0
  - matplotlib-inline=0.1.6=pyhd8ed1ab_0
  - mistune=2.0.5=pyhd8ed1ab_0
  - munkres=1.1.4=pyh9f0ad1d_0
  - nbclassic=0.5.5=pyhb4ecaf3_1
  - nbclient=0.7.3=pyhd8ed1ab_0
  - nbconvert=7.3.1=pyhd8ed1ab_0
  - nbconvert-core=7.3.1=pyhd8ed1ab_0
  - nbconvert-pandoc=7.3.1=pyhd8ed1ab_0
  - nbformat=5.8.0=pyhd8ed1ab_0
  - ncurses=6.3=h27087fc_1
  - nest-asyncio=1.5.6=pyhd8ed1ab_0
  - notebook=6.5.4=pyha770c72_0
  - notebook-shim=0.2.2=pyhd8ed1ab_0
  - numpy=1.24.3=py311h64a7726_0
  - openjdk=17.0.3=h4335b31_6
  - openjpeg=2.5.0=hfec8fc6_2
  - openssl=3.1.0=hd590300_1
  - orc=1.8.3=hfdbbad2_0
  - packaging=23.1=pyhd8ed1ab_0
  - pandas=2.0.1=py311h320fe9a_0
  - pandoc=2.19.2=h32600fe_2
  - pandocfilters=1.5.0=pyhd8ed1ab_0
  - parquet-cpp=1.5.1=2
  - parso=0.8.3=pyhd8ed1ab_0
  - patsy=0.5.3=pyhd8ed1ab_0
  - pcre2=10.40=hc3806b6_0
  - pexpect=4.8.0=pyh1a96a4e_2
  - pickleshare=0.7.5=py_1003
  - pillow=9.5.0=py311h573f0d3_0
  - pip=23.1.1=pyhd8ed1ab_0
  - pixman=0.40.0=h36c2ea0_0
  - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_0
  - platformdirs=3.2.0=pyhd8ed1ab_0
  - pooch=1.7.0=pyha770c72_3
  - prometheus_client=0.16.0=pyhd8ed1ab_0
  - prompt-toolkit=3.0.38=pyha770c72_0
  - prompt_toolkit=3.0.38=hd8ed1ab_0
  - psutil=5.9.5=py311h2582759_0
  - pthread-stubs=0.4=h36c2ea0_1001
  - ptyprocess=0.7.0=pyhd3deb0d_0
  - pure_eval=0.2.2=pyhd8ed1ab_0
  - py4j=0.10.9.7=pyhd8ed1ab_0
  - pyarrow=11.0.0=py311hbdf6286_14_cpu
  - pycparser=2.21=pyhd8ed1ab_0
  - pygments=2.15.1=pyhd8ed1ab_0
  - pyopenssl=23.1.1=pyhd8ed1ab_0
  - pyparsing=3.0.9=pyhd8ed1ab_0
  - pyrsistent=0.19.3=py311h2582759_0
  - pysocks=1.7.1=pyha2e5f31_6
  - pyspark=3.4.0=pyhd8ed1ab_0
  - python=3.11.3=h2755cc3_0_cpython
  - python-dateutil=2.8.2=pyhd8ed1ab_0
  - python-fastjsonschema=2.16.3=pyhd8ed1ab_0
  - python-json-logger=2.0.7=pyhd8ed1ab_0
  - python-tzdata=2023.3=pyhd8ed1ab_0
  - python_abi=3.11=3_cp311
  - pytz=2023.3=pyhd8ed1ab_0
  - pyyaml=6.0=py311hd4cff14_5
  - pyzmq=25.0.2=py311hd6ccaeb_0
  - re2=2023.02.02=hcb278e6_0
  - readline=8.2=h8228510_1
  - requests=2.28.2=pyhd8ed1ab_1
  - rfc3339-validator=0.1.4=pyhd8ed1ab_0
  - rfc3986-validator=0.1.1=pyh9f0ad1d_0
  - s2n=1.3.41=h3358134_0
  - scipy=1.10.1=py311h8e6699e_0
  - seaborn=0.12.2=hd8ed1ab_0
  - seaborn-base=0.12.2=pyhd8ed1ab_0
  - send2trash=1.8.0=pyhd8ed1ab_0
  - setuptools=67.7.1=pyhd8ed1ab_0
  - six=1.16.0=pyh6c4a22f_0
  - snappy=1.1.10=h9fff704_0
  - sniffio=1.3.0=pyhd8ed1ab_0
  - soupsieve=2.3.2.post1=pyhd8ed1ab_0
  - stack_data=0.6.2=pyhd8ed1ab_0
  - statsmodels=0.13.5=py311h4c7f6c3_2
  - terminado=0.17.1=pyh41d4057_0
  - tinycss2=1.2.1=pyhd8ed1ab_0
  - tk=8.6.12=h27826a3_0
  - tomli=2.0.1=pyhd8ed1ab_0
  - tornado=6.3=py311h2582759_0
  - traitlets=5.9.0=pyhd8ed1ab_0
  - typing-extensions=4.5.0=hd8ed1ab_0
  - typing_extensions=4.5.0=pyha770c72_0
  - tzdata=2023c=h71feb2d_0
  - ucx=1.14.0=h8c404fb_1
  - urllib3=1.26.15=pyhd8ed1ab_0
  - wcwidth=0.2.6=pyhd8ed1ab_0
  - webencodings=0.5.1=py_1
  - websocket-client=1.5.1=pyhd8ed1ab_0
  - wheel=0.40.0=pyhd8ed1ab_0
  - xorg-fixesproto=5.0=h7f98852_1002
  - xorg-inputproto=2.3.2=h7f98852_1002
  - xorg-kbproto=1.0.7=h7f98852_1002
  - xorg-libice=1.0.10=h7f98852_0
  - xorg-libsm=1.2.3=hd9c2040_1000
  - xorg-libx11=1.8.4=h0b41bf4_0
  - xorg-libxau=1.0.9=h7f98852_0
  - xorg-libxdmcp=1.1.3=h7f98852_0
  - xorg-libxext=1.3.4=h0b41bf4_2
  - xorg-libxfixes=5.0.3=h7f98852_1004
  - xorg-libxi=1.7.10=h7f98852_0
  - xorg-libxrender=0.9.10=h7f98852_1003
  - xorg-libxtst=1.2.3=h7f98852_1002
  - xorg-recordproto=1.14.2=h7f98852_1002
  - xorg-renderproto=0.11.1=h7f98852_1002
  - xorg-xextproto=7.3.0=h0b41bf4_1003
  - xorg-xproto=7.0.31=h7f98852_1007
  - xz=5.2.6=h166bdaf_0
  - y-py=0.5.9=py311hfe55011_0
  - yaml=0.2.5=h7f98852_2
  - ypy-websocket=0.8.2=pyhd8ed1ab_0
  - zeromq=4.3.4=h9c3ff4c_1
  - zipp=3.15.0=pyhd8ed1ab_0
  - zlib=1.2.13=h166bdaf_4
  - zstd=1.5.2=h3eb15da_6
# prefix: /path/to/pyspark-intro
	testdata

	# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
	# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,visualstudiocode,python,pycharm+all

	### JupyterNotebooks ###
	# gitignore template for Jupyter Notebooks
	# website: http://jupyter.org/

	.ipynb_checkpoints
	/.ipynb_checkpoints/

	# IPython
	profile_default/
	ipython_config.py

	# Remove previous ipynb_checkpoints
	# git rm -r .ipynb_checkpoints/

	### PyCharm+all ###
	# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
	# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

	# User-specific stuff
	.idea/**/workspace.xml
	.idea/**/tasks.xml
	.idea/**/usage.statistics.xml
	.idea/**/dictionaries
	.idea/**/shelf

	# AWS User-specific
	.idea/**/aws.xml

	# Generated files
	.idea/**/contentModel.xml

	# Sensitive or high-churn files
	.idea/**/dataSources/
	.idea/**/dataSources.ids
	.idea/**/dataSources.local.xml
	.idea/**/sqlDataSources.xml
	.idea/**/dynamic.xml
	.idea/**/uiDesigner.xml
	.idea/**/dbnavigator.xml

	# Gradle
	.idea/**/gradle.xml
	.idea/**/libraries

	# Gradle and Maven with auto-import
	# When using Gradle or Maven with auto-import, you should exclude module files,
	# since they will be recreated, and may cause churn. Uncomment if using
	# auto-import.
	# .idea/artifacts
	# .idea/compiler.xml
	# .idea/jarRepositories.xml
	# .idea/modules.xml
	# .idea/*.iml
	# .idea/modules
	# *.iml
	# *.ipr

	# CMake
	cmake-build-*/

	# Mongo Explorer plugin
	.idea/**/mongoSettings.xml

	# File-based project format
	*.iws

	# IntelliJ
	out/

	# mpeltonen/sbt-idea plugin
	.idea_modules/

	# JIRA plugin
	atlassian-ide-plugin.xml

	# Cursive Clojure plugin
	.idea/replstate.xml

	# SonarLint plugin
	.idea/sonarlint/

	# Crashlytics plugin (for Android Studio and IntelliJ)
	com_crashlytics_export_strings.xml
	crashlytics.properties
	crashlytics-build.properties
	fabric.properties

	# Editor-based Rest Client
	.idea/httpRequests

	# Android studio 3.1+ serialized cache file
	.idea/caches/build_file_checksums.ser

	### PyCharm+all Patch ###
	# Ignore everything but code style settings and run configurations
	# that are supposed to be shared within teams.

	.idea/*

	!.idea/codeStyles
	!.idea/runConfigurations

	### Python ###
	# Byte-compiled / optimized / DLL files
	__pycache__/
	*.py[cod]
	*$py.class

	# C extensions
	*.so

	# Distribution / packaging
	.Python
	build/
	develop-eggs/
	dist/
	downloads/
	eggs/
	.eggs/
	lib/
	lib64/
	parts/
	sdist/
	var/
	wheels/
	share/python-wheels/
	*.egg-info/
	.installed.cfg
	*.egg
	MANIFEST

	# PyInstaller
	# Usually these files are written by a python script from a template
	# before PyInstaller builds the exe, so as to inject date/other infos into it.
	*.manifest
	*.spec

	# Installer logs
	pip-log.txt
	pip-delete-this-directory.txt

	# Unit test / coverage reports
	htmlcov/
	.tox/
	.nox/
	.coverage
	.coverage.*
	.cache
	nosetests.xml
	coverage.xml
	*.cover
	*.py,cover
	.hypothesis/
	.pytest_cache/
	cover/

	# Translations
	*.mo
	*.pot

	# Django stuff:
	*.log
	local_settings.py
	db.sqlite3
	db.sqlite3-journal

	# Flask stuff:
	instance/
	.webassets-cache

	# Scrapy stuff:
	.scrapy

	# Sphinx documentation
	docs/_build/

	# PyBuilder
	.pybuilder/
	target/

	# Jupyter Notebook

	# IPython

	# pyenv
	# For a library or package, you might want to ignore these files since the code is
	# intended to run in multiple environments; otherwise, check them in:
	# .python-version

	# pipenv
	# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
	# However, in case of collaboration, if having platform-specific dependencies or dependencies
	# having no cross-platform support, pipenv may install dependencies that don't work, or not
	# install all needed dependencies.
	#Pipfile.lock

	# poetry
	# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
	# This is especially recommended for binary packages to ensure reproducibility, and is more
	# commonly ignored for libraries.
	# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
	#poetry.lock

	# pdm
	# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
	#pdm.lock
	# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
	# in version control.
	# https://pdm.fming.dev/#use-with-ide
	.pdm.toml

	# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
	__pypackages__/

	# Celery stuff
	celerybeat-schedule
	celerybeat.pid

	# SageMath parsed files
	*.sage.py

	# Environments
	.env
	.venv
	env/
	venv/
	ENV/
	env.bak/
	venv.bak/

	# Spyder project settings
	.spyderproject
	.spyproject

	# Rope project settings
	.ropeproject

	# mkdocs documentation
	/site

	# mypy
	.mypy_cache/
	.dmypy.json
	dmypy.json

	# Pyre type checker
	.pyre/

	# pytype static type analyzer
	.pytype/

	# Cython debug symbols
	cython_debug/

	# PyCharm
	# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
	# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
	# and can be added to the global gitignore or merged into this file. For a more nuclear
	# option (not recommended) you can uncomment the following to ignore the entire idea folder.
	#.idea/

	### Python Patch ###
	# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
	poetry.toml

	# ruff
	.ruff_cache/

	# LSP config files
	pyrightconfig.json

	### VisualStudioCode ###
	.vscode/*
	!.vscode/settings.json
	!.vscode/tasks.json
	!.vscode/launch.json
	!.vscode/extensions.json
	!.vscode/*.code-snippets

	# Local History for Visual Studio Code
	.history/

	# Built Visual Studio Code Extensions
	*.vsix

	### VisualStudioCode Patch ###
	# Ignore all local history of files
	.history
	.ionide

	# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,visualstudiocode,python,pycharm+all
	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica
	name: pyspark-intro
	channels:
	- conda-forge
	dependencies:
	- python=3.11
	- pyspark>=3.4
	- openjdk=17
	- pandas>=2.0.1
	- pyarrow
	# for testdata
	- numpy
	- seaborn
	# jupyter
	- jupyterlab