Piotr Mlocek pimlock

## s3-read-compressed.py
df = pd.read_csv(
    "s3://my-public-bucket/data.csv.gz",
    storage_options={"anon": True},

    # in this case pandas will infer it based on extension,
    # but you can still specify it explicitely.
    compression="gzip"
)

## s3-caching.py
df = pd.read_csv(
    "filecache::s3://my-public-bucket/data.csv",
    storage_options={
        "s3": {"anon": True},
        "filecache": {"cache_storage": cache_dir}
    },
)

## s3-private-bucket.py
# credentials are accessed automatically from one of the common locations
df = pd.read_csv("s3://my-private-bucket/data.csv")

# credentials are passed as arguments (use this only if you really have to!)
df = pd.read_csv(
    "s3://my-private-bucket/data.csv",
    storage_options={"key": "AKIAIOSFODNN7EXAMPLE", "secret": "SECRET"},
)

## s3-anonymous-read.py
df = pd.read_csv(
    "s3://my-public-bucket/data.csv",
    storage_options={"anon": True},
)

## reading-sql.py
import sqlite3
con = sqlite3.connect("2016-olympics-medals.db")

try:
  df_sql = pd.read_sql_query("SELECT * FROM medals", con)

  df_sql.info()
finally:
  con.close()

## reading-parquet.py
df_parquet = pd.read_parquet("2016-olympics-medals.snappy.parquet")
df_parquet.info()

## loading-excel.py
df_excel = pd.read_excel("2016-olympics-medals.xls", sheet_name="Medals")
df_excel.info()

## loading-json.py
import json
with open("2016-olympics-medals.json") as f:
  data = json.load(f)

df = pd.json_normalize(data, record_path="Countries")

## sample-data.json
{
  "Timestamp": "2021-05-11T14:38:10",
  "Countries": [
    {"Rank":1,"NOC":"United States (USA)","Gold":46},
    {"Rank":2,"NOC":"Great Britain (GBR)","Gold":27},
    {"Rank":3,"NOC":"China (CHN)","Gold":26},
    {"...more data"}
  ]
}

## reading-jsonl.py
df = pd.read_json("2016-olympics-medals.jsonl", lines=True)
df.info()
	df = pd.read_csv(
	"s3://my-public-bucket/data.csv.gz",
	storage_options={"anon": True},

	# in this case pandas will infer it based on extension,
	# but you can still specify it explicitely.
	compression="gzip"
	)
	df = pd.read_csv(
	"filecache::s3://my-public-bucket/data.csv",
	storage_options={
	"s3": {"anon": True},
	"filecache": {"cache_storage": cache_dir}
	},
	)
	# credentials are accessed automatically from one of the common locations
	df = pd.read_csv("s3://my-private-bucket/data.csv")

	# credentials are passed as arguments (use this only if you really have to!)
	df = pd.read_csv(
	"s3://my-private-bucket/data.csv",
	storage_options={"key": "AKIAIOSFODNN7EXAMPLE", "secret": "SECRET"},
	)
	df = pd.read_csv(
	"s3://my-public-bucket/data.csv",
	storage_options={"anon": True},
	)
	import sqlite3
	con = sqlite3.connect("2016-olympics-medals.db")

	try:
	df_sql = pd.read_sql_query("SELECT * FROM medals", con)

	df_sql.info()
	finally:
	con.close()
	df_parquet = pd.read_parquet("2016-olympics-medals.snappy.parquet")
	df_parquet.info()
	df_excel = pd.read_excel("2016-olympics-medals.xls", sheet_name="Medals")
	df_excel.info()
	import json
	with open("2016-olympics-medals.json") as f:
	data = json.load(f)

	df = pd.json_normalize(data, record_path="Countries")
	{
	"Timestamp": "2021-05-11T14:38:10",
	"Countries": [
	{"Rank":1,"NOC":"United States (USA)","Gold":46},
	{"Rank":2,"NOC":"Great Britain (GBR)","Gold":27},
	{"Rank":3,"NOC":"China (CHN)","Gold":26},
	{"...more data"}
	]
	}
	df = pd.read_json("2016-olympics-medals.jsonl", lines=True)
	df.info()