Skip to content

Instantly share code, notes, and snippets.

@FabienArcellier
Last active June 28, 2024 07:10
Show Gist options
  • Save FabienArcellier/cb7fb74fd89981696b5c3d1d8423a8ff to your computer and use it in GitHub Desktop.
Save FabienArcellier/cb7fb74fd89981696b5c3d1d8423a8ff to your computer and use it in GitHub Desktop.
pyarrow export for list of records and array
# pyarrow.Table
# 0: string
# 1: int64
# 2: string
# __index_level_0__: int64
# ----
# 0: [["Alice","Bob","Charlie","Charlie"]]
# 1: [[30,25,35,28]]
# 2: [["New York","San Francisco","Chicago","Chicago"]]
# __index_level_0__: [[0,1,2,3]]
import pyarrow as pa
# Étape 1: Liste de données (chaque sous-liste est une ligne)
data = [
["Alice", 30, "New York"],
["Bob", 25, "San Francisco"],
["Charlie", 35, "Chicago"],
["Charlie", 28, "Chicago"]
]
# Étape 2: Générer des noms de colonnes basés sur les index
num_columns = len(data[0]) # Supposons que toutes les sous-listes ont la même longueur
column_names = [f"{i}" for i in range(num_columns)]
column_names += ['__index_level_0__']
# Étape 3: Transposer les données pour obtenir les colonnes
transposed_data = list(zip(*data))
transposed_data.append(list(range(len(data))))
# Étape 4: Créer des objets PyArrow Array pour chaque colonne
pyarrow_columns = [pa.array(column) for column in transposed_data]
# Étape 5: Définir le schéma de la table (optionnel)
schema = pa.schema([(column_names[i], pyarrow_columns[i].type) for i in range(len(column_names))])
# Étape 6: Créer la table PyArrow en utilisant les index générés comme noms de colonnes
table = pa.Table.from_arrays(pyarrow_columns, schema=schema)
# Afficher la table pour vérifier
print(table)
# Result
# pyarrow.Table
# name: string
# age: int64
# city: string
# __index_level_0__: int64
# ----
# name: [["Alice","Bob","Charlie","Charlie"]]
# age: [[30,25,35,28]]
# city: [["New York","San Francisco","Chicago","Chicago"]]
# __index_level_0__: [[0,1,2,3]]
import pyarrow as pa
# Étape 1: Créer une liste d'enregistrements (dictionnaires)
data = [
{"name": "Alice", "age": 30, "city": "New York"},
{"name": "Bob", "age": 25, "city": "San Francisco"},
{"name": "Charlie", "age": 35, "city": "Chicago"},
{"name": "Charlie", "age": 28, "city": "Chicago"}
]
# Étape 2: Extraire les colonnes de la liste d'enregistrements
# Utiliser les clés du premier enregistrement pour extraire les colonnes
column_names = list(data[0].keys())
# Créer un dictionnaire où chaque clé correspond à une liste de valeurs pour cette colonne
columns = {key: [record[key] for record in data] for key in column_names}
column_names += ['__index_level_0__']
columns['__index_level_0__'] = list(range(len(data)))
# Étape 3: Créer des objets PyArrow Array pour chaque colonne
pyarrow_columns = {key: pa.array(values) for key, values in columns.items()}
# Étape 4: Définir le schéma de la table (optionnel mais recommandé pour plus de contrôle)
# On utilise les mêmes clés (noms de colonnes) et on déduit les types de données
schema = pa.schema([(key, pyarrow_columns[key].type) for key in pyarrow_columns])
# Étape 5: Créer la table PyArrow
table = pa.Table.from_arrays(
[pyarrow_columns[key] for key in column_names],
schema=schema
)
# Afficher la table pour vérifier
print(table)
# Result
# pyarrow.Table
# name: string
# age: int64
# city: string
# __index_level_0__: int64
# ----
# name: [["Alice","Bob","Charlie"]]
# age: [[30,25,35]]
# city: [["New York","San Francisco","Chicago"]]
# __index_level_0__: [[0,1,2]]
import pandas as pd
import pyarrow as pa
# Exemple de DataFrame Pandas
df = pd.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [30, 25, 35],
"city": ["New York", "San Francisco", "Chicago"]
}, index=[0, 1, 2]) # Ajout d'un index explicite
# Convertir le DataFrame en table PyArrow
# En incluant l'index du DataFrame Pandas en tant que colonne
table = pa.Table.from_pandas(df, preserve_index=True)
print(table)
# Result
# pyarrow.Table
# __index_level_0__: uint32
# name: large_string
# age: int64
# city: large_string
# ----
# __index_level_0__: [[0,1,2]]
# name: [["Alice","Bob","Charlie"]]
# age: [[30,25,35]]
# city: [["New York","San Francisco","Chicago"]]
import polars as po
from pyarrow.interchange import from_dataframe
# Exemple de DataFrame Pandas
df = po.DataFrame({
"name": ["Alice", "Bob", "Charlie"],
"age": [30, 25, 35],
"city": ["New York", "San Francisco", "Chicago"]
}) # Ajout d'un index explicite
df = df.with_row_count("__index_level_0__")
# Convertir le DataFrame en table PyArrow
# En incluant l'index du DataFrame Pandas en tant que colonne
table = from_dataframe(df)
print(table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment