Skip to content

Instantly share code, notes, and snippets.

@AlenkaF
Last active March 22, 2023 13:10
Show Gist options
  • Save AlenkaF/f0f1ea95255555024fc7b4c488135e67 to your computer and use it in GitHub Desktop.
Save AlenkaF/f0f1ea95255555024fc7b4c488135e67 to your computer and use it in GitHub Desktop.
Line # Mem usage Increment Occurrences Line Contents
=============================================================
7 147.8 MiB 147.8 MiB 1 @profile
8 def my_func():
9 # Load Vaex example
10 173.0 MiB 25.3 MiB 1 df = vaex.example()
11 # Create a virtual column
12 173.0 MiB 0.0 MiB 1 df.add_virtual_column("r", "sqrt(x**2 + y**2 + z**2)")
13
14 # Create a __dataframe__ instance
15 173.2 MiB 0.1 MiB 1 df_protocol = df.__dataframe__()
16
17 # Inspecting the metadata and selecting columns does not yet
18 # materialize all the buffers
19 173.2 MiB 0.0 MiB 1 df_protocol.num_columns()
20 173.2 MiB 0.0 MiB 1 df_protocol.column_names()
21
22 # Chunk the data
23 173.2 MiB 0.0 MiB 1 df_protocol.num_chunks()
24 173.2 MiB 0.0 MiB 1 df_protocol.get_chunks(33)
25 175.8 MiB 2.6 MiB 1 next(df_protocol.get_chunks(33)).num_rows()
26
27 # Select a subset of columns
28 175.8 MiB 0.0 MiB 1 df_protocol.select_columns_by_name(['x', 'y']).num_rows()
29
30 # Read in the virtual column
31 175.8 MiB 0.0 MiB 1 column = df_protocol.__dataframe__().get_column_by_name("r")
32 175.8 MiB 0.0 MiB 1 column.size()
33
34 # Only when actually asking for the buffers of one chunk of a column,
35 # the data needs to be in memory (to pass a pointer to the buffers)
36 187.8 MiB 12.1 MiB 1 column.get_buffers()
>>> import numpy as np
>>> import vaex

>>> # Load Vaex example
>>> df = vaex.example()
>>> # Create a virtual column
>>> df.add_virtual_column("r", "sqrt(x**2 + y**2 + z**2)")

>>> # Create a __dataframe__ instance
>>> df_protocol = df.__dataframe__()

>>> # Inspecting the metadata and selecting columns does not yet
>>> # materialize all the buffers
>>> df_protocol.num_columns()
11
>>> df_protocol.column_names()
['id', 'x', 'y', 'z', 'vx', 'vy', 'vz', 'E', 'L', 'Lz', 'FeH', 'r']

>>> # Chunk the data
>>> df_protocol.num_chunks()
1
>>> df_protocol.get_chunks(33)
<generator object _VaexDataFrame.get_chunks at 0x16c1703c0>
>>> next(df_protocol.get_chunks(33)).num_rows()
10000
>>> # Select a subset of columns
>>> df_protocol.select_columns_by_name(['x', 'y']).num_rows()
330000

>>> # Read in the virtual column
>>> column = df_protocol.__dataframe__().get_column_by_name("r")
>>> column.size()
330000

>>> # Only when actually asking for the buffers of one chunk of a column,
>>> # the data needs to be in memory (to pass a pointer to the buffers)
>>> column.get_buffers()
{'data': (VaexBuffer({'bufsize': 1320000, 'ptr': 5236260864, 'device': 'CPU'}), (<_DtypeKind.FLOAT: 2>, 32, '<f4', '=')), 'validity': None, 'offsets': None}
from memory_profiler import profile
import numpy as np
import vaex
@profile
def my_func():
# Load Vaex example
df = vaex.example()
# Create a virtual column
df.add_virtual_column("r", "sqrt(x**2 + y**2 + z**2)")
# Create a __dataframe__ instance
df_protocol = df.__dataframe__()
# Inspecting the metadata and selecting columns does not yet
# materialize all the buffers
df_protocol.num_columns()
df_protocol.column_names()
# Chunk the data
df_protocol.num_chunks()
df_protocol.get_chunks(33)
next(df_protocol.get_chunks(33)).num_rows()
# Select a subset of columns
df_protocol.select_columns_by_name(['x', 'y']).num_rows()
# Read in the virtual column
column = df_protocol.__dataframe__().get_column_by_name("r")
column.size()
# Only when actually asking for the buffers of one chunk of a column,
# the data needs to be in memory (to pass a pointer to the buffers)
column.get_buffers()
if __name__ == '__main__':
my_func()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment