Skip to content

Instantly share code, notes, and snippets.

@spitz-dan-l
Created July 19, 2016 14:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save spitz-dan-l/ceb765c7e466781efce5bdbd41e9efa4 to your computer and use it in GitHub Desktop.
Save spitz-dan-l/ceb765c7e466781efce5bdbd41e9efa4 to your computer and use it in GitHub Desktop.
from chatto_transform.schema.schema_base import cat
from libc.string cimport memcpy
from cpython cimport array
import array
import numpy as np
cimport numpy as np
import pandas as pd
from libc.stdlib cimport malloc, free
cdef inline void read_reversed2(char *target, char *src, int *pos):
cdef int pos_val = pos[0]
target[0] = src[1+pos_val]
target[1] = src[pos_val]
pos[0] = pos_val+2
cdef inline void read_reversed4(char *target, char *src, int *pos):
cdef int pos_val = pos[0]
target[0] = src[3+pos_val]
target[1] = src[2+pos_val]
target[2] = src[1+pos_val]
target[3] = src[pos_val]
pos[0] = pos_val+4
def read_binary_df(char[:] f, object schema):
head = b'PGCOPY\n\377\r\n\0'
cdef int pos = len(head) + 8
cdef int i
cdef np.ndarray[dtype=object, ndim=2] col_buffers
cdef np.ndarray[dtype=int, ndim=1] cat_cols
cdef int schema_cols = len(schema.cols)
col_buffers = np.empty((schema_cols, 3), dtype=np.object_)
cat_cols = np.zeros(schema_cols, dtype='i')
for i in range(schema_cols):
col_buffers[i, 0] = array.array('b') #this is the raw bytes buffer
col_buffers[i, 1] = array.array('b') #this is the null indicator
cat_cols[i] = isinstance(schema.cols[i], cat)
cdef short column_count
cdef int field_size
cdef array.array col_buf, null_buf
cdef char is_field_null
cdef int is_field_cat
read_reversed2(<char *> &column_count, &f[0], &pos)
while column_count != -1:
for i in range(column_count):
read_reversed4(<char *> &field_size, &f[0], &pos)
null_buf = <array.array>col_buffers[i, 1]
col_buf = <array.array>col_buffers[i, 0]
if field_size == -1: #it's null
is_field_null = 1
else:
is_field_null = 0
if cat_cols[i]:
array.extend_buffer(col_buf, <char *>&field_size, 4)
array.extend_buffer(col_buf, &f[pos], field_size); pos += field_size
array.extend_buffer(null_buf, &is_field_null, 1)
read_reversed2(<char *> &column_count, &f[0], &pos)
for i in range(schema_cols):
col_buffers[i, 0] = np.frombuffer(col_buffers[i, 0], dtype='c') #this is the raw bytes buffer
col_buffers[i, 1] = np.frombuffer(col_buffers[i, 1], dtype='bool') #this is the null indicator
return col_buffers
def parse_cat_col(np.ndarray[dtype=char, ndim=1] values, int arr_size):
cdef int i, field_size, pos = 0
cdef np.ndarray[dtype=object, ndim=1] fields
cdef char *field
fields = np.empty(arr_size, dtype=np.object_)
for i in range(arr_size):
memcpy(<char *>&field_size, &values[pos], 4); pos += 4
field = &values[pos]
fields[i] = field[:field_size].decode('utf-8')
pos += field_size
return fields
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment