Created
July 19, 2016 14:06
-
-
Save spitz-dan-l/ceb765c7e466781efce5bdbd41e9efa4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from chatto_transform.schema.schema_base import cat | |
from libc.string cimport memcpy | |
from cpython cimport array | |
import array | |
import numpy as np | |
cimport numpy as np | |
import pandas as pd | |
from libc.stdlib cimport malloc, free | |
cdef inline void read_reversed2(char *target, char *src, int *pos): | |
cdef int pos_val = pos[0] | |
target[0] = src[1+pos_val] | |
target[1] = src[pos_val] | |
pos[0] = pos_val+2 | |
cdef inline void read_reversed4(char *target, char *src, int *pos): | |
cdef int pos_val = pos[0] | |
target[0] = src[3+pos_val] | |
target[1] = src[2+pos_val] | |
target[2] = src[1+pos_val] | |
target[3] = src[pos_val] | |
pos[0] = pos_val+4 | |
def read_binary_df(char[:] f, object schema): | |
head = b'PGCOPY\n\377\r\n\0' | |
cdef int pos = len(head) + 8 | |
cdef int i | |
cdef np.ndarray[dtype=object, ndim=2] col_buffers | |
cdef np.ndarray[dtype=int, ndim=1] cat_cols | |
cdef int schema_cols = len(schema.cols) | |
col_buffers = np.empty((schema_cols, 3), dtype=np.object_) | |
cat_cols = np.zeros(schema_cols, dtype='i') | |
for i in range(schema_cols): | |
col_buffers[i, 0] = array.array('b') #this is the raw bytes buffer | |
col_buffers[i, 1] = array.array('b') #this is the null indicator | |
cat_cols[i] = isinstance(schema.cols[i], cat) | |
cdef short column_count | |
cdef int field_size | |
cdef array.array col_buf, null_buf | |
cdef char is_field_null | |
cdef int is_field_cat | |
read_reversed2(<char *> &column_count, &f[0], &pos) | |
while column_count != -1: | |
for i in range(column_count): | |
read_reversed4(<char *> &field_size, &f[0], &pos) | |
null_buf = <array.array>col_buffers[i, 1] | |
col_buf = <array.array>col_buffers[i, 0] | |
if field_size == -1: #it's null | |
is_field_null = 1 | |
else: | |
is_field_null = 0 | |
if cat_cols[i]: | |
array.extend_buffer(col_buf, <char *>&field_size, 4) | |
array.extend_buffer(col_buf, &f[pos], field_size); pos += field_size | |
array.extend_buffer(null_buf, &is_field_null, 1) | |
read_reversed2(<char *> &column_count, &f[0], &pos) | |
for i in range(schema_cols): | |
col_buffers[i, 0] = np.frombuffer(col_buffers[i, 0], dtype='c') #this is the raw bytes buffer | |
col_buffers[i, 1] = np.frombuffer(col_buffers[i, 1], dtype='bool') #this is the null indicator | |
return col_buffers | |
def parse_cat_col(np.ndarray[dtype=char, ndim=1] values, int arr_size): | |
cdef int i, field_size, pos = 0 | |
cdef np.ndarray[dtype=object, ndim=1] fields | |
cdef char *field | |
fields = np.empty(arr_size, dtype=np.object_) | |
for i in range(arr_size): | |
memcpy(<char *>&field_size, &values[pos], 4); pos += 4 | |
field = &values[pos] | |
fields[i] = field[:field_size].decode('utf-8') | |
pos += field_size | |
return fields |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment