spitz-dan-l/psql_binary_parser.pyx

## psql_binary_parser.pyx
from chatto_transform.schema.schema_base import cat

from libc.string cimport memcpy
from cpython cimport array
import array

import numpy as np
cimport numpy as np
import pandas as pd
from libc.stdlib cimport malloc, free

cdef inline void read_reversed2(char *target, char *src, int *pos):
    cdef int pos_val = pos[0]
    target[0] = src[1+pos_val]
    target[1] = src[pos_val]
    pos[0] = pos_val+2

cdef inline void read_reversed4(char *target, char *src, int *pos):
    cdef int pos_val = pos[0]
    target[0] = src[3+pos_val]
    target[1] = src[2+pos_val]
    target[2] = src[1+pos_val]
    target[3] = src[pos_val]
    pos[0] = pos_val+4

def read_binary_df(char[:] f, object schema):
    head = b'PGCOPY\n\377\r\n\0'
    cdef int pos = len(head) + 8

    cdef int i

    cdef np.ndarray[dtype=object, ndim=2] col_buffers
    cdef np.ndarray[dtype=int, ndim=1] cat_cols
    cdef int schema_cols = len(schema.cols)

    col_buffers = np.empty((schema_cols, 3), dtype=np.object_)
    cat_cols = np.zeros(schema_cols, dtype='i')
    for i in range(schema_cols):
        col_buffers[i, 0] = array.array('b') #this is the raw bytes buffer
        col_buffers[i, 1] = array.array('b') #this is the null indicator

        cat_cols[i] = isinstance(schema.cols[i], cat)

    cdef short column_count
    cdef int field_size

    cdef array.array col_buf, null_buf
    cdef char is_field_null
    cdef int is_field_cat

    read_reversed2(<char *> &column_count, &f[0], &pos)

    while column_count != -1:
        for i in range(column_count):
            read_reversed4(<char *> &field_size, &f[0], &pos)

            null_buf = <array.array>col_buffers[i, 1]
            col_buf = <array.array>col_buffers[i, 0]


            if field_size == -1: #it's null
                is_field_null = 1
            else:
                is_field_null = 0
                if cat_cols[i]:
                    array.extend_buffer(col_buf, <char *>&field_size, 4)
                array.extend_buffer(col_buf, &f[pos], field_size); pos += field_size

            array.extend_buffer(null_buf, &is_field_null, 1)

        read_reversed2(<char *> &column_count, &f[0], &pos)

    for i in range(schema_cols):
        col_buffers[i, 0] = np.frombuffer(col_buffers[i, 0], dtype='c') #this is the raw bytes buffer
        col_buffers[i, 1] = np.frombuffer(col_buffers[i, 1], dtype='bool') #this is the null indicator

    return col_buffers

def parse_cat_col(np.ndarray[dtype=char, ndim=1] values, int arr_size):
    cdef int i, field_size, pos = 0
    cdef np.ndarray[dtype=object, ndim=1] fields
    cdef char *field

    fields = np.empty(arr_size, dtype=np.object_)

    for i in range(arr_size):
        memcpy(<char *>&field_size, &values[pos], 4); pos += 4
        field = &values[pos]
        fields[i] = field[:field_size].decode('utf-8')
        pos += field_size

    return fields
	from chatto_transform.schema.schema_base import cat

	from libc.string cimport memcpy
	from cpython cimport array
	import array

	import numpy as np
	cimport numpy as np
	import pandas as pd
	from libc.stdlib cimport malloc, free

	cdef inline void read_reversed2(char target, char src, int *pos):
	cdef int pos_val = pos[0]
	target[0] = src[1+pos_val]
	target[1] = src[pos_val]
	pos[0] = pos_val+2

	cdef inline void read_reversed4(char target, char src, int *pos):
	cdef int pos_val = pos[0]
	target[0] = src[3+pos_val]
	target[1] = src[2+pos_val]
	target[2] = src[1+pos_val]
	target[3] = src[pos_val]
	pos[0] = pos_val+4

	def read_binary_df(char[:] f, object schema):
	head = b'PGCOPY\n\377\r\n\0'
	cdef int pos = len(head) + 8

	cdef int i

	cdef np.ndarray[dtype=object, ndim=2] col_buffers
	cdef np.ndarray[dtype=int, ndim=1] cat_cols
	cdef int schema_cols = len(schema.cols)

	col_buffers = np.empty((schema_cols, 3), dtype=np.object_)
	cat_cols = np.zeros(schema_cols, dtype='i')
	for i in range(schema_cols):
	col_buffers[i, 0] = array.array('b') #this is the raw bytes buffer
	col_buffers[i, 1] = array.array('b') #this is the null indicator

	cat_cols[i] = isinstance(schema.cols[i], cat)

	cdef short column_count
	cdef int field_size

	cdef array.array col_buf, null_buf
	cdef char is_field_null
	cdef int is_field_cat

	read_reversed2(<char *> &column_count, &f[0], &pos)

	while column_count != -1:
	for i in range(column_count):
	read_reversed4(<char *> &field_size, &f[0], &pos)

	null_buf = <array.array>col_buffers[i, 1]
	col_buf = <array.array>col_buffers[i, 0]


	if field_size == -1: #it's null
	is_field_null = 1
	else:
	is_field_null = 0
	if cat_cols[i]:
	array.extend_buffer(col_buf, <char *>&field_size, 4)
	array.extend_buffer(col_buf, &f[pos], field_size); pos += field_size

	array.extend_buffer(null_buf, &is_field_null, 1)

	read_reversed2(<char *> &column_count, &f[0], &pos)

	for i in range(schema_cols):
	col_buffers[i, 0] = np.frombuffer(col_buffers[i, 0], dtype='c') #this is the raw bytes buffer
	col_buffers[i, 1] = np.frombuffer(col_buffers[i, 1], dtype='bool') #this is the null indicator

	return col_buffers

	def parse_cat_col(np.ndarray[dtype=char, ndim=1] values, int arr_size):
	cdef int i, field_size, pos = 0
	cdef np.ndarray[dtype=object, ndim=1] fields
	cdef char *field

	fields = np.empty(arr_size, dtype=np.object_)

	for i in range(arr_size):
	memcpy(<char *>&field_size, &values[pos], 4); pos += 4
	field = &values[pos]
	fields[i] = field[:field_size].decode('utf-8')
	pos += field_size

	return fields