Skip to content

Instantly share code, notes, and snippets.

@bowin
Last active April 16, 2021 06:07
Show Gist options
  • Save bowin/7426980833dd8e98bdcee51c9b89e57b to your computer and use it in GitHub Desktop.
Save bowin/7426980833dd8e98bdcee51c9b89e57b to your computer and use it in GitHub Desktop.
bitmap 索引 protobuf 数据结构

先用bitmap来标记哪些field存在,每个field只占1bit,按1000个field来算只需要125个字节的开销,然后存在的field记录偏移,每个偏移4个字节。还可以采用inline value的方式进一步优化空间,低于4字节的field可以将value直接记录在偏移字段,不需要额外的偏移存储。

按照现在的物料数据进行测试:===>新数据格式空间大小=>inline优化后的空间大小(inline优化比率) 从数据可以看出,能够将单条物料控制在5K大小,inline的能够平均优化5%空间,1000万物料大概50G内存。

package com.weibo.hotwb.inspect;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static java.nio.charset.StandardCharsets.UTF_8;

public class DataStruct {
    // 内存结构 start
    private final int maxBitMapSize;
    private final long[] bitmaps;
    private final int[] bitmapCounts;
    private final List<byte[]> o_array; // 4byte data
    private final List<Byte> v_array;   // 1Byte flag
    private final List<byte[]> largerBinary;
    // 内存结构 end


    private final transient ByteArrayOutputStream stream;  // binary
    private transient byte[] finalBinary;

    public DataStruct(final int cap) {
        maxBitMapSize = cap >> 6;
        bitmaps = new long[maxBitMapSize];
        bitmapCounts = new int[maxBitMapSize - 1];
        o_array = new ArrayList<>();
        v_array = new ArrayList<>();
        largerBinary = new ArrayList<>();

        stream = new ByteArrayOutputStream(1024 * 8);

    }

    public void appendWith4Bytes(final int f, final int d) {
        int nthBitmap = f >> 6;
        bitmaps[nthBitmap] |= 1L << (f & 0x3f);
        v_array.add((byte) 0x0);
        byte[] a = int2Bytes(d);
        o_array.add(a);
        largerBinary.add(new byte[]{0x0});
    }

    public void appendExcess4Bytes(final int f, final String d) {
        int nthBitmap = f >> 6;
        bitmaps[nthBitmap] |= 1L << (f & 0x3f);
        v_array.add((byte) 0x1);
        byte[] b = d.getBytes(UTF_8);
        byte[] ll = int2Bytes(stream.size());
        o_array.add(ll);
        largerBinary.add(b);
        stream.write(b, 0, b.length);
    }

    public byte[] get(final int f) {
        int nthBitmap = f >> 6;
        if ((bitmaps[nthBitmap] & (1L << f & 0x3f)) == 0L) {
            return null;
        }
        int nthItem = Long.bitCount(bitmaps[nthBitmap] & (1L << (f & 0x3f) - 1));
        if (nthBitmap > 0) {
            nthItem += bitmapCounts[nthBitmap - 1];
        }
        byte[] bytes = o_array.get(nthItem);
        int x = bytes2Int(bytes);
        byte b = v_array.get(nthItem);
        int i = 0;
        return bytes;

    }

    public byte[] buildBinary() throws IOException {
        // aflter flush
        stream.flush();
        int totalStreamSize = stream.size();
        stream.close();
        // fill bitmapCounts
        int bitmaps_xp = 0;
        for (int i = 0; i < maxBitMapSize; ++i) {
            int c = Long.bitCount(bitmaps[i]);
            if (i > 0) {
                bitmapCounts[i - 1] = bitmaps_xp;
            }
            bitmaps_xp += c;
        }
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream(1024 * 8);
        outputStream.write(int2Bytes(bitmaps.length * 8 + bitmapCounts.length * 4 + v_array.size() + o_array.size() * 4 + totalStreamSize));
        outputStream.write(int2Bytes(bitmaps_xp));
        for (int i = 0; i < bitmaps.length; ++i) {
            outputStream.write(long2Bytes(bitmaps[i]));
        }
        for (int i = 0; i < bitmapCounts.length; ++i) {
            outputStream.write(int2Bytes(bitmapCounts[i]));
        }
        outputStream.flush();

        final int offset = outputStream.size();
        final int oArrayBytes = o_array.size() * 4;
        for (int i = 0; i < o_array.size(); i++) {
            if (v_array.get(i) == 0x1) {
                int off = bytes2Int(o_array.get(i));
                o_array.set(i, int2Bytes(offset + off + oArrayBytes));
            }
        }
        for (int i = 0; i < o_array.size(); i++) {
            outputStream.write(o_array.get(i), 0, 4);
        }
        if (totalStreamSize > 0) {
            byte[] extra = stream.toByteArray();
            outputStream.write(extra, 0, extra.length);
        }
        outputStream.flush();
        finalBinary = outputStream.toByteArray();
        return finalBinary;

    }

    // singed
    static byte[] int2Bytes(int d) {
        byte[] a = new byte[4];
        a[0] = (byte) d;
        a[1] = (byte) (d >> 8);
        a[2] = (byte) (d >> 16);
        a[3] = (byte) (d >> 24);
        return a;
    }

    static int bytes2Int(byte[] d) {
        int a = 0;
        for (int i = 0; i < 4; i++) {
            // signed byte to unsigned
            int v = (d[i] & 0xff);
            a |= v << (i * 8);
        }
        return a;
    }

    static byte[] long2Bytes(long d) {
        byte[] a = new byte[8];
        a[0] = (byte) d;
        a[1] = (byte) (d >> 8);
        a[2] = (byte) (d >> 16);
        a[3] = (byte) (d >> 24);
        a[4] = (byte) (d >> 32);
        a[5] = (byte) (d >> 40);
        a[6] = (byte) (d >> 48);
        a[7] = (byte) (d >> 56);
        return a;
    }


    public static void main(String[] args) throws IOException {
        DataStruct dataStruct = new DataStruct(1024);
        dataStruct.appendWith4Bytes(1, 4);
        dataStruct.appendExcess4Bytes(2, "sssss");
        byte[] out = dataStruct.buildBinary();
        dataStruct.get(2);
        int j = 0;
    }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment