adworacz/lut2.diff

## lut2.diff
diff --git doc/functions/lut2.rst doc/functions/lut2.rst
index 4aa2933..6ee4560 100644
--- doc/functions/lut2.rst
+++ doc/functions/lut2.rst
@@ -1,15 +1,24 @@
 Lut2
 =======

-.. function:: Lut2(clip[] clips, int[] lut, int[] planes)
+.. function:: Lut2(clip[] clips, int[] lut, int[] planes[, int bits])
    :module: std

    Applies a lut that takes the pixel values of two clips into account. The lut needs to contain 2^(clip1.bits_per_sample + clip2.bits_per_sample) entries and will be applied to the planes listed in *planes*. The other planes will simply be passed through unchanged.

+   Lut2 also takes an optional bit depth parameter, *bits*. *bits* defaults to the bit depth of the first input clip, and specifies the bit depth of the output clip. The user is responsible for understanding the effects of bit depth conversion, specifically from higher bit depths to lower bit depths, as no scaling or clamping is applied.
+
    How to average 2 clips::

       lut = []
-      for y in range(2**clipx.format.bits_per_sample):
-         for x in range(2**clipy.format.bits_per_sample):
+      for y in range(2 ** clipy.format.bits_per_sample):
+         for x in range(2 ** clipx.format.bits_per_sample):
             lut.append((x + y)//2)
       Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2])
+
+   How to average 2 clips with a 10-bit output::
+      lut = []
+      for y in range(2 ** clipy.format.bits_per_sample):
+         for x in range(2 ** clipx.format.bits_per_sample):
+            lut.append((x + y)//2)
+      Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
diff --git src/core/simplefilters.c src/core/simplefilters.c
index 3d5d494..4832f22 100644
--- src/core/simplefilters.c
+++ src/core/simplefilters.c
@@ -1997,13 +1997,26 @@ static void VS_CC lutCreate(const VSMap *in, VSMap *out, void *userData, VSCore
 typedef struct {
     VSNodeRef *node[2];
     const VSVideoInfo *vi[2];
+    VSVideoInfo *vi_out;
     void *lut;
     int process[3];
 } Lut2Data;

+#define LUT2_PROCESS(X_CAST, Y_CAST, DST_CAST) \
+    do { \
+        for (hl = 0; hl < h; hl++) { \
+            for (x = 0; x < w; x++) { \
+                ((DST_CAST *)dstp)[x] =  lut[(((Y_CAST *)srcpy)[x] << shift) + ((X_CAST *)srcpx)[x]]; \
+            } \
+            dstp += dst_stride; \
+            srcpx += srcx_stride; \
+            srcpy += srcy_stride; \
+        } \
+    } while(0)
+
 static void VS_CC lut2Init(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
     Lut2Data *d = (Lut2Data *) * instanceData;
-    vsapi->setVideoInfo(d->vi[0], 1, node);
+    vsapi->setVideoInfo(d->vi_out, 1, node);
     vsapi->clearMap(in);
 }

@@ -2017,7 +2030,7 @@ static const VSFrameRef *VS_CC lut2Getframe(int n, int activationReason, void **
         int plane;
         const VSFrameRef *srcx = vsapi->getFrameFilter(n, d->node[0], frameCtx);
         const VSFrameRef *srcy = vsapi->getFrameFilter(n, d->node[1], frameCtx);
-        const VSFormat *fi = vsapi->getFrameFormat(srcx);
+        const VSFormat *fi = d->vi_out->format;
         const int pl[] = {0, 1, 2};
         const VSFrameRef *fr[] = {d->process[0] ? 0 : srcx, d->process[1] ? 0 : srcx, d->process[2] ? 0 : srcx};
         VSFrameRef *dst = vsapi->newVideoFrame2(fi, vsapi->getFrameWidth(srcx, 0), vsapi->getFrameHeight(srcx, 0), fr, pl, srcx, core);
@@ -2025,13 +2038,14 @@ static const VSFrameRef *VS_CC lut2Getframe(int n, int activationReason, void **
         for (plane = 0; plane < fi->numPlanes; plane++) {
             const uint8_t *srcpx = vsapi->getReadPtr(srcx, plane);
             const uint8_t *srcpy = vsapi->getReadPtr(srcy, plane);
-            int src_stride = vsapi->getStride(srcx, plane);
+            int srcx_stride = vsapi->getStride(srcx, plane);
+            int srcy_stride = vsapi->getStride(srcy, plane);
             uint8_t *dstp = vsapi->getWritePtr(dst, plane);
             int dst_stride = vsapi->getStride(dst, plane);
             int h = vsapi->getFrameHeight(srcx, plane);

             if (d->process[plane]) {
-                int shift = fi->bitsPerSample;
+                int shift = d->vi[0]->format->bitsPerSample;
                 int hl;
                 int w = vsapi->getFrameWidth(srcx, plane);
                 int x;
@@ -2039,24 +2053,26 @@ static const VSFrameRef *VS_CC lut2Getframe(int n, int activationReason, void **
                 if (fi->bytesPerSample == 1) {
                     const uint8_t *lut = (uint8_t *)d->lut;

-                    for (hl = 0; hl < h; hl++) {
-                        for (x = 0; x < w; x++)
-                            dstp[x] =  lut[(srcpy[x] << shift) + srcpx[x]];
-
-                        dstp += dst_stride;
-                        srcpx += src_stride;
-                        srcpy += src_stride;
+                    if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample == 8) {
+                        LUT2_PROCESS(uint8_t, uint8_t, uint8_t);
+                    } else if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample > 8) {
+                        LUT2_PROCESS(uint8_t, uint16_t, uint8_t);
+                    } else if (d->vi[0]->format->bitsPerSample > 8 && d->vi[1]->format->bitsPerSample == 8) {
+                        LUT2_PROCESS(uint16_t, uint8_t, uint8_t);
+                    } else {
+                        LUT2_PROCESS(uint16_t, uint16_t, uint8_t);
                     }
                 } else {
                     const uint16_t *lut = (uint16_t *)d->lut;

-                    for (hl = 0; hl < h; hl++) {
-                        for (x = 0; x < w; x++)
-                            ((uint16_t *)dstp)[x] =  lut[(srcpy[x] << shift) + srcpx[x]];
-
-                        dstp += dst_stride;
-                        srcpx += src_stride;
-                        srcpy += src_stride;
+                    if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample == 8) {
+                        LUT2_PROCESS(uint8_t, uint8_t, uint16_t);
+                    } else if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample > 8) {
+                        LUT2_PROCESS(uint8_t, uint16_t, uint16_t);
+                    } else if (d->vi[0]->format->bitsPerSample > 8 && d->vi[1]->format->bitsPerSample == 8) {
+                        LUT2_PROCESS(uint16_t, uint8_t, uint16_t);
+                    } else {
+                        LUT2_PROCESS(uint16_t, uint16_t, uint16_t);
                     }
                 }
             }
@@ -2135,9 +2151,26 @@ static void VS_CC lut2Create(const VSMap *in, VSMap *out, void *userData, VSCore
         RETERROR("Lut2: bad lut length");
     }

-    d.lut = malloc(d.vi[0]->format->bytesPerSample * n);
+    int err;
+    int bits = int64ToIntS(vsapi->propGetInt(in, "bits", 0, &err));
+    if (bits == 0) {
+        bits = d.vi[0]->format->bitsPerSample;
+    } else if (bits < 8 || bits > 16) {
+        vsapi->freeNode(d.node[0]);
+        vsapi->freeNode(d.node[1]);
+        RETERROR("Lut2: Output format must be between 8 and 16 bits.");
+    }
+
+    d.vi_out = (VSVideoInfo *)malloc(sizeof(VSVideoInfo));
+    *d.vi_out = *d.vi[0];
+    d.vi_out->format = vsapi->registerFormat(d.vi[0]->format->colorFamily, d.vi[0]->format->sampleType, bits, d.vi[0]->format->subSamplingW, d.vi[0]->format->subSamplingH, core);
+
+    if (bits == 8)
+        d.lut = malloc(sizeof(uint8_t) * n);
+    else
+        d.lut = malloc(sizeof(uint16_t) * n);

-    if (d.vi[0]->format->bytesPerSample == 1) {
+    if (bits == 8) {
         uint8_t *lut = d.lut;

         for (i = 0; i < n; i++) {
@@ -3400,7 +3433,7 @@ void VS_CC stdlibInitialize(VSConfigPlugin configFunc, VSRegisterFunction regist
     registerFunc("BlankClip", "clip:clip:opt;width:int:opt;height:int:opt;format:int:opt;length:int:opt;fpsnum:int:opt;fpsden:int:opt;color:float[]:opt;", blankClipCreate, 0, plugin);
     registerFunc("AssumeFPS", "clip:clip;src:clip:opt;fpsnum:int:opt;fpsden:int:opt;", assumeFPSCreate, 0, plugin);
     registerFunc("Lut", "clip:clip;lut:int[];planes:int[];", lutCreate, 0, plugin);
-    registerFunc("Lut2", "clips:clip[];lut:int[];planes:int[];", lut2Create, 0, plugin);
+    registerFunc("Lut2", "clips:clip[];lut:int[];planes:int[];bits:int:opt;", lut2Create, 0, plugin);
     registerFunc("SelectClip", "clips:clip[];src:clip[];selector:func;", selectClipCreate, 0, plugin);
     registerFunc("ModifyFrame", "clips:clip[];selector:func;", modifyFrameCreate, 0, plugin);
     registerFunc("Transpose", "clip:clip;", transposeCreate, 0, plugin);
diff --git src/core/vsapi.cpp src/core/vsapi.cpp
index de3a5c7..e649f9c 100644
--- src/core/vsapi.cpp
+++ src/core/vsapi.cpp
@@ -33,8 +33,8 @@ static const VSFormat *VS_CC getFormatPreset(int id, VSCore *core) {
     return core->getFormatPreset((VSPresetFormat)id);
 }

-static const VSFormat *VS_CC registerFormat(int colorFamily, int sampleType, int bytesPerSample, int subSamplingW, int subSamplingH, VSCore *core) {
-    return core->registerFormat((VSColorFamily)colorFamily, (VSSampleType)sampleType, bytesPerSample, subSamplingW, subSamplingH);
+static const VSFormat *VS_CC registerFormat(int colorFamily, int sampleType, int bitsPerSample, int subSamplingW, int subSamplingH, VSCore *core) {
+    return core->registerFormat((VSColorFamily)colorFamily, (VSSampleType)sampleType, bitsPerSample, subSamplingW, subSamplingH);
 }

 static const VSFrameRef *VS_CC cloneFrameRef(const VSFrameRef *frame) {
diff --git test/filter_test.py test/filter_test.py
index 58cc9f4..7256d5d 100644
--- test/filter_test.py
+++ test/filter_test.py
@@ -1,15 +1,16 @@
 import unittest
 import vapoursynth as vs

+
 class FilterTestSequence(unittest.TestCase):

     def setUp(self):
         self.core = vs.Core()

-    def checkDifference(self, cpu, gpu):
-        diff = self.core.std.PlaneDifference([cpu, gpu], 0, prop="PlaneDifference0")
-        diff = self.core.std.PlaneDifference([diff, gpu], 1, prop="PlaneDifference1")
-        diff = self.core.std.PlaneDifference([diff, gpu], 2, prop="PlaneDifference2")
+    def checkDifference(self, original, processed):
+        diff = self.core.std.PlaneDifference([original, processed], 0, prop="PlaneDifference0")
+        diff = self.core.std.PlaneDifference([diff, processed], 1, prop="PlaneDifference1")
+        diff = self.core.std.PlaneDifference([diff, processed], 2, prop="PlaneDifference2")

         for i in range(diff.num_frames):
             frame = diff.get_frame(i)
@@ -28,5 +29,89 @@ class FilterTestSequence(unittest.TestCase):

         self.checkDifference(clip, ret)

+    def testLUT2_8Bit(self):
+        clipx = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
+        clipy = self.core.std.BlankClip(format=vs.YUV420P8, color=[115, 103, 205])
+
+        lut = []
+        for y in range(2 ** clipy.format.bits_per_sample):
+            for x in range(2 ** clipx.format.bits_per_sample):
+                lut.append(x)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
+        self.checkDifference(clipx, ret)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
+        comp = self.core.std.BlankClip(format=vs.YUV420P10, color=[69, 242, 115])
+        self.checkDifference(comp, ret)
+
+    def testLUT2_8Bit_10Bit(self):
+        # Check 8-bit, 10-bit source.
+        clipx = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
+        clipy = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 900, 442])
+
+        lut = []
+        for y in range(2 ** clipy.format.bits_per_sample):
+            for x in range(2 ** clipx.format.bits_per_sample):
+                lut.append(x)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
+        self.checkDifference(clipx, ret)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
+        comp = self.core.std.BlankClip(format=vs.YUV420P10, color=[69, 242, 115])
+        self.checkDifference(comp, ret)
+
+        # Check 10-bit, 8-bit source.
+        # Colors are 8-bit levels for 10-bit clip so that we can verify output.
+        clipx = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 235, 115])
+        clipy = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
+
+        lut = []
+        for y in range(2 ** clipy.format.bits_per_sample):
+            for x in range(2 ** clipx.format.bits_per_sample):
+                lut.append(x)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
+        comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[15, 235, 115])
+        self.checkDifference(comp, ret)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
+        self.checkDifference(clipx, ret)
+
+    def testLUT2_9Bit_10Bit(self):
+        # Check 9-bit, 10-bit source.
+        clipx = self.core.std.BlankClip(format=vs.YUV420P9, color=[384, 10, 500])
+        clipy = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 600, 900])
+
+        lut = []
+        for y in range(2 ** clipy.format.bits_per_sample):
+            for x in range(2 ** clipx.format.bits_per_sample):
+                lut.append(x)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=9)
+        self.checkDifference(clipx, ret)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
+        comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[128, 10, 244])
+        self.checkDifference(comp, ret)
+
+        # Check 10-bit, 9-bit source.
+        clipx = self.core.std.BlankClip(format=vs.YUV420P10, color=[384, 10, 500])
+        clipy = self.core.std.BlankClip(format=vs.YUV420P9, color=[15, 384, 511])
+
+        lut = []
+        for y in range(2 ** clipy.format.bits_per_sample):
+            for x in range(2 ** clipx.format.bits_per_sample):
+                lut.append(x)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=9)
+        comp = self.core.std.BlankClip(format=vs.YUV420P9, color=[384, 10, 500])
+        self.checkDifference(comp, ret)
+
+        ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
+        comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[128, 10, 244])
+        self.checkDifference(comp, ret)
+
 if __name__ == '__main__':
     unittest.main()
	diff --git doc/functions/lut2.rst doc/functions/lut2.rst
	index 4aa2933..6ee4560 100644
	--- doc/functions/lut2.rst
	+++ doc/functions/lut2.rst
	@@ -1,15 +1,24 @@
	Lut2
	=======

	-.. function:: Lut2(clip[] clips, int[] lut, int[] planes)
	+.. function:: Lut2(clip[] clips, int[] lut, int[] planes[, int bits])
	:module: std

	Applies a lut that takes the pixel values of two clips into account. The lut needs to contain 2^(clip1.bits_per_sample + clip2.bits_per_sample) entries and will be applied to the planes listed in planes. The other planes will simply be passed through unchanged.

	+ Lut2 also takes an optional bit depth parameter, bits. bits defaults to the bit depth of the first input clip, and specifies the bit depth of the output clip. The user is responsible for understanding the effects of bit depth conversion, specifically from higher bit depths to lower bit depths, as no scaling or clamping is applied.
	+
	How to average 2 clips::

	lut = []
	- for y in range(2**clipx.format.bits_per_sample):
	- for x in range(2**clipy.format.bits_per_sample):
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	lut.append((x + y)//2)
	Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2])
	+
	+ How to average 2 clips with a 10-bit output::
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append((x + y)//2)
	+ Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
	diff --git src/core/simplefilters.c src/core/simplefilters.c
	index 3d5d494..4832f22 100644
	--- src/core/simplefilters.c
	+++ src/core/simplefilters.c
	@@ -1997,13 +1997,26 @@ static void VS_CC lutCreate(const VSMap in, VSMap out, void *userData, VSCore
	typedef struct {
	VSNodeRef *node[2];
	const VSVideoInfo *vi[2];
	+ VSVideoInfo *vi_out;
	void *lut;
	int process[3];
	} Lut2Data;

	+#define LUT2_PROCESS(X_CAST, Y_CAST, DST_CAST) \
	+ do { \
	+ for (hl = 0; hl < h; hl++) { \
	+ for (x = 0; x < w; x++) { \
	+ ((DST_CAST )dstp)[x] = lut[(((Y_CAST )srcpy)[x] << shift) + ((X_CAST *)srcpx)[x]]; \
	+ } \
	+ dstp += dst_stride; \
	+ srcpx += srcx_stride; \
	+ srcpy += srcy_stride; \
	+ } \
	+ } while(0)
	+
	static void VS_CC lut2Init(VSMap in, VSMap out, void *instanceData, VSNode node, VSCore core, const VSAPI vsapi) {
	Lut2Data d = (Lut2Data ) * instanceData;
	- vsapi->setVideoInfo(d->vi[0], 1, node);
	+ vsapi->setVideoInfo(d->vi_out, 1, node);
	vsapi->clearMap(in);
	}

	@@ -2017,7 +2030,7 @@ static const VSFrameRef VS_CC lut2Getframe(int n, int activationReason, void *
	int plane;
	const VSFrameRef *srcx = vsapi->getFrameFilter(n, d->node[0], frameCtx);
	const VSFrameRef *srcy = vsapi->getFrameFilter(n, d->node[1], frameCtx);
	- const VSFormat *fi = vsapi->getFrameFormat(srcx);
	+ const VSFormat *fi = d->vi_out->format;
	const int pl[] = {0, 1, 2};
	const VSFrameRef *fr[] = {d->process[0] ? 0 : srcx, d->process[1] ? 0 : srcx, d->process[2] ? 0 : srcx};
	VSFrameRef *dst = vsapi->newVideoFrame2(fi, vsapi->getFrameWidth(srcx, 0), vsapi->getFrameHeight(srcx, 0), fr, pl, srcx, core);
	@@ -2025,13 +2038,14 @@ static const VSFrameRef VS_CC lut2Getframe(int n, int activationReason, void *
	for (plane = 0; plane < fi->numPlanes; plane++) {
	const uint8_t *srcpx = vsapi->getReadPtr(srcx, plane);
	const uint8_t *srcpy = vsapi->getReadPtr(srcy, plane);
	- int src_stride = vsapi->getStride(srcx, plane);
	+ int srcx_stride = vsapi->getStride(srcx, plane);
	+ int srcy_stride = vsapi->getStride(srcy, plane);
	uint8_t *dstp = vsapi->getWritePtr(dst, plane);
	int dst_stride = vsapi->getStride(dst, plane);
	int h = vsapi->getFrameHeight(srcx, plane);

	if (d->process[plane]) {
	- int shift = fi->bitsPerSample;
	+ int shift = d->vi[0]->format->bitsPerSample;
	int hl;
	int w = vsapi->getFrameWidth(srcx, plane);
	int x;
	@@ -2039,24 +2053,26 @@ static const VSFrameRef VS_CC lut2Getframe(int n, int activationReason, void *
	if (fi->bytesPerSample == 1) {
	const uint8_t lut = (uint8_t )d->lut;

	- for (hl = 0; hl < h; hl++) {
	- for (x = 0; x < w; x++)
	- dstp[x] = lut[(srcpy[x] << shift) + srcpx[x]];
	-
	- dstp += dst_stride;
	- srcpx += src_stride;
	- srcpy += src_stride;
	+ if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample == 8) {
	+ LUT2_PROCESS(uint8_t, uint8_t, uint8_t);
	+ } else if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample > 8) {
	+ LUT2_PROCESS(uint8_t, uint16_t, uint8_t);
	+ } else if (d->vi[0]->format->bitsPerSample > 8 && d->vi[1]->format->bitsPerSample == 8) {
	+ LUT2_PROCESS(uint16_t, uint8_t, uint8_t);
	+ } else {
	+ LUT2_PROCESS(uint16_t, uint16_t, uint8_t);
	}
	} else {
	const uint16_t lut = (uint16_t )d->lut;

	- for (hl = 0; hl < h; hl++) {
	- for (x = 0; x < w; x++)
	- ((uint16_t *)dstp)[x] = lut[(srcpy[x] << shift) + srcpx[x]];
	-
	- dstp += dst_stride;
	- srcpx += src_stride;
	- srcpy += src_stride;
	+ if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample == 8) {
	+ LUT2_PROCESS(uint8_t, uint8_t, uint16_t);
	+ } else if (d->vi[0]->format->bitsPerSample == 8 && d->vi[1]->format->bitsPerSample > 8) {
	+ LUT2_PROCESS(uint8_t, uint16_t, uint16_t);
	+ } else if (d->vi[0]->format->bitsPerSample > 8 && d->vi[1]->format->bitsPerSample == 8) {
	+ LUT2_PROCESS(uint16_t, uint8_t, uint16_t);
	+ } else {
	+ LUT2_PROCESS(uint16_t, uint16_t, uint16_t);
	}
	}
	}
	@@ -2135,9 +2151,26 @@ static void VS_CC lut2Create(const VSMap in, VSMap out, void *userData, VSCore
	RETERROR("Lut2: bad lut length");
	}

	- d.lut = malloc(d.vi[0]->format->bytesPerSample * n);
	+ int err;
	+ int bits = int64ToIntS(vsapi->propGetInt(in, "bits", 0, &err));
	+ if (bits == 0) {
	+ bits = d.vi[0]->format->bitsPerSample;
	+ } else if (bits < 8 \|\| bits > 16) {
	+ vsapi->freeNode(d.node[0]);
	+ vsapi->freeNode(d.node[1]);
	+ RETERROR("Lut2: Output format must be between 8 and 16 bits.");
	+ }
	+
	+ d.vi_out = (VSVideoInfo *)malloc(sizeof(VSVideoInfo));
	+ d.vi_out = d.vi[0];
	+ d.vi_out->format = vsapi->registerFormat(d.vi[0]->format->colorFamily, d.vi[0]->format->sampleType, bits, d.vi[0]->format->subSamplingW, d.vi[0]->format->subSamplingH, core);
	+
	+ if (bits == 8)
	+ d.lut = malloc(sizeof(uint8_t) * n);
	+ else
	+ d.lut = malloc(sizeof(uint16_t) * n);

	- if (d.vi[0]->format->bytesPerSample == 1) {
	+ if (bits == 8) {
	uint8_t *lut = d.lut;

	for (i = 0; i < n; i++) {
	@@ -3400,7 +3433,7 @@ void VS_CC stdlibInitialize(VSConfigPlugin configFunc, VSRegisterFunction regist
	registerFunc("BlankClip", "clip:clip:opt;width:int:opt;height:int:opt;format:int:opt;length:int:opt;fpsnum:int:opt;fpsden:int:opt;color:float[]:opt;", blankClipCreate, 0, plugin);
	registerFunc("AssumeFPS", "clip:clip;src:clip:opt;fpsnum:int:opt;fpsden:int:opt;", assumeFPSCreate, 0, plugin);
	registerFunc("Lut", "clip:clip;lut:int[];planes:int[];", lutCreate, 0, plugin);
	- registerFunc("Lut2", "clips:clip[];lut:int[];planes:int[];", lut2Create, 0, plugin);
	+ registerFunc("Lut2", "clips:clip[];lut:int[];planes:int[];bits:int:opt;", lut2Create, 0, plugin);
	registerFunc("SelectClip", "clips:clip[];src:clip[];selector:func;", selectClipCreate, 0, plugin);
	registerFunc("ModifyFrame", "clips:clip[];selector:func;", modifyFrameCreate, 0, plugin);
	registerFunc("Transpose", "clip:clip;", transposeCreate, 0, plugin);
	diff --git src/core/vsapi.cpp src/core/vsapi.cpp
	index de3a5c7..e649f9c 100644
	--- src/core/vsapi.cpp
	+++ src/core/vsapi.cpp
	@@ -33,8 +33,8 @@ static const VSFormat VS_CC getFormatPreset(int id, VSCore core) {
	return core->getFormatPreset((VSPresetFormat)id);
	}

	-static const VSFormat VS_CC registerFormat(int colorFamily, int sampleType, int bytesPerSample, int subSamplingW, int subSamplingH, VSCore core) {
	- return core->registerFormat((VSColorFamily)colorFamily, (VSSampleType)sampleType, bytesPerSample, subSamplingW, subSamplingH);
	+static const VSFormat VS_CC registerFormat(int colorFamily, int sampleType, int bitsPerSample, int subSamplingW, int subSamplingH, VSCore core) {
	+ return core->registerFormat((VSColorFamily)colorFamily, (VSSampleType)sampleType, bitsPerSample, subSamplingW, subSamplingH);
	}

	static const VSFrameRef VS_CC cloneFrameRef(const VSFrameRef frame) {
	diff --git test/filter_test.py test/filter_test.py
	index 58cc9f4..7256d5d 100644
	--- test/filter_test.py
	+++ test/filter_test.py
	@@ -1,15 +1,16 @@
	import unittest
	import vapoursynth as vs

	+
	class FilterTestSequence(unittest.TestCase):

	def setUp(self):
	self.core = vs.Core()

	- def checkDifference(self, cpu, gpu):
	- diff = self.core.std.PlaneDifference([cpu, gpu], 0, prop="PlaneDifference0")
	- diff = self.core.std.PlaneDifference([diff, gpu], 1, prop="PlaneDifference1")
	- diff = self.core.std.PlaneDifference([diff, gpu], 2, prop="PlaneDifference2")
	+ def checkDifference(self, original, processed):
	+ diff = self.core.std.PlaneDifference([original, processed], 0, prop="PlaneDifference0")
	+ diff = self.core.std.PlaneDifference([diff, processed], 1, prop="PlaneDifference1")
	+ diff = self.core.std.PlaneDifference([diff, processed], 2, prop="PlaneDifference2")

	for i in range(diff.num_frames):
	frame = diff.get_frame(i)
	@@ -28,5 +29,89 @@ class FilterTestSequence(unittest.TestCase):

	self.checkDifference(clip, ret)

	+ def testLUT2_8Bit(self):
	+ clipx = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
	+ clipy = self.core.std.BlankClip(format=vs.YUV420P8, color=[115, 103, 205])
	+
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append(x)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
	+ self.checkDifference(clipx, ret)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P10, color=[69, 242, 115])
	+ self.checkDifference(comp, ret)
	+
	+ def testLUT2_8Bit_10Bit(self):
	+ # Check 8-bit, 10-bit source.
	+ clipx = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
	+ clipy = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 900, 442])
	+
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append(x)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
	+ self.checkDifference(clipx, ret)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P10, color=[69, 242, 115])
	+ self.checkDifference(comp, ret)
	+
	+ # Check 10-bit, 8-bit source.
	+ # Colors are 8-bit levels for 10-bit clip so that we can verify output.
	+ clipx = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 235, 115])
	+ clipy = self.core.std.BlankClip(format=vs.YUV420P8, color=[69, 242, 115])
	+
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append(x)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[15, 235, 115])
	+ self.checkDifference(comp, ret)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=10)
	+ self.checkDifference(clipx, ret)
	+
	+ def testLUT2_9Bit_10Bit(self):
	+ # Check 9-bit, 10-bit source.
	+ clipx = self.core.std.BlankClip(format=vs.YUV420P9, color=[384, 10, 500])
	+ clipy = self.core.std.BlankClip(format=vs.YUV420P10, color=[15, 600, 900])
	+
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append(x)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=9)
	+ self.checkDifference(clipx, ret)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[128, 10, 244])
	+ self.checkDifference(comp, ret)
	+
	+ # Check 10-bit, 9-bit source.
	+ clipx = self.core.std.BlankClip(format=vs.YUV420P10, color=[384, 10, 500])
	+ clipy = self.core.std.BlankClip(format=vs.YUV420P9, color=[15, 384, 511])
	+
	+ lut = []
	+ for y in range(2 ** clipy.format.bits_per_sample):
	+ for x in range(2 ** clipx.format.bits_per_sample):
	+ lut.append(x)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=9)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P9, color=[384, 10, 500])
	+ self.checkDifference(comp, ret)
	+
	+ ret = self.core.std.Lut2(clips=[clipx, clipy], lut=lut, planes=[0, 1, 2], bits=8)
	+ comp = self.core.std.BlankClip(format=vs.YUV420P8, color=[128, 10, 244])
	+ self.checkDifference(comp, ret)
	+
	if __name__ == '__main__':
	unittest.main()