advancedxy/TestDistribution.java

## system.err
-- one run result is, which might be slightly different for each run.
bucket per column is: 416361, 1250677, 833758, 2501233, 1250901, 3747070
bucket multiple column is: 1665022, 1667690, 1665528, 1667951, 1667929, 1665880

## TestDistribution.java
/*
 *
 *  * Licensed to the Apache Software Foundation (ASF) under one
 *  * or more contributor license agreements.  See the NOTICE file
 *  * distributed with this work for additional information
 *  * regarding copyright ownership.  The ASF licenses this file
 *  * to you under the Apache License, Version 2.0 (the
 *  * "License"); you may not use this file except in compliance
 *  * with the License.  You may obtain a copy of the License at
 *  *
 *  *   http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  * Unless required by applicable law or agreed to in writing,
 *  * software distributed under the License is distributed on an
 *  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *  * KIND, either express or implied.  See the License for the
 *  * specific language governing permissions and limitations
 *  * under the License.
 *
 */

package org.apache.iceberg.transforms;

import java.util.Random;
import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
import org.apache.iceberg.relocated.com.google.common.primitives.Ints;
import org.apache.iceberg.util.BucketUtil;

public class TestDistribution {
    private static final HashFunction MURMUR3 = Hashing.murmur3_32_fixed();
    private static final Random random = new Random();

    private static int[] generateRandomIntegersWithBucket(
            int size, int bucketIndex, int bucketNum) {
        int[] result = new int[size];
        for (int i = 0; i < size;) {
            int candidate = random.nextInt();
            if ((BucketUtil.hash(candidate) & Integer.MAX_VALUE) % bucketNum == bucketIndex) {
                result[i] = candidate;
                i += 1;
            }
        }
        return result;
    }

    private static int hash(int a, int b) {
        return MURMUR3.newHasher(8 + 8).putLong(a).putLong(b).hash().asInt();
    }

    public static void main(String[] args) {
        // col1 has distribution of:
        // - hash(col_1, 3) == 0 -> 1/6
        // - hash(col_1, 3) == 1 -> 1/3
        // - hash(col_1, 3) == 2 -> 1/2
        int[] firstPart = generateRandomIntegersWithBucket(100_000, 0, 3);
        int[] secondPart = generateRandomIntegersWithBucket(200_000, 1, 3);
        int[] thirdPart = generateRandomIntegersWithBucket(300_000, 2, 3);
        int[] col1 = Ints.concat(firstPart, secondPart, thirdPart);

        // col2 has distribution of:
        // - hash(col_2, 2) == 0 -> 1/4
        // - hash(col_1, 2) == 1 -> 3/4
        firstPart = generateRandomIntegersWithBucket(100_000, 0, 2);
        secondPart = generateRandomIntegersWithBucket(300_000, 1, 2);
        int[] col2 = Ints.concat(firstPart, secondPart);

        int[] countForBucketPerColumn = new int[6];
        int[] countForBucketMultipleColumns = new int[6];
        // let's generate 10_000_000 pairs out of this two cols, and check the distribution.
        for (int i = 0; i < 10_000_000; i++) {
            int idx = random.nextInt(col1.length);
            int first = col1[idx];
            idx = random.nextInt(col2.length);
            int second = col2[idx];
            int firstBucket = (BucketUtil.hash(first) & Integer.MAX_VALUE) % 3;
            int secondBucket = (BucketUtil.hash(second) & Integer.MAX_VALUE) % 2;
            countForBucketPerColumn[firstBucket * 2 + secondBucket] += 1;
            int bucketIndex = (hash(first, second) & Integer.MAX_VALUE) % 6;
            countForBucketMultipleColumns[bucketIndex] += 1;
        }

        System.err.println("bucket per column is: " + Ints.join(", ", countForBucketPerColumn));
        System.err.println("bucket multiple column is: " + Ints.join(", ", countForBucketMultipleColumns));
    }
}
	-- one run result is, which might be slightly different for each run.
	bucket per column is: 416361, 1250677, 833758, 2501233, 1250901, 3747070
	bucket multiple column is: 1665022, 1667690, 1665528, 1667951, 1667929, 1665880
	/*
	*
	* * Licensed to the Apache Software Foundation (ASF) under one
	* * or more contributor license agreements. See the NOTICE file
	* * distributed with this work for additional information
	* * regarding copyright ownership. The ASF licenses this file
	* * to you under the Apache License, Version 2.0 (the
	* * "License"); you may not use this file except in compliance
	* * with the License. You may obtain a copy of the License at
	* *
	* * http://www.apache.org/licenses/LICENSE-2.0
	* *
	* * Unless required by applicable law or agreed to in writing,
	* * software distributed under the License is distributed on an
	* * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* * KIND, either express or implied. See the License for the
	* * specific language governing permissions and limitations
	* * under the License.
	*
	*/

	package org.apache.iceberg.transforms;

	import java.util.Random;
	import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
	import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
	import org.apache.iceberg.relocated.com.google.common.primitives.Ints;
	import org.apache.iceberg.util.BucketUtil;

	public class TestDistribution {
	private static final HashFunction MURMUR3 = Hashing.murmur3_32_fixed();
	private static final Random random = new Random();

	private static int[] generateRandomIntegersWithBucket(
	int size, int bucketIndex, int bucketNum) {
	int[] result = new int[size];
	for (int i = 0; i < size;) {
	int candidate = random.nextInt();
	if ((BucketUtil.hash(candidate) & Integer.MAX_VALUE) % bucketNum == bucketIndex) {
	result[i] = candidate;
	i += 1;
	}
	}
	return result;
	}

	private static int hash(int a, int b) {
	return MURMUR3.newHasher(8 + 8).putLong(a).putLong(b).hash().asInt();
	}

	public static void main(String[] args) {
	// col1 has distribution of:
	// - hash(col_1, 3) == 0 -> 1/6
	// - hash(col_1, 3) == 1 -> 1/3
	// - hash(col_1, 3) == 2 -> 1/2
	int[] firstPart = generateRandomIntegersWithBucket(100_000, 0, 3);
	int[] secondPart = generateRandomIntegersWithBucket(200_000, 1, 3);
	int[] thirdPart = generateRandomIntegersWithBucket(300_000, 2, 3);
	int[] col1 = Ints.concat(firstPart, secondPart, thirdPart);

	// col2 has distribution of:
	// - hash(col_2, 2) == 0 -> 1/4
	// - hash(col_1, 2) == 1 -> 3/4
	firstPart = generateRandomIntegersWithBucket(100_000, 0, 2);
	secondPart = generateRandomIntegersWithBucket(300_000, 1, 2);
	int[] col2 = Ints.concat(firstPart, secondPart);

	int[] countForBucketPerColumn = new int[6];
	int[] countForBucketMultipleColumns = new int[6];
	// let's generate 10_000_000 pairs out of this two cols, and check the distribution.
	for (int i = 0; i < 10_000_000; i++) {
	int idx = random.nextInt(col1.length);
	int first = col1[idx];
	idx = random.nextInt(col2.length);
	int second = col2[idx];
	int firstBucket = (BucketUtil.hash(first) & Integer.MAX_VALUE) % 3;
	int secondBucket = (BucketUtil.hash(second) & Integer.MAX_VALUE) % 2;
	countForBucketPerColumn[firstBucket * 2 + secondBucket] += 1;
	int bucketIndex = (hash(first, second) & Integer.MAX_VALUE) % 6;
	countForBucketMultipleColumns[bucketIndex] += 1;
	}

	System.err.println("bucket per column is: " + Ints.join(", ", countForBucketPerColumn));
	System.err.println("bucket multiple column is: " + Ints.join(", ", countForBucketMultipleColumns));
	}
	}