Skip to content

Instantly share code, notes, and snippets.

@advancedxy
Created July 19, 2023 03:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save advancedxy/236a8db8de03cf40c2ecbfebd4bf07ef to your computer and use it in GitHub Desktop.
Save advancedxy/236a8db8de03cf40c2ecbfebd4bf07ef to your computer and use it in GitHub Desktop.
Bucket distribution
-- one run result is, which might be slightly different for each run.
bucket per column is: 416361, 1250677, 833758, 2501233, 1250901, 3747070
bucket multiple column is: 1665022, 1667690, 1665528, 1667951, 1667929, 1665880
/*
*
* * Licensed to the Apache Software Foundation (ASF) under one
* * or more contributor license agreements. See the NOTICE file
* * distributed with this work for additional information
* * regarding copyright ownership. The ASF licenses this file
* * to you under the Apache License, Version 2.0 (the
* * "License"); you may not use this file except in compliance
* * with the License. You may obtain a copy of the License at
* *
* * http://www.apache.org/licenses/LICENSE-2.0
* *
* * Unless required by applicable law or agreed to in writing,
* * software distributed under the License is distributed on an
* * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* * KIND, either express or implied. See the License for the
* * specific language governing permissions and limitations
* * under the License.
*
*/
package org.apache.iceberg.transforms;
import java.util.Random;
import org.apache.iceberg.relocated.com.google.common.hash.HashFunction;
import org.apache.iceberg.relocated.com.google.common.hash.Hashing;
import org.apache.iceberg.relocated.com.google.common.primitives.Ints;
import org.apache.iceberg.util.BucketUtil;
public class TestDistribution {
private static final HashFunction MURMUR3 = Hashing.murmur3_32_fixed();
private static final Random random = new Random();
private static int[] generateRandomIntegersWithBucket(
int size, int bucketIndex, int bucketNum) {
int[] result = new int[size];
for (int i = 0; i < size;) {
int candidate = random.nextInt();
if ((BucketUtil.hash(candidate) & Integer.MAX_VALUE) % bucketNum == bucketIndex) {
result[i] = candidate;
i += 1;
}
}
return result;
}
private static int hash(int a, int b) {
return MURMUR3.newHasher(8 + 8).putLong(a).putLong(b).hash().asInt();
}
public static void main(String[] args) {
// col1 has distribution of:
// - hash(col_1, 3) == 0 -> 1/6
// - hash(col_1, 3) == 1 -> 1/3
// - hash(col_1, 3) == 2 -> 1/2
int[] firstPart = generateRandomIntegersWithBucket(100_000, 0, 3);
int[] secondPart = generateRandomIntegersWithBucket(200_000, 1, 3);
int[] thirdPart = generateRandomIntegersWithBucket(300_000, 2, 3);
int[] col1 = Ints.concat(firstPart, secondPart, thirdPart);
// col2 has distribution of:
// - hash(col_2, 2) == 0 -> 1/4
// - hash(col_1, 2) == 1 -> 3/4
firstPart = generateRandomIntegersWithBucket(100_000, 0, 2);
secondPart = generateRandomIntegersWithBucket(300_000, 1, 2);
int[] col2 = Ints.concat(firstPart, secondPart);
int[] countForBucketPerColumn = new int[6];
int[] countForBucketMultipleColumns = new int[6];
// let's generate 10_000_000 pairs out of this two cols, and check the distribution.
for (int i = 0; i < 10_000_000; i++) {
int idx = random.nextInt(col1.length);
int first = col1[idx];
idx = random.nextInt(col2.length);
int second = col2[idx];
int firstBucket = (BucketUtil.hash(first) & Integer.MAX_VALUE) % 3;
int secondBucket = (BucketUtil.hash(second) & Integer.MAX_VALUE) % 2;
countForBucketPerColumn[firstBucket * 2 + secondBucket] += 1;
int bucketIndex = (hash(first, second) & Integer.MAX_VALUE) % 6;
countForBucketMultipleColumns[bucketIndex] += 1;
}
System.err.println("bucket per column is: " + Ints.join(", ", countForBucketPerColumn));
System.err.println("bucket multiple column is: " + Ints.join(", ", countForBucketMultipleColumns));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment