Skip to content

Instantly share code, notes, and snippets.

@fb64
Last active February 2, 2024 18:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fb64/71880cde297bc5234b02b68b785670fd to your computer and use it in GitHub Desktop.
Save fb64/71880cde297bc5234b02b68b785670fd to your computer and use it in GitHub Desktop.
arrow-dataset parquet JNI error

Arrow Java Dataset JNI Error

This java program re-produce the following JNIEnv was not attached to current thread error that happend when reading a parquet file with arrow-dataset library Parquet file must be downloaded before running the program wget -O /tmp/taxi.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet

Move Main.java into src/main/java folder Run program with maven : mvn clean compile exec:exec

Full stack trace :

/Users/runner/work/crossbow/crossbow/arrow/java/dataset/src/main/cpp/jni_util.cc:79: Failed to update reservation while freeing bytes: JNIEnv was not attached to current thread
0   jnilib-10391875251891153172.tmp     0x00000001424a99ac _ZN5arrow4util7CerrLogD2Ev + 204
1   jnilib-10391875251891153172.tmp     0x00000001424a98ce _ZN5arrow4util7CerrLogD0Ev + 14
2   jnilib-10391875251891153172.tmp     0x00000001424a1112 _ZN5arrow4util8ArrowLogD1Ev + 34
3   jnilib-10391875251891153172.tmp     0x00000001412608ed _ZN5arrow7dataset3jni31ReservationListenableMemoryPool4Impl4FreeEPhxx + 237
4   jnilib-10391875251891153172.tmp     0x00000001427b2acb _ZN5arrow10PoolBufferD2Ev + 59
5   jnilib-10391875251891153172.tmp     0x00000001427b268e _ZN5arrow10PoolBufferD0Ev + 14
6   jnilib-10391875251891153172.tmp     0x000000014140f508 _ZN7parquet14SerializedFileD2Ev + 168
7   jnilib-10391875251891153172.tmp     0x000000014140ebfe _ZN7parquet14SerializedFileD0Ev + 14
8   jnilib-10391875251891153172.tmp     0x000000014153a8ef _ZN7parquet5arrow12_GLOBAL__N_114FileReaderImplD0Ev + 159
9   jnilib-10391875251891153172.tmp     0x000000014155048a _ZN5arrow8internal6FnOnceIFvRKNS_10FutureImplEEE6FnImplINS_6FutureINS0_5EmptyEE20WrapResultOnComplete8CallbackINSA_14ThenOnCompleteIZN7parquet5arrow17RowGroupGenerator9FetchNextEvEUlvE_NSA_17PassthruOnFailureISH_EEEEEEED0Ev + 122
10  jnilib-10391875251891153172.tmp     0x00000001424a503d _ZN5arrow18ConcreteFutureImpl22DoMarkFinishedOrFailedENS_11FutureStateE + 189
11  jnilib-10391875251891153172.tmp     0x00000001428471d1 _ZN5arrow6FutureINS_8internal5EmptyEE14DoMarkFinishedENS_6ResultIS2_EE + 129
12  jnilib-10391875251891153172.tmp     0x0000000142847055 _ZN5arrow6FutureINS_8internal5EmptyEE12MarkFinishedIS2_vEEvNS_6StatusE + 53
13  jnilib-10391875251891153172.tmp     0x000000014154f4a4 _ZZN5arrow8internal8Executor10DoTransferINS0_5EmptyENS_6FutureIS3_EENS_6StatusEEENS4_IT_EES8_bENUlRKS6_E_clESA_ + 100
14  jnilib-10391875251891153172.tmp     0x00000001424a5650 _ZN5arrow8internal6FnOnceIFvvEE6FnImplIZNS_18ConcreteFutureImpl21RunOrScheduleCallbackERKNSt3__110shared_ptrINS_10FutureImplEEEONS8_14CallbackRecordEbEUlvE_E6invokeEv + 32
15  jnilib-10391875251891153172.tmp     0x000000014248e055 _ZNSt3__1L14__thread_proxyINS_5tupleIJNS_10unique_ptrINS_15__thread_structENS_14default_deleteIS3_EEEEZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE4$_10EEEEEPvSC_ + 709
16  libsystem_pthread.dylib             0x00007ff810c77202 _pthread_start + 99
17  libsystem_pthread.dylib             0x00007ff810c72bab thread_start + 15

import org.apache.arrow.dataset.file.FileFormat;
import org.apache.arrow.dataset.file.FileSystemDatasetFactory;
import org.apache.arrow.dataset.jni.DirectReservationListener;
import org.apache.arrow.dataset.jni.NativeMemoryPool;
import org.apache.arrow.dataset.scanner.ScanOptions;
import org.apache.arrow.dataset.scanner.Scanner;
import org.apache.arrow.dataset.source.Dataset;
import org.apache.arrow.dataset.source.DatasetFactory;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.util.AutoCloseables;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.VectorUnloader;
import org.apache.arrow.vector.ipc.ArrowReader;
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch;
import java.util.ArrayList;
import java.util.List;
public class Main {
public static void main(String[] args) {
//Download parquet file with: wget -O /tmp/taxi.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-07.parquet
var fileUri = "file:/tmp/taxi.parquet";
ScanOptions options = new ScanOptions(/*batchSize*/ 32768);
//works with: NativeMemoryPool nativeMemoryPool = NativeMemoryPool.getDefault();
NativeMemoryPool nativeMemoryPool = NativeMemoryPool.createListenable(DirectReservationListener.instance());
try (
BufferAllocator allocator = new RootAllocator();
DatasetFactory datasetFactory = new FileSystemDatasetFactory(
allocator, nativeMemoryPool,
FileFormat.PARQUET, fileUri);
Dataset dataset = datasetFactory.finish();
Scanner scanner = dataset.newScan(options);
ArrowReader reader = scanner.scanBatches()
) {
List<ArrowRecordBatch> batches = new ArrayList<>();
while (reader.loadNextBatch()) {
try (VectorSchemaRoot root = reader.getVectorSchemaRoot()) {
final VectorUnloader unloader = new VectorUnloader(root);
batches.add(unloader.getRecordBatch());
}
}
System.out.println("Batch size: " + batches.size());
AutoCloseables.close(batches);
} catch (Exception e) {
e.printStackTrace();
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>arrow-dataset-error</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-dataset</artifactId>
<version>15.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-memory-netty</artifactId>
<version>15.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>3.1.0</version>
<configuration>
<executable>java</executable>
<arguments>
<argument>--add-opens=java.base/java.nio=ALL-UNNAMED</argument>
<argument>-classpath</argument>
<classpath />
<argument>Main</argument>
</arguments>
</configuration>
</plugin>
</plugins>
</build>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment