Created
November 22, 2010 05:17
-
-
Save tsuna/709554 to your computer and use it in GitHub Desktop.
Netty getting stuck in an infinite loop during shutdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 07d1425781f08799ff9ef59f9cd578fa63a2e83f Mon Sep 17 00:00:00 2001 | |
From: Benoit Sigoure <tsuna@stumbleupon.com> | |
Date: Sun, 21 Nov 2010 20:42:20 -0800 | |
Subject: [PATCH] Don't shutdown while RPCs are waiting for a -ROOT- lookup. | |
The following scenario led to data loss: | |
1. Application starts. | |
2. A PutRequest is generated, triggers a -ROOT- lookup. | |
3. Application calls shutdown() on the HBaseClient, the client | |
terminates and the PutRequest is lost. | |
Now shutdown() will wait if there's an ongoing -ROOT- lookup | |
to allow the PutRequest to complete. This bug was only likely | |
to affect very short lived programs, not long lived servers. | |
Change-Id: I7da4d5e81d59e75ae5acbdeca0cc72d8078b3ce1 | |
--- | |
src/HBaseClient.java | 39 +++++++++++++++++++++++++++++++++++++-- | |
1 files changed, 37 insertions(+), 2 deletions(-) | |
diff --git a/src/HBaseClient.java b/src/HBaseClient.java | |
index 8855055..f7e3e83 100644 | |
--- a/src/HBaseClient.java | |
+++ b/src/HBaseClient.java | |
@@ -526,6 +526,22 @@ public final class HBaseClient { | |
} | |
} | |
+ // If some RPCs are waiting for -ROOT- to be discovered, we too must wait | |
+ // because some of those RPCs could be edits that we must not lose. | |
+ final Deferred<Object> d = zkclient.getDeferredRootIfBeingLookedUp(); | |
+ if (d != null) { | |
+ final class RetryShutdown implements Callback<Object, Object> { | |
+ public Object call(final Object arg) { | |
+ shutdown(); | |
+ return arg; | |
+ } | |
+ public String toString() { | |
+ return "retry shutdown"; | |
+ } | |
+ } | |
+ return d.addBoth(new RetryShutdown()); | |
+ } | |
+ | |
// 1. Flush everything. | |
return flush().addCallback(new DisconnectCB()); | |
} | |
@@ -2090,8 +2106,6 @@ public final class HBaseClient { | |
/** | |
* Returns a deferred that will be called back once we found -ROOT-. | |
- * @param cb The callback you want to be called once the -ROOT- region is | |
- * discovered. | |
* @return A deferred which will be invoked with an unspecified argument | |
* once we know where -ROOT- is. Note that by the time you get called | |
* back, we may have lost the connection to the -ROOT- region again. | |
@@ -2110,6 +2124,27 @@ public final class HBaseClient { | |
} | |
/** | |
+ * Like {@link getDeferredRoot} but returns null if we're not already | |
+ * trying to find -ROOT-. | |
+ * In other words calling this method doesn't trigger a -ROOT- lookup | |
+ * unless there's already one in flight. | |
+ * @return @{code null} if -ROOT- isn't being looked up right now, | |
+ * otherwise a deferred which will be invoked with an unspecified argument | |
+ * once we know where -ROOT- is. Note that by the time you get called | |
+ * back, we may have lost the connection to the -ROOT- region again. | |
+ */ | |
+ Deferred<Object> getDeferredRootIfBeingLookedUp() { | |
+ synchronized (this) { | |
+ if (deferred_rootregion == null) { | |
+ return null; | |
+ } | |
+ final Deferred<Object> d = new Deferred<Object>(); | |
+ deferred_rootregion.add(d); | |
+ return d; | |
+ } | |
+ } | |
+ | |
+ /** | |
* Atomically returns and {@code null}s out the current list of | |
* Deferreds waiting for the -ROOT- region. | |
*/ | |
-- | |
1.7.3.1.121.g64a71 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Copyright 2010 Benoit Sigoure | |
* | |
* This program is free software: you can redistribute it and/or modify it | |
* under the terms of the GNU Lesser General Public License as published | |
* by the Free Software Foundation, either version 3 of the License, or | |
* (at your option) any later version. | |
* | |
* This library is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public License | |
* along with this library. If not, see <http://www.gnu.org/licenses/>. */ | |
import org.hbase.async.HBaseClient; | |
import org.hbase.async.PutRequest; | |
final class AsyncHBasePut { | |
public static void main(String[] a) throws Exception { | |
final HBaseClient client = new HBaseClient("localhost"); | |
final PutRequest put = new PutRequest("hbench", "bar", "t", | |
"qualifier", "value"); | |
put.setBufferable(false); | |
client.put(put); | |
client.shutdown().join(); | |
} | |
} | |
/* | |
# Apply patch below to asynchbase | |
#> git am 0001-Don-t-shutdown-while-RPCs-are-waiting-for-a-ROOT-loo.patch | |
# Compile asynchbase | |
#> make | |
# In another director, copy all jars from asynchbase: | |
#> mkdir third_party/common | |
#> cp path/to/asynchbase/third_party/zookeeper-3.3.1.jar third_party/common | |
#> mkdir third_party/asynchbase | |
#> cp path/to/asynchbase/third_party/*.jar path/to/asynchbase/build/*.jar third_party/asynchbase | |
# Compile: | |
#> javac -cp third_party/common/zookeeper-3.3.1.jar:third_party/asynchbase/hbaseasync-1.0.jar:third_party/asynchbase/log4j-over-slf4j-1.6.1.jar:third_party/asynchbase/logback-classic-0.9.24.jar:third_party/asynchbase/logback-core-0.9.24.jar:third_party/asynchbase/netty-3.2.2.Final.jar:third_party/asynchbase/slf4j-api-1.6.1.jar:third_party/asynchbase/suasync-1.0.jar AsyncHBasePut.java | |
# Run: | |
#> java -cp third_party/common/zookeeper-3.3.1.jar:third_party/asynchbase/hbaseasync-1.0.jar:third_party/asynchbase/log4j-over-slf4j-1.6.1.jar:third_party/asynchbase/logback-classic-0.9.24.jar:third_party/asynchbase/logback-core-0.9.24.jar:third_party/asynchbase/netty-3.2.2.Final.jar:third_party/asynchbase/slf4j-api-1.6.1.jar:third_party/asynchbase/suasync-1.0.jar AsyncHBasePut | |
$ jstack -l 24712 | |
2010-11-21 21:06:13 | |
Full thread dump Java HotSpot(TM) 64-Bit Server VM (17.1-b03 mixed mode): | |
"Attach Listener" daemon prio=10 tid=0x00000000402b7000 nid=0x60c1 waiting on condition [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"HBaseClient@647057258 shutdown" prio=10 tid=0x00000000402ea000 nid=0x60a1 waiting on condition [0x0000000040927000] | |
java.lang.Thread.State: TIMED_WAITING (parking) | |
at sun.misc.Unsafe.park(Native Method) | |
- parking to wait for <0x00002aab5fea21a8> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) | |
at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:198) | |
at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2025) | |
at java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1253) | |
at org.jboss.netty.util.internal.ExecutorUtil.terminate(ExecutorUtil.java:87) | |
at org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory.releaseExternalResources(NioClientSocketChannelFactory.java:143) | |
at org.hbase.async.HBaseClient$1ShutdownThread.run(HBaseClient.java:502) | |
Locked ownable synchronizers: | |
- None | |
"New I/O client worker #1-3" prio=10 tid=0x00000000402e7000 nid=0x60a0 runnable [0x00000000405ab000] | |
java.lang.Thread.State: RUNNABLE | |
at sun.nio.ch.IOUtil.drain(Native Method) | |
at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:76) | |
- locked <0x00002aab5feaa3f8> (a java.lang.Object) | |
at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) | |
- locked <0x00002aab5feaa130> (a sun.nio.ch.Util$1) | |
- locked <0x00002aab5feaa118> (a java.util.Collections$UnmodifiableSet) | |
- locked <0x00002aab5fea3190> (a sun.nio.ch.EPollSelectorImpl) | |
at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) | |
at org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:38) | |
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:164) | |
at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) | |
at org.jboss.netty.util.internal.IoWorkerRunnable.run(IoWorkerRunnable.java:46) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908) | |
at java.lang.Thread.run(Thread.java:662) | |
Locked ownable synchronizers: | |
- <0x00002aab5fea2c98> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) | |
"New I/O client worker #1-2" prio=10 tid=0x0000000040113000 nid=0x609f runnable [0x0000000040c8b000] | |
java.lang.Thread.State: RUNNABLE | |
at sun.nio.ch.IOUtil.drain(Native Method) | |
at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:76) | |
- locked <0x00002aab5fea85d0> (a java.lang.Object) | |
at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) | |
- locked <0x00002aab5fea8308> (a sun.nio.ch.Util$1) | |
- locked <0x00002aab5fea82f0> (a java.util.Collections$UnmodifiableSet) | |
- locked <0x00002aab5fea6658> (a sun.nio.ch.EPollSelectorImpl) | |
at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) | |
at org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:38) | |
at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:164) | |
at org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108) | |
at org.jboss.netty.util.internal.IoWorkerRunnable.run(IoWorkerRunnable.java:46) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886) | |
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908) | |
at java.lang.Thread.run(Thread.java:662) | |
Locked ownable synchronizers: | |
- <0x00002aab5fea2548> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) | |
"DestroyJavaVM" prio=10 tid=0x00002aabb8192000 nid=0x6089 waiting on condition [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"Low Memory Detector" daemon prio=10 tid=0x00000000401ad800 nid=0x6098 runnable [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"CompilerThread1" daemon prio=10 tid=0x00000000401a9000 nid=0x6097 waiting on condition [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"CompilerThread0" daemon prio=10 tid=0x00000000401a6000 nid=0x6096 waiting on condition [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"Signal Dispatcher" daemon prio=10 tid=0x00000000401a3800 nid=0x6095 runnable [0x0000000000000000] | |
java.lang.Thread.State: RUNNABLE | |
Locked ownable synchronizers: | |
- None | |
"Finalizer" daemon prio=10 tid=0x000000004017e800 nid=0x6094 in Object.wait() [0x000000004159f000] | |
java.lang.Thread.State: WAITING (on object monitor) | |
at java.lang.Object.wait(Native Method) | |
- waiting on <0x00002aab5ff39130> (a java.lang.ref.ReferenceQueue$Lock) | |
at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:118) | |
- locked <0x00002aab5ff39130> (a java.lang.ref.ReferenceQueue$Lock) | |
at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:134) | |
at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159) | |
Locked ownable synchronizers: | |
- None | |
"Reference Handler" daemon prio=10 tid=0x000000004017c800 nid=0x6093 in Object.wait() [0x00000000413b5000] | |
java.lang.Thread.State: WAITING (on object monitor) | |
at java.lang.Object.wait(Native Method) | |
- waiting on <0x00002aab5ff38c30> (a java.lang.ref.Reference$Lock) | |
at java.lang.Object.wait(Object.java:485) | |
at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116) | |
- locked <0x00002aab5ff38c30> (a java.lang.ref.Reference$Lock) | |
Locked ownable synchronizers: | |
- None | |
"VM Thread" prio=10 tid=0x0000000040178000 nid=0x6092 runnable | |
"GC task thread#0 (ParallelGC)" prio=10 tid=0x0000000040126000 nid=0x608a runnable | |
"GC task thread#1 (ParallelGC)" prio=10 tid=0x0000000040128000 nid=0x608b runnable | |
"GC task thread#2 (ParallelGC)" prio=10 tid=0x0000000040129800 nid=0x608c runnable | |
"GC task thread#3 (ParallelGC)" prio=10 tid=0x000000004012b800 nid=0x608d runnable | |
"GC task thread#4 (ParallelGC)" prio=10 tid=0x000000004012d800 nid=0x608e runnable | |
"GC task thread#5 (ParallelGC)" prio=10 tid=0x000000004012f000 nid=0x608f runnable | |
"GC task thread#6 (ParallelGC)" prio=10 tid=0x0000000040131000 nid=0x6090 runnable | |
"GC task thread#7 (ParallelGC)" prio=10 tid=0x0000000040133000 nid=0x6091 runnable | |
"VM Periodic Task Thread" prio=10 tid=0x00000000401b8000 nid=0x6099 waiting on condition | |
JNI global references: 1494 | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
More different kinds of stack traces:
~ $ jstack -l 1435 08:35:27
2010-11-22 08:35:29
Full thread dump Java HotSpot(TM) 64-Bit Server VM (17.1-b03 mixed mode):