elastic · jasontedor · Apr 6, 2016 · Feb 12, 2016 · Feb 12, 2016 · Feb 13, 2016
diff --git a/...rc/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java b/...rc/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java
@@ -80,6 +80,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -403,7 +404,7 @@ protected void responseWithFailure(Throwable t) {
         protected void doRun() throws Exception {
             setPhase(task, "replica");
             assert request.shardId() != null : "request shardId must be set";
-            try (Releasable ignored = getIndexShardReferenceOnReplica(request.shardId())) {
+            try (Releasable ignored = getIndexShardReferenceOnReplica(request.shardId(), request)) {
                 shardOperationOnReplica(request);
                 if (logger.isTraceEnabled()) {
                     logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction, request.shardId(), request);
@@ -706,7 +707,7 @@ protected void doRun() throws Exception {
                 return;
             }
             // closed in finishAsFailed(e) in the case of error
-            indexShardReference = getIndexShardReferenceOnPrimary(shardId);
+            indexShardReference = getIndexShardReferenceOnPrimary(shardId, request);
             if (indexShardReference.isRelocated() == false) {
                 executeLocally();
 
@@ -820,24 +821,64 @@ void finishBecauseUnavailable(ShardId shardId, String message) {
         }
     }
 
+
+    static ConcurrentMap<IndexShardReference, String> openShardReferences;
+
+    static boolean setupShardReferenceAssertions() {
+        openShardReferences = new ConcurrentHashMap<>();
+        return true;
+    }
+
+    static boolean addShardReference(IndexShardReference ref, String desc) {
+        String prev = openShardReferences.put(ref, desc);
+        if (prev != null) {
+            throw new AssertionError("shard ref " + ref + " is added twice. current [" + desc + "] prev [" + prev + "]");
+        }
+        return true;
+    }
+
+    static boolean removeShardReference(IndexShardReference ref) {
+        assert openShardReferences.remove(ref) != null : "failed to find ref [" + ref + "]";
+        return true;
+    }
+
+    static {
+        assert setupShardReferenceAssertions();
+    }
+
+    static public void assertAllShardReferencesAreCleaned() {
+        if (openShardReferences == null || openShardReferences.isEmpty()) {
+            return;
+        }
+        StringBuilder sb = new StringBuilder();
+        for (String desc : openShardReferences.values()) {
+            sb.append(desc).append("\n");
+        }
+        assert sb.length() == 0 : "Found unclosed shard references:\n" + sb;
+    }
+
     /**
      * returns a new reference to {@link IndexShard} to perform a primary operation. Released after performing primary operation locally
      * and replication of the operation to all replica shards is completed / failed (see {@link ReplicationPhase}).
      */
-    protected IndexShardReference getIndexShardReferenceOnPrimary(ShardId shardId) {
+    protected IndexShardReference getIndexShardReferenceOnPrimary(ShardId shardId, Request request) {
         IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
         IndexShard indexShard = indexService.getShard(shardId.id());
-        return new IndexShardReferenceImpl(indexShard, true);
+        IndexShardReference ref = new IndexShardReferenceImpl(indexShard, true);
+        assert addShardReference(ref, "primary: " + request.toString() + " " + ref.routingEntry());
+        return ref;
     }
 
     /**
      * returns a new reference to {@link IndexShard} on a node that the request is replicated to. The reference is closed as soon as
      * replication is completed on the node.
      */
-    protected IndexShardReference getIndexShardReferenceOnReplica(ShardId shardId) {
+    protected IndexShardReference getIndexShardReferenceOnReplica(ShardId shardId, ReplicaRequest request) {
         IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex());
         IndexShard indexShard = indexService.getShard(shardId.id());
-        return new IndexShardReferenceImpl(indexShard, false);
+        IndexShardReference ref = new IndexShardReferenceImpl(indexShard, false);
+        assert addShardReference(ref, "replica: " + request.toString() + " " + ref.routingEntry());
+        return ref;
     }
 
     /**
@@ -1018,30 +1059,36 @@ public void handleException(TransportException exp) {
                                 String message = String.format(Locale.ROOT, "failed to perform %s on replica on node %s", transportReplicaAction, node);
                                 logger.warn("[{}] {}", exp, shardId, message);
                                 shardStateAction.shardFailed(
-                                    shard,
-                                    indexShardReference.routingEntry(),
-                                    message,
-                                    exp,
-                                    new ShardStateAction.Listener() {
-                                        @Override
-                                        public void onSuccess() {
-                                            onReplicaFailure(nodeId, exp);
-                                        }
-
-                                        @Override
-                                        public void onFailure(Throwable shardFailedError) {
-                                            if (shardFailedError instanceof ShardStateAction.NoLongerPrimaryShardException) {
-                                                ShardRouting primaryShard = indexShardReference.routingEntry();
-                                                String message = String.format(Locale.ROOT, "primary shard [%s] was demoted while failing replica shard [%s] for [%s]", primaryShard, shard, exp);
-                                                // we are no longer the primary, fail ourselves and start over
-                                                indexShardReference.failShard(message, shardFailedError);
-                                                forceFinishAsFailed(new RetryOnPrimaryException(shardId, message, shardFailedError));
-                                            } else {
-                                                assert false : shardFailedError;
+                                        shard,
+                                        indexShardReference.routingEntry(),
+                                        message,
+                                        exp,
+                                        new ShardStateAction.Listener() {
+                                            @Override
+                                            public void onSuccess() {
                                                 onReplicaFailure(nodeId, exp);
                                             }
+
+                                            @Override
+                                            public void onFailure(Throwable shardFailedError) {
+                                                if (shardFailedError instanceof ShardStateAction.NoLongerPrimaryShardException) {
+                                                    String message = "unknown";
+                                                    try {
+                                                        ShardRouting primaryShard = indexShardReference.routingEntry();
+                                                        message = String.format(Locale.ROOT, "primary shard [%s] was demoted while failing replica shard [%s] for [%s]", primaryShard, shard, exp);
+                                                        // we are no longer the primary, fail ourselves and start over
+                                                        indexShardReference.failShard(message, shardFailedError);
+                                                    } catch (Throwable t) {
+                                                        shardFailedError.addSuppressed(t);
+                                                    }
+                                                    forceFinishAsFailed(new RetryOnPrimaryException(shardId, message, shardFailedError));
+                                                } else {
+                                                    assert shardFailedError instanceof TransportException ||
+                                                            shardFailedError instanceof NodeClosedException : shardFailedError;
+                                                    onReplicaFailure(nodeId, exp);
+                                                }
+                                            }
                                         }
-                                    }
                                 );
                             }
                         }
@@ -1125,7 +1172,9 @@ protected boolean shouldExecuteReplication(Settings settings) {
 
     interface IndexShardReference extends Releasable {
         boolean isRelocated();
+
         void failShard(String reason, @Nullable Throwable e);
+
         ShardRouting routingEntry();
     }
 
@@ -1146,6 +1195,7 @@ static final class IndexShardReferenceImpl implements IndexShardReference {
         @Override
         public void close() {
             operationLock.close();
+            assert removeShardReference(this);
         }
 
         @Override

diff --git a/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java b/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
@@ -30,7 +30,6 @@
 import org.elasticsearch.cluster.MasterNodeChangePredicate;
 import org.elasticsearch.cluster.NotMasterException;
 import org.elasticsearch.cluster.node.DiscoveryNode;
-import org.elasticsearch.cluster.routing.IndexRoutingTable;
 import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
 import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.RoutingService;
@@ -54,6 +53,7 @@
 import org.elasticsearch.transport.ConnectTransportException;
 import org.elasticsearch.transport.EmptyTransportResponseHandler;
 import org.elasticsearch.transport.NodeDisconnectedException;
+import org.elasticsearch.transport.RemoteTransportException;
 import org.elasticsearch.transport.TransportChannel;
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportRequest;
@@ -112,7 +112,7 @@ public void handleException(TransportException exp) {
                             waitForNewMasterAndRetry(actionName, observer, shardRoutingEntry, listener);
                         } else {
                             logger.warn("{} unexpected failure while sending request [{}] to [{}] for shard [{}]", exp, shardRoutingEntry.getShardRouting().shardId(), actionName, masterNode, shardRoutingEntry);
-                            listener.onFailure(exp.getCause());
+                            listener.onFailure(exp instanceof RemoteTransportException ? exp.getCause() : exp);
                         }
                     }
                 });

diff --git a/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java b/core/src/main/java/org/elasticsearch/cluster/service/InternalClusterService.java
@@ -223,9 +223,10 @@ protected void doStop() {
         FutureUtils.cancel(this.reconnectToNodes);
         for (NotifyTimeout onGoingTimeout : onGoingTimeouts) {
             onGoingTimeout.cancel();
-            onGoingTimeout.listener.onClose();
         }
         ThreadPool.terminate(updateTasksExecutor, 10, TimeUnit.SECONDS);
+        postAppliedListeners.stream().filter(listener -> listener instanceof TimeoutClusterStateListener)
+                .forEach(listener -> ((TimeoutClusterStateListener) listener).onClose());
         remove(localNodeMasterListeners);
     }
 

diff --git a/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java b/core/src/main/java/org/elasticsearch/discovery/zen/publish/PublishClusterStateAction.java
@@ -58,6 +58,7 @@
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CountDownLatch;
@@ -402,6 +403,18 @@ void validateIncomingState(ClusterState incomingState, ClusterState lastSeenClus
         }
 
         ZenDiscovery.validateStateIsFromCurrentMaster(logger, currentNodes, incomingState);
+        if (lastSeenClusterState != null && lastSeenClusterState.supersedes(incomingState)) {
+            final String message = String.format(
+                    Locale.ROOT,
+                    "received older cluster state version [%s] with uuid [%s] than last seen cluster state [%s] with uuid [%s]",
+                    incomingState.version(),
+                    incomingState.stateUUID(),
+                    lastSeenClusterState.version(),
+                    lastSeenClusterState.stateUUID()
+            );
+            logger.warn(message);
+            throw new IllegalStateException(message);
+        }
     }
 
     protected void handleCommitRequest(CommitClusterStateRequest request, final TransportChannel channel) {

diff --git a/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/core/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -759,7 +759,13 @@ public void releaseSnapshot(IndexCommit snapshot) throws IOException {
      */
     public void failShard(String reason, @Nullable Throwable e) {
         // fail the engine. This will cause this shard to also be removed from the node's index service.
-        getEngine().failEngine(reason, e);
+        final Engine engine = getEngineOrNull();
+        if (engine == null) {
+            logger.trace("ignoring request to fail the shard, we're already closed. (reason: [{}])", e, reason);
+
+        } else {
+            engine.failEngine(reason, e);
+        }
     }
 
     public Engine.Searcher acquireSearcher(String source) {

diff --git a/...st/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java b/...st/java/org/elasticsearch/action/support/replication/TransportReplicationActionTests.java
@@ -992,6 +992,9 @@ public boolean isRelocated() {
             @Override
             public void failShard(String reason, @Nullable Throwable e) {
                 isShardFailed.set(true);
+                if (randomBoolean()) {
+                    throw new ElasticsearchException("simulated");
+                }
             }
 
             @Override
@@ -1097,11 +1100,11 @@ protected boolean resolveIndex() {
         }
 
         @Override
-        protected IndexShardReference getIndexShardReferenceOnPrimary(ShardId shardId) {
+        protected IndexShardReference getIndexShardReferenceOnPrimary(ShardId shardId, Request request) {
             return getOrCreateIndexShardOperationsCounter();
         }
 
-        protected IndexShardReference getIndexShardReferenceOnReplica(ShardId shardId) {
+        protected IndexShardReference getIndexShardReferenceOnReplica(ShardId shardId, Request request) {
             return getOrCreateIndexShardOperationsCounter();
         }
     }