etcd-io · jpbetz · Mar 4, 2018 · Feb 12, 2019 · Feb 12, 2019 · Feb 12, 2019
diff --git a/embed/config.go b/embed/config.go
@@ -75,6 +75,8 @@ const (
 	// maxElectionMs specifies the maximum value of election timeout.
 	// More details are listed in ../Documentation/tuning.md#time-parameters.
 	maxElectionMs = 50000
+
+	DefaultBackendExpensiveReadLimit = 1000
 )
 
 var (
@@ -270,9 +272,10 @@ type Config struct {
 	AuthToken  string `json:"auth-token"`
 	BcryptCost uint   `json:"bcrypt-cost"`
 
-	ExperimentalInitialCorruptCheck bool          `json:"experimental-initial-corrupt-check"`
-	ExperimentalCorruptCheckTime    time.Duration `json:"experimental-corrupt-check-time"`
-	ExperimentalEnableV2V3          string        `json:"experimental-enable-v2v3"`
+	ExperimentalInitialCorruptCheck       bool          `json:"experimental-initial-corrupt-check"`
+	ExperimentalCorruptCheckTime          time.Duration `json:"experimental-corrupt-check-time"`
+	ExperimentalEnableV2V3                string        `json:"experimental-enable-v2v3"`
+	ExperimentalBackendExpensiveReadLimit int           `json:"experimental-backend-expensive-read-limit"`
 
 	// ForceNewCluster starts a new cluster even if previously started; unsafe.
 	ForceNewCluster bool `json:"force-new-cluster"`

diff --git a/embed/etcd.go b/embed/etcd.go
@@ -201,6 +201,7 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) {
 		Debug:                      cfg.Debug,
 		ForceNewCluster:            cfg.ForceNewCluster,
 		EnableGRPCGateway:          cfg.EnableGRPCGateway,
+		BackendExpensiveReadLimit:  cfg.ExperimentalBackendExpensiveReadLimit,
 	}
 	print(e.cfg.logger, *cfg, srvcfg, memberInitialized)
 	if e.Server, err = etcdserver.NewServer(srvcfg); err != nil {

diff --git a/etcdmain/config.go b/etcdmain/config.go
@@ -248,6 +248,7 @@ func newConfig() *config {
 	fs.BoolVar(&cfg.ec.ExperimentalInitialCorruptCheck, "experimental-initial-corrupt-check", cfg.ec.ExperimentalInitialCorruptCheck, "Enable to check data corruption before serving any client/peer traffic.")
 	fs.DurationVar(&cfg.ec.ExperimentalCorruptCheckTime, "experimental-corrupt-check-time", cfg.ec.ExperimentalCorruptCheckTime, "Duration of time between cluster corruption check passes.")
 	fs.StringVar(&cfg.ec.ExperimentalEnableV2V3, "experimental-enable-v2v3", cfg.ec.ExperimentalEnableV2V3, "v3 prefix for serving emulated v2 state.")
+	fs.IntVar(&cfg.ec.ExperimentalBackendExpensiveReadLimit, "experimental-backend-expensive-read-limit", embed.DefaultBackendExpensiveReadLimit, "The number of keys in expensive read request in etcd backend")
 
 	// unsafe
 	fs.BoolVar(&cfg.ec.ForceNewCluster, "force-new-cluster", false, "Force to create a new one member cluster.")

diff --git a/etcdmain/help.go b/etcdmain/help.go
@@ -200,6 +200,8 @@ Experimental feature:
     Duration of time between cluster corruption check passes.
   --experimental-enable-v2v3 ''
     Serve v2 requests through the v3 backend under a given prefix.
+  --experimental-backend-expensive-read-limit '1000'
+    The number of keys in expensive read request in etcd backend
 
 Unsafe feature:
   --force-new-cluster 'false'

diff --git a/etcdserver/backend.go b/etcdserver/backend.go
@@ -48,6 +48,7 @@ func newBackend(cfg ServerConfig) backend.Backend {
 		// permit 10% excess over quota for disarm
 		bcfg.MmapSize = uint64(cfg.QuotaBackendBytes + cfg.QuotaBackendBytes/10)
 	}
+	bcfg.ExpensiveReadLimit = cfg.BackendExpensiveReadLimit
 	return backend.New(bcfg)
 }
 

diff --git a/etcdserver/config.go b/etcdserver/config.go
@@ -150,6 +150,8 @@ type ServerConfig struct {
 	LeaseCheckpointInterval time.Duration
 
 	EnableGRPCGateway bool
+
+	BackendExpensiveReadLimit int
 }
 
 // VerifyBootstrap sanity-checks the initial config for bootstrap case

diff --git a/etcdserver/v3_server.go b/etcdserver/v3_server.go
@@ -85,12 +85,12 @@ type Authenticator interface {
 }
 
 func (s *EtcdServer) Range(ctx context.Context, r *pb.RangeRequest) (*pb.RangeResponse, error) {
+
 	var resp *pb.RangeResponse
 	var err error
 	defer func(start time.Time) {
 		warnOfExpensiveReadOnlyRangeRequest(s.getLogger(), start, r, resp, err)
 	}(time.Now())
-
 	if !r.Serializable {
 		err = s.linearizableReadNotify(ctx)
 		if err != nil {

diff --git a/mvcc/backend/backend.go b/mvcc/backend/backend.go
@@ -46,9 +46,18 @@ var (
 
 	// minSnapshotWarningTimeout is the minimum threshold to trigger a long running snapshot warning.
 	minSnapshotWarningTimeout = 30 * time.Second
+
+	// maxConcurrentReadTxns is the maximum number of bbolt transactions open at any time. When this
+	// limit is hit, committed read transaction requests must wait.
+	maxConcurrentCommittedReadTxs = uint64(10)
 )
 
 type Backend interface {
+	// CommittedReadTx returns a non-blocking read tx that is suitable for large reads.
+	// CommittedReadTx call itself will not return until the current BatchTx gets committed to
+	// ensure consistency.
+	CommittedReadTx() ReadTx
+
 	ReadTx() ReadTx
 	BatchTx() BatchTx
 
@@ -66,6 +75,7 @@ type Backend interface {
 	Defrag() error
 	ForceCommit()
 	Close() error
+	ExpensiveReadLimit() int
 }
 
 type Snapshot interface {
@@ -97,10 +107,14 @@ type backend struct {
 
 	readTx *readTx
 
+	committedReadScheduler *concurrentReadScheduler
+
 	stopc chan struct{}
 	donec chan struct{}
 
 	lg *zap.Logger
+
+	expensiveReadLimit int
 }
 
 type BackendConfig struct {
@@ -114,6 +128,8 @@ type BackendConfig struct {
 	MmapSize uint64
 	// Logger logs backend-side operations.
 	Logger *zap.Logger
+	// ExpensiveReadLimit is the number of keys in expensive read request
+	ExpensiveReadLimit int
 }
 
 func DefaultBackendConfig() BackendConfig {
@@ -169,8 +185,11 @@ func newBackend(bcfg BackendConfig) *backend {
 		donec: make(chan struct{}),
 
 		lg: bcfg.Logger,
+
+		expensiveReadLimit: bcfg.ExpensiveReadLimit,
 	}
 	b.batchTx = newBatchTxBuffered(b)
+	b.committedReadScheduler = newConcurrentReadScheduler(b, maxConcurrentCommittedReadTxs)
 	go b.run()
 	return b
 }
@@ -184,6 +203,10 @@ func (b *backend) BatchTx() BatchTx {
 
 func (b *backend) ReadTx() ReadTx { return b.readTx }
 
+func (b *backend) CommittedReadTx() ReadTx {
+	return b.committedReadScheduler.RequestConcurrentReadTx()
+}
+
 // ForceCommit forces the current batching tx to commit.
 func (b *backend) ForceCommit() {
 	b.batchTx.Commit()
@@ -289,6 +312,7 @@ func (b *backend) SizeInUse() int64 {
 func (b *backend) run() {
 	defer close(b.donec)
 	t := time.NewTimer(b.batchInterval)
+	start := time.Now()
 	defer t.Stop()
 	for {
 		select {
@@ -300,7 +324,10 @@ func (b *backend) run() {
 		if b.batchTx.safePending() != 0 {
 			b.batchTx.Commit()
 		}
+		batchIntervalSec.Observe(time.Since(start).Seconds())
+		start = time.Now()
 		t.Reset(b.batchInterval)
+		b.committedReadScheduler.BeginConcurrentReadTxs()
 	}
 }
 
@@ -533,3 +560,7 @@ func (s *snapshot) Close() error {
 	<-s.donec
 	return s.Tx.Rollback()
 }
+
+func (b *backend) ExpensiveReadLimit() int {
+	return b.expensiveReadLimit
+}
diff --git a/mvcc/backend/backend_test.go b/mvcc/backend/backend_test.go
@@ -300,6 +300,81 @@ func TestBackendWritebackForEach(t *testing.T) {
 	}
 }
 
+// TestBackendConcurrentReadTx checks if the concurrent tx is created correctly.
+func TestBackendConcurrentReadTx(t *testing.T) {
+	b, tmpPath := NewTmpBackend(2*time.Second, 10000)
+	defer cleanup(b, tmpPath)
+
+	var rtx0 ReadTx
+	done := make(chan struct{})
+	go func() {
+		rtx0 = b.ConcurrentReadTx()
+		close(done)
+	}()
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("key"))
+	for i := 0; i < 5; i++ {
+		k := []byte(fmt.Sprintf("%04d", i))
+		tx.UnsafePut([]byte("key"), k, []byte("bar"))
+	}
+	tx.Unlock()
+
+	select {
+	case <-done:
+		t.Fatal("concurrent read tx should block on the last batch tx!")
+	case <-time.After(time.Second):
+	}
+
+	select {
+	case <-done:
+	case <-time.After(4 * time.Second):
+		t.Fatal("commit the last batched tx should unblock concurrent tx!")
+	}
+
+	rtx0.Lock()
+	defer rtx0.Unlock()
+	ks, _ := rtx0.UnsafeRange([]byte("key"), []byte(fmt.Sprintf("%04d", 0)), []byte(fmt.Sprintf("%04d", 5)), 0)
+	if len(ks) != 5 {
+		t.Errorf("got %d keys, expect %d", len(ks), 5)
+	}
+
+	// test if we can create concurrent read while the previous read tx is still open
+	var rtx1 ReadTx
+	done = make(chan struct{})
+	go func() {
+		rtx1 = b.ConcurrentReadTx()
+		rtx1.Lock()
+		rtx1.UnsafeForEach([]byte(""), nil)
+		rtx1.Unlock()
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(4 * time.Second):
+		t.Fatal("cannot create concurrent read")
+	}
+
+	done = make(chan struct{})
+	// test if we can create concurrent write while the previous read tx is still open
+	go func() {
+		tx := b.BatchTx()
+		tx.Lock()
+		for i := 0; i < 5; i++ {
+			k := []byte(fmt.Sprintf("%04d", i))
+			tx.UnsafePut([]byte("key"), k, []byte("bar"))
+		}
+		tx.Unlock()
+		close(done)
+	}()
+	select {
+	case <-done:
+	case <-time.After(4 * time.Second):
+		t.Fatal("cannot create concurrent write")
+	}
+}
+
 func cleanup(b Backend, path string) {
 	b.Close()
 	os.Remove(path)

diff --git a/mvcc/backend/concurrent_read_scheduler.go b/mvcc/backend/concurrent_read_scheduler.go
@@ -0,0 +1,72 @@
+// Copyright 2019 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+// ReadTxRequest is a channel to send a requested ReadTx to when it becomes available.
+type ReadTxRequest = chan ReadTx
+
+// concurrentReadScheduler accumulates requests to begin concurrent ReadTxs and waits until
+// BeginConcurrentReadTxs is called to begin them. It also limits the number of
+// concurrent ReadTxs running at any point in time to the provided maxConcurrentReadTxs.
+type concurrentReadScheduler struct {
+	maxConcurrentReadTxs uint64
+	readTxCh             chan ReadTxRequest
+	b                    *backend
+	pendingReadCounter   *GaugedCounter
+	openReadCounter      *GaugedCounter
+}
+
+func newConcurrentReadScheduler(b *backend, maxConcurrentReadTxs uint64) *concurrentReadScheduler {
+	return &concurrentReadScheduler{
+		maxConcurrentReadTxs: maxConcurrentReadTxs,
+		readTxCh:             make(chan ReadTxRequest),
+		b:                    b,
+		pendingReadCounter:   &GaugedCounter{0, pendingReadGauge},
+		openReadCounter:      &GaugedCounter{0, openReadGauge},
+	}
+}
+
+// RequestConcurrentReadTx requests a new ReadTx and blocks until it is available.
+func (r *concurrentReadScheduler) RequestConcurrentReadTx() ReadTx {
+	rch := make(chan ReadTx)
+	r.pendingReadCounter.Inc()
+	defer r.pendingReadCounter.Dec()
+	r.readTxCh <- rch
+	return <-rch
+}
+
+// BeginConcurrentReadTxs begins pending read transactions and sends them
+// to the channels of all blocked RequestReadTx() callers.
+// Ensures more than maxConcurrentReadTxns are running at the same time.
+func (r *concurrentReadScheduler) BeginConcurrentReadTxs() {
+	// TODO(jpbetz): This has the potential to backlog indefinitely under heavly load.
+	// If we're going to impose a limit here. We might want to do more to ensure we're
+	// managing context deadlines and cancelations also.
+
+	concurrentReadTxs := r.openReadCounter.Value()
+	for i := uint64(0); i < (r.maxConcurrentReadTxs - concurrentReadTxs); i++ {
+		select {
+		case rch := <-r.readTxCh:
+			rtx, err := r.b.db.Begin(false)
+			if err != nil {
+				plog.Fatalf("cannot begin read tx (%s)", err)
+			}
+			rch <- &MonitoredReadTx{r.openReadCounter, &concurrentReadTx{tx: rtx}}
+		default:
+			// no more to create.
+			return
+		}
+	}
+}
diff --git a/mvcc/backend/concurrent_read_tx.go b/mvcc/backend/concurrent_read_tx.go
@@ -0,0 +1,45 @@
+// Copyright 2018 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	bolt "go.etcd.io/bbolt"
+)
+
+type concurrentReadTx struct {
+	tx *bolt.Tx
+}
+
+func (rt *concurrentReadTx) Lock()   {}
+func (rt *concurrentReadTx) Unlock() { rt.tx.Rollback() }
+
+func (rt *concurrentReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+	bucket := rt.tx.Bucket(bucketName)
+	if bucket == nil {
+		plog.Fatalf("bucket %s does not exist", bucketName)
+	}
+	return unsafeRange(bucket.Cursor(), key, endKey, limit)
+}
+
+func (rt *concurrentReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	return unsafeForEach(rt.tx, bucketName, visitor)
+}
+
+func (m *MonitoredReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+	return m.Tx.UnsafeRange(bucketName, key, endKey, limit)
+}
+func (m *MonitoredReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	return m.Tx.UnsafeForEach(bucketName, visitor)
+}