-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Query: Add servername to grpc dial options for DNS stores (Fixes #1507) #2407
Changes from all commits
57757c1
e80c342
128242f
c203179
909189c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,9 @@ import ( | |
"github.com/thanos-io/thanos/pkg/runutil" | ||
"github.com/thanos-io/thanos/pkg/store" | ||
"github.com/thanos-io/thanos/pkg/store/storepb" | ||
"github.com/thanos-io/thanos/pkg/tls" | ||
"google.golang.org/grpc" | ||
"google.golang.org/grpc/credentials" | ||
) | ||
|
||
const ( | ||
|
@@ -38,6 +40,8 @@ type StoreSpec interface { | |
Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []storepb.LabelSet, mint int64, maxt int64, storeType component.StoreAPI, err error) | ||
// StrictStatic returns true if the StoreAPI has been statically defined and it is under a strict mode. | ||
StrictStatic() bool | ||
// ServerName returns the StoreAPI's server name for the store spec. | ||
ServerName() string | ||
} | ||
|
||
type StoreStatus struct { | ||
|
@@ -53,12 +57,19 @@ type StoreStatus struct { | |
type grpcStoreSpec struct { | ||
addr string | ||
strictstatic bool | ||
serverName string | ||
} | ||
|
||
// NewGRPCStoreSpecServerName creates store pure gRPC spec with a server name. | ||
// It uses Info gRPC call to get Metadata. | ||
func NewGRPCStoreSpecServerName(addr string, strictstatic bool, serverName string) StoreSpec { | ||
return &grpcStoreSpec{addr: addr, strictstatic: strictstatic, serverName: serverName} | ||
} | ||
|
||
// NewGRPCStoreSpec creates store pure gRPC spec. | ||
// It uses Info gRPC call to get Metadata. | ||
func NewGRPCStoreSpec(addr string, strictstatic bool) StoreSpec { | ||
return &grpcStoreSpec{addr: addr, strictstatic: strictstatic} | ||
return &grpcStoreSpec{addr: addr, strictstatic: strictstatic, serverName: ""} | ||
} | ||
|
||
// StrictStatic returns true if the StoreAPI has been statically defined and it is under a strict mode. | ||
|
@@ -71,6 +82,10 @@ func (s *grpcStoreSpec) Addr() string { | |
return s.addr | ||
} | ||
|
||
func (s *grpcStoreSpec) ServerName() string { | ||
return s.serverName | ||
} | ||
|
||
// Metadata method for gRPC store API tries to reach host Info method until context timeout. If we are unable to get metadata after | ||
// that time, we assume that the host is unhealthy and return error. | ||
func (s *grpcStoreSpec) Metadata(ctx context.Context, client storepb.StoreClient) (labelSets []storepb.LabelSet, mint int64, maxt int64, storeType component.StoreAPI, err error) { | ||
|
@@ -165,6 +180,10 @@ type StoreSet struct { | |
storeSpecs func() []StoreSpec | ||
dialOpts []grpc.DialOption | ||
gRPCInfoCallTimeout time.Duration | ||
dnsServerName bool | ||
cert string | ||
key string | ||
caCert string | ||
|
||
updateMtx sync.Mutex | ||
storesMtx sync.RWMutex | ||
|
@@ -186,6 +205,10 @@ func NewStoreSet( | |
storeSpecs func() []StoreSpec, | ||
dialOpts []grpc.DialOption, | ||
unhealthyStoreTimeout time.Duration, | ||
dnsServerName bool, | ||
cert string, | ||
key string, | ||
caCert string, | ||
) *StoreSet { | ||
storesMetric := newStoreSetNodeCollector() | ||
if reg != nil { | ||
|
@@ -208,6 +231,10 @@ func NewStoreSet( | |
stores: make(map[string]*storeRef), | ||
storeStatuses: make(map[string]*StoreStatus), | ||
unhealthyStoreTimeout: unhealthyStoreTimeout, | ||
dnsServerName: dnsServerName, | ||
cert: cert, | ||
key: key, | ||
caCert: caCert, | ||
} | ||
return ss | ||
} | ||
|
@@ -420,7 +447,17 @@ func (s *StoreSet) getActiveStores(ctx context.Context, stores map[string]*store | |
st, seenAlready := stores[addr] | ||
if !seenAlready { | ||
// New store or was unactive and was removed in the past - create new one. | ||
conn, err := grpc.DialContext(ctx, addr, s.dialOpts...) | ||
dialOpts := s.dialOpts | ||
// Update the dialOpts if we are asked to add dnsServerName. | ||
if s.dnsServerName { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm this seems a bit odd. The NewStoreSet func already accepts a slice of dialopts. I think we should expect that the dialopts has the correct WithTransportCredentials already set, no? ie correctly configure this option in cmd/Thanos/query rather than leaking configuration into this struct. Also, it looks to me like the server name is already set in the dialopts generated with extgrpc https://github.com/j3p0uk/thanos/blob/40526f52f54d4501737e5246c0e71e56dd7e0b2d/pkg/extgrpc/client.go#L56 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, taking out this new dial opt and server name bool from this struct would eliminate the need for extending the StoreSpec interface and adding the extra field to the grpcStoreSpec struct There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The bug, as it stands, is that the arguments provided to the query at start time do not allow for individual server names per configured store. The store set gets populated with the resolved addresses of the stores in the store spec and not the original server name, as far as I could tell. Given this, I attempted to pass minimal data through to generate the TLS portion of the dialOpts on demand at connection time by retrieving the server name from the dns provider. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed this is a bit weird because with this we not only have two places where these options are set but also this newly introduced option overrides the other one. It should probably be refactored so that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, happy to discuss further. My thoughts are that this looks odd because a StoreSet is a set of Stores, and the code assumes that each of the stores shares a particular set of characteristics, which may not be true (and is the basis for this bug fix). The StoreSet currently holds attributes that can differ on a per-store basis. The ServerName is one such attribute. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but I think we need to refactor this because the current approach causes the StoreSet to have duplicate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Chat in slack... Sounds like implementation of this would be something like removing dialOpts from the StoreSet, and adding it to the StoreSpec. For the SRV case, I think the correct behaviour would be to enable lookup of the A record resolved from the SRV lookup. This data is currently lost during resolution, as only the IPs are returned to the provider from resolver, and stored under the key of the original store spec. I can drop the “--grpc-client-dns-server-name” flag and then use “dns” in the “--grpc-client-server-name” to trigger addition of server name from DNS in the dialOpts, as “dns” is likely an illegal server name anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
tlsCfg, err := tls.NewClientConfig(s.logger, s.cert, s.key, s.caCert, spec.ServerName()) | ||
if err != nil { | ||
level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "setting TLS"), "address", addr) | ||
return | ||
} | ||
dialOpts = append(s.dialOpts, grpc.WithTransportCredentials(credentials.NewTLS(tlsCfg))) | ||
} | ||
conn, err := grpc.DialContext(ctx, addr, dialOpts...) | ||
if err != nil { | ||
s.updateStoreStatus(&storeRef{addr: addr}, err) | ||
level.Warn(s.logger).Log("msg", "update of store node failed", "err", errors.Wrap(err, "dialing connection"), "address", addr) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we just use serverName provided from
--grpc-client-server-name
instead of relying on dns?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, the code as-is will use a provided server-name (
--grpc-client-server-name
). This works when using a single store from the querier. But adding multiple store options to collect data from many stores will not be able to use individual server names as the code currently stands.The change intended here is to allow for multiple stores to each use different server names.