Library to validate and normalize cloud specific tags (#819)

## Changes Prompted by the proposed fix for a tagging-related problem in #810, I investigated how tag validation works. This turned out to be quite a bit more complex than anticipated. Tags at the job level (or cluster level) are passed through to the underlying compute infrastructure and as such are tested against cloud-specific validation rules. GCP appears to be the most restrictive. It would be disappointing to always restrict to `\w+`, so this package implements validation and normalization rules for each cloud. It can pick the right cloud to use using a Go SDK configuration. ## Tests Exhaustive unit tests. The regular expressions were pulled by #814.
databricks · Oct 13, 2023 · 52d3a5e · 52d3a5e
1 parent f7170dd
commit 52d3a5e
Show file tree

Hide file tree

Showing 13 changed files with 532 additions and 0 deletions.
diff --git a/libs/tags/aws.go b/libs/tags/aws.go
@@ -0,0 +1,36 @@
+package tags
+
+import (
+	"regexp"
+	"unicode"
+
+	"golang.org/x/text/unicode/rangetable"
+)
+
+// The union of all characters allowed in AWS tags.
+// This must be used only after filtering out non-Latin1 characters,
+// because the [unicode] classes include non-Latin1 characters.
+var awsChars = rangetable.Merge(
+	unicode.Digit,
+	unicode.Space,
+	unicode.Letter,
+	rangetable.New('+', '-', '=', '.', ':', '/', '@'),
+)
+
+var awsTag = &tag{
+	keyLength:  127,
+	keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`),
+	keyNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(awsChars, '_'),
+	),
+
+	valueLength:  255,
+	valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`),
+	valueNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(awsChars, '_'),
+	),
+}
diff --git a/libs/tags/aws_test.go b/libs/tags/aws_test.go
@@ -0,0 +1,49 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestAwsNormalizeKey(t *testing.T) {
+	assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c"))
+	assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)"))
+	assert.Equal(t, "test", awsTag.NormalizeKey("test"))
+
+	// Remove marks; unicode becomes underscore.
+	assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎"))
+
+	// Replace forbidden characters with underscore.
+	assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?"))
+}
+
+func TestAwsNormalizeValue(t *testing.T) {
+	assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c"))
+	assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)"))
+	assert.Equal(t, "test", awsTag.NormalizeValue("test"))
+
+	// Remove marks; unicode becomes underscore.
+	assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎"))
+
+	// Replace forbidden characters with underscore.
+	assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?"))
+}
+
+func TestAwsValidateKey(t *testing.T) {
+	assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length")
+	assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127)))
+	assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎")))
+}
+
+func TestAwsValidateValue(t *testing.T) {
+	assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length")
+	assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1")
+	assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern")
+	assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127)))
+	assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎")))
+}
diff --git a/libs/tags/azure.go b/libs/tags/azure.go
@@ -0,0 +1,25 @@
+package tags
+
+import (
+	"regexp"
+
+	"golang.org/x/text/unicode/rangetable"
+)
+
+// All characters that may not be used in Azure tag keys.
+var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?')
+
+var azureTag = &tag{
+	keyLength:  512,
+	keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`),
+	keyNormalize: chain(
+		replaceNotIn(latin1, '_'),
+		replaceIn(azureForbiddenChars, '_'),
+	),
+
+	valueLength:  256,
+	valuePattern: regexp.MustCompile(`^.*$`),
+	valueNormalize: chain(
+		replaceNotIn(latin1, '_'),
+	),
+}
diff --git a/libs/tags/azure_test.go b/libs/tags/azure_test.go
@@ -0,0 +1,34 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestAzureNormalizeKey(t *testing.T) {
+	assert.Equal(t, "test", azureTag.NormalizeKey("test"))
+	assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?"))
+}
+
+func TestAzureNormalizeValue(t *testing.T) {
+	assert.Equal(t, "test", azureTag.NormalizeValue("test"))
+	assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?"))
+}
+
+func TestAzureValidateKey(t *testing.T) {
+	assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length")
+	assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127)))
+	assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎")))
+}
+
+func TestAzureValidateValue(t *testing.T) {
+	assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length")
+	assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin")
+	assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127)))
+	assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎")))
+}
diff --git a/libs/tags/cloud.go b/libs/tags/cloud.go
@@ -0,0 +1,32 @@
+package tags
+
+import "github.com/databricks/databricks-sdk-go/config"
+
+type Cloud interface {
+	// ValidateKey checks if a tag key can be used with the cloud provider.
+	ValidateKey(key string) error
+
+	// ValidateValue checks if a tag value can be used with the cloud provider.
+	ValidateValue(value string) error
+
+	// NormalizeKey normalizes a tag key for the cloud provider.
+	NormalizeKey(key string) string
+
+	// NormalizeValue normalizes a tag value for the cloud provider.
+	NormalizeValue(value string) string
+}
+
+func ForCloud(cfg *config.Config) Cloud {
+	var t *tag
+	switch {
+	case cfg.IsAws():
+		t = awsTag
+	case cfg.IsAzure():
+		t = azureTag
+	case cfg.IsGcp():
+		t = gcpTag
+	default:
+		panic("unknown cloud provider")
+	}
+	return t
+}
diff --git a/libs/tags/cloud_test.go b/libs/tags/cloud_test.go
@@ -0,0 +1,32 @@
+package tags
+
+import (
+	"testing"
+
+	"github.com/databricks/databricks-sdk-go/config"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestForCloudAws(t *testing.T) {
+	c := &config.Config{
+		Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/",
+	}
+
+	assert.Equal(t, awsTag, ForCloud(c))
+}
+
+func TestForCloudAzure(t *testing.T) {
+	c := &config.Config{
+		Host: "https://adb-xxx.y.azuredatabricks.net/",
+	}
+
+	assert.Equal(t, azureTag, ForCloud(c))
+}
+
+func TestForCloudGcp(t *testing.T) {
+	c := &config.Config{
+		Host: "https://123.4.gcp.databricks.com/",
+	}
+
+	assert.Equal(t, gcpTag, ForCloud(c))
+}
diff --git a/libs/tags/gcp.go b/libs/tags/gcp.go
@@ -0,0 +1,63 @@
+package tags
+
+import (
+	"regexp"
+	"unicode"
+)
+
+// Tag keys and values on GCP are limited to 63 characters and must match the
+// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`.
+// For normalization, we define one table for the outer characters and
+// one table for the inner characters. The outer table is used to trim
+// leading and trailing characters, and the inner table is used to
+// replace invalid characters with underscores.
+
+var gcpOuter = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		// 0-9
+		{0x0030, 0x0039, 1},
+		// A-Z
+		{0x0041, 0x005A, 1},
+		// a-z
+		{0x0061, 0x007A, 1},
+	},
+	LatinOffset: 3,
+}
+
+var gcpInner = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		// Hyphen-minus (dash)
+		{0x002D, 0x002D, 1},
+		// Full stop (period)
+		{0x002E, 0x002E, 1},
+		// 0-9
+		{0x0030, 0x0039, 1},
+		// A-Z
+		{0x0041, 0x005A, 1},
+		// Low line (underscore)
+		{0x005F, 0x005F, 1},
+		// a-z
+		{0x0061, 0x007A, 1},
+	},
+	LatinOffset: 6,
+}
+
+var gcpTag = &tag{
+	keyLength:  63,
+	keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`),
+	keyNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(gcpInner, '_'),
+		trimIfNotIn(gcpOuter),
+	),
+
+	valueLength:  63,
+	valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`),
+	valueNormalize: chain(
+		normalizeMarks(),
+		replaceNotIn(latin1, '_'),
+		replaceNotIn(gcpInner, '_'),
+		trimIfNotIn(gcpOuter),
+	),
+}
diff --git a/libs/tags/gcp_test.go b/libs/tags/gcp_test.go
@@ -0,0 +1,65 @@
+package tags
+
+import (
+	"strings"
+	"testing"
+	"unicode"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGcpOuter(t *testing.T) {
+	assert.True(t, unicode.In('A', gcpOuter))
+	assert.True(t, unicode.In('Z', gcpOuter))
+	assert.True(t, unicode.In('a', gcpOuter))
+	assert.True(t, unicode.In('z', gcpOuter))
+	assert.True(t, unicode.In('0', gcpOuter))
+	assert.True(t, unicode.In('9', gcpOuter))
+	assert.False(t, unicode.In('-', gcpOuter))
+	assert.False(t, unicode.In('.', gcpOuter))
+	assert.False(t, unicode.In('_', gcpOuter))
+	assert.False(t, unicode.In('!', gcpOuter))
+}
+
+func TestGcpInner(t *testing.T) {
+	assert.True(t, unicode.In('A', gcpInner))
+	assert.True(t, unicode.In('Z', gcpInner))
+	assert.True(t, unicode.In('a', gcpInner))
+	assert.True(t, unicode.In('z', gcpInner))
+	assert.True(t, unicode.In('0', gcpInner))
+	assert.True(t, unicode.In('9', gcpInner))
+	assert.True(t, unicode.In('-', gcpInner))
+	assert.True(t, unicode.In('.', gcpInner))
+	assert.True(t, unicode.In('_', gcpInner))
+	assert.False(t, unicode.In('!', gcpInner))
+}
+
+func TestGcpNormalizeKey(t *testing.T) {
+	assert.Equal(t, "test", gcpTag.NormalizeKey("test"))
+	assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?"))
+	assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__"))
+
+}
+
+func TestGcpNormalizeValue(t *testing.T) {
+	assert.Equal(t, "test", gcpTag.NormalizeValue("test"))
+	assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?"))
+	assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__"))
+}
+
+func TestGcpValidateKey(t *testing.T) {
+	assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty")
+	assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length")
+	assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin")
+	assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern")
+	assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32)))
+	assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎")))
+}
+
+func TestGcpValidateValue(t *testing.T) {
+	assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length")
+	assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin")
+	assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern")
+	assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32)))
+	assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎")))
+}
diff --git a/libs/tags/latin.go b/libs/tags/latin.go
@@ -0,0 +1,11 @@
+package tags
+
+import "unicode"
+
+// Range table for all characters in the Latin1 character set.
+var latin1 = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		{0x0000, 0x00ff, 1},
+	},
+	LatinOffset: 1,
+}
diff --git a/libs/tags/latin_test.go b/libs/tags/latin_test.go
@@ -0,0 +1,16 @@
+package tags
+
+import (
+	"testing"
+	"unicode"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestLatinTable(t *testing.T) {
+	assert.True(t, unicode.In('\u0000', latin1))
+	assert.True(t, unicode.In('A', latin1))
+	assert.True(t, unicode.In('Z', latin1))
+	assert.True(t, unicode.In('\u00ff', latin1))
+	assert.False(t, unicode.In('\u0100', latin1))
+}