diff --git a/libs/tags/aws.go b/libs/tags/aws.go new file mode 100644 index 0000000000..44d69c683e --- /dev/null +++ b/libs/tags/aws.go @@ -0,0 +1,36 @@ +package tags + +import ( + "regexp" + "unicode" + + "golang.org/x/text/unicode/rangetable" +) + +// The union of all characters allowed in AWS tags. +// This must be used only after filtering out non-Latin1 characters, +// because the [unicode] classes include non-Latin1 characters. +var awsChars = rangetable.Merge( + unicode.Digit, + unicode.Space, + unicode.Letter, + rangetable.New('+', '-', '=', '.', ':', '/', '@'), +) + +var awsTag = &tag{ + keyLength: 127, + keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`), + keyNormalize: chain( + normalizeMarks(), + replaceNotIn(latin1, '_'), + replaceNotIn(awsChars, '_'), + ), + + valueLength: 255, + valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`), + valueNormalize: chain( + normalizeMarks(), + replaceNotIn(latin1, '_'), + replaceNotIn(awsChars, '_'), + ), +} diff --git a/libs/tags/aws_test.go b/libs/tags/aws_test.go new file mode 100644 index 0000000000..2a2bb7e7bd --- /dev/null +++ b/libs/tags/aws_test.go @@ -0,0 +1,49 @@ +package tags + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestAwsNormalizeKey(t *testing.T) { + assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c")) + assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)")) + assert.Equal(t, "test", awsTag.NormalizeKey("test")) + + // Remove marks; unicode becomes underscore. + assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎")) + + // Replace forbidden characters with underscore. + assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?")) +} + +func TestAwsNormalizeValue(t *testing.T) { + assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c")) + assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)")) + assert.Equal(t, "test", awsTag.NormalizeValue("test")) + + // Remove marks; unicode becomes underscore. + assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎")) + + // Replace forbidden characters with underscore. + assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?")) +} + +func TestAwsValidateKey(t *testing.T) { + assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty") + assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length") + assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin") + assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern") + assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127))) + assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎"))) +} + +func TestAwsValidateValue(t *testing.T) { + assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length") + assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1") + assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern") + assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127))) + assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎"))) +} diff --git a/libs/tags/azure.go b/libs/tags/azure.go new file mode 100644 index 0000000000..e98a5eb2d4 --- /dev/null +++ b/libs/tags/azure.go @@ -0,0 +1,25 @@ +package tags + +import ( + "regexp" + + "golang.org/x/text/unicode/rangetable" +) + +// All characters that may not be used in Azure tag keys. +var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?') + +var azureTag = &tag{ + keyLength: 512, + keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`), + keyNormalize: chain( + replaceNotIn(latin1, '_'), + replaceIn(azureForbiddenChars, '_'), + ), + + valueLength: 256, + valuePattern: regexp.MustCompile(`^.*$`), + valueNormalize: chain( + replaceNotIn(latin1, '_'), + ), +} diff --git a/libs/tags/azure_test.go b/libs/tags/azure_test.go new file mode 100644 index 0000000000..1deb5d6e6f --- /dev/null +++ b/libs/tags/azure_test.go @@ -0,0 +1,34 @@ +package tags + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestAzureNormalizeKey(t *testing.T) { + assert.Equal(t, "test", azureTag.NormalizeKey("test")) + assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?")) +} + +func TestAzureNormalizeValue(t *testing.T) { + assert.Equal(t, "test", azureTag.NormalizeValue("test")) + assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?")) +} + +func TestAzureValidateKey(t *testing.T) { + assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty") + assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length") + assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin") + assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern") + assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127))) + assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎"))) +} + +func TestAzureValidateValue(t *testing.T) { + assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length") + assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin") + assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127))) + assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎"))) +} diff --git a/libs/tags/cloud.go b/libs/tags/cloud.go new file mode 100644 index 0000000000..f423efa58b --- /dev/null +++ b/libs/tags/cloud.go @@ -0,0 +1,32 @@ +package tags + +import "github.com/databricks/databricks-sdk-go/config" + +type Cloud interface { + // ValidateKey checks if a tag key can be used with the cloud provider. + ValidateKey(key string) error + + // ValidateValue checks if a tag value can be used with the cloud provider. + ValidateValue(value string) error + + // NormalizeKey normalizes a tag key for the cloud provider. + NormalizeKey(key string) string + + // NormalizeValue normalizes a tag value for the cloud provider. + NormalizeValue(value string) string +} + +func ForCloud(cfg *config.Config) Cloud { + var t *tag + switch { + case cfg.IsAws(): + t = awsTag + case cfg.IsAzure(): + t = azureTag + case cfg.IsGcp(): + t = gcpTag + default: + panic("unknown cloud provider") + } + return t +} diff --git a/libs/tags/cloud_test.go b/libs/tags/cloud_test.go new file mode 100644 index 0000000000..a1d04d88fe --- /dev/null +++ b/libs/tags/cloud_test.go @@ -0,0 +1,32 @@ +package tags + +import ( + "testing" + + "github.com/databricks/databricks-sdk-go/config" + "github.com/stretchr/testify/assert" +) + +func TestForCloudAws(t *testing.T) { + c := &config.Config{ + Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/", + } + + assert.Equal(t, awsTag, ForCloud(c)) +} + +func TestForCloudAzure(t *testing.T) { + c := &config.Config{ + Host: "https://adb-xxx.y.azuredatabricks.net/", + } + + assert.Equal(t, azureTag, ForCloud(c)) +} + +func TestForCloudGcp(t *testing.T) { + c := &config.Config{ + Host: "https://123.4.gcp.databricks.com/", + } + + assert.Equal(t, gcpTag, ForCloud(c)) +} diff --git a/libs/tags/gcp.go b/libs/tags/gcp.go new file mode 100644 index 0000000000..f30ca4cae0 --- /dev/null +++ b/libs/tags/gcp.go @@ -0,0 +1,63 @@ +package tags + +import ( + "regexp" + "unicode" +) + +// Tag keys and values on GCP are limited to 63 characters and must match the +// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`. +// For normalization, we define one table for the outer characters and +// one table for the inner characters. The outer table is used to trim +// leading and trailing characters, and the inner table is used to +// replace invalid characters with underscores. + +var gcpOuter = &unicode.RangeTable{ + R16: []unicode.Range16{ + // 0-9 + {0x0030, 0x0039, 1}, + // A-Z + {0x0041, 0x005A, 1}, + // a-z + {0x0061, 0x007A, 1}, + }, + LatinOffset: 3, +} + +var gcpInner = &unicode.RangeTable{ + R16: []unicode.Range16{ + // Hyphen-minus (dash) + {0x002D, 0x002D, 1}, + // Full stop (period) + {0x002E, 0x002E, 1}, + // 0-9 + {0x0030, 0x0039, 1}, + // A-Z + {0x0041, 0x005A, 1}, + // Low line (underscore) + {0x005F, 0x005F, 1}, + // a-z + {0x0061, 0x007A, 1}, + }, + LatinOffset: 6, +} + +var gcpTag = &tag{ + keyLength: 63, + keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`), + keyNormalize: chain( + normalizeMarks(), + replaceNotIn(latin1, '_'), + replaceNotIn(gcpInner, '_'), + trimIfNotIn(gcpOuter), + ), + + valueLength: 63, + valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`), + valueNormalize: chain( + normalizeMarks(), + replaceNotIn(latin1, '_'), + replaceNotIn(gcpInner, '_'), + trimIfNotIn(gcpOuter), + ), +} diff --git a/libs/tags/gcp_test.go b/libs/tags/gcp_test.go new file mode 100644 index 0000000000..89f4fd8e6b --- /dev/null +++ b/libs/tags/gcp_test.go @@ -0,0 +1,65 @@ +package tags + +import ( + "strings" + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestGcpOuter(t *testing.T) { + assert.True(t, unicode.In('A', gcpOuter)) + assert.True(t, unicode.In('Z', gcpOuter)) + assert.True(t, unicode.In('a', gcpOuter)) + assert.True(t, unicode.In('z', gcpOuter)) + assert.True(t, unicode.In('0', gcpOuter)) + assert.True(t, unicode.In('9', gcpOuter)) + assert.False(t, unicode.In('-', gcpOuter)) + assert.False(t, unicode.In('.', gcpOuter)) + assert.False(t, unicode.In('_', gcpOuter)) + assert.False(t, unicode.In('!', gcpOuter)) +} + +func TestGcpInner(t *testing.T) { + assert.True(t, unicode.In('A', gcpInner)) + assert.True(t, unicode.In('Z', gcpInner)) + assert.True(t, unicode.In('a', gcpInner)) + assert.True(t, unicode.In('z', gcpInner)) + assert.True(t, unicode.In('0', gcpInner)) + assert.True(t, unicode.In('9', gcpInner)) + assert.True(t, unicode.In('-', gcpInner)) + assert.True(t, unicode.In('.', gcpInner)) + assert.True(t, unicode.In('_', gcpInner)) + assert.False(t, unicode.In('!', gcpInner)) +} + +func TestGcpNormalizeKey(t *testing.T) { + assert.Equal(t, "test", gcpTag.NormalizeKey("test")) + assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?")) + assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__")) + +} + +func TestGcpNormalizeValue(t *testing.T) { + assert.Equal(t, "test", gcpTag.NormalizeValue("test")) + assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?")) + assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__")) +} + +func TestGcpValidateKey(t *testing.T) { + assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty") + assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length") + assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin") + assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern") + assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32))) + assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎"))) +} + +func TestGcpValidateValue(t *testing.T) { + assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length") + assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin") + assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern") + assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32))) + assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎"))) +} diff --git a/libs/tags/latin.go b/libs/tags/latin.go new file mode 100644 index 0000000000..df9ad403e7 --- /dev/null +++ b/libs/tags/latin.go @@ -0,0 +1,11 @@ +package tags + +import "unicode" + +// Range table for all characters in the Latin1 character set. +var latin1 = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x0000, 0x00ff, 1}, + }, + LatinOffset: 1, +} diff --git a/libs/tags/latin_test.go b/libs/tags/latin_test.go new file mode 100644 index 0000000000..c3234a4435 --- /dev/null +++ b/libs/tags/latin_test.go @@ -0,0 +1,16 @@ +package tags + +import ( + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestLatinTable(t *testing.T) { + assert.True(t, unicode.In('\u0000', latin1)) + assert.True(t, unicode.In('A', latin1)) + assert.True(t, unicode.In('Z', latin1)) + assert.True(t, unicode.In('\u00ff', latin1)) + assert.False(t, unicode.In('\u0100', latin1)) +} diff --git a/libs/tags/tag.go b/libs/tags/tag.go new file mode 100644 index 0000000000..4e9b329ca2 --- /dev/null +++ b/libs/tags/tag.go @@ -0,0 +1,57 @@ +package tags + +import ( + "fmt" + "regexp" + "strings" + "unicode" +) + +// The tag type holds the validation and normalization rules for +// a cloud provider's resource tags as applied by Databricks. +type tag struct { + keyLength int + keyPattern *regexp.Regexp + keyNormalize transformer + + valueLength int + valuePattern *regexp.Regexp + valueNormalize transformer +} + +func (t *tag) ValidateKey(s string) error { + if len(s) == 0 { + return fmt.Errorf("key must not be empty") + } + if len(s) > t.keyLength { + return fmt.Errorf("key length %d exceeds maximum of %d", len(s), t.keyLength) + } + if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) { + return fmt.Errorf("key contains non-latin1 characters") + } + if !t.keyPattern.MatchString(s) { + return fmt.Errorf("key %q does not match pattern %q", s, t.keyPattern) + } + return nil +} + +func (t *tag) ValidateValue(s string) error { + if len(s) > t.valueLength { + return fmt.Errorf("value length %d exceeds maximum of %d", len(s), t.valueLength) + } + if strings.ContainsFunc(s, func(r rune) bool { return !unicode.Is(latin1, r) }) { + return fmt.Errorf("value contains non-latin1 characters") + } + if !t.valuePattern.MatchString(s) { + return fmt.Errorf("value %q does not match pattern %q", s, t.valuePattern) + } + return nil +} + +func (t *tag) NormalizeKey(s string) string { + return t.keyNormalize.transform(s) +} + +func (t *tag) NormalizeValue(s string) string { + return t.valueNormalize.transform(s) +} diff --git a/libs/tags/transform.go b/libs/tags/transform.go new file mode 100644 index 0000000000..71d01b3563 --- /dev/null +++ b/libs/tags/transform.go @@ -0,0 +1,87 @@ +package tags + +import ( + "strings" + "unicode" + + "golang.org/x/text/runes" + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" +) + +type transformer interface { + transform(string) string +} + +type chainTransformer []transformer + +func (c chainTransformer) transform(s string) string { + for _, t := range c { + s = t.transform(s) + } + return s +} + +func chain(t ...transformer) transformer { + return chainTransformer(t) +} + +// Implement [transformer] interface with text/transform package. +type textTransformer struct { + transform.Transformer +} + +func (t textTransformer) transform(s string) string { + s, _, _ = transform.String(t, s) + return s +} + +func normalizeMarks() transformer { + // Decompose unicode characters, then remove all non-spacing marks, then recompose. + // This turns 'é' into 'e' and 'ü' into 'u'. + return textTransformer{ + transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC), + } +} + +// Replaces characters in the given set with replacement. +type replaceTransformer struct { + set runes.Set + replacement rune +} + +func (t replaceTransformer) transform(s string) string { + return strings.Map(func(r rune) rune { + if t.set.Contains(r) { + return t.replacement + } + return r + }, s) +} + +func replaceIn(table *unicode.RangeTable, replacement rune) transformer { + return replaceTransformer{runes.In(table), replacement} +} + +func replaceNotIn(table *unicode.RangeTable, replacement rune) transformer { + return replaceTransformer{runes.NotIn(table), replacement} +} + +// Trims the given string of all characters in the given set. +type trimTransformer struct { + set runes.Set +} + +func (t trimTransformer) transform(s string) string { + return strings.TrimFunc(s, func(r rune) bool { + return t.set.Contains(r) + }) +} + +func trimIfIn(table *unicode.RangeTable) transformer { + return trimTransformer{runes.In(table)} +} + +func trimIfNotIn(table *unicode.RangeTable) transformer { + return trimTransformer{runes.NotIn(table)} +} diff --git a/libs/tags/transform_test.go b/libs/tags/transform_test.go new file mode 100644 index 0000000000..6481b6d9bc --- /dev/null +++ b/libs/tags/transform_test.go @@ -0,0 +1,25 @@ +package tags + +import ( + "testing" + "unicode" + + "github.com/stretchr/testify/assert" +) + +func TestNormalizeMarks(t *testing.T) { + x := normalizeMarks() + assert.Equal(t, "cafe", x.transform("café")) + assert.Equal(t, "cafe 🍎", x.transform("café 🍎")) + assert.Equal(t, "Foo Bar", x.transform("Foo Bar")) +} + +func TestReplace(t *testing.T) { + assert.Equal(t, "___abc___", replaceIn(unicode.Digit, '_').transform("000abc999")) + assert.Equal(t, "___000___", replaceNotIn(unicode.Digit, '_').transform("abc000abc")) +} + +func TestTrim(t *testing.T) { + assert.Equal(t, "abc", trimIfIn(unicode.Digit).transform("000abc999")) + assert.Equal(t, "000", trimIfNotIn(unicode.Digit).transform("abc000abc")) +}