Skip to content

Commit

Permalink
Library to validate and normalize cloud specific tags (#819)
Browse files Browse the repository at this point in the history
## Changes

Prompted by the proposed fix for a tagging-related problem in #810, I
investigated how tag validation works. This turned out to be quite a bit
more complex than anticipated. Tags at the job level (or cluster level)
are passed through to the underlying compute infrastructure and as such
are tested against cloud-specific validation rules. GCP appears to be
the most restrictive. It would be disappointing to always restrict to
`\w+`, so this package implements validation and normalization rules for
each cloud. It can pick the right cloud to use using a Go SDK
configuration.

## Tests

Exhaustive unit tests. The regular expressions were pulled by #814.
  • Loading branch information
pietern authored and hectorcast-db committed Oct 13, 2023
1 parent f7170dd commit 52d3a5e
Show file tree
Hide file tree
Showing 13 changed files with 532 additions and 0 deletions.
36 changes: 36 additions & 0 deletions libs/tags/aws.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package tags

import (
"regexp"
"unicode"

"golang.org/x/text/unicode/rangetable"
)

// The union of all characters allowed in AWS tags.
// This must be used only after filtering out non-Latin1 characters,
// because the [unicode] classes include non-Latin1 characters.
var awsChars = rangetable.Merge(
unicode.Digit,
unicode.Space,
unicode.Letter,
rangetable.New('+', '-', '=', '.', ':', '/', '@'),
)

var awsTag = &tag{
keyLength: 127,
keyPattern: regexp.MustCompile(`^[\d \w\+\-=\.:\/@]*$`),
keyNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(awsChars, '_'),
),

valueLength: 255,
valuePattern: regexp.MustCompile(`^[\d \w\+\-=\.:/@]*$`),
valueNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(awsChars, '_'),
),
}
49 changes: 49 additions & 0 deletions libs/tags/aws_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package tags

import (
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

func TestAwsNormalizeKey(t *testing.T) {
assert.Equal(t, "1 a b c", awsTag.NormalizeKey("1 a b c"))
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeKey("+-=.:/@?)"))
assert.Equal(t, "test", awsTag.NormalizeKey("test"))

// Remove marks; unicode becomes underscore.
assert.Equal(t, "cafe _", awsTag.NormalizeKey("café 🍎"))

// Replace forbidden characters with underscore.
assert.Equal(t, "cafe __", awsTag.NormalizeKey("café 🍎?"))
}

func TestAwsNormalizeValue(t *testing.T) {
assert.Equal(t, "1 a b c", awsTag.NormalizeValue("1 a b c"))
assert.Equal(t, "+-=.:/@__", awsTag.NormalizeValue("+-=.:/@?)"))
assert.Equal(t, "test", awsTag.NormalizeValue("test"))

// Remove marks; unicode becomes underscore.
assert.Equal(t, "cafe _", awsTag.NormalizeValue("café 🍎"))

// Replace forbidden characters with underscore.
assert.Equal(t, "cafe __", awsTag.NormalizeValue("café 🍎?"))
}

func TestAwsValidateKey(t *testing.T) {
assert.ErrorContains(t, awsTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, awsTag.ValidateKey(strings.Repeat("a", 512)), "length")
assert.ErrorContains(t, awsTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, awsTag.ValidateKey("????"), "pattern")
assert.NoError(t, awsTag.ValidateKey(strings.Repeat("a", 127)))
assert.NoError(t, awsTag.ValidateKey(awsTag.NormalizeKey("café 🍎")))
}

func TestAwsValidateValue(t *testing.T) {
assert.ErrorContains(t, awsTag.ValidateValue(strings.Repeat("a", 512)), "length")
assert.ErrorContains(t, awsTag.ValidateValue("café 🍎"), "latin1")
assert.ErrorContains(t, awsTag.ValidateValue("????"), "pattern")
assert.NoError(t, awsTag.ValidateValue(strings.Repeat("a", 127)))
assert.NoError(t, awsTag.ValidateValue(awsTag.NormalizeValue("café 🍎")))
}
25 changes: 25 additions & 0 deletions libs/tags/azure.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package tags

import (
"regexp"

"golang.org/x/text/unicode/rangetable"
)

// All characters that may not be used in Azure tag keys.
var azureForbiddenChars = rangetable.New('<', '>', '*', '&', '%', ';', '\\', '/', '+', '?')

var azureTag = &tag{
keyLength: 512,
keyPattern: regexp.MustCompile(`^[^<>\*&%;\\\/\+\?]*$`),
keyNormalize: chain(
replaceNotIn(latin1, '_'),
replaceIn(azureForbiddenChars, '_'),
),

valueLength: 256,
valuePattern: regexp.MustCompile(`^.*$`),
valueNormalize: chain(
replaceNotIn(latin1, '_'),
),
}
34 changes: 34 additions & 0 deletions libs/tags/azure_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package tags

import (
"strings"
"testing"

"github.com/stretchr/testify/assert"
)

func TestAzureNormalizeKey(t *testing.T) {
assert.Equal(t, "test", azureTag.NormalizeKey("test"))
assert.Equal(t, "café __", azureTag.NormalizeKey("café 🍎?"))
}

func TestAzureNormalizeValue(t *testing.T) {
assert.Equal(t, "test", azureTag.NormalizeValue("test"))
assert.Equal(t, "café _?", azureTag.NormalizeValue("café 🍎?"))
}

func TestAzureValidateKey(t *testing.T) {
assert.ErrorContains(t, azureTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, azureTag.ValidateKey(strings.Repeat("a", 513)), "length")
assert.ErrorContains(t, azureTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, azureTag.ValidateKey("????"), "pattern")
assert.NoError(t, azureTag.ValidateKey(strings.Repeat("a", 127)))
assert.NoError(t, azureTag.ValidateKey(azureTag.NormalizeKey("café 🍎")))
}

func TestAzureValidateValue(t *testing.T) {
assert.ErrorContains(t, azureTag.ValidateValue(strings.Repeat("a", 513)), "length")
assert.ErrorContains(t, azureTag.ValidateValue("café 🍎"), "latin")
assert.NoError(t, azureTag.ValidateValue(strings.Repeat("a", 127)))
assert.NoError(t, azureTag.ValidateValue(azureTag.NormalizeValue("café 🍎")))
}
32 changes: 32 additions & 0 deletions libs/tags/cloud.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package tags

import "github.com/databricks/databricks-sdk-go/config"

type Cloud interface {
// ValidateKey checks if a tag key can be used with the cloud provider.
ValidateKey(key string) error

// ValidateValue checks if a tag value can be used with the cloud provider.
ValidateValue(value string) error

// NormalizeKey normalizes a tag key for the cloud provider.
NormalizeKey(key string) string

// NormalizeValue normalizes a tag value for the cloud provider.
NormalizeValue(value string) string
}

func ForCloud(cfg *config.Config) Cloud {
var t *tag
switch {
case cfg.IsAws():
t = awsTag
case cfg.IsAzure():
t = azureTag
case cfg.IsGcp():
t = gcpTag
default:
panic("unknown cloud provider")
}
return t
}
32 changes: 32 additions & 0 deletions libs/tags/cloud_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package tags

import (
"testing"

"github.com/databricks/databricks-sdk-go/config"
"github.com/stretchr/testify/assert"
)

func TestForCloudAws(t *testing.T) {
c := &config.Config{
Host: "https://dbc-XXXXXXXX-YYYY.cloud.databricks.com/",
}

assert.Equal(t, awsTag, ForCloud(c))
}

func TestForCloudAzure(t *testing.T) {
c := &config.Config{
Host: "https://adb-xxx.y.azuredatabricks.net/",
}

assert.Equal(t, azureTag, ForCloud(c))
}

func TestForCloudGcp(t *testing.T) {
c := &config.Config{
Host: "https://123.4.gcp.databricks.com/",
}

assert.Equal(t, gcpTag, ForCloud(c))
}
63 changes: 63 additions & 0 deletions libs/tags/gcp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package tags

import (
"regexp"
"unicode"
)

// Tag keys and values on GCP are limited to 63 characters and must match the
// regular expression `^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`.
// For normalization, we define one table for the outer characters and
// one table for the inner characters. The outer table is used to trim
// leading and trailing characters, and the inner table is used to
// replace invalid characters with underscores.

var gcpOuter = &unicode.RangeTable{
R16: []unicode.Range16{
// 0-9
{0x0030, 0x0039, 1},
// A-Z
{0x0041, 0x005A, 1},
// a-z
{0x0061, 0x007A, 1},
},
LatinOffset: 3,
}

var gcpInner = &unicode.RangeTable{
R16: []unicode.Range16{
// Hyphen-minus (dash)
{0x002D, 0x002D, 1},
// Full stop (period)
{0x002E, 0x002E, 1},
// 0-9
{0x0030, 0x0039, 1},
// A-Z
{0x0041, 0x005A, 1},
// Low line (underscore)
{0x005F, 0x005F, 1},
// a-z
{0x0061, 0x007A, 1},
},
LatinOffset: 6,
}

var gcpTag = &tag{
keyLength: 63,
keyPattern: regexp.MustCompile(`^([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$`),
keyNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(gcpInner, '_'),
trimIfNotIn(gcpOuter),
),

valueLength: 63,
valuePattern: regexp.MustCompile(`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$`),
valueNormalize: chain(
normalizeMarks(),
replaceNotIn(latin1, '_'),
replaceNotIn(gcpInner, '_'),
trimIfNotIn(gcpOuter),
),
}
65 changes: 65 additions & 0 deletions libs/tags/gcp_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package tags

import (
"strings"
"testing"
"unicode"

"github.com/stretchr/testify/assert"
)

func TestGcpOuter(t *testing.T) {
assert.True(t, unicode.In('A', gcpOuter))
assert.True(t, unicode.In('Z', gcpOuter))
assert.True(t, unicode.In('a', gcpOuter))
assert.True(t, unicode.In('z', gcpOuter))
assert.True(t, unicode.In('0', gcpOuter))
assert.True(t, unicode.In('9', gcpOuter))
assert.False(t, unicode.In('-', gcpOuter))
assert.False(t, unicode.In('.', gcpOuter))
assert.False(t, unicode.In('_', gcpOuter))
assert.False(t, unicode.In('!', gcpOuter))
}

func TestGcpInner(t *testing.T) {
assert.True(t, unicode.In('A', gcpInner))
assert.True(t, unicode.In('Z', gcpInner))
assert.True(t, unicode.In('a', gcpInner))
assert.True(t, unicode.In('z', gcpInner))
assert.True(t, unicode.In('0', gcpInner))
assert.True(t, unicode.In('9', gcpInner))
assert.True(t, unicode.In('-', gcpInner))
assert.True(t, unicode.In('.', gcpInner))
assert.True(t, unicode.In('_', gcpInner))
assert.False(t, unicode.In('!', gcpInner))
}

func TestGcpNormalizeKey(t *testing.T) {
assert.Equal(t, "test", gcpTag.NormalizeKey("test"))
assert.Equal(t, "cafe", gcpTag.NormalizeKey("café 🍎?"))
assert.Equal(t, "cafe_foo", gcpTag.NormalizeKey("__café_foo__"))

}

func TestGcpNormalizeValue(t *testing.T) {
assert.Equal(t, "test", gcpTag.NormalizeValue("test"))
assert.Equal(t, "cafe", gcpTag.NormalizeValue("café 🍎?"))
assert.Equal(t, "cafe_foo", gcpTag.NormalizeValue("__café_foo__"))
}

func TestGcpValidateKey(t *testing.T) {
assert.ErrorContains(t, gcpTag.ValidateKey(""), "not be empty")
assert.ErrorContains(t, gcpTag.ValidateKey(strings.Repeat("a", 64)), "length")
assert.ErrorContains(t, gcpTag.ValidateKey("café 🍎"), "latin")
assert.ErrorContains(t, gcpTag.ValidateKey("????"), "pattern")
assert.NoError(t, gcpTag.ValidateKey(strings.Repeat("a", 32)))
assert.NoError(t, gcpTag.ValidateKey(gcpTag.NormalizeKey("café 🍎")))
}

func TestGcpValidateValue(t *testing.T) {
assert.ErrorContains(t, gcpTag.ValidateValue(strings.Repeat("a", 64)), "length")
assert.ErrorContains(t, gcpTag.ValidateValue("café 🍎"), "latin")
assert.ErrorContains(t, gcpTag.ValidateValue("????"), "pattern")
assert.NoError(t, gcpTag.ValidateValue(strings.Repeat("a", 32)))
assert.NoError(t, gcpTag.ValidateValue(gcpTag.NormalizeValue("café 🍎")))
}
11 changes: 11 additions & 0 deletions libs/tags/latin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package tags

import "unicode"

// Range table for all characters in the Latin1 character set.
var latin1 = &unicode.RangeTable{
R16: []unicode.Range16{
{0x0000, 0x00ff, 1},
},
LatinOffset: 1,
}
16 changes: 16 additions & 0 deletions libs/tags/latin_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package tags

import (
"testing"
"unicode"

"github.com/stretchr/testify/assert"
)

func TestLatinTable(t *testing.T) {
assert.True(t, unicode.In('\u0000', latin1))
assert.True(t, unicode.In('A', latin1))
assert.True(t, unicode.In('Z', latin1))
assert.True(t, unicode.In('\u00ff', latin1))
assert.False(t, unicode.In('\u0100', latin1))
}
Loading

0 comments on commit 52d3a5e

Please sign in to comment.