Skip to content

Commit

Permalink
Make utf8 validity check function self-contained.
Browse files Browse the repository at this point in the history
  • Loading branch information
kellegous committed Feb 5, 2016
1 parent 1253e60 commit 157c425
Showing 1 changed file with 21 additions and 13 deletions.
34 changes: 21 additions & 13 deletions index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -242,32 +242,40 @@ func isTextFile(filename string) (bool, error) {

buf = buf[:n]

res, validSize := validUTF8(buf)
// if remaining non valid less than utf8.UTFMax probably we are read part of rune just assume it's valid
if n == filePeekSize && n-validSize < utf8.UTFMax {
res = true
if n < filePeekSize {
// read the whole file, must be valid.
return utf8.Valid(buf), nil
}
return res, nil

// read a prefix, allow trailing partial runes.
return validUTF8IgnoringPartialTrailingRune(buf), nil

}

// copy paste from go source https://golang.org/src/unicode/utf8/utf8.go?s=9677:9702#L387, with valid bytes size
func validUTF8(p []byte) (bool, int) {
// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed
// to be a prefix of a larger buffer so if the buffer ends with the start of a rune, it
// is still considered valid.
//
// Basic logic copied from https://golang.org/pkg/unicode/utf8/#Valid
func validUTF8IgnoringPartialTrailingRune(p []byte) bool {
i := 0
for i < len(p) {
n := len(p)

for i < n {
if p[i] < utf8.RuneSelf {
i++
} else {
_, size := utf8.DecodeRune(p[i:])
if size == 1 {
// All valid runes of size 1 (those
// below RuneSelf) were handled above.
// This must be a RuneError.
return false, i
// All valid runes of size 1 (those below RuneSelf) were handled above. This must be a RuneError.
// If we're encountering this error within UTFMax of the end and the current byte could be a
// valid start, we'll just ignore the assumed partial rune.
return n-i < utf8.UTFMax && utf8.RuneStart(p[i])
}
i += size
}
}
return true, i
return true
}

func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) {
Expand Down

0 comments on commit 157c425

Please sign in to comment.