diff --git a/index/index.go b/index/index.go index abe7ae39..8b773690 100644 --- a/index/index.go +++ b/index/index.go @@ -242,32 +242,40 @@ func isTextFile(filename string) (bool, error) { buf = buf[:n] - res, validSize := validUTF8(buf) - // if remaining non valid less than utf8.UTFMax probably we are read part of rune just assume it's valid - if n == filePeekSize && n-validSize < utf8.UTFMax { - res = true + if n < filePeekSize { + // read the whole file, must be valid. + return utf8.Valid(buf), nil } - return res, nil + + // read a prefix, allow trailing partial runes. + return validUTF8IgnoringPartialTrailingRune(buf), nil + } -// copy paste from go source https://golang.org/src/unicode/utf8/utf8.go?s=9677:9702#L387, with valid bytes size -func validUTF8(p []byte) (bool, int) { +// Determines if the buffer contains valid UTF8 encoded string data. The buffer is assumed +// to be a prefix of a larger buffer so if the buffer ends with the start of a rune, it +// is still considered valid. +// +// Basic logic copied from https://golang.org/pkg/unicode/utf8/#Valid +func validUTF8IgnoringPartialTrailingRune(p []byte) bool { i := 0 - for i < len(p) { + n := len(p) + + for i < n { if p[i] < utf8.RuneSelf { i++ } else { _, size := utf8.DecodeRune(p[i:]) if size == 1 { - // All valid runes of size 1 (those - // below RuneSelf) were handled above. - // This must be a RuneError. - return false, i + // All valid runes of size 1 (those below RuneSelf) were handled above. This must be a RuneError. + // If we're encountering this error within UTFMax of the end and the current byte could be a + // valid start, we'll just ignore the assumed partial rune. + return n-i < utf8.UTFMax && utf8.RuneStart(p[i]) } i += size } } - return true, i + return true } func addFileToIndex(ix *index.IndexWriter, dst, src, path string) (string, error) {