Skip to content

Commit

Permalink
Add Tabular Diff for CSV files (#14661)
Browse files Browse the repository at this point in the history
Implements request #14320 The rendering of CSV files does match the diff style.

* Moved CSV logic into base package.

* Added method to create a tabular diff.

* Added CSV compare context.

* Added CSV diff template.

* Use new table style in CSV markup.

* Added file size limit for CSV rendering.

* Display CSV parser errors in diff.

* Lazy read single file.

* Lazy read rows for full diff.

* Added unit tests for various CSV changes.
  • Loading branch information
KN4CK3R authored Mar 29, 2021
1 parent d3b8127 commit 0c61376
Show file tree
Hide file tree
Showing 20 changed files with 937 additions and 118 deletions.
4 changes: 4 additions & 0 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,10 @@ EVENT_SOURCE_UPDATE_TIME = 10s
; Whether to render SVG files as images. If SVG rendering is disabled, SVG files are displayed as text and cannot be embedded in markdown files as images.
ENABLE_RENDER = true

[ui.csv]
; Maximum allowed file size in bytes to render CSV files as table. (Set to 0 for no limit).
MAX_FILE_SIZE = 524288

[markdown]
; Render soft line breaks as hard line breaks, which means a single newline character between
; paragraphs will cause a line break and adding trailing whitespace to paragraphs is not
Expand Down
4 changes: 4 additions & 0 deletions docs/content/doc/advanced/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,10 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.

- `ENABLE_RENDER`: **true**: Whether to render SVG files as images. If SVG rendering is disabled, SVG files are displayed as text and cannot be embedded in markdown files as images.

### UI - CSV Files (`ui.csv`)

- `MAX_FILE_SIZE`: **524288** (512kb): Maximum allowed file size in bytes to render CSV files as table. (Set to 0 for no limit).

## Markdown (`markdown`)

- `ENABLE_HARD_LINE_BREAK_IN_COMMENTS`: **true**: Render soft line breaks as hard line breaks in comments, which
Expand Down
93 changes: 93 additions & 0 deletions modules/csv/csv.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package csv

import (
"bytes"
"encoding/csv"
"errors"
"regexp"
"strings"

"code.gitea.io/gitea/modules/translation"
"code.gitea.io/gitea/modules/util"
)

var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)

// CreateReader creates a csv.Reader with the given delimiter.
func CreateReader(rawBytes []byte, delimiter rune) *csv.Reader {
rd := csv.NewReader(bytes.NewReader(rawBytes))
rd.Comma = delimiter
rd.TrimLeadingSpace = true
return rd
}

// CreateReaderAndGuessDelimiter tries to guess the field delimiter from the content and creates a csv.Reader.
func CreateReaderAndGuessDelimiter(rawBytes []byte) *csv.Reader {
delimiter := guessDelimiter(rawBytes)
return CreateReader(rawBytes, delimiter)
}

// guessDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func guessDelimiter(data []byte) rune {
maxLines := 10
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
text = quoteRegexp.ReplaceAllLiteralString(text, "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]

delimiters := []rune{',', ';', '\t', '|', '@'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
}

return bestDelim
}

// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV.
func scoreDelimiter(lines []string, delim rune) float64 {
countTotal := 0
countLineMax := 0
linesNotEqual := 0

for _, line := range lines {
if len(line) == 0 {
continue
}

countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
}
}

return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}

// FormatError converts csv errors into readable messages.
func FormatError(err error, locale translation.Locale) (string, error) {
var perr *csv.ParseError
if errors.As(err, &perr) {
if perr.Err == csv.ErrFieldCount {
return locale.Tr("repo.error.csv.invalid_field_count", perr.Line), nil
}
return locale.Tr("repo.error.csv.unexpected", perr.Line, perr.Column), nil
}

return "", err
}
40 changes: 40 additions & 0 deletions modules/csv/csv_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package csv

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestCreateReader(t *testing.T) {
rd := CreateReader([]byte{}, ',')
assert.Equal(t, ',', rd.Comma)
}

func TestCreateReaderAndGuessDelimiter(t *testing.T) {
input := "a;b;c\n1;2;3\n4;5;6"

rd := CreateReaderAndGuessDelimiter([]byte(input))
assert.Equal(t, ';', rd.Comma)
}

func TestGuessDelimiter(t *testing.T) {
var kases = map[string]rune{
"a": ',',
"1,2": ',',
"1;2": ';',
"1\t2": '\t',
"1|2": '|',
"1,2,3;4,5,6;7,8,9\na;b;c": ';',
"\"1,2,3,4\";\"a\nb\"\nc;d": ';',
"<br/>": ',',
}

for k, v := range kases {
assert.EqualValues(t, guessDelimiter([]byte(k)), v)
}
}
103 changes: 40 additions & 63 deletions modules/markup/csv/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,20 @@ package markup

import (
"bytes"
"encoding/csv"
"html"
"io"
"regexp"
"strings"
"strconv"

"code.gitea.io/gitea/modules/csv"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/modules/setting"
)

var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)

func init() {
markup.RegisterParser(Parser{})

}

// Parser implements markup.Parser for orgmode
// Parser implements markup.Parser for csv files
type Parser struct {
}

Expand All @@ -38,11 +34,35 @@ func (Parser) Extensions() []string {
}

// Render implements markup.Parser
func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
rd := csv.NewReader(bytes.NewReader(rawBytes))
rd.Comma = p.bestDelimiter(rawBytes)
func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
var tmpBlock bytes.Buffer
tmpBlock.WriteString(`<table class="table">`)

if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) {
tmpBlock.WriteString("<pre>")
tmpBlock.WriteString(html.EscapeString(string(rawBytes)))
tmpBlock.WriteString("</pre>")
return tmpBlock.Bytes()
}

rd := csv.CreateReaderAndGuessDelimiter(rawBytes)

writeField := func(element, class, field string) {
tmpBlock.WriteString("<")
tmpBlock.WriteString(element)
if len(class) > 0 {
tmpBlock.WriteString(" class=\"")
tmpBlock.WriteString(class)
tmpBlock.WriteString("\"")
}
tmpBlock.WriteString(">")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</")
tmpBlock.WriteString(element)
tmpBlock.WriteString(">")
}

tmpBlock.WriteString(`<table class="data-table">`)
row := 1
for {
fields, err := rd.Read()
if err == io.EOF {
Expand All @@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
continue
}
tmpBlock.WriteString("<tr>")
element := "td"
if row == 1 {
element = "th"
}
writeField(element, "line-num", strconv.Itoa(row))
for _, field := range fields {
tmpBlock.WriteString("<td>")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</td>")
writeField(element, "", field)
}
tmpBlock.WriteString("</tr>")

row++
}
tmpBlock.WriteString("</table>")

return tmpBlock.Bytes()
}

// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func (p Parser) bestDelimiter(data []byte) rune {
maxLines := 10
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
text = quoteRegexp.ReplaceAllLiteralString(text, "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]

delimiters := []rune{',', ';', '\t', '|'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := p.scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
}

return bestDelim
}

// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
countTotal := 0
countLineMax := 0
linesNotEqual := 0

for _, line := range lines {
if len(line) == 0 {
continue
}

countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
}
}

return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}
12 changes: 4 additions & 8 deletions modules/markup/csv/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,10 @@ import (
func TestRenderCSV(t *testing.T) {
var parser Parser
var kases = map[string]string{
"a": "<table class=\"table\"><tr><td>a</td></tr></table>",
"1,2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
"1;2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
"1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
"1|2": "<table class=\"table\"><tr><td>1</td><td>2</td></tr></table>",
"1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td></tr><tr><td>a</td><td>b</td><td>c</td></tr></table>",
"\"1,2,3,4\";\"a\nb\"\nc;d": "<table class=\"table\"><tr><td>1,2,3,4</td><td>a\nb</td></tr><tr><td>c</td><td>d</td></tr></table>",
"<br/>": "<table class=\"table\"><tr><td>&lt;br/&gt;</td></tr></table>",
"a": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>a</th></tr></table>",
"1,2": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr></table>",
"1;2\n3;4": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>1</th><th>2</th></tr><tr><td class=\"line-num\">2</td><td>3</td><td>4</td></tr></table>",
"<br/>": "<table class=\"data-table\"><tr><th class=\"line-num\">1</th><th>&lt;br/&gt;</th></tr></table>",
}

for k, v := range kases {
Expand Down
4 changes: 4 additions & 0 deletions modules/markup/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ func ReplaceSanitizer() {
// Allow icons, emojis, and chroma syntax on span
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`^((icon(\s+[\p{L}\p{N}_-]+)+)|(emoji))$|^([a-z][a-z0-9]{0,2})$`)).OnElements("span")

// Allow data tables
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`data-table`)).OnElements("table")
sanitizer.policy.AllowAttrs("class").Matching(regexp.MustCompile(`line-num`)).OnElements("th", "td")

// Allow generally safe attributes
generalSafeAttrs := []string{"abbr", "accept", "accept-charset",
"accesskey", "action", "align", "alt",
Expand Down
9 changes: 9 additions & 0 deletions modules/setting/setting.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ var (
Enabled bool `ini:"ENABLE_RENDER"`
} `ini:"ui.svg"`

CSV struct {
MaxFileSize int64
} `ini:"ui.csv"`

Admin struct {
UserPagingNum int
RepoPagingNum int
Expand Down Expand Up @@ -258,6 +262,11 @@ var (
}{
Enabled: true,
},
CSV: struct {
MaxFileSize int64
}{
MaxFileSize: 524288,
},
Admin: struct {
UserPagingNum int
RepoPagingNum int
Expand Down
5 changes: 5 additions & 0 deletions options/locale/locale_en-US.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1860,6 +1860,7 @@ diff.whitespace_ignore_at_eol = Ignore changes in whitespace at EOL
diff.stats_desc = <strong> %d changed files</strong> with <strong>%d additions</strong> and <strong>%d deletions</strong>
diff.stats_desc_file = %d changes: %d additions and %d deletions
diff.bin = BIN
diff.bin_not_shown = Binary file not shown.
diff.view_file = View File
diff.file_before = Before
diff.file_after = After
Expand Down Expand Up @@ -1960,6 +1961,10 @@ topic.done = Done
topic.count_prompt = You can not select more than 25 topics
topic.format_prompt = Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

error.csv.too_large = Can't render this file because it is too large.
error.csv.unexpected = Can't render this file because it contains an unexpected character in line %d and column %d.
error.csv.invalid_field_count = Can't render this file because it has a wrong number of fields in line %d.
[org]
org_name_holder = Organization Name
org_full_name_holder = Organization Full Name
Expand Down
3 changes: 1 addition & 2 deletions routers/repo/commit.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,8 @@ func Diff(ctx *context.Context) {
return
}
}
setImageCompareContext(ctx, parentCommit, commit)
headTarget := path.Join(userName, repoName)
setPathsCompareContext(ctx, parentCommit, commit, headTarget)
setCompareContext(ctx, parentCommit, commit, headTarget)
ctx.Data["Title"] = commit.Summary() + " · " + base.ShortSha(commitID)
ctx.Data["Commit"] = commit
verification := models.ParseCommitWithSignature(commit)
Expand Down
Loading

0 comments on commit 0c61376

Please sign in to comment.