diff --git a/modules/charset/charset.go b/modules/charset/charset.go index 597ce5120c..00e0909d2f 100644 --- a/modules/charset/charset.go +++ b/modules/charset/charset.go @@ -26,9 +26,11 @@ type ConvertOpts struct { KeepBOM bool } +var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024 + // ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { - buf := make([]byte, 2048) + buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize) n, err := util.ReadAtMost(rd, buf) if err != nil { return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd) @@ -41,6 +43,7 @@ func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { + log.Error("Unknown encoding: %s", charsetLabel) return io.MultiReader(bytes.NewReader(buf[:n]), rd) } @@ -54,17 +57,18 @@ func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader { } // ToUTF8 converts content to UTF8 encoding -func ToUTF8(content []byte, opts ConvertOpts) (string, error) { +func ToUTF8(content []byte, opts ConvertOpts) ([]byte, error) { charsetLabel, err := DetectEncoding(content) if err != nil { - return "", err + return content, err } else if charsetLabel == "UTF-8" { - return string(MaybeRemoveBOM(content, opts)), nil + return MaybeRemoveBOM(content, opts), nil } encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { - return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel) + log.Error("Unknown encoding: %s", charsetLabel) + return content, fmt.Errorf("unknown encoding: %s", charsetLabel) } // If there is an error, we concatenate the nicely decoded part and the @@ -76,7 +80,7 @@ func ToUTF8(content []byte, opts ConvertOpts) (string, error) { result = MaybeRemoveBOM(result, opts) - return string(result), err + return result, err } // ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible @@ -94,6 +98,7 @@ func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte { encoding, _ := charset.Lookup(charsetLabel) if encoding == nil { + log.Error("Unknown encoding: %s", charsetLabel) return content } @@ -130,28 +135,37 @@ func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte { } // DetectEncoding detect the encoding of content -func DetectEncoding(content []byte) (string, error) { +// it always returns a detected or guessed "encoding" string, no matter error happens or not +func DetectEncoding(content []byte) (encoding string, _ error) { // First we check if the content represents valid utf8 content excepting a truncated character at the end. // Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do - // instead we walk backwards from the end to trim off a the incomplete character + // instead we walk backwards from the end to trim off the incomplete character toValidate := content end := len(toValidate) - 1 - if end < 0 { - // no-op - } else if toValidate[end]>>5 == 0b110 { - // Incomplete 1 byte extension e.g. © which has been truncated to - toValidate = toValidate[:end] - } else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 { - // Incomplete 2 byte extension e.g. ⛔ <9b><94> which has been truncated to <9b> - toValidate = toValidate[:end-1] - } else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 { - // Incomplete 3 byte extension e.g. 💩 <9f><92> which has been truncated to <9f><92> - toValidate = toValidate[:end-2] + // U+0000 U+007F 0yyyzzzz + // U+0080 U+07FF 110xxxyy 10yyzzzz + // U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz + // U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz + cnt := 0 + for end >= 0 && cnt < 4 { + c := toValidate[end] + if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 { + // a leading byte + toValidate = toValidate[:end] + break + } else if c>>6 == 0b10 { + // a continuation byte + end-- + } else { + // not an utf-8 byte + break + } + cnt++ } + if utf8.Valid(toValidate) { - log.Debug("Detected encoding: utf-8 (fast)") return "UTF-8", nil } @@ -160,7 +174,7 @@ func DetectEncoding(content []byte) (string, error) { if len(content) < 1024 { // Check if original content is valid if _, err := textDetector.DetectBest(content); err != nil { - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } times := 1024 / len(content) detectContent = make([]byte, 0, times*len(content)) @@ -171,14 +185,10 @@ func DetectEncoding(content []byte) (string, error) { detectContent = content } - // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break + // Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break results, err := textDetector.DetectAll(detectContent) if err != nil { - if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) - return setting.Repository.AnsiCharset, nil - } - return "", err + return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err } topConfidence := results[0].Confidence @@ -201,11 +211,9 @@ func DetectEncoding(content []byte) (string, error) { } // FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument - if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 { - log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset) + if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" { return setting.Repository.AnsiCharset, err } - log.Debug("Detected encoding: %s", topResult.Charset) - return topResult.Charset, err + return topResult.Charset, nil } diff --git a/modules/charset/charset_test.go b/modules/charset/charset_test.go index cd2e3b9aaa..2fa820daf2 100644 --- a/modules/charset/charset_test.go +++ b/modules/charset/charset_test.go @@ -4,12 +4,12 @@ package charset import ( - "bytes" "io" "strings" "testing" "code.gitea.io/gitea/modules/setting" + "code.gitea.io/gitea/modules/test" "github.com/stretchr/testify/assert" ) @@ -47,12 +47,12 @@ func TestToUTF8(t *testing.T) { res, err := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{}) assert.NoError(t, err) - assert.Equal(t, "ABC", res) + assert.Equal(t, "ABC", string(res)) // "áéíóú" res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{}) assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) // "áéíóú" res, err = ToUTF8([]byte{ @@ -60,7 +60,7 @@ func TestToUTF8(t *testing.T) { 0xc3, 0xba, }, ConvertOpts{}) assert.NoError(t, err) - assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res)) + assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res) res, err = ToUTF8([]byte{ 0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, @@ -96,12 +96,11 @@ func TestToUTF8(t *testing.T) { assert.Equal(t, []byte{ 0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3, 0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82, - }, - []byte(res)) + }, res) res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{}) assert.NoError(t, err) - assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res)) + assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res) } func TestToUTF8WithFallback(t *testing.T) { @@ -231,152 +230,42 @@ func TestDetectEncoding(t *testing.T) { assert.Error(t, err) } -func stringMustStartWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[:len(expected)]) +func stringMustStartWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[:len(expected)])) } -func stringMustEndWith(t *testing.T, expected, value string) { - assert.Equal(t, expected, value[len(value)-len(expected):]) +func stringMustEndWith(t *testing.T, expected string, value []byte) { + assert.Equal(t, expected, string(value[len(value)-len(expected):])) } func TestToUTF8WithFallbackReader(t *testing.T) { resetDefaultCharsetsOrder() + test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize) - for testLen := range 2048 { - pattern := " test { () }\n" - input := "" - for len(input) < testLen { - input += pattern - } - input = input[:testLen] - input += "// Выключаем" - rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{}) + block := "aá啊🤔" + runes := []rune(block) + assert.Len(t, string(runes[0]), 1) + assert.Len(t, string(runes[1]), 2) + assert.Len(t, string(runes[2]), 3) + assert.Len(t, string(runes[3]), 4) + + content := strings.Repeat(block, 2) + for i := 1; i < len(content); i++ { + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) + assert.Equal(t, "UTF-8", encoding) + + ToUTF8WithFallbackReaderPrefetchSize = i + rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{}) r, _ := io.ReadAll(rd) - assert.Equalf(t, input, string(r), "testing string len=%d", testLen) + assert.Equal(t, content, string(r)) + } + for _, r := range runes { + content = "abc abc " + string(r) + string(r) + string(r) + for i := 0; i < len(content); i++ { + encoding, err := DetectEncoding([]byte(content[:i])) + assert.NoError(t, err) + assert.Equal(t, "UTF-8", encoding) + } } - - truncatedOneByteExtension := failFastBytes - encoding, _ := DetectEncoding(truncatedOneByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedTwoByteExtension := failFastBytes - truncatedTwoByteExtension[len(failFastBytes)-1] = 0x9b - truncatedTwoByteExtension[len(failFastBytes)-2] = 0xe2 - - encoding, _ = DetectEncoding(truncatedTwoByteExtension) - assert.Equal(t, "UTF-8", encoding) - - truncatedThreeByteExtension := failFastBytes - truncatedThreeByteExtension[len(failFastBytes)-1] = 0x92 - truncatedThreeByteExtension[len(failFastBytes)-2] = 0x9f - truncatedThreeByteExtension[len(failFastBytes)-3] = 0xf0 - - encoding, _ = DetectEncoding(truncatedThreeByteExtension) - assert.Equal(t, "UTF-8", encoding) -} - -var failFastBytes = []byte{ - 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x74, 0x6f, - 0x6f, 0x6c, 0x73, 0x2e, 0x61, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x64, 0x65, 0x66, 0x73, 0x2e, 0x63, 0x6f, 0x6e, - 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x4f, 0x73, 0x0a, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, - 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, - 0x74, 0x2e, 0x67, 0x72, 0x61, 0x64, 0x6c, 0x65, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x75, 0x6e, 0x2e, 0x42, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x0a, 0x0a, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x64, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, - 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x64, 0x65, 0x70, 0x65, - 0x6e, 0x64, 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, - 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x2d, 0x64, 0x6f, 0x63, 0x73, 0x22, 0x29, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x64, 0x62, - 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, - 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x66, - 0x73, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, - 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x6d, 0x71, 0x22, 0x29, 0x29, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, - 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, - 0x2d, 0x61, 0x75, 0x74, 0x68, 0x2d, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, - 0x65, 0x72, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, - 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x68, 0x61, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x22, 0x29, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, - 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x77, 0x65, 0x62, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, - 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, - 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x6f, 0x70, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, - 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, - 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x63, 0x74, 0x75, 0x61, 0x74, 0x6f, 0x72, 0x22, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, - 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, - 0x6c, 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x73, 0x74, 0x72, 0x61, 0x70, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x73, 0x75, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, - 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c, - 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x65, 0x72, 0x2d, 0x73, 0x6c, 0x65, 0x75, 0x74, 0x68, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, - 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, - 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x72, 0x65, 0x74, 0x72, 0x79, 0x3a, - 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x72, 0x65, 0x74, 0x72, 0x79, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x63, 0x68, 0x2e, 0x71, - 0x6f, 0x73, 0x2e, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x3a, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x2d, 0x63, - 0x6c, 0x61, 0x73, 0x73, 0x69, 0x63, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, - 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x69, 0x6f, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, - 0x74, 0x65, 0x72, 0x3a, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x67, 0x69, 0x73, - 0x74, 0x72, 0x79, 0x2d, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6b, 0x6f, 0x74, - 0x6c, 0x69, 0x6e, 0x28, 0x22, 0x73, 0x74, 0x64, 0x6c, 0x69, 0x62, 0x22, 0x29, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64, - 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, - 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x65, 0x73, 0x74, 0x49, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, - 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, - 0x74, 0x65, 0x73, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, - 0x61, 0x72, 0x20, 0x62, 0x79, 0x20, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72, - 0x69, 0x6e, 0x67, 0x28, 0x4a, 0x61, 0x72, 0x3a, 0x3a, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x61, 0x72, 0x63, 0x68, 0x69, 0x76, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x66, 0x69, 0x65, 0x72, 0x2e, - 0x73, 0x65, 0x74, 0x28, 0x22, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x76, 0x61, 0x6c, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, - 0x20, 0x62, 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x67, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x6e, 0x69, 0x66, 0x65, 0x73, 0x74, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x73, 0x28, 0x22, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x2d, 0x50, 0x61, 0x74, 0x68, 0x22, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, - 0x20, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x2f, 0x2b, 0x22, 0x2e, 0x74, 0x6f, 0x52, 0x65, 0x67, 0x65, 0x78, 0x28, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, - 0x65, 0x20, 0x66, 0x75, 0x6e, 0x20, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x3a, 0x20, 0x53, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, - 0x61, 0x74, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x54, 0x6f, 0x53, 0x74, 0x72, 0x69, - 0x6e, 0x67, 0x28, 0x22, 0x20, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x74, 0x2e, 0x74, 0x6f, 0x55, 0x52, 0x49, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x55, - 0x52, 0x4c, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x46, 0x69, 0x72, 0x73, 0x74, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x22, 0x2f, - 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x74, 0x61, 0x73, - 0x6b, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3c, 0x42, 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x3e, 0x28, 0x22, 0x62, - 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x4f, - 0x73, 0x2e, 0x69, 0x73, 0x46, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x28, 0x4f, 0x73, 0x2e, 0x46, 0x41, 0x4d, 0x49, 0x4c, 0x59, - 0x5f, 0x57, 0x49, 0x4e, 0x44, 0x4f, 0x57, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x28, 0x73, - 0x6f, 0x75, 0x72, 0x63, 0x65, 0x53, 0x65, 0x74, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x28, 0x22, 0x6d, 0x61, 0x69, - 0x6e, 0x22, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x20, 0x7b, 0x20, 0x69, 0x74, 0x2e, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20, - 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0xd0, } diff --git a/routers/web/repo/editor.go b/routers/web/repo/editor.go index 8c630cb35f..6aefe1a470 100644 --- a/routers/web/repo/editor.go +++ b/routers/web/repo/editor.go @@ -312,11 +312,13 @@ func EditFile(ctx *context.Context) { ctx.ServerError("ReadAll", err) return } + var fileContent string if content, err := charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true}); err != nil { - ctx.Data["FileContent"] = string(buf) + fileContent = string(buf) } else { - ctx.Data["FileContent"] = content + fileContent = string(content) } + ctx.Data["FileContent"] = fileContent } } diff --git a/services/gitdiff/gitdiff.go b/services/gitdiff/gitdiff.go index 4c7063a3ba..a39954a805 100644 --- a/services/gitdiff/gitdiff.go +++ b/services/gitdiff/gitdiff.go @@ -1233,10 +1233,10 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit shouldFullFileHighlight := !setting.Git.DisableDiffHighlight && attrDiff.Value() == "" if shouldFullFileHighlight { if limitedContent.LeftContent != nil && limitedContent.LeftContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.String()) + diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.Bytes()) } if limitedContent.RightContent != nil && limitedContent.RightContent.buf.Len() < MaxDiffHighlightEntireFileSize { - diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.String()) + diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.Bytes()) } } } @@ -1244,9 +1244,35 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit return diff, nil } -func highlightCodeLines(diffFile *DiffFile, isLeft bool, content string) map[int]template.HTML { +func splitHighlightLines(buf []byte) (ret [][]byte) { + lineCount := bytes.Count(buf, []byte("\n")) + 1 + ret = make([][]byte, 0, lineCount) + nlTagClose := []byte("\n" right after \n, sometimes before. + // * "text\n" + // * "text\n" + if bytes.HasPrefix(buf[pos:], nlTagClose) { + pos1 := bytes.IndexByte(buf[pos:], '>') + if pos1 != -1 { + pos += pos1 + } + } + ret = append(ret, buf[:pos+1]) + buf = buf[pos+1:] + } +} + +func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML { + contentUTF8, _ := charset.ToUTF8(rawContent, charset.ConvertOpts{}) + content := util.UnsafeBytesToString(contentUTF8) highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content) - splitLines := strings.Split(string(highlightedNewContent), "\n") + splitLines := splitHighlightLines([]byte(highlightedNewContent)) lines := make(map[int]template.HTML, len(splitLines)) // only save the highlighted lines we need, but not the whole file, to save memory for _, sec := range diffFile.Sections { diff --git a/services/gitdiff/gitdiff_test.go b/services/gitdiff/gitdiff_test.go index 7b64b6b5f8..07334e0061 100644 --- a/services/gitdiff/gitdiff_test.go +++ b/services/gitdiff/gitdiff_test.go @@ -5,6 +5,7 @@ package gitdiff import ( + "html/template" "strconv" "strings" "testing" @@ -640,3 +641,41 @@ func TestNoCrashes(t *testing.T) { ParsePatch(t.Context(), setting.Git.MaxGitDiffLines, setting.Git.MaxGitDiffLineCharacters, setting.Git.MaxGitDiffFiles, strings.NewReader(testcase.gitdiff), "") } } + +func TestHighlightCodeLines(t *testing.T) { + t.Run("CharsetDetecting", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{{LeftIdx: 1}}, + }, + }, + } + ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes + assert.Equal(t, "// abcÌ defÍ\n", string(ret[0])) + }) + + t.Run("LeftLines", func(t *testing.T) { + diffFile := &DiffFile{ + Name: "a.c", + Language: "c", + Sections: []*DiffSection{ + { + Lines: []*DiffLine{ + {LeftIdx: 1}, + {LeftIdx: 2}, + {LeftIdx: 3}, + }, + }, + }, + } + const nl = "\n" + ret := highlightCodeLines(diffFile, true, []byte("a\nb\n")) + assert.Equal(t, map[int]template.HTML{ + 0: `a` + nl, + 1: `b`, + }, ret) + }) +}