mirror of
https://github.com/go-gitea/gitea.git
synced 2025-12-13 18:32:54 +00:00
Fix bug when viewing the commit diff page with non-ANSI files (#36149)
Fix #35504 --------- Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
This commit is contained in:
@@ -5,12 +5,10 @@ package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
@@ -23,60 +21,39 @@ import (
|
||||
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
|
||||
|
||||
type ConvertOpts struct {
|
||||
KeepBOM bool
|
||||
KeepBOM bool
|
||||
ErrorReplacement []byte
|
||||
ErrorReturnOrigin bool
|
||||
}
|
||||
|
||||
var ToUTF8WithFallbackReaderPrefetchSize = 16 * 1024
|
||||
|
||||
// ToUTF8WithFallbackReader detects the encoding of content and converts to UTF-8 reader if possible
|
||||
func ToUTF8WithFallbackReader(rd io.Reader, opts ConvertOpts) io.Reader {
|
||||
buf := make([]byte, 2048)
|
||||
buf := make([]byte, ToUTF8WithFallbackReaderPrefetchSize)
|
||||
n, err := util.ReadAtMost(rd, buf)
|
||||
if err != nil {
|
||||
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
|
||||
}
|
||||
|
||||
charsetLabel, err := DetectEncoding(buf[:n])
|
||||
if err != nil || charsetLabel == "UTF-8" {
|
||||
return io.MultiReader(bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)), rd)
|
||||
}
|
||||
|
||||
encoding, _ := charset.Lookup(charsetLabel)
|
||||
if encoding == nil {
|
||||
// read error occurs, don't do any processing
|
||||
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
|
||||
}
|
||||
|
||||
return transform.NewReader(
|
||||
io.MultiReader(
|
||||
bytes.NewReader(MaybeRemoveBOM(buf[:n], opts)),
|
||||
rd,
|
||||
),
|
||||
encoding.NewDecoder(),
|
||||
)
|
||||
}
|
||||
|
||||
// ToUTF8 converts content to UTF8 encoding
|
||||
func ToUTF8(content []byte, opts ConvertOpts) (string, error) {
|
||||
charsetLabel, err := DetectEncoding(content)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else if charsetLabel == "UTF-8" {
|
||||
return string(MaybeRemoveBOM(content, opts)), nil
|
||||
charsetLabel, _ := DetectEncoding(buf[:n])
|
||||
if charsetLabel == "UTF-8" {
|
||||
// is utf-8, try to remove BOM and read it as-is
|
||||
return io.MultiReader(bytes.NewReader(maybeRemoveBOM(buf[:n], opts)), rd)
|
||||
}
|
||||
|
||||
encoding, _ := charset.Lookup(charsetLabel)
|
||||
if encoding == nil {
|
||||
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
|
||||
// unknown charset, don't do any processing
|
||||
return io.MultiReader(bytes.NewReader(buf[:n]), rd)
|
||||
}
|
||||
|
||||
// If there is an error, we concatenate the nicely decoded part and the
|
||||
// original left over. This way we won't lose much data.
|
||||
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
|
||||
if err != nil {
|
||||
result = append(result, content[n:]...)
|
||||
}
|
||||
|
||||
result = MaybeRemoveBOM(result, opts)
|
||||
|
||||
return string(result), err
|
||||
// convert from charset to utf-8
|
||||
return transform.NewReader(
|
||||
io.MultiReader(bytes.NewReader(buf[:n]), rd),
|
||||
encoding.NewDecoder(),
|
||||
)
|
||||
}
|
||||
|
||||
// ToUTF8WithFallback detects the encoding of content and converts to UTF-8 if possible
|
||||
@@ -85,73 +62,84 @@ func ToUTF8WithFallback(content []byte, opts ConvertOpts) []byte {
|
||||
return bs
|
||||
}
|
||||
|
||||
// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
|
||||
func ToUTF8DropErrors(content []byte, opts ConvertOpts) []byte {
|
||||
charsetLabel, err := DetectEncoding(content)
|
||||
if err != nil || charsetLabel == "UTF-8" {
|
||||
return MaybeRemoveBOM(content, opts)
|
||||
func ToUTF8DropErrors(content []byte) []byte {
|
||||
return ToUTF8(content, ConvertOpts{ErrorReplacement: []byte{' '}})
|
||||
}
|
||||
|
||||
func ToUTF8(content []byte, opts ConvertOpts) []byte {
|
||||
charsetLabel, _ := DetectEncoding(content)
|
||||
if charsetLabel == "UTF-8" {
|
||||
return maybeRemoveBOM(content, opts)
|
||||
}
|
||||
|
||||
encoding, _ := charset.Lookup(charsetLabel)
|
||||
if encoding == nil {
|
||||
setting.PanicInDevOrTesting("unsupported detected charset %q, it shouldn't happen", charsetLabel)
|
||||
return content
|
||||
}
|
||||
|
||||
// We ignore any non-decodable parts from the file.
|
||||
// Some parts might be lost
|
||||
var decoded []byte
|
||||
decoder := encoding.NewDecoder()
|
||||
idx := 0
|
||||
for {
|
||||
for idx < len(content) {
|
||||
result, n, err := transform.Bytes(decoder, content[idx:])
|
||||
decoded = append(decoded, result...)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
decoded = append(decoded, ' ')
|
||||
idx = idx + n + 1
|
||||
if idx >= len(content) {
|
||||
break
|
||||
if opts.ErrorReturnOrigin {
|
||||
return content
|
||||
}
|
||||
if opts.ErrorReplacement == nil {
|
||||
decoded = append(decoded, content[idx+n])
|
||||
} else {
|
||||
decoded = append(decoded, opts.ErrorReplacement...)
|
||||
}
|
||||
idx += n + 1
|
||||
}
|
||||
|
||||
return MaybeRemoveBOM(decoded, opts)
|
||||
return maybeRemoveBOM(decoded, opts)
|
||||
}
|
||||
|
||||
// MaybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
|
||||
func MaybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
|
||||
// maybeRemoveBOM removes a UTF-8 BOM from a []byte when opts.KeepBOM is false
|
||||
func maybeRemoveBOM(content []byte, opts ConvertOpts) []byte {
|
||||
if opts.KeepBOM {
|
||||
return content
|
||||
}
|
||||
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
|
||||
return content[3:]
|
||||
}
|
||||
return content
|
||||
return bytes.TrimPrefix(content, UTF8BOM)
|
||||
}
|
||||
|
||||
// DetectEncoding detect the encoding of content
|
||||
func DetectEncoding(content []byte) (string, error) {
|
||||
// it always returns a detected or guessed "encoding" string, no matter error happens or not
|
||||
func DetectEncoding(content []byte) (encoding string, _ error) {
|
||||
// First we check if the content represents valid utf8 content excepting a truncated character at the end.
|
||||
|
||||
// Now we could decode all the runes in turn but this is not necessarily the cheapest thing to do
|
||||
// instead we walk backwards from the end to trim off a the incomplete character
|
||||
// instead we walk backwards from the end to trim off the incomplete character
|
||||
toValidate := content
|
||||
end := len(toValidate) - 1
|
||||
|
||||
if end < 0 {
|
||||
// no-op
|
||||
} else if toValidate[end]>>5 == 0b110 {
|
||||
// Incomplete 1 byte extension e.g. © <c2><a9> which has been truncated to <c2>
|
||||
toValidate = toValidate[:end]
|
||||
} else if end > 0 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>4 == 0b1110 {
|
||||
// Incomplete 2 byte extension e.g. ⛔ <e2><9b><94> which has been truncated to <e2><9b>
|
||||
toValidate = toValidate[:end-1]
|
||||
} else if end > 1 && toValidate[end]>>6 == 0b10 && toValidate[end-1]>>6 == 0b10 && toValidate[end-2]>>3 == 0b11110 {
|
||||
// Incomplete 3 byte extension e.g. 💩 <f0><9f><92><a9> which has been truncated to <f0><9f><92>
|
||||
toValidate = toValidate[:end-2]
|
||||
// U+0000 U+007F 0yyyzzzz
|
||||
// U+0080 U+07FF 110xxxyy 10yyzzzz
|
||||
// U+0800 U+FFFF 1110wwww 10xxxxyy 10yyzzzz
|
||||
// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
|
||||
cnt := 0
|
||||
for end >= 0 && cnt < 4 {
|
||||
c := toValidate[end]
|
||||
if c>>5 == 0b110 || c>>4 == 0b1110 || c>>3 == 0b11110 {
|
||||
// a leading byte
|
||||
toValidate = toValidate[:end]
|
||||
break
|
||||
} else if c>>6 == 0b10 {
|
||||
// a continuation byte
|
||||
end--
|
||||
} else {
|
||||
// not an utf-8 byte
|
||||
break
|
||||
}
|
||||
cnt++
|
||||
}
|
||||
|
||||
if utf8.Valid(toValidate) {
|
||||
log.Debug("Detected encoding: utf-8 (fast)")
|
||||
return "UTF-8", nil
|
||||
}
|
||||
|
||||
@@ -160,7 +148,7 @@ func DetectEncoding(content []byte) (string, error) {
|
||||
if len(content) < 1024 {
|
||||
// Check if original content is valid
|
||||
if _, err := textDetector.DetectBest(content); err != nil {
|
||||
return "", err
|
||||
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
|
||||
}
|
||||
times := 1024 / len(content)
|
||||
detectContent = make([]byte, 0, times*len(content))
|
||||
@@ -171,14 +159,10 @@ func DetectEncoding(content []byte) (string, error) {
|
||||
detectContent = content
|
||||
}
|
||||
|
||||
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
|
||||
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie-break
|
||||
results, err := textDetector.DetectAll(detectContent)
|
||||
if err != nil {
|
||||
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
|
||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||
return setting.Repository.AnsiCharset, nil
|
||||
}
|
||||
return "", err
|
||||
return util.IfZero(setting.Repository.AnsiCharset, "UTF-8"), err
|
||||
}
|
||||
|
||||
topConfidence := results[0].Confidence
|
||||
@@ -201,11 +185,9 @@ func DetectEncoding(content []byte) (string, error) {
|
||||
}
|
||||
|
||||
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
|
||||
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
|
||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||
if topResult.Charset != "UTF-8" && setting.Repository.AnsiCharset != "" {
|
||||
return setting.Repository.AnsiCharset, err
|
||||
}
|
||||
|
||||
log.Debug("Detected encoding: %s", topResult.Charset)
|
||||
return topResult.Charset, err
|
||||
return topResult.Charset, nil
|
||||
}
|
||||
|
||||
@@ -4,108 +4,89 @@
|
||||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/test"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func resetDefaultCharsetsOrder() {
|
||||
defaultDetectedCharsetsOrder := make([]string, 0, len(setting.Repository.DetectedCharsetsOrder))
|
||||
for _, charset := range setting.Repository.DetectedCharsetsOrder {
|
||||
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
|
||||
}
|
||||
func TestMain(m *testing.M) {
|
||||
setting.Repository.DetectedCharsetScore = map[string]int{}
|
||||
i := 0
|
||||
for _, charset := range defaultDetectedCharsetsOrder {
|
||||
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
|
||||
if _, has := setting.Repository.DetectedCharsetScore[canonicalCharset]; !has {
|
||||
setting.Repository.DetectedCharsetScore[canonicalCharset] = i
|
||||
i++
|
||||
}
|
||||
for i, charset := range setting.Repository.DetectedCharsetsOrder {
|
||||
setting.Repository.DetectedCharsetScore[strings.ToLower(charset)] = i
|
||||
}
|
||||
os.Exit(m.Run())
|
||||
}
|
||||
|
||||
func TestMaybeRemoveBOM(t *testing.T) {
|
||||
res := MaybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
res := maybeRemoveBOM([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
res = MaybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
res = maybeRemoveBOM([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
}
|
||||
|
||||
func TestToUTF8(t *testing.T) {
|
||||
resetDefaultCharsetsOrder()
|
||||
|
||||
// Note: golang compiler seems so behave differently depending on the current
|
||||
// locale, so some conversions might behave differently. For that reason, we don't
|
||||
// depend on particular conversions but in expected behaviors.
|
||||
|
||||
res, err := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "ABC", res)
|
||||
res := ToUTF8([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
assert.Equal(t, "ABC", string(res))
|
||||
|
||||
// "áéíóú"
|
||||
res, err = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
|
||||
res = ToUTF8([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// "áéíóú"
|
||||
res, err = ToUTF8([]byte{
|
||||
res = ToUTF8([]byte{
|
||||
0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
|
||||
0xc3, 0xba,
|
||||
}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
res, err = ToUTF8([]byte{
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
res, err = ToUTF8([]byte{
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
res, err = ToUTF8([]byte{
|
||||
res = ToUTF8([]byte{
|
||||
0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
|
||||
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e,
|
||||
}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
stringMustStartWith(t, "Hola,", res)
|
||||
stringMustEndWith(t, "AAA.", res)
|
||||
|
||||
// Japanese (Shift-JIS)
|
||||
// 日属秘ぞしちゅ。
|
||||
res, err = ToUTF8([]byte{
|
||||
res = ToUTF8([]byte{
|
||||
0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
|
||||
0xBF, 0x82, 0xE3, 0x81, 0x42,
|
||||
}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []byte{
|
||||
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
|
||||
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
|
||||
},
|
||||
[]byte(res))
|
||||
}, res)
|
||||
|
||||
res, err = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
|
||||
res = ToUTF8([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
|
||||
}
|
||||
|
||||
func TestToUTF8WithFallback(t *testing.T) {
|
||||
resetDefaultCharsetsOrder()
|
||||
// "ABC"
|
||||
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
|
||||
@@ -152,54 +133,58 @@ func TestToUTF8WithFallback(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestToUTF8DropErrors(t *testing.T) {
|
||||
resetDefaultCharsetsOrder()
|
||||
// "ABC"
|
||||
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43}, ConvertOpts{})
|
||||
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
|
||||
assert.Equal(t, []byte{0x41, 0x42, 0x43}, res)
|
||||
|
||||
// "áéíóú"
|
||||
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// UTF8 BOM + "áéíóú"
|
||||
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
|
||||
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
|
||||
|
||||
// "Hola, así cómo ños"
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
|
||||
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73}, res[:8])
|
||||
assert.Equal(t, []byte{0x73}, res[len(res)-1:])
|
||||
|
||||
// "Hola, así cómo "
|
||||
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
|
||||
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
|
||||
assert.Equal(t, minmatch, res[0:len(minmatch)])
|
||||
|
||||
// Japanese (Shift-JIS)
|
||||
// "日属秘ぞしちゅ。"
|
||||
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
|
||||
assert.Equal(t, []byte{
|
||||
0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
|
||||
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82,
|
||||
}, res)
|
||||
|
||||
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00}, ConvertOpts{})
|
||||
res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
|
||||
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
|
||||
}
|
||||
|
||||
func TestDetectEncoding(t *testing.T) {
|
||||
resetDefaultCharsetsOrder()
|
||||
testSuccess := func(b []byte, expected string) {
|
||||
encoding, err := DetectEncoding(b)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, expected, encoding)
|
||||
}
|
||||
|
||||
// invalid bytes
|
||||
encoding, err := DetectEncoding([]byte{0xfa})
|
||||
assert.Error(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
// utf-8
|
||||
b := []byte("just some ascii")
|
||||
testSuccess(b, "UTF-8")
|
||||
@@ -214,169 +199,49 @@ func TestDetectEncoding(t *testing.T) {
|
||||
|
||||
// iso-8859-1: d<accented e>cor<newline>
|
||||
b = []byte{0x44, 0xe9, 0x63, 0x6f, 0x72, 0x0a}
|
||||
encoding, err := DetectEncoding(b)
|
||||
encoding, err = DetectEncoding(b)
|
||||
assert.NoError(t, err)
|
||||
assert.Contains(t, encoding, "ISO-8859-1")
|
||||
|
||||
old := setting.Repository.AnsiCharset
|
||||
setting.Repository.AnsiCharset = "placeholder"
|
||||
defer func() {
|
||||
setting.Repository.AnsiCharset = old
|
||||
}()
|
||||
testSuccess(b, "placeholder")
|
||||
|
||||
// invalid bytes
|
||||
b = []byte{0xfa}
|
||||
_, err = DetectEncoding(b)
|
||||
assert.Error(t, err)
|
||||
defer test.MockVariableValue(&setting.Repository.AnsiCharset, "MyEncoding")()
|
||||
testSuccess(b, "MyEncoding")
|
||||
}
|
||||
|
||||
func stringMustStartWith(t *testing.T, expected, value string) {
|
||||
assert.Equal(t, expected, value[:len(expected)])
|
||||
func stringMustStartWith(t *testing.T, expected string, value []byte) {
|
||||
assert.Equal(t, expected, string(value[:len(expected)]))
|
||||
}
|
||||
|
||||
func stringMustEndWith(t *testing.T, expected, value string) {
|
||||
assert.Equal(t, expected, value[len(value)-len(expected):])
|
||||
func stringMustEndWith(t *testing.T, expected string, value []byte) {
|
||||
assert.Equal(t, expected, string(value[len(value)-len(expected):]))
|
||||
}
|
||||
|
||||
func TestToUTF8WithFallbackReader(t *testing.T) {
|
||||
resetDefaultCharsetsOrder()
|
||||
test.MockVariableValue(&ToUTF8WithFallbackReaderPrefetchSize)
|
||||
|
||||
for testLen := range 2048 {
|
||||
pattern := " test { () }\n"
|
||||
input := ""
|
||||
for len(input) < testLen {
|
||||
input += pattern
|
||||
}
|
||||
input = input[:testLen]
|
||||
input += "// Выключаем"
|
||||
rd := ToUTF8WithFallbackReader(bytes.NewReader([]byte(input)), ConvertOpts{})
|
||||
block := "aá啊🤔"
|
||||
runes := []rune(block)
|
||||
assert.Len(t, string(runes[0]), 1)
|
||||
assert.Len(t, string(runes[1]), 2)
|
||||
assert.Len(t, string(runes[2]), 3)
|
||||
assert.Len(t, string(runes[3]), 4)
|
||||
|
||||
content := strings.Repeat(block, 2)
|
||||
for i := 1; i < len(content); i++ {
|
||||
encoding, err := DetectEncoding([]byte(content[:i]))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
ToUTF8WithFallbackReaderPrefetchSize = i
|
||||
rd := ToUTF8WithFallbackReader(strings.NewReader(content), ConvertOpts{})
|
||||
r, _ := io.ReadAll(rd)
|
||||
assert.Equalf(t, input, string(r), "testing string len=%d", testLen)
|
||||
assert.Equal(t, content, string(r))
|
||||
}
|
||||
for _, r := range runes {
|
||||
content = "abc abc " + string(r) + string(r) + string(r)
|
||||
for i := 0; i < len(content); i++ {
|
||||
encoding, err := DetectEncoding([]byte(content[:i]))
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
}
|
||||
}
|
||||
|
||||
truncatedOneByteExtension := failFastBytes
|
||||
encoding, _ := DetectEncoding(truncatedOneByteExtension)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
truncatedTwoByteExtension := failFastBytes
|
||||
truncatedTwoByteExtension[len(failFastBytes)-1] = 0x9b
|
||||
truncatedTwoByteExtension[len(failFastBytes)-2] = 0xe2
|
||||
|
||||
encoding, _ = DetectEncoding(truncatedTwoByteExtension)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
|
||||
truncatedThreeByteExtension := failFastBytes
|
||||
truncatedThreeByteExtension[len(failFastBytes)-1] = 0x92
|
||||
truncatedThreeByteExtension[len(failFastBytes)-2] = 0x9f
|
||||
truncatedThreeByteExtension[len(failFastBytes)-3] = 0xf0
|
||||
|
||||
encoding, _ = DetectEncoding(truncatedThreeByteExtension)
|
||||
assert.Equal(t, "UTF-8", encoding)
|
||||
}
|
||||
|
||||
var failFastBytes = []byte{
|
||||
0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67, 0x2e, 0x61, 0x70, 0x61, 0x63, 0x68, 0x65, 0x2e, 0x74, 0x6f,
|
||||
0x6f, 0x6c, 0x73, 0x2e, 0x61, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x64, 0x65, 0x66, 0x73, 0x2e, 0x63, 0x6f, 0x6e,
|
||||
0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x4f, 0x73, 0x0a, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6f, 0x72, 0x67,
|
||||
0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f,
|
||||
0x74, 0x2e, 0x67, 0x72, 0x61, 0x64, 0x6c, 0x65, 0x2e, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x75, 0x6e, 0x2e, 0x42,
|
||||
0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x0a, 0x0a, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||
0x20, 0x20, 0x69, 0x64, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d,
|
||||
0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x64, 0x65, 0x70, 0x65,
|
||||
0x6e, 0x64, 0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65,
|
||||
0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a,
|
||||
0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d,
|
||||
0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74,
|
||||
0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x61, 0x70, 0x69, 0x2d, 0x64, 0x6f, 0x63, 0x73, 0x22, 0x29,
|
||||
0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e,
|
||||
0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x64, 0x62,
|
||||
0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69,
|
||||
0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a,
|
||||
0x73, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65,
|
||||
0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a,
|
||||
0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x66,
|
||||
0x73, 0x22, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
|
||||
0x69, 0x6f, 0x6e, 0x28, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x22, 0x3a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
|
||||
0x3a, 0x69, 0x6e, 0x74, 0x65, 0x67, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2d, 0x6d, 0x71, 0x22, 0x29, 0x29, 0x0a, 0x0a,
|
||||
0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22,
|
||||
0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e,
|
||||
0x2d, 0x61, 0x75, 0x74, 0x68, 0x2d, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74,
|
||||
0x65, 0x72, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74,
|
||||
0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63,
|
||||
0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x68, 0x61, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c,
|
||||
0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a, 0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e,
|
||||
0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x22, 0x29, 0x0a,
|
||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28,
|
||||
0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b,
|
||||
0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74,
|
||||
0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x77, 0x65, 0x62, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c,
|
||||
0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69,
|
||||
0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72,
|
||||
0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x6f, 0x70,
|
||||
0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f,
|
||||
0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f,
|
||||
0x72, 0x6b, 0x2e, 0x62, 0x6f, 0x6f, 0x74, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x2d,
|
||||
0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x61, 0x63, 0x74, 0x75, 0x61, 0x74, 0x6f, 0x72, 0x22, 0x29, 0x0a, 0x20,
|
||||
0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f,
|
||||
0x72, 0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63,
|
||||
0x6c, 0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74,
|
||||
0x61, 0x72, 0x74, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x6f, 0x74, 0x73, 0x74, 0x72, 0x61, 0x70, 0x22, 0x29, 0x0a, 0x20, 0x20,
|
||||
0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72,
|
||||
0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c,
|
||||
0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61,
|
||||
0x72, 0x74, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x73, 0x75, 0x6c, 0x2d, 0x61, 0x6c, 0x6c, 0x22, 0x29, 0x0a, 0x20, 0x20,
|
||||
0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72,
|
||||
0x67, 0x2e, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x63, 0x6c,
|
||||
0x6f, 0x75, 0x64, 0x3a, 0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x2d, 0x73, 0x74, 0x61,
|
||||
0x72, 0x74, 0x65, 0x72, 0x2d, 0x73, 0x6c, 0x65, 0x75, 0x74, 0x68, 0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d,
|
||||
0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6f, 0x72, 0x67, 0x2e, 0x73, 0x70,
|
||||
0x72, 0x69, 0x6e, 0x67, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x77, 0x6f, 0x72, 0x6b, 0x2e, 0x72, 0x65, 0x74, 0x72, 0x79, 0x3a,
|
||||
0x73, 0x70, 0x72, 0x69, 0x6e, 0x67, 0x2d, 0x72, 0x65, 0x74, 0x72, 0x79, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||
0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x63, 0x68, 0x2e, 0x71,
|
||||
0x6f, 0x73, 0x2e, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x3a, 0x6c, 0x6f, 0x67, 0x62, 0x61, 0x63, 0x6b, 0x2d, 0x63,
|
||||
0x6c, 0x61, 0x73, 0x73, 0x69, 0x63, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d,
|
||||
0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x69, 0x6f, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65,
|
||||
0x74, 0x65, 0x72, 0x3a, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x2d, 0x72, 0x65, 0x67, 0x69, 0x73,
|
||||
0x74, 0x72, 0x79, 0x2d, 0x70, 0x72, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x65, 0x75, 0x73, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20,
|
||||
0x20, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6b, 0x6f, 0x74,
|
||||
0x6c, 0x69, 0x6e, 0x28, 0x22, 0x73, 0x74, 0x64, 0x6c, 0x69, 0x62, 0x22, 0x29, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
|
||||
0x2f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x65, 0x73, 0x74, 0x20, 0x64, 0x65, 0x70, 0x65, 0x6e, 0x64,
|
||||
0x65, 0x6e, 0x63, 0x69, 0x65, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f,
|
||||
0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74,
|
||||
0x65, 0x73, 0x74, 0x49, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x22, 0x6a,
|
||||
0x66, 0x75, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x70, 0x65, 0x3a, 0x70, 0x65, 0x2d, 0x63, 0x6f, 0x6d, 0x6d, 0x6f, 0x6e, 0x2d,
|
||||
0x74, 0x65, 0x73, 0x74, 0x22, 0x29, 0x0a, 0x7d, 0x0a, 0x0a, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a,
|
||||
0x61, 0x72, 0x20, 0x62, 0x79, 0x20, 0x74, 0x61, 0x73, 0x6b, 0x73, 0x2e, 0x72, 0x65, 0x67, 0x69, 0x73, 0x74, 0x65, 0x72,
|
||||
0x69, 0x6e, 0x67, 0x28, 0x4a, 0x61, 0x72, 0x3a, 0x3a, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||
0x20, 0x20, 0x61, 0x72, 0x63, 0x68, 0x69, 0x76, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x69, 0x66, 0x69, 0x65, 0x72, 0x2e,
|
||||
0x73, 0x65, 0x74, 0x28, 0x22, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x64, 0x22, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||
0x76, 0x61, 0x6c, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68,
|
||||
0x20, 0x62, 0x79, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2e, 0x67,
|
||||
0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x6e, 0x69, 0x66, 0x65, 0x73, 0x74,
|
||||
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65,
|
||||
0x73, 0x28, 0x22, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x2d, 0x50, 0x61, 0x74, 0x68, 0x22, 0x20, 0x74, 0x6f, 0x20, 0x6f, 0x62,
|
||||
0x6a, 0x65, 0x63, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70,
|
||||
0x72, 0x69, 0x76, 0x61, 0x74, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d,
|
||||
0x20, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x3a, 0x2f, 0x2b, 0x22, 0x2e, 0x74, 0x6f, 0x52, 0x65, 0x67, 0x65, 0x78, 0x28, 0x29,
|
||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64,
|
||||
0x65, 0x20, 0x66, 0x75, 0x6e, 0x20, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x3a, 0x20, 0x53, 0x74,
|
||||
0x72, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x43, 0x6c, 0x61, 0x73, 0x73, 0x70,
|
||||
0x61, 0x74, 0x68, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x54, 0x6f, 0x53, 0x74, 0x72, 0x69,
|
||||
0x6e, 0x67, 0x28, 0x22, 0x20, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x74, 0x2e, 0x74, 0x6f, 0x55, 0x52, 0x49, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x55,
|
||||
0x52, 0x4c, 0x28, 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c,
|
||||
0x61, 0x63, 0x65, 0x46, 0x69, 0x72, 0x73, 0x74, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x22, 0x2f,
|
||||
0x22, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x74, 0x61, 0x73,
|
||||
0x6b, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x3c, 0x42, 0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x3e, 0x28, 0x22, 0x62,
|
||||
0x6f, 0x6f, 0x74, 0x52, 0x75, 0x6e, 0x22, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x4f,
|
||||
0x73, 0x2e, 0x69, 0x73, 0x46, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x28, 0x4f, 0x73, 0x2e, 0x46, 0x41, 0x4d, 0x49, 0x4c, 0x59,
|
||||
0x5f, 0x57, 0x49, 0x4e, 0x44, 0x4f, 0x57, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x70, 0x61, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x28, 0x73,
|
||||
0x6f, 0x75, 0x72, 0x63, 0x65, 0x53, 0x65, 0x74, 0x73, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x64, 0x28, 0x22, 0x6d, 0x61, 0x69,
|
||||
0x6e, 0x22, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x20, 0x7b, 0x20, 0x69, 0x74, 0x2e, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x20,
|
||||
0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x63, 0x68, 0x4a, 0x61, 0x72, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a,
|
||||
0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0xd0,
|
||||
}
|
||||
|
||||
@@ -19,7 +19,6 @@ import (
|
||||
charsetModule "code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/container"
|
||||
"code.gitea.io/gitea/modules/httpcache"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/typesniffer"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
@@ -109,11 +108,7 @@ func setServeHeadersByFile(r *http.Request, w http.ResponseWriter, mineBuf []byt
|
||||
}
|
||||
|
||||
if isPlain {
|
||||
charset, err := charsetModule.DetectEncoding(mineBuf)
|
||||
if err != nil {
|
||||
log.Error("Detect raw file %s charset failed: %v, using by default utf-8", opts.Filename, err)
|
||||
charset = "utf-8"
|
||||
}
|
||||
charset, _ := charsetModule.DetectEncoding(mineBuf)
|
||||
opts.ContentTypeCharset = strings.ToLower(charset)
|
||||
}
|
||||
|
||||
|
||||
@@ -203,7 +203,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
|
||||
RepoID: repo.ID,
|
||||
CommitID: commitSha,
|
||||
Filename: update.Filename,
|
||||
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
|
||||
Content: string(charset.ToUTF8DropErrors(fileContents)),
|
||||
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
})
|
||||
|
||||
@@ -191,7 +191,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
|
||||
Doc(map[string]any{
|
||||
"repo_id": repo.ID,
|
||||
"filename": update.Filename,
|
||||
"content": string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
|
||||
"content": string(charset.ToUTF8DropErrors(fileContents)),
|
||||
"commit_id": sha,
|
||||
"language": analyze.GetCodeLanguage(update.Filename, fileContents),
|
||||
"updated_at": timeutil.TimeStampNow(),
|
||||
|
||||
@@ -240,4 +240,5 @@ func PanicInDevOrTesting(msg string, a ...any) {
|
||||
if !IsProd || IsInTesting {
|
||||
panic(fmt.Sprintf(msg, a...))
|
||||
}
|
||||
log.Error(msg, a...)
|
||||
}
|
||||
|
||||
@@ -317,11 +317,7 @@ func EditFile(ctx *context.Context) {
|
||||
ctx.ServerError("ReadAll", err)
|
||||
return
|
||||
}
|
||||
if content, err := charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true}); err != nil {
|
||||
ctx.Data["FileContent"] = string(buf)
|
||||
} else {
|
||||
ctx.Data["FileContent"] = content
|
||||
}
|
||||
ctx.Data["FileContent"] = string(charset.ToUTF8(buf, charset.ConvertOpts{KeepBOM: true, ErrorReturnOrigin: true}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -835,11 +835,11 @@ parsingLoop:
|
||||
if buffer.Len() == 0 {
|
||||
continue
|
||||
}
|
||||
charsetLabel, err := charset.DetectEncoding(buffer.Bytes())
|
||||
if charsetLabel != "UTF-8" && err == nil {
|
||||
encoding, _ := stdcharset.Lookup(charsetLabel)
|
||||
if encoding != nil {
|
||||
diffLineTypeDecoders[lineType] = encoding.NewDecoder()
|
||||
charsetLabel, _ := charset.DetectEncoding(buffer.Bytes())
|
||||
if charsetLabel != "UTF-8" {
|
||||
charsetEncoding, _ := stdcharset.Lookup(charsetLabel)
|
||||
if charsetEncoding != nil {
|
||||
diffLineTypeDecoders[lineType] = charsetEncoding.NewDecoder()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1325,10 +1325,10 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit
|
||||
shouldFullFileHighlight := !setting.Git.DisableDiffHighlight && attrDiff.Value() == ""
|
||||
if shouldFullFileHighlight {
|
||||
if limitedContent.LeftContent != nil && limitedContent.LeftContent.buf.Len() < MaxDiffHighlightEntireFileSize {
|
||||
diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.String())
|
||||
diffFile.highlightedLeftLines = highlightCodeLines(diffFile, true /* left */, limitedContent.LeftContent.buf.Bytes())
|
||||
}
|
||||
if limitedContent.RightContent != nil && limitedContent.RightContent.buf.Len() < MaxDiffHighlightEntireFileSize {
|
||||
diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.String())
|
||||
diffFile.highlightedRightLines = highlightCodeLines(diffFile, false /* right */, limitedContent.RightContent.buf.Bytes())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1336,9 +1336,34 @@ func GetDiffForRender(ctx context.Context, repoLink string, gitRepo *git.Reposit
|
||||
return diff, nil
|
||||
}
|
||||
|
||||
func highlightCodeLines(diffFile *DiffFile, isLeft bool, content string) map[int]template.HTML {
|
||||
func splitHighlightLines(buf []byte) (ret [][]byte) {
|
||||
lineCount := bytes.Count(buf, []byte("\n")) + 1
|
||||
ret = make([][]byte, 0, lineCount)
|
||||
nlTagClose := []byte("\n</")
|
||||
for {
|
||||
pos := bytes.IndexByte(buf, '\n')
|
||||
if pos == -1 {
|
||||
ret = append(ret, buf)
|
||||
return ret
|
||||
}
|
||||
// Chroma highlighting output sometimes have "</span>" right after \n, sometimes before.
|
||||
// * "<span>text\n</span>"
|
||||
// * "<span>text</span>\n"
|
||||
if bytes.HasPrefix(buf[pos:], nlTagClose) {
|
||||
pos1 := bytes.IndexByte(buf[pos:], '>')
|
||||
if pos1 != -1 {
|
||||
pos += pos1
|
||||
}
|
||||
}
|
||||
ret = append(ret, buf[:pos+1])
|
||||
buf = buf[pos+1:]
|
||||
}
|
||||
}
|
||||
|
||||
func highlightCodeLines(diffFile *DiffFile, isLeft bool, rawContent []byte) map[int]template.HTML {
|
||||
content := util.UnsafeBytesToString(charset.ToUTF8(rawContent, charset.ConvertOpts{}))
|
||||
highlightedNewContent, _ := highlight.Code(diffFile.Name, diffFile.Language, content)
|
||||
splitLines := strings.Split(string(highlightedNewContent), "\n")
|
||||
splitLines := splitHighlightLines([]byte(highlightedNewContent))
|
||||
lines := make(map[int]template.HTML, len(splitLines))
|
||||
// only save the highlighted lines we need, but not the whole file, to save memory
|
||||
for _, sec := range diffFile.Sections {
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
package gitdiff
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -1106,3 +1107,41 @@ func TestDiffLine_GetExpandDirection(t *testing.T) {
|
||||
assert.Equal(t, c.direction, c.diffLine.GetExpandDirection(), "case %s expected direction: %s", c.name, c.direction)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHighlightCodeLines(t *testing.T) {
|
||||
t.Run("CharsetDetecting", func(t *testing.T) {
|
||||
diffFile := &DiffFile{
|
||||
Name: "a.c",
|
||||
Language: "c",
|
||||
Sections: []*DiffSection{
|
||||
{
|
||||
Lines: []*DiffLine{{LeftIdx: 1}},
|
||||
},
|
||||
},
|
||||
}
|
||||
ret := highlightCodeLines(diffFile, true, []byte("// abc\xcc def\xcd")) // ISO-8859-1 bytes
|
||||
assert.Equal(t, "<span class=\"c1\">// abcÌ defÍ\n</span>", string(ret[0]))
|
||||
})
|
||||
|
||||
t.Run("LeftLines", func(t *testing.T) {
|
||||
diffFile := &DiffFile{
|
||||
Name: "a.c",
|
||||
Language: "c",
|
||||
Sections: []*DiffSection{
|
||||
{
|
||||
Lines: []*DiffLine{
|
||||
{LeftIdx: 1},
|
||||
{LeftIdx: 2},
|
||||
{LeftIdx: 3},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
const nl = "\n"
|
||||
ret := highlightCodeLines(diffFile, true, []byte("a\nb\n"))
|
||||
assert.Equal(t, map[int]template.HTML{
|
||||
0: `<span class="n">a</span>` + nl,
|
||||
1: `<span class="n">b</span>`,
|
||||
}, ret)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -25,12 +25,12 @@ func TestDiffWithHighlight(t *testing.T) {
|
||||
|
||||
t.Run("CleanUp", func(t *testing.T) {
|
||||
hcd := newHighlightCodeDiff()
|
||||
codeA := template.HTML(`<span class="cm>this is a comment</span>`)
|
||||
codeB := template.HTML(`<span class="cm>this is updated comment</span>`)
|
||||
codeA := template.HTML(`<span class="cm">this is a comment</span>`)
|
||||
codeB := template.HTML(`<span class="cm">this is updated comment</span>`)
|
||||
outDel := hcd.diffLineWithHighlight(DiffLineDel, codeA, codeB)
|
||||
assert.Equal(t, `<span class="cm>this is <span class="removed-code">a</span> comment</span>`, string(outDel))
|
||||
assert.Equal(t, `<span class="cm">this is <span class="removed-code">a</span> comment</span>`, string(outDel))
|
||||
outAdd := hcd.diffLineWithHighlight(DiffLineAdd, codeA, codeB)
|
||||
assert.Equal(t, `<span class="cm>this is <span class="added-code">updated</span> comment</span>`, string(outAdd))
|
||||
assert.Equal(t, `<span class="cm">this is <span class="added-code">updated</span> comment</span>`, string(outAdd))
|
||||
})
|
||||
|
||||
t.Run("OpenCloseTags", func(t *testing.T) {
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
package migrations
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"database/sql"
|
||||
@@ -21,7 +22,6 @@ import (
|
||||
"code.gitea.io/gitea/models/migrations"
|
||||
migrate_base "code.gitea.io/gitea/models/migrations/base"
|
||||
"code.gitea.io/gitea/models/unittest"
|
||||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
@@ -108,11 +108,11 @@ func readSQLFromFile(version string) (string, error) {
|
||||
}
|
||||
defer gr.Close()
|
||||
|
||||
bytes, err := io.ReadAll(gr)
|
||||
buf, err := io.ReadAll(gr)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(charset.MaybeRemoveBOM(bytes, charset.ConvertOpts{})), nil
|
||||
return string(bytes.TrimPrefix(buf, []byte{'\xef', '\xbb', '\xbf'})), nil
|
||||
}
|
||||
|
||||
func restoreOldDB(t *testing.T, version string) {
|
||||
|
||||
Reference in New Issue
Block a user