package encoder import ( "math/bits" "reflect" "unsafe" ) const ( lsb = 0x0101010101010101 msb = 0x8080808080808080 ) var hex = "0123456789abcdef" //nolint:govet func stringToUint64Slice(s string) []uint64 { return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{ Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data, Len: len(s) / 8, Cap: len(s) / 8, })) } func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { if ctx.Option.Flag&HTMLEscapeOption != 0 { if ctx.Option.Flag&NormalizeUTF8Option != 0 { return appendNormalizedHTMLString(buf, s) } return appendHTMLString(buf, s) } if ctx.Option.Flag&NormalizeUTF8Option != 0 { return appendNormalizedString(buf, s) } return appendString(buf, s) } func appendNormalizedHTMLString(buf []byte, s string) []byte { valLen := len(s) if valLen == 0 { return append(buf, `""`...) } buf = append(buf, '"') var ( i, j int ) if valLen >= 8 { chunks := stringToUint64Slice(s) for _, n := range chunks { // combine masks before checking for the MSB of each byte. We include // `n` in the mask to check whether any of the *input* byte MSBs were // set (i.e. the byte was outside the ASCII range). mask := n | (n - (lsb * 0x20)) | ((n ^ (lsb * '"')) - lsb) | ((n ^ (lsb * '\\')) - lsb) | ((n ^ (lsb * '<')) - lsb) | ((n ^ (lsb * '>')) - lsb) | ((n ^ (lsb * '&')) - lsb) if (mask & msb) != 0 { j = bits.TrailingZeros64(mask&msb) / 8 goto ESCAPE_END } } for i := len(chunks) * 8; i < valLen; i++ { if needEscapeHTMLNormalizeUTF8[s[i]] { j = i goto ESCAPE_END } } // no found any escape characters. return append(append(buf, s...), '"') } ESCAPE_END: for j < valLen { c := s[j] if !needEscapeHTMLNormalizeUTF8[c] { // fast path: most of the time, printable ascii characters are used j++ continue } switch c { case '\\', '"': buf = append(buf, s[i:j]...) buf = append(buf, '\\', c) i = j + 1 j = j + 1 continue case '\n': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'n') i = j + 1 j = j + 1 continue case '\r': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'r') i = j + 1 j = j + 1 continue case '\t': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 't') i = j + 1 j = j + 1 continue case '<', '>', '&': buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue } state, size := decodeRuneInString(s[j:]) switch state { case runeErrorState: buf = append(buf, s[i:j]...) buf = append(buf, `\ufffd`...) i = j + 1 j = j + 1 continue // U+2028 is LINE SEPARATOR. // U+2029 is PARAGRAPH SEPARATOR. // They are both technically valid characters in JSON strings, // but don't work in JSONP, which has to be evaluated as JavaScript, // and can lead to security holes there. It is valid JSON to // escape them, so we do so unconditionally. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. case lineSepState: buf = append(buf, s[i:j]...) buf = append(buf, `\u2028`...) i = j + 3 j = j + 3 continue case paragraphSepState: buf = append(buf, s[i:j]...) buf = append(buf, `\u2029`...) i = j + 3 j = j + 3 continue } j += size } return append(append(buf, s[i:]...), '"') } func appendHTMLString(buf []byte, s string) []byte { valLen := len(s) if valLen == 0 { return append(buf, `""`...) } buf = append(buf, '"') var ( i, j int ) if valLen >= 8 { chunks := stringToUint64Slice(s) for _, n := range chunks { // combine masks before checking for the MSB of each byte. We include // `n` in the mask to check whether any of the *input* byte MSBs were // set (i.e. the byte was outside the ASCII range). mask := n | (n - (lsb * 0x20)) | ((n ^ (lsb * '"')) - lsb) | ((n ^ (lsb * '\\')) - lsb) | ((n ^ (lsb * '<')) - lsb) | ((n ^ (lsb * '>')) - lsb) | ((n ^ (lsb * '&')) - lsb) if (mask & msb) != 0 { j = bits.TrailingZeros64(mask&msb) / 8 goto ESCAPE_END } } for i := len(chunks) * 8; i < valLen; i++ { if needEscapeHTML[s[i]] { j = i goto ESCAPE_END } } // no found any escape characters. return append(append(buf, s...), '"') } ESCAPE_END: for j < valLen { c := s[j] if !needEscapeHTML[c] { // fast path: most of the time, printable ascii characters are used j++ continue } switch c { case '\\', '"': buf = append(buf, s[i:j]...) buf = append(buf, '\\', c) i = j + 1 j = j + 1 continue case '\n': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'n') i = j + 1 j = j + 1 continue case '\r': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'r') i = j + 1 j = j + 1 continue case '\t': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 't') i = j + 1 j = j + 1 continue case '<', '>', '&': buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue } j++ } return append(append(buf, s[i:]...), '"') } func appendNormalizedString(buf []byte, s string) []byte { valLen := len(s) if valLen == 0 { return append(buf, `""`...) } buf = append(buf, '"') var ( i, j int ) if valLen >= 8 { chunks := stringToUint64Slice(s) for _, n := range chunks { // combine masks before checking for the MSB of each byte. We include // `n` in the mask to check whether any of the *input* byte MSBs were // set (i.e. the byte was outside the ASCII range). mask := n | (n - (lsb * 0x20)) | ((n ^ (lsb * '"')) - lsb) | ((n ^ (lsb * '\\')) - lsb) if (mask & msb) != 0 { j = bits.TrailingZeros64(mask&msb) / 8 goto ESCAPE_END } } valLen := len(s) for i := len(chunks) * 8; i < valLen; i++ { if needEscapeNormalizeUTF8[s[i]] { j = i goto ESCAPE_END } } return append(append(buf, s...), '"') } ESCAPE_END: for j < valLen { c := s[j] if !needEscapeNormalizeUTF8[c] { // fast path: most of the time, printable ascii characters are used j++ continue } switch c { case '\\', '"': buf = append(buf, s[i:j]...) buf = append(buf, '\\', c) i = j + 1 j = j + 1 continue case '\n': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'n') i = j + 1 j = j + 1 continue case '\r': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'r') i = j + 1 j = j + 1 continue case '\t': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 't') i = j + 1 j = j + 1 continue case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue } state, size := decodeRuneInString(s[j:]) switch state { case runeErrorState: buf = append(buf, s[i:j]...) buf = append(buf, `\ufffd`...) i = j + 1 j = j + 1 continue // U+2028 is LINE SEPARATOR. // U+2029 is PARAGRAPH SEPARATOR. // They are both technically valid characters in JSON strings, // but don't work in JSONP, which has to be evaluated as JavaScript, // and can lead to security holes there. It is valid JSON to // escape them, so we do so unconditionally. // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. case lineSepState: buf = append(buf, s[i:j]...) buf = append(buf, `\u2028`...) i = j + 3 j = j + 3 continue case paragraphSepState: buf = append(buf, s[i:j]...) buf = append(buf, `\u2029`...) i = j + 3 j = j + 3 continue } j += size } return append(append(buf, s[i:]...), '"') } func appendString(buf []byte, s string) []byte { valLen := len(s) if valLen == 0 { return append(buf, `""`...) } buf = append(buf, '"') var ( i, j int ) if valLen >= 8 { chunks := stringToUint64Slice(s) for _, n := range chunks { // combine masks before checking for the MSB of each byte. We include // `n` in the mask to check whether any of the *input* byte MSBs were // set (i.e. the byte was outside the ASCII range). mask := n | (n - (lsb * 0x20)) | ((n ^ (lsb * '"')) - lsb) | ((n ^ (lsb * '\\')) - lsb) if (mask & msb) != 0 { j = bits.TrailingZeros64(mask&msb) / 8 goto ESCAPE_END } } valLen := len(s) for i := len(chunks) * 8; i < valLen; i++ { if needEscape[s[i]] { j = i goto ESCAPE_END } } return append(append(buf, s...), '"') } ESCAPE_END: for j < valLen { c := s[j] if !needEscape[c] { // fast path: most of the time, printable ascii characters are used j++ continue } switch c { case '\\', '"': buf = append(buf, s[i:j]...) buf = append(buf, '\\', c) i = j + 1 j = j + 1 continue case '\n': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'n') i = j + 1 j = j + 1 continue case '\r': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 'r') i = j + 1 j = j + 1 continue case '\t': buf = append(buf, s[i:j]...) buf = append(buf, '\\', 't') i = j + 1 j = j + 1 continue case 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0E, 0x0F, // 0x00-0x0F 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F: // 0x10-0x1F buf = append(buf, s[i:j]...) buf = append(buf, `\u00`...) buf = append(buf, hex[c>>4], hex[c&0xF]) i = j + 1 j = j + 1 continue } j++ } return append(append(buf, s[i:]...), '"') }