4
0

decoder.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package xml // import "miniflux.app/v2/internal/reader/xml"
  4. import (
  5. "bytes"
  6. "encoding/xml"
  7. "fmt"
  8. "io"
  9. "unicode/utf8"
  10. "miniflux.app/v2/internal/reader/encoding"
  11. )
  12. // NewXMLDecoder returns a XML decoder that filters illegal characters.
  13. func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
  14. var decoder *xml.Decoder
  15. // This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
  16. buffer := &bytes.Buffer{}
  17. io.Copy(buffer, data)
  18. if hasUTF8XMLDeclaration(buffer.Bytes()) {
  19. // TODO: detect actual encoding from bytes if not UTF-8 and convert to UTF-8 if needed.
  20. // For now we just expect the invalid characters to be stripped out.
  21. // Filter invalid chars now, since decoder.CharsetReader isn't called for utf-8 content
  22. filteredBytes := filterValidXMLChars(buffer.Bytes())
  23. decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
  24. } else {
  25. data.Seek(0, io.SeekStart)
  26. decoder = xml.NewDecoder(data)
  27. // The XML document will be converted to UTF-8 by encoding.CharsetReader
  28. // Invalid characters will be filtered later via decoder.CharsetReader
  29. decoder.CharsetReader = charsetReaderFilterInvalidUtf8
  30. }
  31. decoder.Entity = xml.HTMLEntity
  32. decoder.Strict = false
  33. return decoder
  34. }
  35. func charsetReaderFilterInvalidUtf8(charset string, input io.Reader) (io.Reader, error) {
  36. utf8Reader, err := encoding.CharsetReader(charset, input)
  37. if err != nil {
  38. return nil, err
  39. }
  40. rawData, err := io.ReadAll(utf8Reader)
  41. if err != nil {
  42. return nil, fmt.Errorf("xml: unable to read data: %w", err)
  43. }
  44. filteredBytes := filterValidXMLChars(rawData)
  45. return bytes.NewReader(filteredBytes), nil
  46. }
  47. // filterValidXMLChars filters inplace invalid XML characters.
  48. // This function is inspired from bytes.Map
  49. func filterValidXMLChars(s []byte) []byte {
  50. var i uint // declaring it as an uint removes a bound check in the loop.
  51. var j int
  52. for i = 0; i < uint(len(s)); {
  53. wid := 1
  54. r := rune(s[i])
  55. if r >= utf8.RuneSelf {
  56. r, wid = utf8.DecodeRune(s[i:])
  57. }
  58. if r != utf8.RuneError {
  59. if r = filterValidXMLChar(r); r >= 0 {
  60. utf8.EncodeRune(s[j:], r)
  61. j += wid
  62. }
  63. }
  64. i += uint(wid)
  65. }
  66. return s[:j]
  67. }
  68. // This function is copied from encoding/xml package,
  69. // and is used to check if all the characters are legal.
  70. func filterValidXMLChar(r rune) rune {
  71. if r == 0x09 ||
  72. r == 0x0A ||
  73. r == 0x0D ||
  74. r >= 0x20 && r <= 0xD7FF ||
  75. r >= 0xE000 && r <= 0xFFFD ||
  76. r >= 0x10000 && r <= 0x10FFFF {
  77. return r
  78. }
  79. return -1
  80. }
  81. // This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
  82. func getEncoding(b []byte) []byte {
  83. // This parsing is somewhat lame and not exact.
  84. // It works for all actual cases, though.
  85. idx := bytes.Index(b, []byte("encoding="))
  86. if idx == -1 {
  87. return nil
  88. }
  89. v := b[idx+len("encoding="):]
  90. if len(v) == 0 {
  91. return nil
  92. }
  93. if v[0] != '\'' && v[0] != '"' {
  94. return nil
  95. }
  96. idx = bytes.IndexRune(v[1:], rune(v[0]))
  97. if idx == -1 {
  98. return nil
  99. }
  100. return v[1 : idx+1]
  101. }
  102. func hasUTF8XMLDeclaration(data []byte) bool {
  103. enc := getEncoding(data)
  104. return enc == nil || bytes.EqualFold(enc, []byte("utf-8"))
  105. }