decoder.go 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package xml // import "miniflux.app/v2/internal/reader/xml"
  4. import (
  5. "bytes"
  6. "encoding/xml"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "unicode/utf8"
  11. "miniflux.app/v2/internal/reader/encoding"
  12. )
  13. // NewXMLDecoder returns a XML decoder that filters illegal characters.
  14. func NewXMLDecoder(data io.ReadSeeker) *xml.Decoder {
  15. var decoder *xml.Decoder
  16. // This is way fasted than io.ReadAll(data) as the buffer can be allocated in one go instead of dynamically grown.
  17. buffer := &bytes.Buffer{}
  18. io.Copy(buffer, data)
  19. enc := getEncoding(buffer.Bytes())
  20. if enc == "" || strings.EqualFold(enc, "utf-8") {
  21. // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
  22. filteredBytes := filterValidXMLChars(buffer.Bytes())
  23. decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
  24. } else {
  25. // filter invalid chars later within decoder.CharsetReader
  26. data.Seek(0, io.SeekStart)
  27. decoder = xml.NewDecoder(data)
  28. }
  29. decoder.Entity = xml.HTMLEntity
  30. decoder.Strict = false
  31. decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
  32. utf8Reader, err := encoding.CharsetReader(charset, input)
  33. if err != nil {
  34. return nil, err
  35. }
  36. rawData, err := io.ReadAll(utf8Reader)
  37. if err != nil {
  38. return nil, fmt.Errorf("encoding: unable to read data: %w", err)
  39. }
  40. filteredBytes := filterValidXMLChars(rawData)
  41. return bytes.NewReader(filteredBytes), nil
  42. }
  43. return decoder
  44. }
  45. // filterValidXMLChars filters inplace invalid XML characters.
  46. // This function is inspired from bytes.Map
  47. func filterValidXMLChars(s []byte) []byte {
  48. j := 0
  49. for i := 0; i < len(s); {
  50. wid := 1
  51. r := rune(s[i])
  52. if r >= utf8.RuneSelf {
  53. r, wid = utf8.DecodeRune(s[i:])
  54. }
  55. if r != utf8.RuneError {
  56. if r = filterValidXMLChar(r); r >= 0 {
  57. utf8.EncodeRune(s[j:], r)
  58. j += wid
  59. }
  60. }
  61. i += wid
  62. }
  63. return s[:j]
  64. }
  65. // This function is copied from encoding/xml package,
  66. // and is used to check if all the characters are legal.
  67. func filterValidXMLChar(r rune) rune {
  68. if r == 0x09 ||
  69. r == 0x0A ||
  70. r == 0x0D ||
  71. r >= 0x20 && r <= 0xD7FF ||
  72. r >= 0xE000 && r <= 0xFFFD ||
  73. r >= 0x10000 && r <= 0x10FFFF {
  74. return r
  75. }
  76. return -1
  77. }
  78. // This function is copied from encoding/xml's procInst and adapted for []bytes instead of string
  79. func getEncoding(b []byte) string {
  80. // TODO: this parsing is somewhat lame and not exact.
  81. // It works for all actual cases, though.
  82. idx := bytes.Index(b, []byte("encoding="))
  83. if idx == -1 {
  84. return ""
  85. }
  86. v := b[idx+len("encoding="):]
  87. if len(v) == 0 {
  88. return ""
  89. }
  90. if v[0] != '\'' && v[0] != '"' {
  91. return ""
  92. }
  93. idx = bytes.IndexRune(v[1:], rune(v[0]))
  94. if idx == -1 {
  95. return ""
  96. }
  97. return string(v[1 : idx+1])
  98. }