decoder.go 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package xml // import "miniflux.app/v2/internal/reader/xml"
  4. import (
  5. "bytes"
  6. "encoding/xml"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "miniflux.app/v2/internal/reader/encoding"
  11. )
  12. // NewXMLDecoder returns a XML decoder that filters illegal characters.
  13. func NewXMLDecoder(data io.Reader) *xml.Decoder {
  14. var decoder *xml.Decoder
  15. buffer, _ := io.ReadAll(data)
  16. enc := procInst("encoding", string(buffer))
  17. if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
  18. // filter invalid chars later within decoder.CharsetReader
  19. decoder = xml.NewDecoder(bytes.NewReader(buffer))
  20. } else {
  21. // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
  22. filteredBytes := bytes.Map(filterValidXMLChar, buffer)
  23. decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
  24. }
  25. decoder.Entity = xml.HTMLEntity
  26. decoder.Strict = false
  27. decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
  28. utf8Reader, err := encoding.CharsetReader(charset, input)
  29. if err != nil {
  30. return nil, err
  31. }
  32. rawData, err := io.ReadAll(utf8Reader)
  33. if err != nil {
  34. return nil, fmt.Errorf("encoding: unable to read data: %w", err)
  35. }
  36. filteredBytes := bytes.Map(filterValidXMLChar, rawData)
  37. return bytes.NewReader(filteredBytes), nil
  38. }
  39. return decoder
  40. }
  41. // This function is copied from encoding/xml package,
  42. // and is used to check if all the characters are legal.
  43. func filterValidXMLChar(r rune) rune {
  44. if r == 0x09 ||
  45. r == 0x0A ||
  46. r == 0x0D ||
  47. r >= 0x20 && r <= 0xD7FF ||
  48. r >= 0xE000 && r <= 0xFFFD ||
  49. r >= 0x10000 && r <= 0x10FFFF {
  50. return r
  51. }
  52. return -1
  53. }
  54. // This function is copied from encoding/xml package,
  55. // procInst parses the `param="..."` or `param='...'`
  56. // value out of the provided string, returning "" if not found.
  57. func procInst(param, s string) string {
  58. // TODO: this parsing is somewhat lame and not exact.
  59. // It works for all actual cases, though.
  60. param = param + "="
  61. idx := strings.Index(s, param)
  62. if idx == -1 {
  63. return ""
  64. }
  65. v := s[idx+len(param):]
  66. if v == "" {
  67. return ""
  68. }
  69. if v[0] != '\'' && v[0] != '"' {
  70. return ""
  71. }
  72. idx = strings.IndexRune(v[1:], rune(v[0]))
  73. if idx == -1 {
  74. return ""
  75. }
  76. return v[1 : idx+1]
  77. }