decoder.go 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. // Copyright 2019 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package xml // import "miniflux.app/reader/xml"
  5. import (
  6. "bytes"
  7. "encoding/xml"
  8. "fmt"
  9. "io"
  10. "strings"
  11. "miniflux.app/reader/encoding"
  12. )
  13. // NewDecoder returns a XML decoder that filters illegal characters.
  14. func NewDecoder(data io.Reader) *xml.Decoder {
  15. var decoder *xml.Decoder
  16. buffer, _ := io.ReadAll(data)
  17. enc := procInst("encoding", string(buffer))
  18. if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
  19. // filter invalid chars later within decoder.CharsetReader
  20. decoder = xml.NewDecoder(bytes.NewReader(buffer))
  21. } else {
  22. // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
  23. filteredBytes := bytes.Map(filterValidXMLChar, buffer)
  24. decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
  25. }
  26. decoder.Entity = xml.HTMLEntity
  27. decoder.Strict = false
  28. decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
  29. utf8Reader, err := encoding.CharsetReader(charset, input)
  30. if err != nil {
  31. return nil, err
  32. }
  33. rawData, err := io.ReadAll(utf8Reader)
  34. if err != nil {
  35. return nil, fmt.Errorf("Unable to read data: %q", err)
  36. }
  37. filteredBytes := bytes.Map(filterValidXMLChar, rawData)
  38. return bytes.NewReader(filteredBytes), nil
  39. }
  40. return decoder
  41. }
  42. // This function is copied from encoding/xml package,
  43. // and is used to check if all the characters are legal.
  44. func filterValidXMLChar(r rune) rune {
  45. if r == 0x09 ||
  46. r == 0x0A ||
  47. r == 0x0D ||
  48. r >= 0x20 && r <= 0xD7FF ||
  49. r >= 0xE000 && r <= 0xFFFD ||
  50. r >= 0x10000 && r <= 0x10FFFF {
  51. return r
  52. }
  53. return -1
  54. }
  55. // This function is copied from encoding/xml package,
  56. // procInst parses the `param="..."` or `param='...'`
  57. // value out of the provided string, returning "" if not found.
  58. func procInst(param, s string) string {
  59. // TODO: this parsing is somewhat lame and not exact.
  60. // It works for all actual cases, though.
  61. param = param + "="
  62. idx := strings.Index(s, param)
  63. if idx == -1 {
  64. return ""
  65. }
  66. v := s[idx+len(param):]
  67. if v == "" {
  68. return ""
  69. }
  70. if v[0] != '\'' && v[0] != '"' {
  71. return ""
  72. }
  73. idx = strings.IndexRune(v[1:], rune(v[0]))
  74. if idx == -1 {
  75. return ""
  76. }
  77. return v[1 : idx+1]
  78. }