decoder.go 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. // Copyright 2019 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package xml // import "miniflux.app/reader/xml"
  5. import (
  6. "bytes"
  7. "encoding/xml"
  8. "fmt"
  9. "io"
  10. "io/ioutil"
  11. "strings"
  12. "miniflux.app/reader/encoding"
  13. )
  14. // NewDecoder returns a XML decoder that filters illegal characters.
  15. func NewDecoder(data io.Reader) *xml.Decoder {
  16. var decoder *xml.Decoder
  17. buffer, _ := ioutil.ReadAll(data)
  18. enc := procInst("encoding", string(buffer))
  19. if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
  20. // filter invalid chars later within decoder.CharsetReader
  21. decoder = xml.NewDecoder(bytes.NewReader(buffer))
  22. } else {
  23. // filter invalid chars now, since decoder.CharsetReader not called for utf-8 content
  24. filteredBytes := bytes.Map(filterValidXMLChar, buffer)
  25. decoder = xml.NewDecoder(bytes.NewReader(filteredBytes))
  26. }
  27. decoder.Entity = xml.HTMLEntity
  28. decoder.Strict = false
  29. decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
  30. utf8Reader, err := encoding.CharsetReader(charset, input)
  31. if err != nil {
  32. return nil, err
  33. }
  34. rawData, err := ioutil.ReadAll(utf8Reader)
  35. if err != nil {
  36. return nil, fmt.Errorf("Unable to read data: %q", err)
  37. }
  38. filteredBytes := bytes.Map(filterValidXMLChar, rawData)
  39. return bytes.NewReader(filteredBytes), nil
  40. }
  41. return decoder
  42. }
  43. // This function is copied from encoding/xml package,
  44. // and is used to check if all the characters are legal.
  45. func filterValidXMLChar(r rune) rune {
  46. if r == 0x09 ||
  47. r == 0x0A ||
  48. r == 0x0D ||
  49. r >= 0x20 && r <= 0xD7FF ||
  50. r >= 0xE000 && r <= 0xFFFD ||
  51. r >= 0x10000 && r <= 0x10FFFF {
  52. return r
  53. }
  54. return -1
  55. }
  56. // This function is copied from encoding/xml package,
  57. // procInst parses the `param="..."` or `param='...'`
  58. // value out of the provided string, returning "" if not found.
  59. func procInst(param, s string) string {
  60. // TODO: this parsing is somewhat lame and not exact.
  61. // It works for all actual cases, though.
  62. param = param + "="
  63. idx := strings.Index(s, param)
  64. if idx == -1 {
  65. return ""
  66. }
  67. v := s[idx+len(param):]
  68. if v == "" {
  69. return ""
  70. }
  71. if v[0] != '\'' && v[0] != '"' {
  72. return ""
  73. }
  74. idx = strings.IndexRune(v[1:], rune(v[0]))
  75. if idx == -1 {
  76. return ""
  77. }
  78. return v[1 : idx+1]
  79. }