decoder.go 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. // Copyright 2019 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package xml // import "miniflux.app/reader/xml"
  5. import (
  6. "bytes"
  7. "encoding/xml"
  8. "fmt"
  9. "io"
  10. "io/ioutil"
  11. "miniflux.app/reader/encoding"
  12. )
  13. // NewDecoder returns a XML decoder that filters illegal characters.
  14. func NewDecoder(data io.Reader) *xml.Decoder {
  15. decoder := xml.NewDecoder(data)
  16. decoder.Entity = xml.HTMLEntity
  17. decoder.Strict = false
  18. decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
  19. utf8Reader, err := encoding.CharsetReader(charset, input)
  20. if err != nil {
  21. return nil, err
  22. }
  23. rawData, err := ioutil.ReadAll(utf8Reader)
  24. if err != nil {
  25. return nil, fmt.Errorf("Unable to read data: %q", err)
  26. }
  27. filteredBytes := bytes.Map(filterValidXMLChar, rawData)
  28. return bytes.NewReader(filteredBytes), nil
  29. }
  30. return decoder
  31. }
  32. // This function is copied from encoding/xml package,
  33. // and is used to check if all the characters are legal.
  34. func filterValidXMLChar(r rune) rune {
  35. if r == 0x09 ||
  36. r == 0x0A ||
  37. r == 0x0D ||
  38. r >= 0x20 && r <= 0xD7FF ||
  39. r >= 0xE000 && r <= 0xFFFD ||
  40. r >= 0x10000 && r <= 0x10FFFF {
  41. return r
  42. }
  43. return -1
  44. }