parser.go 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package feed
  5. import (
  6. "bytes"
  7. "encoding/xml"
  8. "io"
  9. "strings"
  10. "time"
  11. "github.com/miniflux/miniflux/errors"
  12. "github.com/miniflux/miniflux/logger"
  13. "github.com/miniflux/miniflux/model"
  14. "github.com/miniflux/miniflux/reader/atom"
  15. "github.com/miniflux/miniflux/reader/encoding"
  16. "github.com/miniflux/miniflux/reader/json"
  17. "github.com/miniflux/miniflux/reader/rdf"
  18. "github.com/miniflux/miniflux/reader/rss"
  19. "github.com/miniflux/miniflux/timer"
  20. )
  21. // List of feed formats.
  22. const (
  23. FormatRDF = "rdf"
  24. FormatRSS = "rss"
  25. FormatAtom = "atom"
  26. FormatJSON = "json"
  27. FormatUnknown = "unknown"
  28. )
  29. // DetectFeedFormat detect feed format from input data.
  30. func DetectFeedFormat(r io.Reader) string {
  31. defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
  32. var buffer bytes.Buffer
  33. tee := io.TeeReader(r, &buffer)
  34. decoder := xml.NewDecoder(tee)
  35. decoder.CharsetReader = encoding.CharsetReader
  36. for {
  37. token, _ := decoder.Token()
  38. if token == nil {
  39. break
  40. }
  41. if element, ok := token.(xml.StartElement); ok {
  42. switch element.Name.Local {
  43. case "rss":
  44. return FormatRSS
  45. case "feed":
  46. return FormatAtom
  47. case "RDF":
  48. return FormatRDF
  49. }
  50. }
  51. }
  52. if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
  53. return FormatJSON
  54. }
  55. return FormatUnknown
  56. }
  57. func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
  58. defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
  59. var buffer bytes.Buffer
  60. size, _ := io.Copy(&buffer, r)
  61. if size == 0 {
  62. return nil, errors.NewLocalizedError("This feed is empty")
  63. }
  64. str := stripInvalidXMLCharacters(buffer.String())
  65. reader := strings.NewReader(str)
  66. format := DetectFeedFormat(reader)
  67. reader.Seek(0, io.SeekStart)
  68. switch format {
  69. case FormatAtom:
  70. return atom.Parse(reader)
  71. case FormatRSS:
  72. return rss.Parse(reader)
  73. case FormatJSON:
  74. return json.Parse(reader)
  75. case FormatRDF:
  76. return rdf.Parse(reader)
  77. default:
  78. return nil, errors.NewLocalizedError("Unsupported feed format")
  79. }
  80. }
  81. func stripInvalidXMLCharacters(input string) string {
  82. return strings.Map(func(r rune) rune {
  83. if isInCharacterRange(r) {
  84. return r
  85. }
  86. logger.Debug("Strip invalid XML characters: %U", r)
  87. return -1
  88. }, input)
  89. }
  90. // Decide whether the given rune is in the XML Character Range, per
  91. // the Char production of http://www.xml.com/axml/testaxml.htm,
  92. // Section 2.2 Characters.
  93. func isInCharacterRange(r rune) (inrange bool) {
  94. return r == 0x09 ||
  95. r == 0x0A ||
  96. r == 0x0D ||
  97. r >= 0x20 && r <= 0xDF77 ||
  98. r >= 0xE000 && r <= 0xFFFD ||
  99. r >= 0x10000 && r <= 0x10FFFF
  100. }