format.go 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package parser // import "miniflux.app/v2/internal/reader/parser"
  4. import (
  5. "encoding/xml"
  6. "errors"
  7. "io"
  8. "unicode"
  9. rxml "miniflux.app/v2/internal/reader/xml"
  10. )
  11. // List of feed formats.
  12. const (
  13. FormatRDF = "rdf"
  14. FormatRSS = "rss"
  15. FormatAtom = "atom"
  16. FormatJSON = "json"
  17. FormatUnknown = "unknown"
  18. )
  19. const maxTokensToConsider = uint(50)
  20. // DetectFeedFormat tries to guess the feed format from input data.
  21. func DetectFeedFormat(r io.ReadSeeker) (string, string) {
  22. r.Seek(0, io.SeekStart)
  23. defer r.Seek(0, io.SeekStart)
  24. if isJSON, err := detectJSONFormat(r); err == nil && isJSON {
  25. return FormatJSON, ""
  26. }
  27. r.Seek(0, io.SeekStart)
  28. decoder := rxml.NewXMLDecoder(r)
  29. processedTokens := uint(0)
  30. for {
  31. token, _ := decoder.Token()
  32. if token == nil || processedTokens == maxTokensToConsider {
  33. break
  34. }
  35. processedTokens += 1
  36. if element, ok := token.(xml.StartElement); ok {
  37. switch element.Name.Local {
  38. case "rss":
  39. return FormatRSS, ""
  40. case "feed":
  41. for _, attr := range element.Attr {
  42. if attr.Name.Local == "version" && attr.Value == "0.3" {
  43. return FormatAtom, "0.3"
  44. }
  45. }
  46. return FormatAtom, "1.0"
  47. case "RDF":
  48. return FormatRDF, ""
  49. }
  50. }
  51. }
  52. return FormatUnknown, ""
  53. }
  54. // detectJSONFormat checks if the reader contains JSON by reading until it finds
  55. // the first non-whitespace character or reaches EOF/error.
  56. func detectJSONFormat(r io.ReadSeeker) (bool, error) {
  57. const bufferSize = 32
  58. buffer := make([]byte, bufferSize)
  59. for {
  60. n, err := r.Read(buffer)
  61. if n == 0 {
  62. if errors.Is(err, io.EOF) {
  63. return false, nil // No non-whitespace content found
  64. }
  65. return false, err
  66. }
  67. if len(buffer) < n {
  68. panic("unreachable") // bounds check hint to compiler
  69. }
  70. // Check each byte in the buffer
  71. for i := range n {
  72. ch := buffer[i]
  73. // Skip whitespace characters (space, tab, newline, carriage return, etc.)
  74. if unicode.IsSpace(rune(ch)) {
  75. continue
  76. }
  77. // First non-whitespace character determines if it's JSON
  78. return ch == '{', nil
  79. }
  80. // If we've read less than bufferSize, we've reached EOF
  81. if n < bufferSize {
  82. return false, nil
  83. }
  84. }
  85. }