| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- // Copyright 2017 Frédéric Guillot. All rights reserved.
- // Use of this source code is governed by the Apache 2.0
- // license that can be found in the LICENSE file.
- package feed
- import (
- "bytes"
- "encoding/xml"
- "io"
- "strings"
- "time"
- "github.com/miniflux/miniflux/errors"
- "github.com/miniflux/miniflux/logger"
- "github.com/miniflux/miniflux/model"
- "github.com/miniflux/miniflux/reader/atom"
- "github.com/miniflux/miniflux/reader/encoding"
- "github.com/miniflux/miniflux/reader/json"
- "github.com/miniflux/miniflux/reader/rdf"
- "github.com/miniflux/miniflux/reader/rss"
- "github.com/miniflux/miniflux/timer"
- )
- // List of feed formats.
- const (
- FormatRDF = "rdf"
- FormatRSS = "rss"
- FormatAtom = "atom"
- FormatJSON = "json"
- FormatUnknown = "unknown"
- )
- // DetectFeedFormat detect feed format from input data.
- func DetectFeedFormat(r io.Reader) string {
- defer timer.ExecutionTime(time.Now(), "[Feed:DetectFeedFormat]")
- var buffer bytes.Buffer
- tee := io.TeeReader(r, &buffer)
- decoder := xml.NewDecoder(tee)
- decoder.CharsetReader = encoding.CharsetReader
- for {
- token, _ := decoder.Token()
- if token == nil {
- break
- }
- if element, ok := token.(xml.StartElement); ok {
- switch element.Name.Local {
- case "rss":
- return FormatRSS
- case "feed":
- return FormatAtom
- case "RDF":
- return FormatRDF
- }
- }
- }
- if strings.HasPrefix(strings.TrimSpace(buffer.String()), "{") {
- return FormatJSON
- }
- return FormatUnknown
- }
- func parseFeed(r io.Reader) (*model.Feed, *errors.LocalizedError) {
- defer timer.ExecutionTime(time.Now(), "[Feed:ParseFeed]")
- var buffer bytes.Buffer
- size, _ := io.Copy(&buffer, r)
- if size == 0 {
- return nil, errors.NewLocalizedError("This feed is empty")
- }
- str := stripInvalidXMLCharacters(buffer.String())
- reader := strings.NewReader(str)
- format := DetectFeedFormat(reader)
- reader.Seek(0, io.SeekStart)
- switch format {
- case FormatAtom:
- return atom.Parse(reader)
- case FormatRSS:
- return rss.Parse(reader)
- case FormatJSON:
- return json.Parse(reader)
- case FormatRDF:
- return rdf.Parse(reader)
- default:
- return nil, errors.NewLocalizedError("Unsupported feed format")
- }
- }
- func stripInvalidXMLCharacters(input string) string {
- return strings.Map(func(r rune) rune {
- if isInCharacterRange(r) {
- return r
- }
- logger.Debug("Strip invalid XML characters: %U", r)
- return -1
- }, input)
- }
- // Decide whether the given rune is in the XML Character Range, per
- // the Char production of http://www.xml.com/axml/testaxml.htm,
- // Section 2.2 Characters.
- func isInCharacterRange(r rune) (inrange bool) {
- return r == 0x09 ||
- r == 0x0A ||
- r == 0x0D ||
- r >= 0x20 && r <= 0xDF77 ||
- r >= 0xE000 && r <= 0xFFFD ||
- r >= 0x10000 && r <= 0x10FFFF
- }
|