Browse Source

Improve feed parsers

Frédéric Guillot 8 years ago
parent
commit
2b641cc224
4 changed files with 27 additions and 25 deletions
  1. 8 9
      reader/atom/atom.go
  2. 8 6
      reader/json/json.go
  3. 3 2
      reader/rdf/rdf.go
  4. 8 8
      reader/rss/rss.go

+ 8 - 9
reader/atom/atom.go

@@ -15,7 +15,6 @@ import (
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
 	"github.com/miniflux/miniflux2/reader/processor"
-	"github.com/miniflux/miniflux2/reader/sanitizer"
 )
 
 type atomFeed struct {
@@ -64,7 +63,7 @@ func (a *atomFeed) Transform() *model.Feed {
 	feed := new(model.Feed)
 	feed.FeedURL = getRelationURL(a.Links, "self")
 	feed.SiteURL = getURL(a.Links)
-	feed.Title = sanitizer.StripTags(a.Title)
+	feed.Title = strings.TrimSpace(a.Title)
 
 	if feed.Title == "" {
 		feed.Title = feed.SiteURL
@@ -86,10 +85,10 @@ func (a *atomEntry) Transform() *model.Entry {
 	entry := new(model.Entry)
 	entry.URL = getURL(a.Links)
 	entry.Date = getDate(a)
-	entry.Author = sanitizer.StripTags(getAuthor(a.Author))
+	entry.Author = getAuthor(a.Author)
 	entry.Hash = getHash(a)
 	entry.Content = processor.ItemContentProcessor(entry.URL, getContent(a))
-	entry.Title = sanitizer.StripTags(strings.Trim(a.Title, " \n\t"))
+	entry.Title = strings.TrimSpace(a.Title)
 	entry.Enclosures = getEnclosures(a)
 
 	if entry.Title == "" {
@@ -102,11 +101,11 @@ func (a *atomEntry) Transform() *model.Entry {
 func getURL(links []atomLink) string {
 	for _, link := range links {
 		if strings.ToLower(link.Rel) == "alternate" {
-			return link.URL
+			return strings.TrimSpace(link.URL)
 		}
 
 		if link.Rel == "" && link.Type == "" {
-			return link.URL
+			return strings.TrimSpace(link.URL)
 		}
 	}
 
@@ -116,7 +115,7 @@ func getURL(links []atomLink) string {
 func getRelationURL(links []atomLink, relation string) string {
 	for _, link := range links {
 		if strings.ToLower(link.Rel) == relation {
-			return link.URL
+			return strings.TrimSpace(link.URL)
 		}
 	}
 
@@ -182,11 +181,11 @@ func getEnclosures(a *atomEntry) model.EnclosureList {
 
 func getAuthor(author atomAuthor) string {
 	if author.Name != "" {
-		return author.Name
+		return strings.TrimSpace(author.Name)
 	}
 
 	if author.Email != "" {
-		return author.Email
+		return strings.TrimSpace(author.Email)
 	}
 
 	return ""

+ 8 - 6
reader/json/json.go

@@ -9,11 +9,12 @@ import (
 	"strings"
 	"time"
 
+	"github.com/miniflux/miniflux2/reader/sanitizer"
+
 	"github.com/miniflux/miniflux2/helper"
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
 	"github.com/miniflux/miniflux2/reader/processor"
-	"github.com/miniflux/miniflux2/reader/sanitizer"
 )
 
 type jsonFeed struct {
@@ -59,7 +60,7 @@ func (j *jsonFeed) Transform() *model.Feed {
 	feed := new(model.Feed)
 	feed.FeedURL = j.FeedURL
 	feed.SiteURL = j.SiteURL
-	feed.Title = sanitizer.StripTags(j.Title)
+	feed.Title = strings.TrimSpace(j.Title)
 
 	if feed.Title == "" {
 		feed.Title = feed.SiteURL
@@ -110,7 +111,7 @@ func (j *jsonItem) GetHash() string {
 func (j *jsonItem) GetTitle() string {
 	for _, value := range []string{j.Title, j.Summary, j.Text, j.HTML} {
 		if value != "" {
-			return truncate(value)
+			return truncate(sanitizer.StripTags(value))
 		}
 	}
 
@@ -145,17 +146,17 @@ func (j *jsonItem) Transform() *model.Entry {
 	entry := new(model.Entry)
 	entry.URL = j.URL
 	entry.Date = j.GetDate()
-	entry.Author = sanitizer.StripTags(j.GetAuthor())
+	entry.Author = j.GetAuthor()
 	entry.Hash = j.GetHash()
 	entry.Content = processor.ItemContentProcessor(entry.URL, j.GetContent())
-	entry.Title = sanitizer.StripTags(strings.Trim(j.GetTitle(), " \n\t"))
+	entry.Title = strings.TrimSpace(j.GetTitle())
 	entry.Enclosures = j.GetEnclosures()
 	return entry
 }
 
 func getAuthor(author jsonAuthor) string {
 	if author.Name != "" {
-		return author.Name
+		return strings.TrimSpace(author.Name)
 	}
 
 	return ""
@@ -163,6 +164,7 @@ func getAuthor(author jsonAuthor) string {
 
 func truncate(str string) string {
 	max := 100
+	str = strings.TrimSpace(str)
 	if len(str) > max {
 		return str[:max] + "..."
 	}

+ 3 - 2
reader/rdf/rdf.go

@@ -6,6 +6,7 @@ package rdf
 
 import (
 	"encoding/xml"
+	"strings"
 	"time"
 
 	"github.com/miniflux/miniflux2/helper"
@@ -54,8 +55,8 @@ type rdfItem struct {
 
 func (r *rdfItem) Transform() *model.Entry {
 	entry := new(model.Entry)
-	entry.Title = sanitizer.StripTags(r.Title)
-	entry.Author = sanitizer.StripTags(r.Creator)
+	entry.Title = strings.TrimSpace(r.Title)
+	entry.Author = strings.TrimSpace(r.Creator)
 	entry.URL = r.Link
 	entry.Content = processor.ItemContentProcessor(entry.URL, r.Description)
 	entry.Hash = getHash(r)

+ 8 - 8
reader/rss/rss.go

@@ -16,7 +16,6 @@ import (
 	"github.com/miniflux/miniflux2/model"
 	"github.com/miniflux/miniflux2/reader/date"
 	"github.com/miniflux/miniflux2/reader/processor"
-	"github.com/miniflux/miniflux2/reader/sanitizer"
 )
 
 type rssFeed struct {
@@ -68,7 +67,7 @@ type rssEnclosure struct {
 func (r *rssFeed) GetSiteURL() string {
 	for _, element := range r.Links {
 		if element.XMLName.Space == "" {
-			return element.Data
+			return strings.TrimSpace(element.Data)
 		}
 	}
 
@@ -78,7 +77,7 @@ func (r *rssFeed) GetSiteURL() string {
 func (r *rssFeed) GetFeedURL() string {
 	for _, element := range r.Links {
 		if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
-			return element.Href
+			return strings.TrimSpace(element.Href)
 		}
 	}
 
@@ -89,7 +88,7 @@ func (r *rssFeed) Transform() *model.Feed {
 	feed := new(model.Feed)
 	feed.SiteURL = r.GetSiteURL()
 	feed.FeedURL = r.GetFeedURL()
-	feed.Title = sanitizer.StripTags(r.Title)
+	feed.Title = strings.TrimSpace(r.Title)
 
 	if feed.Title == "" {
 		feed.Title = feed.SiteURL
@@ -101,7 +100,7 @@ func (r *rssFeed) Transform() *model.Feed {
 		if entry.Author == "" && r.ItunesAuthor != "" {
 			entry.Author = r.ItunesAuthor
 		}
-		entry.Author = sanitizer.StripTags(entry.Author)
+		entry.Author = strings.TrimSpace(entry.Author)
 
 		if entry.URL == "" {
 			entry.URL = feed.SiteURL
@@ -112,6 +111,7 @@ func (r *rssFeed) Transform() *model.Feed {
 
 	return feed
 }
+
 func (r *rssItem) GetDate() time.Time {
 	value := r.PubDate
 	if r.Date != "" {
@@ -170,11 +170,11 @@ func (r *rssItem) GetURL() string {
 
 	for _, link := range r.Links {
 		if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
-			return link.Href
+			return strings.TrimSpace(link.Href)
 		}
 
 		if link.Data != "" {
-			return link.Data
+			return strings.TrimSpace(link.Data)
 		}
 	}
 
@@ -212,7 +212,7 @@ func (r *rssItem) Transform() *model.Entry {
 	entry.Author = r.GetAuthor()
 	entry.Hash = r.GetHash()
 	entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
-	entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t"))
+	entry.Title = strings.TrimSpace(r.Title)
 	entry.Enclosures = r.GetEnclosures()
 
 	if entry.Title == "" {