| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408 |
- // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
- // SPDX-License-Identifier: Apache-2.0
- package rss // import "miniflux.app/v2/internal/reader/rss"
- import (
- "html"
- "log/slog"
- "path"
- "slices"
- "strconv"
- "strings"
- "time"
- "miniflux.app/v2/internal/crypto"
- "miniflux.app/v2/internal/model"
- "miniflux.app/v2/internal/reader/date"
- "miniflux.app/v2/internal/reader/sanitizer"
- "miniflux.app/v2/internal/urllib"
- )
- type rssAdapter struct {
- rss *rss
- }
- func (r *rssAdapter) buildFeed(baseURL string) *model.Feed {
- feed := &model.Feed{
- Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
- FeedURL: strings.TrimSpace(baseURL),
- SiteURL: strings.TrimSpace(r.rss.Channel.Link),
- Description: strings.TrimSpace(r.rss.Channel.Description),
- }
- // Ensure the Site URL is absolute.
- if absoluteSiteURL, err := urllib.ResolveToAbsoluteURL(baseURL, feed.SiteURL); err == nil {
- feed.SiteURL = absoluteSiteURL
- }
- // Try to find the feed URL from the Atom links.
- for _, atomLink := range r.rss.Channel.Links {
- atomLinkHref := strings.TrimSpace(atomLink.Href)
- if atomLinkHref != "" && atomLink.Rel == "self" {
- if absoluteFeedURL, err := urllib.ResolveToAbsoluteURL(feed.FeedURL, atomLinkHref); err == nil {
- feed.FeedURL = absoluteFeedURL
- break
- }
- }
- }
- // Fallback to the site URL if the title is empty.
- if feed.Title == "" {
- feed.Title = feed.SiteURL
- }
- // Get TTL if defined.
- if r.rss.Channel.TTL != "" {
- if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
- feed.TTL = time.Duration(ttl) * time.Minute
- }
- }
- // Get the feed icon URL if defined.
- if r.rss.Channel.Image != nil {
- if absoluteIconURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
- feed.IconURL = absoluteIconURL
- }
- }
- for _, item := range r.rss.Channel.Items {
- entry := model.NewEntry()
- entry.Date = findEntryDate(&item)
- entry.Content = findEntryContent(&item)
- entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
- // Populate the entry URL.
- entryURL := findEntryURL(&item)
- if entryURL == "" {
- // Fallback to the first enclosure URL if it exists.
- if len(entry.Enclosures) > 0 && entry.Enclosures[0].URL != "" {
- entry.URL = entry.Enclosures[0].URL
- } else {
- // Fallback to the feed URL if no entry URL is found.
- entry.URL = feed.SiteURL
- }
- } else {
- if absoluteEntryURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, entryURL); err == nil {
- entry.URL = absoluteEntryURL
- } else {
- entry.URL = entryURL
- }
- }
- // Populate the entry title.
- entry.Title = findEntryTitle(&item)
- if entry.Title == "" {
- entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
- if entry.Title == "" {
- entry.Title = entry.URL
- }
- }
- entry.Author = findEntryAuthor(&item)
- if entry.Author == "" {
- entry.Author = findFeedAuthor(&r.rss.Channel)
- }
- // Generate the entry hash.
- switch {
- case item.GUID.Data != "":
- entry.Hash = crypto.SHA256(item.GUID.Data)
- case entryURL != "":
- entry.Hash = crypto.SHA256(entryURL)
- default:
- entry.Hash = crypto.SHA256(entry.Title + entry.Content)
- }
- // Find CommentsURL if defined.
- if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
- entry.CommentsURL = absoluteCommentsURL
- }
- // Set podcast listening time.
- if item.ItunesDuration != "" {
- if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
- entry.ReadingTime = duration
- }
- }
- // Populate entry categories.
- entry.Tags = findEntryTags(&item)
- if len(entry.Tags) == 0 {
- entry.Tags = findFeedTags(&r.rss.Channel)
- }
- // Sort and deduplicate tags.
- slices.Sort(entry.Tags)
- entry.Tags = slices.Compact(entry.Tags)
- feed.Entries = append(feed.Entries, entry)
- }
- return feed
- }
- func findFeedAuthor(rssChannel *rssChannel) string {
- var author string
- switch {
- case rssChannel.ItunesAuthor != "":
- author = rssChannel.ItunesAuthor
- case rssChannel.GooglePlayAuthor != "":
- author = rssChannel.GooglePlayAuthor
- case rssChannel.ItunesOwner.String() != "":
- author = rssChannel.ItunesOwner.String()
- case rssChannel.ManagingEditor != "":
- author = rssChannel.ManagingEditor
- case rssChannel.Webmaster != "":
- author = rssChannel.Webmaster
- default:
- return ""
- }
- return strings.TrimSpace(sanitizer.StripTags(author))
- }
- func findFeedTags(rssChannel *rssChannel) []string {
- tags := make([]string, 0)
- for _, tag := range rssChannel.Categories {
- tag = strings.TrimSpace(tag)
- if tag != "" {
- tags = append(tags, tag)
- }
- }
- for _, tag := range rssChannel.GetItunesCategories() {
- tag = strings.TrimSpace(tag)
- if tag != "" {
- tags = append(tags, tag)
- }
- }
- if tag := strings.TrimSpace(rssChannel.GooglePlayCategory.Text); tag != "" {
- tags = append(tags, tag)
- }
- return tags
- }
- func findEntryTitle(rssItem *rssItem) string {
- title := rssItem.Title.Content
- if rssItem.DublinCoreTitle != "" {
- title = rssItem.DublinCoreTitle
- }
- return html.UnescapeString(html.UnescapeString(strings.TrimSpace(title)))
- }
- func findEntryURL(rssItem *rssItem) string {
- for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
- if link != "" {
- return strings.TrimSpace(link)
- }
- }
- for _, atomLink := range rssItem.Links {
- if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
- return strings.TrimSpace(atomLink.Href)
- }
- }
- // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
- // isPermaLink is optional, its default value is true.
- // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
- if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
- return strings.TrimSpace(rssItem.GUID.Data)
- }
- return ""
- }
- func findEntryContent(rssItem *rssItem) string {
- for _, value := range []string{
- rssItem.DublinCoreContent,
- rssItem.Description,
- rssItem.GooglePlayDescription,
- rssItem.ItunesSummary,
- rssItem.ItunesSubtitle,
- } {
- if value != "" {
- return value
- }
- }
- return ""
- }
- func findEntryDate(rssItem *rssItem) time.Time {
- value := rssItem.PubDate
- if rssItem.DublinCoreDate != "" {
- value = rssItem.DublinCoreDate
- }
- if value != "" {
- result, err := date.Parse(value)
- if err != nil {
- slog.Debug("Unable to parse date from RSS feed",
- slog.String("date", value),
- slog.String("guid", rssItem.GUID.Data),
- slog.Any("error", err),
- )
- return time.Now()
- }
- return result
- }
- return time.Now()
- }
- func findEntryAuthor(rssItem *rssItem) string {
- var author string
- switch {
- case rssItem.GooglePlayAuthor != "":
- author = rssItem.GooglePlayAuthor
- case rssItem.ItunesAuthor != "":
- author = rssItem.ItunesAuthor
- case rssItem.DublinCoreCreator != "":
- author = rssItem.DublinCoreCreator
- case rssItem.PersonName() != "":
- author = rssItem.PersonName()
- case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
- author = rssItem.Author.Data
- case rssItem.Author.Inner != "":
- author = rssItem.Author.Inner
- default:
- return ""
- }
- return strings.TrimSpace(sanitizer.StripTags(author))
- }
- func findEntryTags(rssItem *rssItem) []string {
- tags := make([]string, 0)
- for _, tag := range rssItem.Categories {
- tag = strings.TrimSpace(tag)
- if tag != "" {
- tags = append(tags, tag)
- }
- }
- for _, tag := range rssItem.MediaCategories.Labels() {
- tag = strings.TrimSpace(tag)
- if tag != "" {
- tags = append(tags, tag)
- }
- }
- return tags
- }
- func findEntryEnclosures(rssItem *rssItem, siteURL string) model.EnclosureList {
- enclosures := make(model.EnclosureList, 0)
- duplicates := make(map[string]bool)
- for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
- mediaURL := strings.TrimSpace(mediaThumbnail.URL)
- if mediaURL == "" {
- continue
- }
- if _, found := duplicates[mediaURL]; !found {
- if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
- slog.Debug("Unable to build absolute URL for media thumbnail",
- slog.String("url", mediaThumbnail.URL),
- slog.String("site_url", siteURL),
- slog.Any("error", err),
- )
- } else {
- duplicates[mediaAbsoluteURL] = true
- enclosures = append(enclosures, &model.Enclosure{
- URL: mediaAbsoluteURL,
- MimeType: mediaThumbnail.MimeType(),
- Size: mediaThumbnail.Size(),
- })
- }
- }
- }
- for _, enclosure := range rssItem.Enclosures {
- enclosureURL := enclosure.URL
- if rssItem.FeedBurnerEnclosureLink != "" {
- filename := path.Base(rssItem.FeedBurnerEnclosureLink)
- if strings.HasSuffix(enclosureURL, filename) {
- enclosureURL = rssItem.FeedBurnerEnclosureLink
- }
- }
- enclosureURL = strings.TrimSpace(enclosureURL)
- if enclosureURL == "" {
- continue
- }
- if absoluteEnclosureURL, err := urllib.ResolveToAbsoluteURL(siteURL, enclosureURL); err == nil {
- enclosureURL = absoluteEnclosureURL
- }
- if _, found := duplicates[enclosureURL]; !found {
- duplicates[enclosureURL] = true
- enclosures = append(enclosures, &model.Enclosure{
- URL: enclosureURL,
- MimeType: enclosure.Type,
- Size: enclosure.Size(),
- })
- }
- }
- for _, mediaContent := range rssItem.AllMediaContents() {
- mediaURL := strings.TrimSpace(mediaContent.URL)
- if mediaURL == "" {
- continue
- }
- if _, found := duplicates[mediaURL]; !found {
- mediaURL := strings.TrimSpace(mediaContent.URL)
- if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
- slog.Debug("Unable to build absolute URL for media content",
- slog.String("url", mediaContent.URL),
- slog.String("site_url", siteURL),
- slog.Any("error", err),
- )
- } else {
- duplicates[mediaAbsoluteURL] = true
- enclosures = append(enclosures, &model.Enclosure{
- URL: mediaAbsoluteURL,
- MimeType: mediaContent.MimeType(),
- Size: mediaContent.Size(),
- })
- }
- }
- }
- for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
- mediaURL := strings.TrimSpace(mediaPeerLink.URL)
- if mediaURL == "" {
- continue
- }
- if _, found := duplicates[mediaURL]; !found {
- mediaURL := strings.TrimSpace(mediaPeerLink.URL)
- if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
- slog.Debug("Unable to build absolute URL for media peer link",
- slog.String("url", mediaPeerLink.URL),
- slog.String("site_url", siteURL),
- slog.Any("error", err),
- )
- } else {
- duplicates[mediaAbsoluteURL] = true
- enclosures = append(enclosures, &model.Enclosure{
- URL: mediaAbsoluteURL,
- MimeType: mediaPeerLink.MimeType(),
- Size: mediaPeerLink.Size(),
- })
- }
- }
- }
- return enclosures
- }
|