finder.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package subscription // import "miniflux.app/reader/subscription"
  4. import (
  5. "fmt"
  6. "io"
  7. "regexp"
  8. "strings"
  9. "miniflux.app/config"
  10. "miniflux.app/errors"
  11. "miniflux.app/http/client"
  12. "miniflux.app/reader/browser"
  13. "miniflux.app/reader/parser"
  14. "miniflux.app/url"
  15. "github.com/PuerkitoBio/goquery"
  16. )
  17. var (
  18. errUnreadableDoc = "Unable to analyze this page: %v"
  19. youtubeChannelRegex = regexp.MustCompile(`youtube\.com/channel/(.*)`)
  20. youtubeVideoRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
  21. )
  22. // FindSubscriptions downloads and try to find one or more subscriptions from an URL.
  23. func FindSubscriptions(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool) (Subscriptions, *errors.LocalizedError) {
  24. websiteURL = findYoutubeChannelFeed(websiteURL)
  25. websiteURL = parseYoutubeVideoPage(websiteURL)
  26. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  27. clt.WithCredentials(username, password)
  28. clt.WithUserAgent(userAgent)
  29. clt.WithCookie(cookie)
  30. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  31. if fetchViaProxy {
  32. clt.WithProxy()
  33. }
  34. response, err := browser.Exec(clt)
  35. if err != nil {
  36. return nil, err
  37. }
  38. body := response.BodyAsString()
  39. if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
  40. var subscriptions Subscriptions
  41. subscriptions = append(subscriptions, &Subscription{
  42. Title: response.EffectiveURL,
  43. URL: response.EffectiveURL,
  44. Type: format,
  45. })
  46. return subscriptions, nil
  47. }
  48. subscriptions, err := parseWebPage(response.EffectiveURL, strings.NewReader(body))
  49. if err != nil || subscriptions != nil {
  50. return subscriptions, err
  51. }
  52. return tryWellKnownUrls(websiteURL, userAgent, cookie, username, password)
  53. }
  54. func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *errors.LocalizedError) {
  55. var subscriptions Subscriptions
  56. queries := map[string]string{
  57. "link[type='application/rss+xml']": "rss",
  58. "link[type='application/atom+xml']": "atom",
  59. "link[type='application/json']": "json",
  60. "link[type='application/feed+json']": "json",
  61. }
  62. doc, err := goquery.NewDocumentFromReader(data)
  63. if err != nil {
  64. return nil, errors.NewLocalizedError(errUnreadableDoc, err)
  65. }
  66. for query, kind := range queries {
  67. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  68. subscription := new(Subscription)
  69. subscription.Type = kind
  70. if title, exists := s.Attr("title"); exists {
  71. subscription.Title = title
  72. }
  73. if feedURL, exists := s.Attr("href"); exists {
  74. if feedURL != "" {
  75. subscription.URL, _ = url.AbsoluteURL(websiteURL, feedURL)
  76. }
  77. }
  78. if subscription.Title == "" {
  79. subscription.Title = subscription.URL
  80. }
  81. if subscription.URL != "" {
  82. subscriptions = append(subscriptions, subscription)
  83. }
  84. })
  85. }
  86. return subscriptions, nil
  87. }
  88. func findYoutubeChannelFeed(websiteURL string) string {
  89. matches := youtubeChannelRegex.FindStringSubmatch(websiteURL)
  90. if len(matches) == 2 {
  91. return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, matches[1])
  92. }
  93. return websiteURL
  94. }
  95. func parseYoutubeVideoPage(websiteURL string) string {
  96. if !youtubeVideoRegex.MatchString(websiteURL) {
  97. return websiteURL
  98. }
  99. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  100. response, browserErr := browser.Exec(clt)
  101. if browserErr != nil {
  102. return websiteURL
  103. }
  104. doc, docErr := goquery.NewDocumentFromReader(response.Body)
  105. if docErr != nil {
  106. return websiteURL
  107. }
  108. if channelID, exists := doc.Find(`meta[itemprop="channelId"]`).First().Attr("content"); exists {
  109. return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, channelID)
  110. }
  111. return websiteURL
  112. }
  113. func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string) (Subscriptions, *errors.LocalizedError) {
  114. var subscriptions Subscriptions
  115. knownURLs := map[string]string{
  116. "atom.xml": "atom",
  117. "feed.xml": "atom",
  118. "feed/": "atom",
  119. "rss.xml": "rss",
  120. "rss/": "rss",
  121. }
  122. websiteURLRoot := url.RootURL(websiteURL)
  123. baseURLs := []string{
  124. // Look for knownURLs in the root.
  125. websiteURLRoot,
  126. }
  127. // Look for knownURLs in current subdirectory, such as 'example.com/blog/'.
  128. websiteURL, _ = url.AbsoluteURL(websiteURL, "./")
  129. if websiteURL != websiteURLRoot {
  130. baseURLs = append(baseURLs, websiteURL)
  131. }
  132. for _, baseURL := range baseURLs {
  133. for knownURL, kind := range knownURLs {
  134. fullURL, err := url.AbsoluteURL(baseURL, knownURL)
  135. if err != nil {
  136. continue
  137. }
  138. clt := client.NewClientWithConfig(fullURL, config.Opts)
  139. clt.WithCredentials(username, password)
  140. clt.WithUserAgent(userAgent)
  141. clt.WithCookie(cookie)
  142. // Some websites redirects unknown URLs to the home page.
  143. // As result, the list of known URLs is returned to the subscription list.
  144. // We don't want the user to choose between invalid feed URLs.
  145. clt.WithoutRedirects()
  146. response, err := clt.Get()
  147. if err != nil {
  148. continue
  149. }
  150. if response != nil && response.StatusCode == 200 {
  151. subscription := new(Subscription)
  152. subscription.Type = kind
  153. subscription.Title = fullURL
  154. subscription.URL = fullURL
  155. if subscription.URL != "" {
  156. subscriptions = append(subscriptions, subscription)
  157. }
  158. }
  159. }
  160. }
  161. return subscriptions, nil
  162. }