finder.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package subscription // import "miniflux.app/reader/subscription"
  5. import (
  6. "fmt"
  7. "io"
  8. "regexp"
  9. "strings"
  10. "miniflux.app/config"
  11. "miniflux.app/errors"
  12. "miniflux.app/http/client"
  13. "miniflux.app/reader/browser"
  14. "miniflux.app/reader/parser"
  15. "miniflux.app/url"
  16. "github.com/PuerkitoBio/goquery"
  17. )
  18. var (
  19. errUnreadableDoc = "Unable to analyze this page: %v"
  20. youtubeChannelRegex = regexp.MustCompile(`youtube\.com/channel/(.*)`)
  21. youtubeVideoRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
  22. )
  23. // FindSubscriptions downloads and try to find one or more subscriptions from an URL.
  24. func FindSubscriptions(websiteURL, userAgent, cookie, username, password string, fetchViaProxy, allowSelfSignedCertificates bool) (Subscriptions, *errors.LocalizedError) {
  25. websiteURL = findYoutubeChannelFeed(websiteURL)
  26. websiteURL = parseYoutubeVideoPage(websiteURL)
  27. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  28. clt.WithCredentials(username, password)
  29. clt.WithUserAgent(userAgent)
  30. clt.WithCookie(cookie)
  31. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  32. if fetchViaProxy {
  33. clt.WithProxy()
  34. }
  35. response, err := browser.Exec(clt)
  36. if err != nil {
  37. return nil, err
  38. }
  39. body := response.BodyAsString()
  40. if format := parser.DetectFeedFormat(body); format != parser.FormatUnknown {
  41. var subscriptions Subscriptions
  42. subscriptions = append(subscriptions, &Subscription{
  43. Title: response.EffectiveURL,
  44. URL: response.EffectiveURL,
  45. Type: format,
  46. })
  47. return subscriptions, nil
  48. }
  49. subscriptions, err := parseWebPage(response.EffectiveURL, strings.NewReader(body))
  50. if err != nil || subscriptions != nil {
  51. return subscriptions, err
  52. }
  53. return tryWellKnownUrls(websiteURL, userAgent, cookie, username, password)
  54. }
  55. func parseWebPage(websiteURL string, data io.Reader) (Subscriptions, *errors.LocalizedError) {
  56. var subscriptions Subscriptions
  57. queries := map[string]string{
  58. "link[type='application/rss+xml']": "rss",
  59. "link[type='application/atom+xml']": "atom",
  60. "link[type='application/json']": "json",
  61. "link[type='application/feed+json']": "json",
  62. }
  63. doc, err := goquery.NewDocumentFromReader(data)
  64. if err != nil {
  65. return nil, errors.NewLocalizedError(errUnreadableDoc, err)
  66. }
  67. for query, kind := range queries {
  68. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  69. subscription := new(Subscription)
  70. subscription.Type = kind
  71. if title, exists := s.Attr("title"); exists {
  72. subscription.Title = title
  73. } else {
  74. subscription.Title = "Feed"
  75. }
  76. if feedURL, exists := s.Attr("href"); exists {
  77. subscription.URL, _ = url.AbsoluteURL(websiteURL, feedURL)
  78. }
  79. if subscription.Title == "" {
  80. subscription.Title = subscription.URL
  81. }
  82. if subscription.URL != "" {
  83. subscriptions = append(subscriptions, subscription)
  84. }
  85. })
  86. }
  87. return subscriptions, nil
  88. }
  89. func findYoutubeChannelFeed(websiteURL string) string {
  90. matches := youtubeChannelRegex.FindStringSubmatch(websiteURL)
  91. if len(matches) == 2 {
  92. return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, matches[1])
  93. }
  94. return websiteURL
  95. }
  96. func parseYoutubeVideoPage(websiteURL string) string {
  97. if !youtubeVideoRegex.MatchString(websiteURL) {
  98. return websiteURL
  99. }
  100. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  101. response, browserErr := browser.Exec(clt)
  102. if browserErr != nil {
  103. return websiteURL
  104. }
  105. doc, docErr := goquery.NewDocumentFromReader(response.Body)
  106. if docErr != nil {
  107. return websiteURL
  108. }
  109. if channelID, exists := doc.Find(`meta[itemprop="channelId"]`).First().Attr("content"); exists {
  110. return fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, channelID)
  111. }
  112. return websiteURL
  113. }
  114. func tryWellKnownUrls(websiteURL, userAgent, cookie, username, password string) (Subscriptions, *errors.LocalizedError) {
  115. var subscriptions Subscriptions
  116. knownURLs := map[string]string{
  117. "/atom.xml": "atom",
  118. "/feed.xml": "atom",
  119. "/feed/": "atom",
  120. "/rss.xml": "rss",
  121. "/rss/": "rss",
  122. }
  123. lastCharacter := websiteURL[len(websiteURL)-1:]
  124. if lastCharacter == "/" {
  125. websiteURL = websiteURL[:len(websiteURL)-1]
  126. }
  127. for knownURL, kind := range knownURLs {
  128. fullURL, err := url.AbsoluteURL(websiteURL, knownURL)
  129. if err != nil {
  130. continue
  131. }
  132. clt := client.NewClientWithConfig(fullURL, config.Opts)
  133. clt.WithCredentials(username, password)
  134. clt.WithUserAgent(userAgent)
  135. clt.WithCookie(cookie)
  136. // Some websites redirects unknown URLs to the home page.
  137. // As result, the list of known URLs is returned to the subscription list.
  138. // We don't want the user to choose between invalid feed URLs.
  139. clt.WithoutRedirects()
  140. response, err := clt.Get()
  141. if err != nil {
  142. continue
  143. }
  144. if response != nil && response.StatusCode == 200 {
  145. subscription := new(Subscription)
  146. subscription.Type = kind
  147. subscription.Title = fullURL
  148. subscription.URL = fullURL
  149. if subscription.URL != "" {
  150. subscriptions = append(subscriptions, subscription)
  151. }
  152. }
  153. }
  154. return subscriptions, nil
  155. }