finder.go 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "net/url"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/http/client"
  13. "miniflux.app/v2/internal/logger"
  14. "miniflux.app/v2/internal/model"
  15. "miniflux.app/v2/internal/urllib"
  16. "github.com/PuerkitoBio/goquery"
  17. )
  18. // FindIcon try to find the website's icon.
  19. func FindIcon(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (icon *model.Icon, err error) {
  20. if feedIconURL == "" {
  21. feedIconURL, err = fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
  22. if err != nil {
  23. return nil, err
  24. }
  25. }
  26. if strings.HasPrefix(feedIconURL, "data:") {
  27. return parseImageDataURL(feedIconURL)
  28. }
  29. feedIconURL, err = generateIconURL(websiteURL, feedIconURL)
  30. if err != nil {
  31. return nil, err
  32. }
  33. if icon, err = downloadIcon(feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates); err != nil {
  34. return nil, err
  35. }
  36. return icon, nil
  37. }
  38. func generateIconURL(websiteURL, feedIconURL string) (iconURL string, err error) {
  39. feedIconURL = strings.TrimSpace(feedIconURL)
  40. if feedIconURL == "" {
  41. iconURL, err = urllib.JoinBaseURLAndPath(urllib.RootURL(websiteURL), "favicon.ico")
  42. if err != nil {
  43. return "", fmt.Errorf(`icon: unable to join base URL and path: %w`, err)
  44. }
  45. } else {
  46. iconURL, err = urllib.AbsoluteURL(websiteURL, feedIconURL)
  47. if err != nil {
  48. return "", fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  49. }
  50. }
  51. return iconURL, nil
  52. }
  53. func fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (string, error) {
  54. rootURL := urllib.RootURL(websiteURL)
  55. logger.Debug("[FindIcon] Find icon from HTML webpage: %s", rootURL)
  56. clt := client.NewClientWithConfig(rootURL, config.Opts)
  57. clt.WithUserAgent(userAgent)
  58. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  59. if fetchViaProxy {
  60. clt.WithProxy()
  61. }
  62. response, err := clt.Get()
  63. if err != nil {
  64. return "", fmt.Errorf("icon: unable to download website index page: %v", err)
  65. }
  66. if response.HasServerFailure() {
  67. return "", fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
  68. }
  69. return findIconURLFromHTMLDocument(response.Body)
  70. }
  71. func findIconURLFromHTMLDocument(body io.Reader) (string, error) {
  72. queries := []string{
  73. "link[rel='shortcut icon']",
  74. "link[rel='Shortcut Icon']",
  75. "link[rel='icon shortcut']",
  76. "link[rel='icon']",
  77. }
  78. doc, err := goquery.NewDocumentFromReader(body)
  79. if err != nil {
  80. return "", fmt.Errorf("icon: unable to read document: %v", err)
  81. }
  82. var iconURL string
  83. for _, query := range queries {
  84. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  85. if href, exists := s.Attr("href"); exists {
  86. iconURL = strings.TrimSpace(href)
  87. }
  88. })
  89. if iconURL != "" {
  90. break
  91. }
  92. }
  93. return iconURL, nil
  94. }
  95. func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
  96. clt := client.NewClientWithConfig(iconURL, config.Opts)
  97. clt.WithUserAgent(userAgent)
  98. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  99. if fetchViaProxy {
  100. clt.WithProxy()
  101. }
  102. response, err := clt.Get()
  103. if err != nil {
  104. return nil, fmt.Errorf("icon: unable to download iconURL: %v", err)
  105. }
  106. if response.HasServerFailure() {
  107. return nil, fmt.Errorf("icon: unable to download icon: status=%d", response.StatusCode)
  108. }
  109. body, err := io.ReadAll(response.Body)
  110. if err != nil {
  111. return nil, fmt.Errorf("icon: unable to read downloaded icon: %v", err)
  112. }
  113. if len(body) == 0 {
  114. return nil, fmt.Errorf("icon: downloaded icon is empty, iconURL=%s", iconURL)
  115. }
  116. icon := &model.Icon{
  117. Hash: crypto.HashFromBytes(body),
  118. MimeType: response.ContentType,
  119. Content: body,
  120. }
  121. return icon, nil
  122. }
  123. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  124. // data:[<mediatype>][;base64],<data>
  125. func parseImageDataURL(value string) (*model.Icon, error) {
  126. var mediaType string
  127. var encoding string
  128. if !strings.HasPrefix(value, "data:") {
  129. return nil, fmt.Errorf(`icon: invalid data URL (missing data:) %q`, value)
  130. }
  131. value = value[5:]
  132. comma := strings.Index(value, ",")
  133. if comma < 0 {
  134. return nil, fmt.Errorf(`icon: invalid data URL (no comma) %q`, value)
  135. }
  136. data := value[comma+1:]
  137. semicolon := strings.Index(value[0:comma], ";")
  138. if semicolon > 0 {
  139. mediaType = value[0:semicolon]
  140. encoding = value[semicolon+1 : comma]
  141. } else {
  142. mediaType = value[0:comma]
  143. }
  144. if !strings.HasPrefix(mediaType, "image/") {
  145. return nil, fmt.Errorf(`icon: invalid media type %q`, mediaType)
  146. }
  147. var blob []byte
  148. switch encoding {
  149. case "base64":
  150. var err error
  151. blob, err = base64.StdEncoding.DecodeString(data)
  152. if err != nil {
  153. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  154. }
  155. case "":
  156. decodedData, err := url.QueryUnescape(data)
  157. if err != nil {
  158. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  159. }
  160. blob = []byte(decodedData)
  161. default:
  162. return nil, fmt.Errorf(`icon: unsupported data URL encoding %q`, value)
  163. }
  164. if len(blob) == 0 {
  165. return nil, fmt.Errorf(`icon: empty data URL %q`, value)
  166. }
  167. icon := &model.Icon{
  168. Hash: crypto.HashFromBytes(blob),
  169. Content: blob,
  170. MimeType: mediaType,
  171. }
  172. return icon, nil
  173. }