finder.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "net/url"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/http/client"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/urllib"
  15. "github.com/PuerkitoBio/goquery"
  16. )
  17. // FindIcon try to find the website's icon.
  18. func FindIcon(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (icon *model.Icon, err error) {
  19. if feedIconURL == "" {
  20. feedIconURL, err = fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
  21. if err != nil {
  22. return nil, err
  23. }
  24. }
  25. if strings.HasPrefix(feedIconURL, "data:") {
  26. return parseImageDataURL(feedIconURL)
  27. }
  28. feedIconURL, err = generateIconURL(websiteURL, feedIconURL)
  29. if err != nil {
  30. return nil, err
  31. }
  32. if icon, err = downloadIcon(feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates); err != nil {
  33. return nil, err
  34. }
  35. return icon, nil
  36. }
  37. func generateIconURL(websiteURL, feedIconURL string) (iconURL string, err error) {
  38. feedIconURL = strings.TrimSpace(feedIconURL)
  39. if feedIconURL == "" {
  40. iconURL, err = urllib.JoinBaseURLAndPath(urllib.RootURL(websiteURL), "favicon.ico")
  41. if err != nil {
  42. return "", fmt.Errorf(`icon: unable to join base URL and path: %w`, err)
  43. }
  44. } else {
  45. iconURL, err = urllib.AbsoluteURL(websiteURL, feedIconURL)
  46. if err != nil {
  47. return "", fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  48. }
  49. }
  50. return iconURL, nil
  51. }
  52. func fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (string, error) {
  53. rootURL := urllib.RootURL(websiteURL)
  54. clt := client.NewClientWithConfig(rootURL, config.Opts)
  55. clt.WithUserAgent(userAgent)
  56. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  57. if fetchViaProxy {
  58. clt.WithProxy()
  59. }
  60. response, err := clt.Get()
  61. if err != nil {
  62. return "", fmt.Errorf("icon: unable to download website index page: %v", err)
  63. }
  64. if response.HasServerFailure() {
  65. return "", fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
  66. }
  67. return findIconURLFromHTMLDocument(response.Body)
  68. }
  69. func findIconURLFromHTMLDocument(body io.Reader) (string, error) {
  70. queries := []string{
  71. "link[rel='shortcut icon']",
  72. "link[rel='Shortcut Icon']",
  73. "link[rel='icon shortcut']",
  74. "link[rel='icon']",
  75. }
  76. doc, err := goquery.NewDocumentFromReader(body)
  77. if err != nil {
  78. return "", fmt.Errorf("icon: unable to read document: %v", err)
  79. }
  80. var iconURL string
  81. for _, query := range queries {
  82. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  83. if href, exists := s.Attr("href"); exists {
  84. iconURL = strings.TrimSpace(href)
  85. }
  86. })
  87. if iconURL != "" {
  88. break
  89. }
  90. }
  91. return iconURL, nil
  92. }
  93. func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
  94. clt := client.NewClientWithConfig(iconURL, config.Opts)
  95. clt.WithUserAgent(userAgent)
  96. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  97. if fetchViaProxy {
  98. clt.WithProxy()
  99. }
  100. response, err := clt.Get()
  101. if err != nil {
  102. return nil, fmt.Errorf("icon: unable to download iconURL: %v", err)
  103. }
  104. if response.HasServerFailure() {
  105. return nil, fmt.Errorf("icon: unable to download icon: status=%d", response.StatusCode)
  106. }
  107. body, err := io.ReadAll(response.Body)
  108. if err != nil {
  109. return nil, fmt.Errorf("icon: unable to read downloaded icon: %v", err)
  110. }
  111. if len(body) == 0 {
  112. return nil, fmt.Errorf("icon: downloaded icon is empty, iconURL=%s", iconURL)
  113. }
  114. icon := &model.Icon{
  115. Hash: crypto.HashFromBytes(body),
  116. MimeType: response.ContentType,
  117. Content: body,
  118. }
  119. return icon, nil
  120. }
  121. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  122. // data:[<mediatype>][;base64],<data>
  123. func parseImageDataURL(value string) (*model.Icon, error) {
  124. var mediaType string
  125. var encoding string
  126. if !strings.HasPrefix(value, "data:") {
  127. return nil, fmt.Errorf(`icon: invalid data URL (missing data:) %q`, value)
  128. }
  129. value = value[5:]
  130. comma := strings.Index(value, ",")
  131. if comma < 0 {
  132. return nil, fmt.Errorf(`icon: invalid data URL (no comma) %q`, value)
  133. }
  134. data := value[comma+1:]
  135. semicolon := strings.Index(value[0:comma], ";")
  136. if semicolon > 0 {
  137. mediaType = value[0:semicolon]
  138. encoding = value[semicolon+1 : comma]
  139. } else {
  140. mediaType = value[0:comma]
  141. }
  142. if !strings.HasPrefix(mediaType, "image/") {
  143. return nil, fmt.Errorf(`icon: invalid media type %q`, mediaType)
  144. }
  145. var blob []byte
  146. switch encoding {
  147. case "base64":
  148. var err error
  149. blob, err = base64.StdEncoding.DecodeString(data)
  150. if err != nil {
  151. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  152. }
  153. case "":
  154. decodedData, err := url.QueryUnescape(data)
  155. if err != nil {
  156. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  157. }
  158. blob = []byte(decodedData)
  159. default:
  160. return nil, fmt.Errorf(`icon: unsupported data URL encoding %q`, value)
  161. }
  162. if len(blob) == 0 {
  163. return nil, fmt.Errorf(`icon: empty data URL %q`, value)
  164. }
  165. icon := &model.Icon{
  166. Hash: crypto.HashFromBytes(blob),
  167. Content: blob,
  168. MimeType: mediaType,
  169. }
  170. return icon, nil
  171. }