finder.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "strings"
  9. stdlib_url "net/url"
  10. "miniflux.app/config"
  11. "miniflux.app/crypto"
  12. "miniflux.app/http/client"
  13. "miniflux.app/logger"
  14. "miniflux.app/model"
  15. "miniflux.app/url"
  16. "github.com/PuerkitoBio/goquery"
  17. )
  18. // FindIcon try to find the website's icon.
  19. func FindIcon(websiteURL, iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
  20. if iconURL == "" {
  21. rootURL := url.RootURL(websiteURL)
  22. logger.Debug("[FindIcon] Trying to find an icon: rootURL=%q websiteURL=%q userAgent=%q", rootURL, websiteURL, userAgent)
  23. clt := client.NewClientWithConfig(rootURL, config.Opts)
  24. clt.WithUserAgent(userAgent)
  25. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  26. if fetchViaProxy {
  27. clt.WithProxy()
  28. }
  29. response, err := clt.Get()
  30. if err != nil {
  31. return nil, fmt.Errorf("icon: unable to download website index page: %v", err)
  32. }
  33. if response.HasServerFailure() {
  34. return nil, fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
  35. }
  36. iconURL, err = parseDocument(rootURL, response.Body)
  37. if err != nil {
  38. return nil, err
  39. }
  40. }
  41. if strings.HasPrefix(iconURL, "data:") {
  42. return parseImageDataURL(iconURL)
  43. }
  44. logger.Debug("[FindIcon] Fetching icon => %s", iconURL)
  45. icon, err := downloadIcon(iconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
  46. if err != nil {
  47. return nil, err
  48. }
  49. return icon, nil
  50. }
  51. func parseDocument(websiteURL string, data io.Reader) (string, error) {
  52. queries := []string{
  53. "link[rel='shortcut icon']",
  54. "link[rel='Shortcut Icon']",
  55. "link[rel='icon shortcut']",
  56. "link[rel='icon']",
  57. }
  58. doc, err := goquery.NewDocumentFromReader(data)
  59. if err != nil {
  60. return "", fmt.Errorf("icon: unable to read document: %v", err)
  61. }
  62. var iconURL string
  63. for _, query := range queries {
  64. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  65. if href, exists := s.Attr("href"); exists {
  66. iconURL = strings.TrimSpace(href)
  67. }
  68. })
  69. if iconURL != "" {
  70. break
  71. }
  72. }
  73. if iconURL == "" {
  74. iconURL = url.RootURL(websiteURL) + "favicon.ico"
  75. } else {
  76. iconURL, _ = url.AbsoluteURL(websiteURL, iconURL)
  77. }
  78. return iconURL, nil
  79. }
  80. func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
  81. clt := client.NewClientWithConfig(iconURL, config.Opts)
  82. clt.WithUserAgent(userAgent)
  83. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  84. if fetchViaProxy {
  85. clt.WithProxy()
  86. }
  87. response, err := clt.Get()
  88. if err != nil {
  89. return nil, fmt.Errorf("icon: unable to download iconURL: %v", err)
  90. }
  91. if response.HasServerFailure() {
  92. return nil, fmt.Errorf("icon: unable to download icon: status=%d", response.StatusCode)
  93. }
  94. body, err := io.ReadAll(response.Body)
  95. if err != nil {
  96. return nil, fmt.Errorf("icon: unable to read downloaded icon: %v", err)
  97. }
  98. if len(body) == 0 {
  99. return nil, fmt.Errorf("icon: downloaded icon is empty, iconURL=%s", iconURL)
  100. }
  101. icon := &model.Icon{
  102. Hash: crypto.HashFromBytes(body),
  103. MimeType: response.ContentType,
  104. Content: body,
  105. }
  106. return icon, nil
  107. }
  108. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  109. // data:[<mediatype>][;base64],<data>
  110. func parseImageDataURL(value string) (*model.Icon, error) {
  111. var mediaType string
  112. var encoding string
  113. if !strings.HasPrefix(value, "data:") {
  114. return nil, fmt.Errorf(`icon: invalid data URL (missing data:) %q`, value)
  115. }
  116. value = value[5:]
  117. comma := strings.Index(value, ",")
  118. if comma < 0 {
  119. return nil, fmt.Errorf(`icon: invalid data URL (no comma) %q`, value)
  120. }
  121. data := value[comma+1:]
  122. semicolon := strings.Index(value[0:comma], ";")
  123. if semicolon > 0 {
  124. mediaType = value[0:semicolon]
  125. encoding = value[semicolon+1 : comma]
  126. } else {
  127. mediaType = value[0:comma]
  128. }
  129. if !strings.HasPrefix(mediaType, "image/") {
  130. return nil, fmt.Errorf(`icon: invalid media type %q`, mediaType)
  131. }
  132. var blob []byte
  133. switch encoding {
  134. case "base64":
  135. var err error
  136. blob, err = base64.StdEncoding.DecodeString(data)
  137. if err != nil {
  138. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  139. }
  140. case "":
  141. decodedData, err := stdlib_url.QueryUnescape(data)
  142. if err != nil {
  143. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  144. }
  145. blob = []byte(decodedData)
  146. default:
  147. return nil, fmt.Errorf(`icon: unsupported data URL encoding %q`, value)
  148. }
  149. if len(blob) == 0 {
  150. return nil, fmt.Errorf(`icon: empty data URL %q`, value)
  151. }
  152. icon := &model.Icon{
  153. Hash: crypto.HashFromBytes(blob),
  154. Content: blob,
  155. MimeType: mediaType,
  156. }
  157. return icon, nil
  158. }