finder.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "log/slog"
  9. "net/url"
  10. "strings"
  11. "miniflux.app/v2/internal/config"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/fetcher"
  15. "miniflux.app/v2/internal/urllib"
  16. "github.com/PuerkitoBio/goquery"
  17. )
  18. type IconFinder struct {
  19. requestBuilder *fetcher.RequestBuilder
  20. websiteURL string
  21. feedIconURL string
  22. }
  23. func NewIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *IconFinder {
  24. return &IconFinder{
  25. requestBuilder: requestBuilder,
  26. websiteURL: websiteURL,
  27. feedIconURL: feedIconURL,
  28. }
  29. }
  30. func (f *IconFinder) FindIcon() (*model.Icon, error) {
  31. slog.Debug("Begin icon discovery process",
  32. slog.String("website_url", f.websiteURL),
  33. slog.String("feed_icon_url", f.feedIconURL),
  34. )
  35. if f.feedIconURL != "" {
  36. if icon, err := f.FetchFeedIcon(); err != nil {
  37. slog.Debug("Unable to download icon from feed",
  38. slog.String("website_url", f.websiteURL),
  39. slog.String("feed_icon_url", f.feedIconURL),
  40. slog.Any("error", err),
  41. )
  42. } else if icon != nil {
  43. return icon, nil
  44. }
  45. }
  46. if icon, err := f.FetchIconsFromHTMLDocument(); err != nil {
  47. slog.Debug("Unable to fetch icons from HTML document",
  48. slog.String("website_url", f.websiteURL),
  49. slog.Any("error", err),
  50. )
  51. } else if icon != nil {
  52. return icon, nil
  53. }
  54. return f.FetchDefaultIcon()
  55. }
  56. func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) {
  57. slog.Debug("Fetching default icon",
  58. slog.String("website_url", f.websiteURL),
  59. )
  60. iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
  61. if err != nil {
  62. return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
  63. }
  64. icon, err := f.DownloadIcon(iconURL)
  65. if err != nil {
  66. return nil, err
  67. }
  68. return icon, nil
  69. }
  70. func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) {
  71. slog.Debug("Fetching feed icon",
  72. slog.String("website_url", f.websiteURL),
  73. slog.String("feed_icon_url", f.feedIconURL),
  74. )
  75. iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
  76. if err != nil {
  77. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  78. }
  79. return f.DownloadIcon(iconURL)
  80. }
  81. func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
  82. slog.Debug("Searching icons from HTML document",
  83. slog.String("website_url", f.websiteURL),
  84. )
  85. rootURL := urllib.RootURL(f.websiteURL)
  86. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL))
  87. defer responseHandler.Close()
  88. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  89. return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
  90. }
  91. iconURLs, err := findIconURLsFromHTMLDocument(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
  92. if err != nil {
  93. return nil, err
  94. }
  95. slog.Debug("Searched icon from HTML document",
  96. slog.String("website_url", f.websiteURL),
  97. slog.String("icon_urls", strings.Join(iconURLs, ",")),
  98. )
  99. for _, iconURL := range iconURLs {
  100. if strings.HasPrefix(iconURL, "data:") {
  101. slog.Debug("Found icon with data URL",
  102. slog.String("website_url", f.websiteURL),
  103. )
  104. return parseImageDataURL(iconURL)
  105. }
  106. iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
  107. if err != nil {
  108. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  109. }
  110. if icon, err := f.DownloadIcon(iconURL); err != nil {
  111. slog.Debug("Unable to download icon from HTML document",
  112. slog.String("website_url", f.websiteURL),
  113. slog.String("icon_url", iconURL),
  114. slog.Any("error", err),
  115. )
  116. } else if icon != nil {
  117. slog.Debug("Found icon from HTML document",
  118. slog.String("website_url", f.websiteURL),
  119. slog.String("icon_url", iconURL),
  120. )
  121. return icon, nil
  122. }
  123. }
  124. return nil, nil
  125. }
  126. func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
  127. slog.Debug("Downloading icon",
  128. slog.String("website_url", f.websiteURL),
  129. slog.String("icon_url", iconURL),
  130. )
  131. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
  132. defer responseHandler.Close()
  133. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  134. return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
  135. }
  136. responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
  137. if localizedError != nil {
  138. return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
  139. }
  140. icon := &model.Icon{
  141. Hash: crypto.HashFromBytes(responseBody),
  142. MimeType: responseHandler.ContentType(),
  143. Content: responseBody,
  144. }
  145. return icon, nil
  146. }
  147. func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
  148. queries := []string{
  149. "link[rel='shortcut icon']",
  150. "link[rel='Shortcut Icon']",
  151. "link[rel='icon shortcut']",
  152. "link[rel='icon']",
  153. }
  154. doc, err := goquery.NewDocumentFromReader(body)
  155. if err != nil {
  156. return nil, fmt.Errorf("icon: unable to read document: %v", err)
  157. }
  158. var iconURLs []string
  159. for _, query := range queries {
  160. slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
  161. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  162. var iconURL string
  163. if href, exists := s.Attr("href"); exists {
  164. iconURL = strings.TrimSpace(href)
  165. }
  166. if iconURL != "" {
  167. iconURLs = append(iconURLs, iconURL)
  168. slog.Debug("Found icon URL in HTML document",
  169. slog.String("query", query),
  170. slog.String("icon_url", iconURL))
  171. }
  172. })
  173. }
  174. return iconURLs, nil
  175. }
  176. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  177. // data:[<mediatype>][;base64],<data>
  178. func parseImageDataURL(value string) (*model.Icon, error) {
  179. var mediaType string
  180. var encoding string
  181. if !strings.HasPrefix(value, "data:") {
  182. return nil, fmt.Errorf(`icon: invalid data URL (missing data:) %q`, value)
  183. }
  184. value = value[5:]
  185. comma := strings.Index(value, ",")
  186. if comma < 0 {
  187. return nil, fmt.Errorf(`icon: invalid data URL (no comma) %q`, value)
  188. }
  189. data := value[comma+1:]
  190. semicolon := strings.Index(value[0:comma], ";")
  191. if semicolon > 0 {
  192. mediaType = value[0:semicolon]
  193. encoding = value[semicolon+1 : comma]
  194. } else {
  195. mediaType = value[0:comma]
  196. }
  197. if !strings.HasPrefix(mediaType, "image/") {
  198. return nil, fmt.Errorf(`icon: invalid media type %q`, mediaType)
  199. }
  200. var blob []byte
  201. switch encoding {
  202. case "base64":
  203. var err error
  204. blob, err = base64.StdEncoding.DecodeString(data)
  205. if err != nil {
  206. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  207. }
  208. case "":
  209. decodedData, err := url.QueryUnescape(data)
  210. if err != nil {
  211. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  212. }
  213. blob = []byte(decodedData)
  214. case "utf8":
  215. blob = []byte(data)
  216. default:
  217. return nil, fmt.Errorf(`icon: unsupported data URL encoding %q`, value)
  218. }
  219. if len(blob) == 0 {
  220. return nil, fmt.Errorf(`icon: empty data URL %q`, value)
  221. }
  222. icon := &model.Icon{
  223. Hash: crypto.HashFromBytes(blob),
  224. Content: blob,
  225. MimeType: mediaType,
  226. }
  227. return icon, nil
  228. }