finder.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "bytes"
  6. "encoding/base64"
  7. "fmt"
  8. "image"
  9. "image/gif"
  10. "image/jpeg"
  11. "image/png"
  12. "io"
  13. "log/slog"
  14. "net/url"
  15. "regexp"
  16. "slices"
  17. "strings"
  18. "miniflux.app/v2/internal/config"
  19. "miniflux.app/v2/internal/crypto"
  20. "miniflux.app/v2/internal/model"
  21. "miniflux.app/v2/internal/reader/encoding"
  22. "miniflux.app/v2/internal/reader/fetcher"
  23. "miniflux.app/v2/internal/urllib"
  24. "github.com/PuerkitoBio/goquery"
  25. "golang.org/x/image/draw"
  26. "golang.org/x/image/webp"
  27. )
  28. type iconFinder struct {
  29. requestBuilder *fetcher.RequestBuilder
  30. websiteURL string
  31. feedIconURL string
  32. }
  33. func newIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *iconFinder {
  34. return &iconFinder{
  35. requestBuilder: requestBuilder,
  36. websiteURL: websiteURL,
  37. feedIconURL: feedIconURL,
  38. }
  39. }
  40. func (f *iconFinder) findIcon() (*model.Icon, error) {
  41. slog.Debug("Begin icon discovery process",
  42. slog.String("website_url", f.websiteURL),
  43. slog.String("feed_icon_url", f.feedIconURL),
  44. )
  45. if f.feedIconURL != "" {
  46. if icon, err := f.fetchFeedIcon(); err != nil {
  47. slog.Debug("Unable to download icon from feed",
  48. slog.String("website_url", f.websiteURL),
  49. slog.String("feed_icon_url", f.feedIconURL),
  50. slog.Any("error", err),
  51. )
  52. } else if icon != nil {
  53. return icon, nil
  54. }
  55. }
  56. if icon, err := f.fetchIconsFromHTMLDocument(); err != nil {
  57. slog.Debug("Unable to fetch icons from HTML document",
  58. slog.String("website_url", f.websiteURL),
  59. slog.Any("error", err),
  60. )
  61. } else if icon != nil {
  62. return icon, nil
  63. }
  64. return f.fetchDefaultIcon()
  65. }
  66. func (f *iconFinder) fetchDefaultIcon() (*model.Icon, error) {
  67. slog.Debug("Fetching default icon",
  68. slog.String("website_url", f.websiteURL),
  69. )
  70. iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
  71. if err != nil {
  72. return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
  73. }
  74. icon, err := f.downloadIcon(iconURL)
  75. if err != nil {
  76. return nil, err
  77. }
  78. return icon, nil
  79. }
  80. func (f *iconFinder) fetchFeedIcon() (*model.Icon, error) {
  81. slog.Debug("Fetching feed icon",
  82. slog.String("website_url", f.websiteURL),
  83. slog.String("feed_icon_url", f.feedIconURL),
  84. )
  85. iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
  86. if err != nil {
  87. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  88. }
  89. return f.downloadIcon(iconURL)
  90. }
  91. func (f *iconFinder) fetchIconsFromHTMLDocument() (*model.Icon, error) {
  92. slog.Debug("Searching icons from HTML document",
  93. slog.String("website_url", f.websiteURL),
  94. )
  95. rootURL := urllib.RootURL(f.websiteURL)
  96. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL))
  97. defer responseHandler.Close()
  98. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  99. return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
  100. }
  101. iconURLs, err := findIconURLsFromHTMLDocument(
  102. responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
  103. responseHandler.ContentType(),
  104. )
  105. if err != nil {
  106. return nil, err
  107. }
  108. slog.Debug("Searched icon from HTML document",
  109. slog.String("website_url", f.websiteURL),
  110. slog.String("icon_urls", strings.Join(iconURLs, ",")),
  111. )
  112. for _, iconURL := range iconURLs {
  113. if strings.HasPrefix(iconURL, "data:") {
  114. slog.Debug("Found icon with data URL",
  115. slog.String("website_url", f.websiteURL),
  116. )
  117. return parseImageDataURL(iconURL)
  118. }
  119. iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
  120. if err != nil {
  121. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  122. }
  123. if icon, err := f.downloadIcon(iconURL); err != nil {
  124. slog.Debug("Unable to download icon from HTML document",
  125. slog.String("website_url", f.websiteURL),
  126. slog.String("icon_url", iconURL),
  127. slog.Any("error", err),
  128. )
  129. } else if icon != nil {
  130. slog.Debug("Downloaded icon from HTML document",
  131. slog.String("website_url", f.websiteURL),
  132. slog.String("icon_url", iconURL),
  133. )
  134. return icon, nil
  135. }
  136. }
  137. return nil, nil
  138. }
  139. func (f *iconFinder) downloadIcon(iconURL string) (*model.Icon, error) {
  140. slog.Debug("Downloading icon",
  141. slog.String("website_url", f.websiteURL),
  142. slog.String("icon_url", iconURL),
  143. )
  144. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
  145. defer responseHandler.Close()
  146. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  147. return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
  148. }
  149. responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
  150. if localizedError != nil {
  151. return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
  152. }
  153. icon := &model.Icon{
  154. Hash: crypto.HashFromBytes(responseBody),
  155. MimeType: responseHandler.ContentType(),
  156. Content: responseBody,
  157. }
  158. icon = resizeIcon(icon)
  159. return icon, nil
  160. }
  161. func resizeIcon(icon *model.Icon) *model.Icon {
  162. r := bytes.NewReader(icon.Content)
  163. if !slices.Contains([]string{"image/jpeg", "image/png", "image/gif", "image/webp"}, icon.MimeType) {
  164. slog.Info("icon isn't a png/gif/jpeg/webp, can't resize", slog.String("mimetype", icon.MimeType))
  165. return icon
  166. }
  167. // Don't resize icons that we can't decode, or that already have the right size.
  168. config, _, err := image.DecodeConfig(r)
  169. if err != nil {
  170. slog.Warn("unable to decode the metadata of the icon", slog.Any("error", err))
  171. return icon
  172. }
  173. if config.Height <= 32 && config.Width <= 32 {
  174. slog.Debug("icon don't need to be rescaled", slog.Int("height", config.Height), slog.Int("width", config.Width))
  175. return icon
  176. }
  177. r.Seek(0, io.SeekStart)
  178. var src image.Image
  179. switch icon.MimeType {
  180. case "image/jpeg":
  181. src, err = jpeg.Decode(r)
  182. case "image/png":
  183. src, err = png.Decode(r)
  184. case "image/gif":
  185. src, err = gif.Decode(r)
  186. case "image/webp":
  187. src, err = webp.Decode(r)
  188. }
  189. if err != nil || src == nil {
  190. slog.Warn("unable to decode the icon", slog.Any("error", err))
  191. return icon
  192. }
  193. dst := image.NewRGBA(image.Rect(0, 0, 32, 32))
  194. draw.BiLinear.Scale(dst, dst.Rect, src, src.Bounds(), draw.Over, nil)
  195. var b bytes.Buffer
  196. if err = png.Encode(io.Writer(&b), dst); err != nil {
  197. slog.Warn("unable to encode the new icon", slog.Any("error", err))
  198. }
  199. icon.Content = b.Bytes()
  200. icon.MimeType = "image/png"
  201. return icon
  202. }
  203. func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) {
  204. htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
  205. if err != nil {
  206. return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
  207. }
  208. doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
  209. if err != nil {
  210. return nil, fmt.Errorf("icon: unable to read document: %v", err)
  211. }
  212. queries := [...]string{
  213. "link[rel='icon' i][href]",
  214. "link[rel='shortcut icon' i][href]",
  215. "link[rel='icon shortcut' i][href]",
  216. "link[rel='apple-touch-icon'][href]",
  217. }
  218. var iconURLs []string
  219. for _, query := range queries {
  220. slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
  221. for _, s := range doc.Find(query).EachIter() {
  222. href, _ := s.Attr("href")
  223. if iconURL := strings.TrimSpace(href); iconURL != "" {
  224. iconURLs = append(iconURLs, iconURL)
  225. slog.Debug("Found icon URL in HTML document",
  226. slog.String("query", query),
  227. slog.String("icon_url", iconURL))
  228. }
  229. }
  230. }
  231. return iconURLs, nil
  232. }
  233. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  234. // data:[<mediatype>][;encoding],<data>
  235. // we consider <mediatype> to be mandatory, and it has to start with `image/`.
  236. // we consider `base64`, `utf8` and the empty string to be the only valid encodings
  237. func parseImageDataURL(value string) (*model.Icon, error) {
  238. re := regexp.MustCompile(`^data:` +
  239. `(?P<mediatype>image/[^;,]+)` +
  240. `(?:;(?P<encoding>base64|utf8))?` +
  241. `,(?P<data>.+)$`)
  242. matches := re.FindStringSubmatch(value)
  243. if matches == nil {
  244. return nil, fmt.Errorf(`icon: invalid data URL %q`, value)
  245. }
  246. mediaType := matches[re.SubexpIndex("mediatype")]
  247. encoding := matches[re.SubexpIndex("encoding")]
  248. data := matches[re.SubexpIndex("data")]
  249. var blob []byte
  250. switch encoding {
  251. case "base64":
  252. var err error
  253. blob, err = base64.StdEncoding.DecodeString(data)
  254. if err != nil {
  255. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  256. }
  257. case "":
  258. decodedData, err := url.QueryUnescape(data)
  259. if err != nil {
  260. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  261. }
  262. blob = []byte(decodedData)
  263. case "utf8":
  264. blob = []byte(data)
  265. }
  266. return &model.Icon{
  267. Hash: crypto.HashFromBytes(blob),
  268. Content: blob,
  269. MimeType: mediaType,
  270. }, nil
  271. }