finder.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "bytes"
  6. "encoding/base64"
  7. "fmt"
  8. "image"
  9. "image/gif"
  10. "image/jpeg"
  11. "image/png"
  12. "io"
  13. "log/slog"
  14. "net/url"
  15. "regexp"
  16. "slices"
  17. "strings"
  18. "miniflux.app/v2/internal/config"
  19. "miniflux.app/v2/internal/crypto"
  20. "miniflux.app/v2/internal/model"
  21. "miniflux.app/v2/internal/reader/encoding"
  22. "miniflux.app/v2/internal/reader/fetcher"
  23. "miniflux.app/v2/internal/urllib"
  24. "github.com/PuerkitoBio/goquery"
  25. "github.com/tdewolff/minify/v2"
  26. "github.com/tdewolff/minify/v2/svg"
  27. "golang.org/x/image/draw"
  28. "golang.org/x/image/webp"
  29. )
  30. type iconFinder struct {
  31. requestBuilder *fetcher.RequestBuilder
  32. websiteURL string
  33. feedIconURL string
  34. }
  35. func newIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *iconFinder {
  36. return &iconFinder{
  37. requestBuilder: requestBuilder,
  38. websiteURL: websiteURL,
  39. feedIconURL: feedIconURL,
  40. }
  41. }
  42. func (f *iconFinder) findIcon() (*model.Icon, error) {
  43. slog.Debug("Begin icon discovery process",
  44. slog.String("website_url", f.websiteURL),
  45. slog.String("feed_icon_url", f.feedIconURL),
  46. )
  47. if f.feedIconURL != "" {
  48. if icon, err := f.downloadIcon(f.feedIconURL); err != nil {
  49. slog.Debug("Unable to fetch the feed's icon",
  50. slog.String("website_url", f.websiteURL),
  51. slog.String("feed_icon_url", f.feedIconURL),
  52. slog.Any("error", err),
  53. )
  54. } else if icon != nil {
  55. return icon, nil
  56. }
  57. }
  58. // Try the website URL first, then fall back to the root URL if no icon is found.
  59. // The website URL may include a subdirectory (e.g., https://example.org/subfolder/), and icons can be referenced relative to that path.
  60. urls := []string{f.websiteURL}
  61. if rootURL := urllib.RootURL(f.websiteURL); rootURL != urls[0] {
  62. urls = []string{f.websiteURL, rootURL}
  63. }
  64. for _, documentURL := range urls {
  65. if icon, err := f.fetchIconsFromHTMLDocument(documentURL); err != nil {
  66. slog.Debug("Unable to fetch icons from HTML document",
  67. slog.String("document_url", documentURL),
  68. slog.Any("error", err),
  69. )
  70. } else if icon != nil {
  71. return icon, nil
  72. }
  73. }
  74. return f.fetchDefaultIcon()
  75. }
  76. func (f *iconFinder) fetchDefaultIcon() (*model.Icon, error) {
  77. slog.Debug("Fetching default icon",
  78. slog.String("website_url", f.websiteURL),
  79. )
  80. iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
  81. if err != nil {
  82. return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
  83. }
  84. icon, err := f.downloadIcon(iconURL)
  85. if err != nil {
  86. return nil, err
  87. }
  88. return icon, nil
  89. }
  90. func (f *iconFinder) fetchIconsFromHTMLDocument(documentURL string) (*model.Icon, error) {
  91. slog.Debug("Searching icons from HTML document",
  92. slog.String("document_url", documentURL),
  93. )
  94. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(documentURL))
  95. defer responseHandler.Close()
  96. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  97. return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
  98. }
  99. iconURLs, err := findIconURLsFromHTMLDocument(
  100. documentURL,
  101. responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
  102. responseHandler.ContentType(),
  103. )
  104. if err != nil {
  105. return nil, err
  106. }
  107. slog.Debug("Searched icon from HTML document",
  108. slog.String("document_url", documentURL),
  109. slog.String("icon_urls", strings.Join(iconURLs, ",")),
  110. )
  111. for _, iconURL := range iconURLs {
  112. if strings.HasPrefix(iconURL, "data:") {
  113. slog.Debug("Found icon with data URL",
  114. slog.String("document_url", documentURL),
  115. )
  116. return parseImageDataURL(iconURL)
  117. }
  118. if icon, err := f.downloadIcon(iconURL); err != nil {
  119. slog.Debug("Unable to download icon from HTML document",
  120. slog.String("document_url", documentURL),
  121. slog.String("icon_url", iconURL),
  122. slog.Any("error", err),
  123. )
  124. } else if icon != nil {
  125. slog.Debug("Downloaded icon from HTML document",
  126. slog.String("document_url", documentURL),
  127. slog.String("icon_url", iconURL),
  128. )
  129. return icon, nil
  130. }
  131. }
  132. return nil, nil
  133. }
  134. func (f *iconFinder) downloadIcon(iconURL string) (*model.Icon, error) {
  135. slog.Debug("Downloading icon",
  136. slog.String("website_url", f.websiteURL),
  137. slog.String("icon_url", iconURL),
  138. )
  139. if err := ensureRemoteIconURLAllowed(iconURL, config.Opts.IconFetchAllowPrivateNetworks()); err != nil {
  140. return nil, err
  141. }
  142. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
  143. defer responseHandler.Close()
  144. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  145. return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
  146. }
  147. responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
  148. if localizedError != nil {
  149. return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
  150. }
  151. icon := &model.Icon{
  152. Hash: crypto.HashFromBytes(responseBody),
  153. MimeType: responseHandler.ContentType(),
  154. Content: responseBody,
  155. }
  156. icon = resizeIcon(icon)
  157. return icon, nil
  158. }
  159. func resizeIcon(icon *model.Icon) *model.Icon {
  160. if icon.MimeType == "image/svg+xml" {
  161. minifier := minify.New()
  162. minifier.AddFunc("image/svg+xml", svg.Minify)
  163. var err error
  164. // minifier.Bytes returns the data unchanged in case of error.
  165. icon.Content, err = minifier.Bytes("image/svg+xml", icon.Content)
  166. if err != nil {
  167. slog.Error("Unable to minify SVG icon", slog.Any("error", err))
  168. }
  169. return icon
  170. }
  171. if !slices.Contains([]string{"image/jpeg", "image/png", "image/gif", "image/webp"}, icon.MimeType) {
  172. slog.Debug("Icon resize skipped: unsupported MIME type", slog.String("mime_type", icon.MimeType))
  173. return icon
  174. }
  175. r := bytes.NewReader(icon.Content)
  176. // Don't resize icons that we can't decode, or that already have the right size.
  177. config, _, err := image.DecodeConfig(r)
  178. if err != nil {
  179. slog.Warn("Unable to decode icon metadata", slog.Any("error", err))
  180. return icon
  181. }
  182. if config.Height <= 32 && config.Width <= 32 {
  183. slog.Debug("Icon doesn't need to be resized", slog.Int("height", config.Height), slog.Int("width", config.Width))
  184. return icon
  185. }
  186. r.Seek(0, io.SeekStart)
  187. var src image.Image
  188. switch icon.MimeType {
  189. case "image/jpeg":
  190. src, err = jpeg.Decode(r)
  191. case "image/png":
  192. src, err = png.Decode(r)
  193. case "image/gif":
  194. src, err = gif.Decode(r)
  195. case "image/webp":
  196. src, err = webp.Decode(r)
  197. }
  198. if err != nil || src == nil {
  199. slog.Warn("Unable to decode icon image", slog.Any("error", err))
  200. return icon
  201. }
  202. dst := image.NewRGBA(image.Rect(0, 0, 32, 32))
  203. draw.BiLinear.Scale(dst, dst.Rect, src, src.Bounds(), draw.Over, nil)
  204. var b bytes.Buffer
  205. if err = png.Encode(io.Writer(&b), dst); err != nil {
  206. slog.Warn("Unable to encode resized icon", slog.Any("error", err))
  207. return icon
  208. }
  209. icon.Content = b.Bytes()
  210. icon.MimeType = "image/png"
  211. return icon
  212. }
  213. func findIconURLsFromHTMLDocument(documentURL string, body io.Reader, contentType string) ([]string, error) {
  214. htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
  215. if err != nil {
  216. return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
  217. }
  218. doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
  219. if err != nil {
  220. return nil, fmt.Errorf("icon: unable to read document: %v", err)
  221. }
  222. query := `link[rel='icon' i][href],
  223. link[rel='shortcut icon' i][href],
  224. link[rel='icon shortcut' i][href],
  225. link[rel='apple-touch-icon'][href]`
  226. var iconURLs []string
  227. slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
  228. for _, s := range doc.Find(query).EachIter() {
  229. href, _ := s.Attr("href")
  230. href = strings.TrimSpace(href)
  231. if href == "" {
  232. continue
  233. }
  234. if absoluteIconURL, err := urllib.ResolveToAbsoluteURL(documentURL, href); err != nil {
  235. slog.Warn("Unable to convert icon URL to absolute URL", slog.Any("error", err), slog.String("icon_href", href))
  236. } else {
  237. iconURLs = append(iconURLs, absoluteIconURL)
  238. slog.Debug("Found icon URL in HTML document",
  239. slog.String("query", query),
  240. slog.String("icon_href", href),
  241. slog.String("absolute_icon_url", absoluteIconURL),
  242. )
  243. }
  244. }
  245. return iconURLs, nil
  246. }
  247. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  248. // data:[<mediatype>][;encoding],<data>
  249. // we consider <mediatype> to be mandatory, and it has to start with `image/`.
  250. // we consider `base64`, `utf8` and the empty string to be the only valid encodings
  251. func parseImageDataURL(value string) (*model.Icon, error) {
  252. re := regexp.MustCompile(`^data:` +
  253. `(?P<mediatype>image/[^;,]+)` +
  254. `(?:;(?P<encoding>base64|utf8))?` +
  255. `,(?P<data>.+)$`)
  256. matches := re.FindStringSubmatch(value)
  257. if matches == nil {
  258. return nil, fmt.Errorf(`icon: invalid data URL %q`, value)
  259. }
  260. mediaType := matches[re.SubexpIndex("mediatype")]
  261. encoding := matches[re.SubexpIndex("encoding")]
  262. data := matches[re.SubexpIndex("data")]
  263. var blob []byte
  264. switch encoding {
  265. case "base64":
  266. var err error
  267. blob, err = base64.StdEncoding.DecodeString(data)
  268. if err != nil {
  269. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  270. }
  271. case "":
  272. decodedData, err := url.QueryUnescape(data)
  273. if err != nil {
  274. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  275. }
  276. blob = []byte(decodedData)
  277. case "utf8":
  278. blob = []byte(data)
  279. }
  280. return &model.Icon{
  281. Hash: crypto.HashFromBytes(blob),
  282. Content: blob,
  283. MimeType: mediaType,
  284. }, nil
  285. }
  286. func ensureRemoteIconURLAllowed(iconURL string, allowPrivateNetworks bool) error {
  287. parsedURL, err := url.Parse(iconURL)
  288. if err != nil {
  289. return fmt.Errorf("icon: invalid icon URL %q: %w", iconURL, err)
  290. }
  291. if !parsedURL.IsAbs() {
  292. return fmt.Errorf("icon: icon URL %q must be absolute", iconURL)
  293. }
  294. scheme := strings.ToLower(parsedURL.Scheme)
  295. if scheme != "http" && scheme != "https" {
  296. return fmt.Errorf("icon: unsupported icon URL scheme %q", parsedURL.Scheme)
  297. }
  298. hostname := parsedURL.Hostname()
  299. if hostname == "" {
  300. return fmt.Errorf("icon: icon URL %q has no hostname", iconURL)
  301. }
  302. if allowPrivateNetworks {
  303. return nil
  304. }
  305. isPrivate, err := urllib.ResolvesToPrivateIP(hostname)
  306. if err != nil {
  307. return fmt.Errorf("icon: unable to resolve icon hostname %q: %w", hostname, err)
  308. }
  309. if isPrivate {
  310. return fmt.Errorf("icon: refusing to download icon from private network host %q", hostname)
  311. }
  312. return nil
  313. }