finder.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "log/slog"
  9. "net/url"
  10. "regexp"
  11. "strings"
  12. "miniflux.app/v2/internal/config"
  13. "miniflux.app/v2/internal/crypto"
  14. "miniflux.app/v2/internal/model"
  15. "miniflux.app/v2/internal/reader/fetcher"
  16. "miniflux.app/v2/internal/urllib"
  17. "github.com/PuerkitoBio/goquery"
  18. "golang.org/x/net/html/charset"
  19. )
  20. type IconFinder struct {
  21. requestBuilder *fetcher.RequestBuilder
  22. websiteURL string
  23. feedIconURL string
  24. }
  25. func NewIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *IconFinder {
  26. return &IconFinder{
  27. requestBuilder: requestBuilder,
  28. websiteURL: websiteURL,
  29. feedIconURL: feedIconURL,
  30. }
  31. }
  32. func (f *IconFinder) FindIcon() (*model.Icon, error) {
  33. slog.Debug("Begin icon discovery process",
  34. slog.String("website_url", f.websiteURL),
  35. slog.String("feed_icon_url", f.feedIconURL),
  36. )
  37. if f.feedIconURL != "" {
  38. if icon, err := f.FetchFeedIcon(); err != nil {
  39. slog.Debug("Unable to download icon from feed",
  40. slog.String("website_url", f.websiteURL),
  41. slog.String("feed_icon_url", f.feedIconURL),
  42. slog.Any("error", err),
  43. )
  44. } else if icon != nil {
  45. return icon, nil
  46. }
  47. }
  48. if icon, err := f.FetchIconsFromHTMLDocument(); err != nil {
  49. slog.Debug("Unable to fetch icons from HTML document",
  50. slog.String("website_url", f.websiteURL),
  51. slog.Any("error", err),
  52. )
  53. } else if icon != nil {
  54. return icon, nil
  55. }
  56. return f.FetchDefaultIcon()
  57. }
  58. func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) {
  59. slog.Debug("Fetching default icon",
  60. slog.String("website_url", f.websiteURL),
  61. )
  62. iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
  63. if err != nil {
  64. return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
  65. }
  66. icon, err := f.DownloadIcon(iconURL)
  67. if err != nil {
  68. return nil, err
  69. }
  70. return icon, nil
  71. }
  72. func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) {
  73. slog.Debug("Fetching feed icon",
  74. slog.String("website_url", f.websiteURL),
  75. slog.String("feed_icon_url", f.feedIconURL),
  76. )
  77. iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
  78. if err != nil {
  79. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  80. }
  81. return f.DownloadIcon(iconURL)
  82. }
  83. func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
  84. slog.Debug("Searching icons from HTML document",
  85. slog.String("website_url", f.websiteURL),
  86. )
  87. rootURL := urllib.RootURL(f.websiteURL)
  88. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL))
  89. defer responseHandler.Close()
  90. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  91. return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
  92. }
  93. iconURLs, err := findIconURLsFromHTMLDocument(
  94. responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
  95. responseHandler.ContentType(),
  96. )
  97. if err != nil {
  98. return nil, err
  99. }
  100. slog.Debug("Searched icon from HTML document",
  101. slog.String("website_url", f.websiteURL),
  102. slog.String("icon_urls", strings.Join(iconURLs, ",")),
  103. )
  104. for _, iconURL := range iconURLs {
  105. if strings.HasPrefix(iconURL, "data:") {
  106. slog.Debug("Found icon with data URL",
  107. slog.String("website_url", f.websiteURL),
  108. )
  109. return parseImageDataURL(iconURL)
  110. }
  111. iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
  112. if err != nil {
  113. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  114. }
  115. if icon, err := f.DownloadIcon(iconURL); err != nil {
  116. slog.Debug("Unable to download icon from HTML document",
  117. slog.String("website_url", f.websiteURL),
  118. slog.String("icon_url", iconURL),
  119. slog.Any("error", err),
  120. )
  121. } else if icon != nil {
  122. slog.Debug("Found icon from HTML document",
  123. slog.String("website_url", f.websiteURL),
  124. slog.String("icon_url", iconURL),
  125. )
  126. return icon, nil
  127. }
  128. }
  129. return nil, nil
  130. }
  131. func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
  132. slog.Debug("Downloading icon",
  133. slog.String("website_url", f.websiteURL),
  134. slog.String("icon_url", iconURL),
  135. )
  136. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
  137. defer responseHandler.Close()
  138. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  139. return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
  140. }
  141. responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
  142. if localizedError != nil {
  143. return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
  144. }
  145. icon := &model.Icon{
  146. Hash: crypto.HashFromBytes(responseBody),
  147. MimeType: responseHandler.ContentType(),
  148. Content: responseBody,
  149. }
  150. return icon, nil
  151. }
  152. func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) {
  153. queries := []string{
  154. "link[rel='icon' i]",
  155. "link[rel='shortcut icon' i]",
  156. "link[rel='icon shortcut' i]",
  157. "link[rel='apple-touch-icon-precomposed.png']",
  158. }
  159. htmlDocumentReader, err := charset.NewReader(body, contentType)
  160. if err != nil {
  161. return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
  162. }
  163. doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
  164. if err != nil {
  165. return nil, fmt.Errorf("icon: unable to read document: %v", err)
  166. }
  167. var iconURLs []string
  168. for _, query := range queries {
  169. slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
  170. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  171. if href, exists := s.Attr("href"); exists {
  172. if iconURL := strings.TrimSpace(href); iconURL != "" {
  173. iconURLs = append(iconURLs, iconURL)
  174. slog.Debug("Found icon URL in HTML document",
  175. slog.String("query", query),
  176. slog.String("icon_url", iconURL))
  177. }
  178. }
  179. })
  180. }
  181. return iconURLs, nil
  182. }
  183. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  184. // data:[<mediatype>][;encoding],<data>
  185. // we consider <mediatype> to be mandatory, and it has to start with `image/`.
  186. // we consider `base64`, `utf8` and the empty string to be the only valid encodings
  187. func parseImageDataURL(value string) (*model.Icon, error) {
  188. re := regexp.MustCompile(`^data:` +
  189. `(?P<mediatype>image/[^;,]+)` +
  190. `(?:;(?P<encoding>base64|utf8))?` +
  191. `,(?P<data>.+)$`)
  192. matches := re.FindStringSubmatch(value)
  193. if matches == nil {
  194. return nil, fmt.Errorf(`icon: invalid data URL %q`, value)
  195. }
  196. mediaType := matches[re.SubexpIndex("mediatype")]
  197. encoding := matches[re.SubexpIndex("encoding")]
  198. data := matches[re.SubexpIndex("data")]
  199. var blob []byte
  200. switch encoding {
  201. case "base64":
  202. var err error
  203. blob, err = base64.StdEncoding.DecodeString(data)
  204. if err != nil {
  205. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  206. }
  207. case "":
  208. decodedData, err := url.QueryUnescape(data)
  209. if err != nil {
  210. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  211. }
  212. blob = []byte(decodedData)
  213. case "utf8":
  214. blob = []byte(data)
  215. }
  216. return &model.Icon{
  217. Hash: crypto.HashFromBytes(blob),
  218. Content: blob,
  219. MimeType: mediaType,
  220. }, nil
  221. }