finder.go 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package icon // import "miniflux.app/v2/internal/reader/icon"
  4. import (
  5. "encoding/base64"
  6. "fmt"
  7. "io"
  8. "log/slog"
  9. "net/url"
  10. "strings"
  11. "miniflux.app/v2/internal/config"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/encoding"
  15. "miniflux.app/v2/internal/reader/fetcher"
  16. "miniflux.app/v2/internal/urllib"
  17. "github.com/PuerkitoBio/goquery"
  18. )
  19. type IconFinder struct {
  20. requestBuilder *fetcher.RequestBuilder
  21. websiteURL string
  22. feedIconURL string
  23. }
  24. func NewIconFinder(requestBuilder *fetcher.RequestBuilder, websiteURL, feedIconURL string) *IconFinder {
  25. return &IconFinder{
  26. requestBuilder: requestBuilder,
  27. websiteURL: websiteURL,
  28. feedIconURL: feedIconURL,
  29. }
  30. }
  31. func (f *IconFinder) FindIcon() (*model.Icon, error) {
  32. slog.Debug("Begin icon discovery process",
  33. slog.String("website_url", f.websiteURL),
  34. slog.String("feed_icon_url", f.feedIconURL),
  35. )
  36. if f.feedIconURL != "" {
  37. if icon, err := f.FetchFeedIcon(); err != nil {
  38. slog.Debug("Unable to download icon from feed",
  39. slog.String("website_url", f.websiteURL),
  40. slog.String("feed_icon_url", f.feedIconURL),
  41. slog.Any("error", err),
  42. )
  43. } else if icon != nil {
  44. return icon, nil
  45. }
  46. }
  47. if icon, err := f.FetchIconsFromHTMLDocument(); err != nil {
  48. slog.Debug("Unable to fetch icons from HTML document",
  49. slog.String("website_url", f.websiteURL),
  50. slog.Any("error", err),
  51. )
  52. } else if icon != nil {
  53. return icon, nil
  54. }
  55. return f.FetchDefaultIcon()
  56. }
  57. func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) {
  58. slog.Debug("Fetching default icon",
  59. slog.String("website_url", f.websiteURL),
  60. )
  61. iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
  62. if err != nil {
  63. return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
  64. }
  65. icon, err := f.DownloadIcon(iconURL)
  66. if err != nil {
  67. return nil, err
  68. }
  69. return icon, nil
  70. }
  71. func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) {
  72. slog.Debug("Fetching feed icon",
  73. slog.String("website_url", f.websiteURL),
  74. slog.String("feed_icon_url", f.feedIconURL),
  75. )
  76. iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
  77. if err != nil {
  78. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  79. }
  80. return f.DownloadIcon(iconURL)
  81. }
  82. func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
  83. slog.Debug("Searching icons from HTML document",
  84. slog.String("website_url", f.websiteURL),
  85. )
  86. rootURL := urllib.RootURL(f.websiteURL)
  87. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(rootURL))
  88. defer responseHandler.Close()
  89. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  90. return nil, fmt.Errorf("icon: unable to download website index page: %w", localizedError.Error())
  91. }
  92. iconURLs, err := findIconURLsFromHTMLDocument(
  93. responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
  94. responseHandler.ContentType(),
  95. )
  96. if err != nil {
  97. return nil, err
  98. }
  99. slog.Debug("Searched icon from HTML document",
  100. slog.String("website_url", f.websiteURL),
  101. slog.String("icon_urls", strings.Join(iconURLs, ",")),
  102. )
  103. for _, iconURL := range iconURLs {
  104. if strings.HasPrefix(iconURL, "data:") {
  105. slog.Debug("Found icon with data URL",
  106. slog.String("website_url", f.websiteURL),
  107. )
  108. return parseImageDataURL(iconURL)
  109. }
  110. iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
  111. if err != nil {
  112. return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
  113. }
  114. if icon, err := f.DownloadIcon(iconURL); err != nil {
  115. slog.Debug("Unable to download icon from HTML document",
  116. slog.String("website_url", f.websiteURL),
  117. slog.String("icon_url", iconURL),
  118. slog.Any("error", err),
  119. )
  120. } else if icon != nil {
  121. slog.Debug("Found icon from HTML document",
  122. slog.String("website_url", f.websiteURL),
  123. slog.String("icon_url", iconURL),
  124. )
  125. return icon, nil
  126. }
  127. }
  128. return nil, nil
  129. }
  130. func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
  131. slog.Debug("Downloading icon",
  132. slog.String("website_url", f.websiteURL),
  133. slog.String("icon_url", iconURL),
  134. )
  135. responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(iconURL))
  136. defer responseHandler.Close()
  137. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  138. return nil, fmt.Errorf("icon: unable to download website icon: %w", localizedError.Error())
  139. }
  140. responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
  141. if localizedError != nil {
  142. return nil, fmt.Errorf("icon: unable to read response body: %w", localizedError.Error())
  143. }
  144. icon := &model.Icon{
  145. Hash: crypto.HashFromBytes(responseBody),
  146. MimeType: responseHandler.ContentType(),
  147. Content: responseBody,
  148. }
  149. return icon, nil
  150. }
  151. func findIconURLsFromHTMLDocument(body io.Reader, contentType string) ([]string, error) {
  152. queries := []string{
  153. "link[rel='shortcut icon']",
  154. "link[rel='Shortcut Icon']",
  155. "link[rel='icon shortcut']",
  156. "link[rel='icon']",
  157. }
  158. htmlDocumentReader, err := encoding.CharsetReaderFromContentType(contentType, body)
  159. if err != nil {
  160. return nil, fmt.Errorf("icon: unable to create charset reader: %w", err)
  161. }
  162. doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
  163. if err != nil {
  164. return nil, fmt.Errorf("icon: unable to read document: %v", err)
  165. }
  166. var iconURLs []string
  167. for _, query := range queries {
  168. slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
  169. doc.Find(query).Each(func(i int, s *goquery.Selection) {
  170. var iconURL string
  171. if href, exists := s.Attr("href"); exists {
  172. iconURL = strings.TrimSpace(href)
  173. }
  174. if iconURL != "" {
  175. iconURLs = append(iconURLs, iconURL)
  176. slog.Debug("Found icon URL in HTML document",
  177. slog.String("query", query),
  178. slog.String("icon_url", iconURL))
  179. }
  180. })
  181. }
  182. return iconURLs, nil
  183. }
  184. // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
  185. // data:[<mediatype>][;base64],<data>
  186. func parseImageDataURL(value string) (*model.Icon, error) {
  187. var mediaType string
  188. var encoding string
  189. if !strings.HasPrefix(value, "data:") {
  190. return nil, fmt.Errorf(`icon: invalid data URL (missing data:) %q`, value)
  191. }
  192. value = value[5:]
  193. comma := strings.Index(value, ",")
  194. if comma < 0 {
  195. return nil, fmt.Errorf(`icon: invalid data URL (no comma) %q`, value)
  196. }
  197. data := value[comma+1:]
  198. semicolon := strings.Index(value[0:comma], ";")
  199. if semicolon > 0 {
  200. mediaType = value[0:semicolon]
  201. encoding = value[semicolon+1 : comma]
  202. } else {
  203. mediaType = value[0:comma]
  204. }
  205. if !strings.HasPrefix(mediaType, "image/") {
  206. return nil, fmt.Errorf(`icon: invalid media type %q`, mediaType)
  207. }
  208. var blob []byte
  209. switch encoding {
  210. case "base64":
  211. var err error
  212. blob, err = base64.StdEncoding.DecodeString(data)
  213. if err != nil {
  214. return nil, fmt.Errorf(`icon: invalid data %q (%v)`, value, err)
  215. }
  216. case "":
  217. decodedData, err := url.QueryUnescape(data)
  218. if err != nil {
  219. return nil, fmt.Errorf(`icon: unable to decode data URL %q`, value)
  220. }
  221. blob = []byte(decodedData)
  222. case "utf8":
  223. blob = []byte(data)
  224. default:
  225. return nil, fmt.Errorf(`icon: unsupported data URL encoding %q`, value)
  226. }
  227. if len(blob) == 0 {
  228. return nil, fmt.Errorf(`icon: empty data URL %q`, value)
  229. }
  230. icon := &model.Icon{
  231. Hash: crypto.HashFromBytes(blob),
  232. Content: blob,
  233. MimeType: mediaType,
  234. }
  235. return icon, nil
  236. }