Browse Source

Refactor icon finder

Changes:

- Continue the discovery process when the feed icon is invalid
- Search all icons from the HTML document and do not stop on the first one
Frédéric Guillot 2 years ago
parent
commit
9fd2dfa680
3 changed files with 170 additions and 111 deletions
  1. 2 2
      internal/reader/handler/handler.go
  2. 163 61
      internal/reader/icon/finder.go
  3. 5 48
      internal/reader/icon/finder_test.go

+ 2 - 2
internal/reader/handler/handler.go

@@ -241,8 +241,8 @@ func RefreshFeed(store *storage.Storage, userID, feedID int64, forceRefresh bool
 
 func checkFeedIcon(store *storage.Storage, feedID int64, websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) {
 	if !store.HasIcon(feedID) {
-		icon, err := icon.FindIcon(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
-		if err != nil {
+		iconFinder := icon.NewIconFinder(websiteURL, feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
+		if icon, err := iconFinder.FindIcon(); err != nil {
 			slog.Warn("Unable to find feed icon",
 				slog.Int64("feed_id", feedID),
 				slog.String("website_url", websiteURL),

+ 163 - 61
internal/reader/icon/finder.go

@@ -7,6 +7,7 @@ import (
 	"encoding/base64"
 	"fmt"
 	"io"
+	"log/slog"
 	"net/url"
 	"strings"
 
@@ -19,106 +20,170 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )
 
-// FindIcon try to find the website's icon.
-func FindIcon(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (icon *model.Icon, err error) {
-	if feedIconURL == "" {
-		feedIconURL, err = fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent, fetchViaProxy, allowSelfSignedCertificates)
-		if err != nil {
-			return nil, err
+type IconFinder struct {
+	websiteURL                  string
+	feedIconURL                 string
+	userAgent                   string
+	fetchViaProxy               bool
+	allowSelfSignedCertificates bool
+}
+
+func NewIconFinder(websiteURL, feedIconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) *IconFinder {
+	return &IconFinder{
+		websiteURL:                  websiteURL,
+		feedIconURL:                 feedIconURL,
+		userAgent:                   userAgent,
+		fetchViaProxy:               fetchViaProxy,
+		allowSelfSignedCertificates: allowSelfSignedCertificates,
+	}
+}
+
+func (f *IconFinder) FindIcon() (*model.Icon, error) {
+	slog.Debug("Begin icon discovery process",
+		slog.String("website_url", f.websiteURL),
+		slog.String("feed_icon_url", f.feedIconURL),
+	)
+
+	if f.feedIconURL != "" {
+		if icon, err := f.FetchFeedIcon(); err != nil {
+			slog.Debug("Unable to download icon from feed",
+				slog.String("website_url", f.websiteURL),
+				slog.String("feed_icon_url", f.feedIconURL),
+				slog.Any("error", err),
+			)
+		} else if icon != nil {
+			return icon, nil
 		}
 	}
 
-	if strings.HasPrefix(feedIconURL, "data:") {
-		return parseImageDataURL(feedIconURL)
+	if icon, err := f.FetchIconsFromHTMLDocument(); err != nil {
+		slog.Debug("Unable to fetch icons from HTML document",
+			slog.String("website_url", f.websiteURL),
+			slog.Any("error", err),
+		)
+	} else if icon != nil {
+		return icon, nil
 	}
 
-	feedIconURL, err = generateIconURL(websiteURL, feedIconURL)
+	return f.FetchDefaultIcon()
+}
+
+func (f *IconFinder) FetchDefaultIcon() (*model.Icon, error) {
+	slog.Debug("Fetching default icon",
+		slog.String("website_url", f.websiteURL),
+	)
+
+	iconURL, err := urllib.JoinBaseURLAndPath(urllib.RootURL(f.websiteURL), "favicon.ico")
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf(`icon: unable to join root URL and path: %w`, err)
 	}
 
-	if icon, err = downloadIcon(feedIconURL, userAgent, fetchViaProxy, allowSelfSignedCertificates); err != nil {
+	icon, err := f.DownloadIcon(iconURL)
+	if err != nil {
 		return nil, err
 	}
 
 	return icon, nil
 }
 
-func generateIconURL(websiteURL, feedIconURL string) (iconURL string, err error) {
-	feedIconURL = strings.TrimSpace(feedIconURL)
+func (f *IconFinder) FetchFeedIcon() (*model.Icon, error) {
+	slog.Debug("Fetching feed icon",
+		slog.String("website_url", f.websiteURL),
+		slog.String("feed_icon_url", f.feedIconURL),
+	)
 
-	if feedIconURL == "" {
-		iconURL, err = urllib.JoinBaseURLAndPath(urllib.RootURL(websiteURL), "favicon.ico")
-		if err != nil {
-			return "", fmt.Errorf(`icon: unable to join base URL and path: %w`, err)
-		}
-	} else {
-		iconURL, err = urllib.AbsoluteURL(websiteURL, feedIconURL)
-		if err != nil {
-			return "", fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
-		}
+	iconURL, err := urllib.AbsoluteURL(f.websiteURL, f.feedIconURL)
+	if err != nil {
+		return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
 	}
 
-	return iconURL, nil
+	return f.DownloadIcon(iconURL)
 }
 
-func fetchHTMLDocumentAndFindIconURL(websiteURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (string, error) {
-	rootURL := urllib.RootURL(websiteURL)
+func (f *IconFinder) FetchIconsFromHTMLDocument() (*model.Icon, error) {
+	slog.Debug("Searching icons from HTML document",
+		slog.String("website_url", f.websiteURL),
+	)
 
-	clt := client.NewClientWithConfig(rootURL, config.Opts)
-	clt.WithUserAgent(userAgent)
-	clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
-
-	if fetchViaProxy {
-		clt.WithProxy()
+	documentBody, err := f.FetchRootDocument()
+	if err != nil {
+		return nil, err
 	}
 
-	response, err := clt.Get()
+	iconURLs, err := findIconURLsFromHTMLDocument(documentBody)
 	if err != nil {
-		return "", fmt.Errorf("icon: unable to download website index page: %v", err)
+		return nil, err
 	}
 
-	if response.HasServerFailure() {
-		return "", fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
+	slog.Debug("Searched icon from HTML document",
+		slog.String("website_url", f.websiteURL),
+		slog.String("icon_urls", strings.Join(iconURLs, ",")),
+	)
+
+	for _, iconURL := range iconURLs {
+		if strings.HasPrefix(iconURL, "data:") {
+			slog.Debug("Found icon with data URL",
+				slog.String("website_url", f.websiteURL),
+			)
+			return parseImageDataURL(iconURL)
+		}
+
+		iconURL, err = urllib.AbsoluteURL(f.websiteURL, iconURL)
+		if err != nil {
+			return nil, fmt.Errorf(`icon: unable to convert icon URL to absolute URL: %w`, err)
+		}
+
+		if icon, err := f.DownloadIcon(iconURL); err != nil {
+			slog.Debug("Unable to download icon from HTML document",
+				slog.String("website_url", f.websiteURL),
+				slog.String("icon_url", iconURL),
+				slog.Any("error", err),
+			)
+		} else if icon != nil {
+			slog.Debug("Found icon from HTML document",
+				slog.String("website_url", f.websiteURL),
+				slog.String("icon_url", iconURL),
+			)
+			return icon, nil
+		}
 	}
 
-	return findIconURLFromHTMLDocument(response.Body)
+	return nil, nil
 }
 
-func findIconURLFromHTMLDocument(body io.Reader) (string, error) {
-	queries := []string{
-		"link[rel='shortcut icon']",
-		"link[rel='Shortcut Icon']",
-		"link[rel='icon shortcut']",
-		"link[rel='icon']",
+func (f *IconFinder) FetchRootDocument() (io.Reader, error) {
+	rootURL := urllib.RootURL(f.websiteURL)
+
+	clt := client.NewClientWithConfig(rootURL, config.Opts)
+	clt.WithUserAgent(f.userAgent)
+	clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates
+
+	if f.fetchViaProxy {
+		clt.WithProxy()
 	}
 
-	doc, err := goquery.NewDocumentFromReader(body)
+	response, err := clt.Get()
 	if err != nil {
-		return "", fmt.Errorf("icon: unable to read document: %v", err)
+		return nil, fmt.Errorf("icon: unable to download website index page: %v", err)
 	}
 
-	var iconURL string
-	for _, query := range queries {
-		doc.Find(query).Each(func(i int, s *goquery.Selection) {
-			if href, exists := s.Attr("href"); exists {
-				iconURL = strings.TrimSpace(href)
-			}
-		})
-
-		if iconURL != "" {
-			break
-		}
+	if response.HasServerFailure() {
+		return nil, fmt.Errorf("icon: unable to download website index page: status=%d", response.StatusCode)
 	}
 
-	return iconURL, nil
+	return response.Body, nil
 }
 
-func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCertificates bool) (*model.Icon, error) {
+func (f *IconFinder) DownloadIcon(iconURL string) (*model.Icon, error) {
+	slog.Debug("Downloading icon",
+		slog.String("website_url", f.websiteURL),
+		slog.String("icon_url", iconURL),
+	)
+
 	clt := client.NewClientWithConfig(iconURL, config.Opts)
-	clt.WithUserAgent(userAgent)
-	clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
-	if fetchViaProxy {
+	clt.WithUserAgent(f.userAgent)
+	clt.AllowSelfSignedCertificates = f.allowSelfSignedCertificates
+	if f.fetchViaProxy {
 		clt.WithProxy()
 	}
 
@@ -149,6 +214,43 @@ func downloadIcon(iconURL, userAgent string, fetchViaProxy, allowSelfSignedCerti
 	return icon, nil
 }
 
+func findIconURLsFromHTMLDocument(body io.Reader) ([]string, error) {
+	queries := []string{
+		"link[rel='shortcut icon']",
+		"link[rel='Shortcut Icon']",
+		"link[rel='icon shortcut']",
+		"link[rel='icon']",
+	}
+
+	doc, err := goquery.NewDocumentFromReader(body)
+	if err != nil {
+		return nil, fmt.Errorf("icon: unable to read document: %v", err)
+	}
+
+	var iconURLs []string
+	for _, query := range queries {
+		slog.Debug("Searching icon URL in HTML document", slog.String("query", query))
+
+		doc.Find(query).Each(func(i int, s *goquery.Selection) {
+			var iconURL string
+
+			if href, exists := s.Attr("href"); exists {
+				iconURL = strings.TrimSpace(href)
+			}
+
+			if iconURL != "" {
+				iconURLs = append(iconURLs, iconURL)
+
+				slog.Debug("Found icon URL in HTML document",
+					slog.String("query", query),
+					slog.String("icon_url", iconURL))
+			}
+		})
+	}
+
+	return iconURLs, nil
+}
+
 // https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs#syntax
 // data:[<mediatype>][;base64],<data>
 func parseImageDataURL(value string) (*model.Icon, error) {

+ 5 - 48
internal/reader/icon/finder_test.go

@@ -112,59 +112,16 @@ func TestParseDocumentWithWhitespaceIconURL(t *testing.T) {
 		/static/img/favicon.ico
 	">`
 
-	iconURL, err := findIconURLFromHTMLDocument(strings.NewReader(html))
+	iconURLs, err := findIconURLsFromHTMLDocument(strings.NewReader(html))
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if iconURL != "/static/img/favicon.ico" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
-	}
-}
-
-func TestGenerateIconURL(t *testing.T) {
-	iconURL, err := generateIconURL("https://example.org/", "/favicon.png")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if iconURL != "https://example.org/favicon.png" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
-	}
-
-	iconURL, err = generateIconURL("https://example.org/", "img/favicon.png")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if iconURL != "https://example.org/img/favicon.png" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
-	}
-
-	iconURL, err = generateIconURL("https://example.org/", "https://example.org/img/favicon.png")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if iconURL != "https://example.org/img/favicon.png" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
-	}
-
-	iconURL, err = generateIconURL("https://example.org/", "//example.org/img/favicon.png")
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if iconURL != "https://example.org/img/favicon.png" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
-	}
-
-	iconURL, err = generateIconURL("https://example.org/", "  ")
-	if err != nil {
-		t.Fatal(err)
+	if len(iconURLs) != 1 {
+		t.Fatalf(`Invalid number of icon URLs, got %d`, len(iconURLs))
 	}
 
-	if iconURL != "https://example.org/favicon.ico" {
-		t.Errorf(`Invalid icon URL, got %q`, iconURL)
+	if iconURLs[0] != "/static/img/favicon.ico" {
+		t.Errorf(`Invalid icon URL, got %q`, iconURLs[0])
 	}
 }