Browse Source

Merge pull request #1504 from Alkarex/new-favicons

Rewritten Favicon library using cURL
Alexandre Alapetite 9 years ago
parent
commit
b95a2c192e
6 changed files with 98 additions and 479 deletions
  1. 1 1
      CHANGELOG.md
  2. 0 43
      lib/Favicon/DataAccess.php
  3. 0 396
      lib/Favicon/Favicon.php
  4. 0 23
      lib/Favicon/FaviconDLType.php
  5. 96 14
      lib/favicons.php
  6. 1 2
      p/f.php

+ 1 - 1
CHANGELOG.md

@@ -23,7 +23,7 @@
 	* Improve English [#1465](https://github.com/FreshRSS/FreshRSS/pull/1465)
 * Misc.
 	* Fall back to article URL when the article GUID is empty [#1482](https://github.com/FreshRSS/FreshRSS/issues/1482)
-	* Update to version 1.2 of Favicon library [#1501](https://github.com/FreshRSS/FreshRSS/issues/1501) 
+	* Rewritten Favicon library using cURL [#1504](https://github.com/FreshRSS/FreshRSS/pull/1504)
 
 
 ## 2017-03-11 FreshRSS 1.6.3

+ 0 - 43
lib/Favicon/DataAccess.php

@@ -1,43 +0,0 @@
-<?php
-
-namespace Favicon;
-
-/**
- * DataAccess is a wrapper used to read/write data locally or remotly
- * Aside from SOLID principles, this wrapper is also useful to mock remote resources in unit tests
- * Note: remote access warning are silenced because we don't care if a website is unreachable
- **/
-class DataAccess {
-	public function retrieveUrl($url) {
-	    $this->set_context();
-	    return @file_get_contents($url);
-	}
-	
-	public function retrieveHeader($url) {
-	    $this->set_context();
-		$headers = @get_headers($url, 1);
-		return is_array($headers) ? array_change_key_case($headers) : array();
-	}
-	
-    public function saveCache($file, $data) {
-        file_put_contents($file, $data);
-    }
-    
-    public function readCache($file) {
-    	return file_get_contents($file);
-    }
-    
-    private function set_context() {
-        stream_context_set_default(
-            array(
-                'http' => array(
-                    'method' => 'GET',
-                    'follow_location' => 0,
-                    'max_redirects' => 1,
-                    'timeout' => 10,
-                    'header' => "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:20.0; Favicon; +https://github.com/ArthurHoaro/favicon) Gecko/20100101 Firefox/32.0\r\n",
-                )
-            )
-        );
-    }
-}

+ 0 - 396
lib/Favicon/Favicon.php

@@ -1,396 +0,0 @@
-<?php
-
-namespace Favicon;
-
-class Favicon
-{
-    protected static $TYPE_CACHE_URL = 'url';
-    protected static $TYPE_CACHE_IMG = 'img';
-    protected $url = '';
-    protected $cacheDir;
-    protected $cacheTimeout;
-    protected $dataAccess;
-
-    public function __construct($args = array())
-    {
-        if (isset($args['url'])) {
-            $this->url = $args['url'];
-        }
-        
-        $this->cacheDir = __DIR__ . '/../../resources/cache';
-        $this->cacheTimeout = 604800;
-        $this->dataAccess = new DataAccess();
-    }
-
-    /**
-     * Set cache settings:
-     *   - dir: cache directory
-     *   - timeout: in seconds
-     *
-     * @param array $args
-     */
-    public function cache($args = array()) {
-        if (isset($args['dir'])) {
-            $this->cacheDir = $args['dir'];
-        }
-
-        if (!empty($args['timeout'])) {
-            $this->cacheTimeout = $args['timeout'];
-        }
-    }
-
-    public static function baseUrl($url, $path = false)
-    {
-        $return = '';
-
-        if (!$url = parse_url($url)) {
-            return FALSE;
-        }
-
-        // Scheme
-        $scheme = isset($url['scheme']) ? strtolower($url['scheme']) : null;
-        if ($scheme != 'http' && $scheme != 'https') {
-
-            return FALSE;
-        }
-        $return .= "{$scheme}://";
-
-        // Username and password
-        if (isset($url['user'])) {
-            $return .= $url['user'];
-            if (isset($url['pass'])) {
-                $return .= ":{$url['pass']}";
-            }
-            $return .= '@';
-        }
-
-        // Hostname
-        if( !isset($url['host']) ) {
-            return FALSE;
-        }
-        
-        $return .= $url['host'];
-
-        // Port
-        if (isset($url['port'])) {
-            $return .= ":{$url['port']}";
-        }
-
-        // Path
-        if( $path && isset($url['path']) ) {
-            $return .= $url['path'];
-        }
-        $return .= '/';
-
-        return $return;    
-    }
-
-    public function info($url)
-    {
-        if(empty($url) || $url === false) {
-            return false;
-        }
-        
-        $max_loop = 5;
-        
-        // Discover real status by following redirects. 
-        $loop = TRUE;
-        while ($loop && $max_loop-- > 0) {
-            $headers = $this->dataAccess->retrieveHeader($url);
-            if (empty($headers)) {
-                return false;
-            }
-            $exploded = explode(' ', $headers[0]);
-            
-            if( !isset($exploded[1]) ) { 
-                return false;
-            }
-            list(,$status) = $exploded;
-            
-            switch ($status) {
-                case '301':
-                case '302':
-                    $url = isset($headers['location']) ? $headers['location'] : '';
-                    if (is_array($url)) {
-                        $url = end($url);
-                    }
-                    break;
-                default:
-                    $loop = FALSE;
-                    break;
-            }
-        }
-
-        return array('status' => $status, 'url' => $url);
-    }
-    
-    public function endRedirect($url) {
-        $out = $this->info($url);
-        return !empty($out['url']) ? $out['url'] : false;
-    }
-
-    /**
-     * Find remote (or cached) favicon
-     *
-     * @param string $url  to look for a favicon
-     * @param int    $type type of retrieval (FaviconDLType):
-     *                       - HOTLINK_URL: returns remote URL
-     *                       - DL_FILE_PATH: returns file path of the favicon downloaded locally
-     *                       - RAW_IMAGE: returns the favicon image binary string
-     *
-     * @return string|bool favicon URL, false if nothing was found
-     */
-    public function get($url = '', $type = FaviconDLType::HOTLINK_URL)
-    {
-        // URLs passed to this method take precedence.
-        if (!empty($url)) {
-            $this->url = $url;
-        }
-
-        // Get the base URL without the path for clearer concatenations.
-        $url = rtrim($this->baseUrl($this->url, true), '/');
-        $original = $url;
-        if (($favicon = $this->checkCache($original, self::$TYPE_CACHE_URL)) === false
-            && ! $favicon = $this->getFavicon($original, false)
-        ) {
-            $url = rtrim($this->endRedirect($this->baseUrl($this->url, false)), '/');
-            if (($favicon = $this->checkCache($url, self::$TYPE_CACHE_URL)) === false
-                && ! $favicon = $this->getFavicon($url)
-            ) {
-                $url = $original;
-            }
-        }
-
-        $this->saveCache($url, $favicon, self::$TYPE_CACHE_URL);
-
-        switch ($type) {
-            case FaviconDLType::DL_FILE_PATH:
-                return $this->getImage($url, $favicon, false);
-            case FaviconDLType::RAW_IMAGE:
-                return $this->getImage($url, $favicon, true);
-            case FaviconDLType::HOTLINK_URL:
-            default:
-                return empty($favicon) ? false : $favicon;
-        }
-    }
-    
-    private function getFavicon($url, $checkDefault = true) {
-        $favicon = false;
-        
-        if(empty($url)) {
-            return false;
-        }
-        
-        // Try /favicon.ico first.
-        if( $checkDefault ) {
-            $info = $this->info("{$url}/favicon.ico");
-            if ($info['status'] == '200') {
-                $favicon = $info['url'];
-            }
-        }
-
-        // See if it's specified in a link tag in domain url.
-        if (!$favicon) {
-            $favicon = trim($this->getInPage($url));
-        }
-        if (substr($favicon, 0, 2) === '//') {
-            $favicon = 'https:' . $favicon;
-        }
-        
-        // Make sure the favicon is an absolute URL.
-        if( $favicon && filter_var($favicon, FILTER_VALIDATE_URL) === false ) {
-            $favicon = $url . '/' . $favicon;
-        }
-
-        // Sometimes people lie, so check the status.
-        // And sometimes, it's not even an image. Sneaky bastards!
-        // If cacheDir isn't writable, that's not our problem
-        if ($favicon && is_writable($this->cacheDir) && extension_loaded('fileinfo') && !$this->checkImageMType($favicon)) {
-            $favicon = false;
-        }
-
-        return $favicon;
-    }
-
-    /**
-     * Find remote favicon and return it as an image
-     */
-    private function getImage($url, $faviconUrl = '', $image = false)
-    {
-        if (empty($faviconUrl)) {
-            return false;
-        }
-
-        $favicon = $this->checkCache($url, self::$TYPE_CACHE_IMG);
-        // Favicon not found in the cache
-        if( $favicon === false ) {
-            $favicon = $this->dataAccess->retrieveUrl($faviconUrl);
-            // Definitely not found
-            if (!$this->checkImageMTypeContent($favicon)) {
-                return false;
-            } else {
-                $this->saveCache($url, $favicon, self::$TYPE_CACHE_IMG);
-            }
-        }
-
-        if( $image ) {
-            return $favicon;
-        }
-        else
-            return self::$TYPE_CACHE_IMG . md5($url);
-    }
-
-    /**
-     * Display data as a PNG Favicon, then exit
-     * @param $data
-     */
-    private function displayFavicon($data) {
-        header('Content-Type: image/png');
-        header('Cache-Control: private, max-age=10800, pre-check=10800');
-        header('Pragma: private');
-        header('Expires: ' . date(DATE_RFC822,strtotime('7 day')));
-        echo $data;
-        exit;
-    }
-
-    private function getInPage($url) {
-        $html = $this->dataAccess->retrieveUrl("{$url}/");
-        preg_match('!<head.*?>.*</head>!ims', $html, $match);
-        
-        if(empty($match) || count($match) == 0) {
-            return false;
-        }
-        
-        $head = $match[0];
-        
-        $dom = new \DOMDocument();
-        // Use error suppression, because the HTML might be too malformed.
-        if (@$dom->loadHTML($head)) {
-            $links = $dom->getElementsByTagName('link');
-            foreach ($links as $link) {
-                if ($link->hasAttribute('rel') && strtolower($link->getAttribute('rel')) == 'shortcut icon') {
-                    return $link->getAttribute('href');
-                }
-            }
-            foreach ($links as $link) {
-                if ($link->hasAttribute('rel') && strtolower($link->getAttribute('rel')) == 'icon') {
-                    return $link->getAttribute('href');
-                }
-            }
-            foreach ($links as $link) {
-                if ($link->hasAttribute('href') && strpos($link->getAttribute('href'), 'favicon') !== FALSE) {
-                    return $link->getAttribute('href');
-                }
-            }
-        }
-        return false;
-    }
-
-    private function checkCache($url, $type) {
-        if ($this->cacheTimeout) {
-            $cache = $this->cacheDir . '/'. $type . md5($url);
-            if (file_exists($cache) && is_readable($cache)
-                && ($this->cacheTimeout === -1 || time() - filemtime($cache) < $this->cacheTimeout)
-            ) {
-                return $this->dataAccess->readCache($cache);
-            }
-        }
-        return false;
-    }
-
-    /**
-     * Will save data in cacheDir if the directory writable and any previous cache is expired (cacheTimeout)
-     * @param $url
-     * @param $data
-     * @param $type
-     * @return string cache file path
-     */
-    private function saveCache($url, $data, $type) {
-        // Save cache if necessary
-        $cache = $this->cacheDir . '/'. $type . md5($url);
-        if ($this->cacheTimeout && !file_exists($cache)
-            || (is_writable($cache) && $this->cacheTimeout !== -1 && time() - filemtime($cache) > $this->cacheTimeout)
-        ) {
-            $this->dataAccess->saveCache($cache, $data);
-        }
-        return $cache;
-    }
-
-    private function checkImageMType($url) {
-        
-        $fileContent = $this->dataAccess->retrieveUrl($url);
-        
-        return $this->checkImageMTypeContent($fileContent);
-    }
-
-    private function checkImageMTypeContent($content) {
-        if(empty($content)) return false;
-
-        $isImage = true;
-        try {
-            $fInfo = finfo_open(FILEINFO_MIME_TYPE);
-            $isImage = strpos(finfo_buffer($fInfo, $content), 'image') !== false;
-            finfo_close($fInfo);
-        } catch (\Exception $e) {
-            error_log('Favicon checkImageMTypeContent error: ' . $e->getMessage());
-        }
-
-        return $isImage;
-    }
-    
-    /**
-     * @return mixed
-     */
-    public function getCacheDir()
-    {
-        return $this->cacheDir;
-    }
-
-    /**
-     * @param mixed $cacheDir
-     */
-    public function setCacheDir($cacheDir)
-    {
-        $this->cacheDir = $cacheDir;
-    }
-
-    /**
-     * @return mixed
-     */
-    public function getCacheTimeout()
-    {
-        return $this->cacheTimeout;
-    }
-
-    /**
-     * @param mixed $cacheTimeout
-     */
-    public function setCacheTimeout($cacheTimeout)
-    {
-        $this->cacheTimeout = $cacheTimeout;
-    }
-
-    /**
-     * @return string
-     */
-    public function getUrl()
-    {
-        return $this->url;
-    }
-
-    /**
-     * @param string $url
-     */
-    public function setUrl($url)
-    {
-        $this->url = $url;
-    }
-
-    /**
-     * @param DataAccess|\PHPUnit_Framework_MockObject_MockObject $dataAccess
-     */
-    public function setDataAccess($dataAccess)
-    {
-        $this->dataAccess = $dataAccess;
-    }
-}

+ 0 - 23
lib/Favicon/FaviconDLType.php

@@ -1,23 +0,0 @@
-<?php
-
-
-namespace Favicon;
-
-
-interface FaviconDLType
-{
-    /**
-     * Retrieve remote favicon URL.
-     */
-    const HOTLINK_URL = 0;
-
-    /**
-     * Retrieve downloaded favicon path (requires cache).
-     */
-    const DL_FILE_PATH = 1;
-
-    /**
-     * Retrieve the image content as a binary string.
-     */
-    const RAW_IMAGE = 2;
-}

+ 96 - 14
lib/favicons.php

@@ -1,22 +1,104 @@
 <?php
-
-include(LIB_PATH . '/Favicon/FaviconDLType.php');
-include(LIB_PATH . '/Favicon/DataAccess.php');
-include(LIB_PATH . '/Favicon/Favicon.php');
-
 $favicons_dir = DATA_PATH . '/favicons/';
 $default_favicon = PUBLIC_PATH . '/themes/icons/default_favicon.ico';
 
-function download_favicon($website, $dest) {
-	global $default_favicon;
+function isImgMime($content) {
+	//Based on https://github.com/ArthurHoaro/favicon/blob/3a4f93da9bb24915b21771eb7873a21bde26f5d1/src/Favicon/Favicon.php#L311-L319
+	if($content == '') {
+		return false;
+	}
+	if (!extension_loaded('fileinfo')) {
+		return true;
+	}
+	$isImage = true;
+	try {
+		$fInfo = finfo_open(FILEINFO_MIME_TYPE);
+		$isImage = strpos(finfo_buffer($fInfo, $content), 'image') !== false;
+		finfo_close($fInfo);
+	} catch (Exception $e) {
+	}
+	return $isImage;
+}
 
-	syslog(LOG_INFO, 'FreshRSS Favicon discovery GET ' . $website);
-	$favicon_getter = new \Favicon\Favicon();
-	$tmpPath = realpath(TMP_PATH);
-	$favicon_getter->setCacheDir($tmpPath);
-	$favicon_getter->setCacheTimeout(-1);
-	$favicon_path = $favicon_getter->get($website, \Favicon\FaviconDLType::DL_FILE_PATH);
+function downloadHttp(&$url, $curlOptions = array()) {
+	syslog(LOG_INFO, 'FreshRSS Favicon GET ' . $url);
+	if (substr($url, 0, 2) === '//') {
+		$url = 'https:' . $favicon;
+	}
+	if ($url == '' || filter_var($url, FILTER_VALIDATE_URL) === false) {
+		return '';
+	}
+	$ch = curl_init($url);
+	curl_setopt_array($ch, array(
+			CURLOPT_FOLLOWLOCATION => true,
+			CURLOPT_MAXREDIRS => 10,
+			CURLOPT_RETURNTRANSFER => true,
+			CURLOPT_TIMEOUT => 15,
+			CURLOPT_USERAGENT => 'FreshRSS/' . FRESHRSS_VERSION . ' (' . PHP_OS . '; ' . FRESHRSS_WEBSITE . ')',
+		));
+	if (defined('CURLOPT_ENCODING')) {
+		curl_setopt($ch, CURLOPT_ENCODING, '');	//Enable all encodings
+	}
+	curl_setopt_array($ch, $curlOptions);
+	$response = curl_exec($ch);
+	$info = curl_getinfo($ch);
+	curl_close($ch);
+	if (!empty($info['url']) && (filter_var($info['url'], FILTER_VALIDATE_URL) !== false)) {
+		$url = $info['url'];	//Possible redirect
+	}
+	return $info['http_code'] == 200 ? $response : '';
+}
+
+function searchFavicon(&$url) {
+	$dom = new DOMDocument();
+	$html = downloadHttp($url);
+	if ($html != '' && @$dom->loadHTML($html, LIBXML_NONET | LIBXML_NOERROR | LIBXML_NOWARNING)) {
+		$rels = array('shortcut icon', 'icon');
+		$links = $dom->getElementsByTagName('link');
+		foreach ($rels as $rel) {
+			foreach ($links as $link) {
+				if ($link->hasAttribute('rel') && $link->hasAttribute('href') &&
+					strtolower(trim($link->getAttribute('rel'))) === $rel) {
+					$href = trim($link->getAttribute('href'));
+					if (substr($href, 0, 2) === '//') {
+						$href = 'https:' . $href;
+					}
+					if (filter_var($href, FILTER_VALIDATE_URL) === false) {
+						$href = SimplePie_IRI::absolutize($url, $href);
+					}
+					$favicon = downloadHttp($href, array(
+							CURLOPT_REFERER => $url,
+						));
+					if (isImgMime($favicon)) {
+						return $favicon;
+					}
+				}
+			}
+		}
+	}
+	return '';
+}
 
-	return ($favicon_path != false && @rename($tmpPath . '/' . $favicon_path, $dest)) ||
+function download_favicon($url, $dest) {
+	global $default_favicon;
+	$url = trim($url);
+	$favicon = searchFavicon($url);
+	if ($favicon == '') {
+		$rootUrl = preg_replace('%^(https?://[^/]+).*$%i', '$1/', $url);
+		if ($rootUrl != $url) {
+			$url = $rootUrl;
+			$favicon = searchFavicon($url);
+		}
+		if ($favicon == '') {
+			$link = $rootUrl . 'favicon.ico';
+			$favicon = downloadHttp($link, array(
+					CURLOPT_REFERER => $url,
+				));
+			if (!isImgMime($favicon)) {
+				$favicon = '';
+			}
+		}
+	}
+	return ($favicon != '' && file_put_contents($dest, $favicon)) ||
 		@copy($default_favicon, $dest);
 }

+ 1 - 2
p/f.php

@@ -1,6 +1,6 @@
 <?php
-
 require('../constants.php');
+require(LIB_PATH . '/lib_rss.php');	//Includes class autoloader
 require(LIB_PATH . '/favicons.php');
 require(LIB_PATH . '/http-conditional.php');
 
@@ -15,7 +15,6 @@ function show_default_favicon($cacheSeconds = 3600) {
 	}
 }
 
-
 $id = isset($_SERVER['QUERY_STRING']) ? $_SERVER['QUERY_STRING'] : '0';
 if (!ctype_xdigit($id)) {
 	$id = '0';