获取域名收藏夹的策略是什么


What is a strategy for obtaining domain favicons?

我有一个脚本,可以解析页面中的链接标记,但由于有多种方法下载页面(wget、file_get_contents、curl等),并且有多种方法可以包含favicon,因此脚本变得越来越大。

有简洁的方法吗?也许可以使用API?

以下是不断增长的脚本:

<?php
// Use a direct GET request for debugging, just pass in the domain ( ?domain=test.com )
if($_GET)
{
    $obj = new FaviconFinder();
    $obj->invokeDebug($_GET);
}
class FaviconFinder
{
    // domain before and after redirects
    private $domain;
    private $real_domain;
    // the file and how it was obtained
    private $file_code = '0';
    private $file_page;
    // the favicon and how it was obtained
    private $favicon_code = 'z';
    private $file_favicon;
    private $ext;
    // paths local to server and on the internet (URL)
    private $path_local1 = "../../favicons/";
    private $path_local;
    private $path_internet;
/****************************************************************************************************
invokeTest
****************************************************************************************************/
    public function invokeTest($pipe)
    {
        exec('wget ' . $pipe['domain'] . ' -O ../sites/temp.html 2>&1', $output);
        print_r($output);
    }
/****************************************************************************************************
invokeDebug
****************************************************************************************************/
    public function invokeDebug($pipe)
    {
        echo "<br><br> domain: " . $pipe['domain'] . "";
        $pipe = $this->invoke($pipe);
        echo "<br><br> real_domain: " . $this->real_domain . "";
        echo "<br><br> file_code | " . $this->file_code;
        echo "<br><br> favicon_code | " . $this->favicon_code;
        echo "<br><br> favicon_path | " . $this->path_internet;
        echo "<br><br> favicon_file | " . $this->file_favicon;
        echo "<br><br> favicon_file type | " . gettype($this->file_favicon);
        echo "<br><br> favicon_file length | " . strlen($this->file_favicon);
        echo "<br><br> IMAGE: ";
        if ($this->file_favicon)
        {
            echo "<br><br> path_local | " . $this->path_local . "<br><br>";
            $file64 = base64_encode($this->file_favicon);
            echo "<img src= 'data:image/" . $this->ext . ";base64," . $file64 . "'></img>";
        }
        echo "<br><br>";
    }
/****************************************************************************************************
invoke
****************************************************************************************************/
    public function invoke( $pipe )
    {
        $domain = $pipe['domain'];
        if ( $this->pageFound($domain) && $this->linkFound() && $this->faviconFoundFromLink() )
        {
            $pipe = $this->saveFavicon($pipe);
            $pipe['favicon'] = $this->path_internet;
            $pipe['favicon_local'] = $this->path_local;
        } else {
            $pipe['favicon'] = 'NULL';
            $pipe['favicon_local'] = 'image_generic.png';
        }
        $pipe['method'] = $this->file_code . $this->favicon_code;
        return $pipe;
    }
/****************************************************************************************************
pageFound - uses the facade pattern to find a page and record how it was found
****************************************************************************************************/
    private function pageFound ($domain) 
    {
        return $this->pageFoundCurl($domain) || $this->pageFoundGet($domain);
    }
    // wget is another way to get past login page
    // https://stackoverflow.com/questions/1324421/how-to-get-past-the-login-page-with-wget
    // uses curl_exec to retreive a page
    private function pageFoundCurl ($domain)
    {
        $types = array(
            "curl - 4"=>'https://www.' . $domain, 
            "curl - 3"=>'http://www.' . $domain,
            "curl - 6"=>'https://' . $domain,
            "curl - 5"=>'http://' . $domain,
            // returned 302 errors for test.com
            "curl - 1"=>$domain, 
            "curl - 2"=>'www.' . $domain
        );
        foreach ($types as $key => $value) {
            $this->file_page = $this->curlExec($value, true);
            if ($this->file_page)
            {
                $this->file_code = $key;
                return true;
            }
        }
        return false;
    }
    // uses file_get_contents to retreive a page
    private function pageFoundGet( $domain )
    {
        $types = array(
            "file_get - 3"=>'http://www.' . $domain,
            "file_get - 4"=>'https://www.' . $domain, 
            "file_get - 5"=>'http://' . $domain,
            "file_get - 6"=>'https://' . $domain,
            "file_get - 1"=>$domain, 
            "file_get - 2"=>'www.' . $domain
        );
        foreach ($types as $key => $value) {
            if ($this->file_page = $this->fileGetContents( $value ))
            {
                $this->file_code = $key;
                return true;
            }
        }
        return false;
    }
/****************************************************************************************************
linkFound
****************************************************************************************************/
    private function linkFound()
    {
        $domain = $this->real_domain;
        $regex = '#<link's+(?=[^>]*rel=(?:''|")(?:shortcut's)?icon(?:''|")'s*)(?:[^>]*href=(?:''|")(.+?)(?:''|")).*>#i';
        $link_found = preg_match( $regex , $this->file_page, $matches );
        if($link_found === 1)
        {
            $path = $matches[1];
            // handles ( // )
            if ( $path[0] === '/' && $path[1] === '/' )
            {
                $this->favicon_code = 'a';
                $this->path_internet = 'http:' . $path;
            }
            // handles ( / )
            else if( $path[0] === '/' )
            {
                $this->favicon_code = 'b';
                $this->path_internet = 'http://www.' . $domain . $path;
            }
            // handles ( http:// || https:// )
            else if ( substr($path, 0, 4) === 'http' )
            {
                $this->favicon_code = 'c';
                $this->path_internet = $path;
            }
            // difference between b and d?
            else
            {
                $this->favicon_code = 'd';
                $this->path_internet = 'http://www.' . $domain . '/' . $path;
            }
        }
        else
        {
            $default_location = 'http://www.' . $domain . '/favicon.ico';
            /*
            if( $this->faviconFound($default_location) )
            {
                $this->favicon_code = 'e';
                $this->path_internet = $default_location;
            }
            */
            $this->path_internet = null;
            $this->favicon_code = 'g';
            return false;
        }
        return true;
    }
/****************************************************************************************************
faviconFound
****************************************************************************************************/
    private function faviconFoundFromLink () 
    {
        $this->file_favicon = $this->faviconFoundFacade( $this->path_internet );
        return $this->file_favicon ? true : false;
    }
    private function faviconFound ($default_location) 
    {
        $this->file_favicon = $this->faviconFoundFacade( $default_location );
        return $this->file_favicon ? true : false;
    }
/****************************************************************************************************
More
****************************************************************************************************/
    private function faviconFoundFacade($url)
    {
        return $this->faviconFoundCurl($url) ;  
    }
    private function faviconFoundExec($url)
    {
        exec('wget ' . $url . ' -O ../sites/temp.html 2>&1', $output);
    }
    private function faviconFoundGet($url)
    {
        return @file_get_contents( $url );
    }
    // make less than 10 characters equate to false so I don't save bogus files
    // prisonexp.org does this
    // bestbuy.com does similar
    private function faviconFoundCurl($url)
    {
        $temp = $this->curlExec( $url, false );
        if($temp === false)
        {
            return false;
        }
        if(strlen($temp) < 20) 
        {
            return false;
        }
        return $temp;
    }
/****************************************************************************************************
saveFavicon
****************************************************************************************************/
    public function saveFavicon( $pipe )
    {
        // this will remove any query parameters on the favicon link
        // and create a valid file name from the real domain
        $arr = parse_url($this->path_internet);
        $this->ext = pathinfo($arr['path'], PATHINFO_EXTENSION);
        $name = str_replace('.', '_', $this->real_domain);
        // add the extension if it exists, verify you need to to do this
        if ($this->ext) {
            $name = $name . "." . $this->ext;
        }
        // finally save it
        file_put_contents($this->path_local1 . $name, $this->file_favicon);
        $this->path_local = $name;
        return $pipe;
    }
/****************************************************************************************************
helper and wrapper functions
****************************************************************************************************/
    // curl_exec wrapper    
    private function curlExec ($url, $set)
    {
        $curl = curl_init();
        curl_setopt_array($curl, array(
            CURLOPT_URL => $url,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
        ));
        $temp = curl_exec($curl);
        if ($set) $this->setRealDomain($curl);
        curl_close($curl);
        return $temp;
    }
    private function setRealDomain ($curl)
    {
        $url = curl_getinfo( $curl )['url'];
        $url = parse_url($url);
        $url = $url['host'];
        $this->real_domain = preg_replace('#^www'.(.+'.)#i', '$1', $url);
    }
    // deprecated as curl can do everything I need, just in case though
    // https://stackoverflow.com/questions/
    // 6009284/how-do-i-ignore-a-moved-header-with-file-get-contents-in-php
    private function fileGetContents($value)
    {
        $opts = array(
            'http'=>array(
                'follow_location' => true,
                'max_redirects' => 20
            )
        );
        $context = stream_context_create($opts);
        return @file_get_contents( $value, false, $context );
    }
/****************************************************************************************************
removed
****************************************************************************************************/
    private function removed ()
    {
        $res = preg_match('#(.*?)([^'.]*)('.)([^'.]*)$#', $domain, $matches);
        if($matches[1])
        {
            $main = $matches[2] . $matches[3] . $matches[4]; 
            $default_location = 'http://www.' . $main . '/favicon.ico';
            $this->file_favicon = @file_get_contents( $default_location );
            if( $this->file_favicon )
            {
                $this->path_internet = $default_location;
                $this->favicon_code = 'f';
                return true;
            }
        }        
    }
}

这里有一个正面的API。

使用Google API 检查收藏夹

favicons没有策略或API。解析HTML,查找:

<link rel="shortcut icon" href="...">

或者只是:

<link rel="icon" href="...">

并提取CCD_ 1属性的值。

如果不存在这样的标记(或者引用的图标不存在),则检查/favicon.ico(这就是1999年在Internet Explorer 5上开始的一切)。

此外,iOS(以及某些版本的Android)会搜索具有rel="apple-touch-icon"rel="apple-touch-icon-precomposed"的额外<link>元素。

其他一切都只是猜测和猜测。

另请参阅:https://en.wikipedia.org/wiki/Favicon#History