PHP-不能爬到第二层深度


PHP- cannot crawl to second level depth

我想要抓取这个链接http://dl2.my98music.com/Data/到4-5个级别,并打印其中存在的所有链接,但我无法进入第二级,因为链接http://dl2.my98music.com/Data/不知怎么地变成了http://dl2.my98music.com/,在/之后自动删除了部分。我怎样才能阻止这一切的发生?

class crawler
{
protected $_url;
protected $_depth;
protected $_host;
protected $_useHttpAuth = false;
protected $_user;
protected $_pass;
protected $_seen = array();
protected $_filter = array();
public function __construct($url, $depth = 5)
{
    $this->_url = $url;
    $this->_depth = $depth;
    $parse = parse_url($url);
    $this->_host = $parse['host'];
}
protected function _processAnchors($content, $url, $depth)
{
    $dom = new DOMDocument('1.0');
    @$dom->loadHTML($content);
    $anchors = $dom->getElementsByTagName('a');
    foreach ($anchors as $element) {
        $href = $element->getAttribute('href');
        if (0 !== strpos($href, 'http')) {
            $path = '/' . ltrim($href, '/');
            if (extension_loaded('http')) {
                $href = http_build_url($url, array('path' => $path));
            } else {
                $parts = parse_url($url);
                $href = $parts['scheme'] . '://';
                if (isset($parts['user']) && isset($parts['pass'])) {
                    $href .= $parts['user'] . ':' . $parts['pass'] . '@';
                }
                $href .= $parts['host'];
                if (isset($parts['port'])) {
                    $href .= ':' . $parts['port'];
                }
                $href .= $path;
            }
        }
        // Crawl only link that belongs to the start domain
        $this->crawl_page($href, $depth - 1);
    }
}
protected function _getContent($url)
{
    $handle = curl_init($url);
    if ($this->_useHttpAuth) {
        curl_setopt($handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
        curl_setopt($handle, CURLOPT_USERPWD, $this->_user . ":" . $this->_pass);
    }
    // follows 302 redirect, creates problem wiht authentication
//        curl_setopt($handle, CURLOPT_FOLLOWLOCATION, TRUE);
    // return the content
    curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
    /* Get the HTML or whatever is linked in $url. */
    $response = curl_exec($handle);
    // response total time
    $time = curl_getinfo($handle, CURLINFO_TOTAL_TIME);
    /* Check for 404 (file not found). */
    $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
    curl_close($handle);
    return array($response, $httpCode, $time);
}
protected function _printResult($url, $depth, $httpcode, $time)
{
    ob_end_flush();
    $currentDepth = $this->_depth - $depth;
    $count = count($this->_seen);
    echo "N::$count,CODE::$httpcode,TIME::$time,DEPTH::$currentDepth URL::$url <br>";
    ob_start();
    flush();
}
protected function isValid($url, $depth)
{
    if (strpos($url, $this->_host) === false
        || $depth === 0
        || isset($this->_seen[$url])
    ) {
        return false;
    }
    foreach ($this->_filter as $excludePath) {
        if (strpos($url, $excludePath) !== false) {
            return false;
        }
    }
    return true;
}
public function crawl_page($url, $depth)
{
    if (!$this->isValid($url, $depth)) {
        return;
    }
    // add to the seen URL
    $this->_seen[$url] = true;
    // get Content and Return Code
    list($content, $httpcode, $time) = $this->_getContent($url);
    // print Result for current Page
    $this->_printResult($url, $depth, $httpcode, $time);
    // process subPages
    $this->_processAnchors($content, $url, $depth);
}
public function setHttpAuth($user, $pass)
{
    $this->_useHttpAuth = true;
    $this->_user = $user;
    $this->_pass = $pass;
}
public function addFilterPath($path)
{
    $this->_filter[] = $path;
}
public function run()
{
    $this->crawl_page($this->_url, $this->_depth);
}
}

$startURL = 'http://dl2.my98music.com/Data/';
$depth = 6;
$username = '';
$password = '';
$crawler = new crawler($startURL, $depth);
$crawler->setHttpAuth($username, $password);
// Exclude path with the following structure to be processed 
$crawler->addFilterPath('customer/account/login/referer');
$crawler->run();

错误在您的_processAnchors()中,您需要$href中的完整路径。

它工作后,我改变:

$path = '/' . ltrim($href, '/');

:

$path = '/Data/' . ltrim($href, '/');

参见:PHP: DomElement->getAttribute