curl_multi检查100个url的速度非常慢


curl_multi is very slow to check 100 of urls

我使用以下代码来检查给定url中的断开链接。但是这个过程非常缓慢。我需要尽快加快这个过程。

$$url_list = array(
"http://goog528le.com",
"http://facebook.com",
"http://google.com", 
"http://youtube.com", 
"http://yahoo.com", 
"http://amazon.com",
"http://baidu.com", 
"http://wikipedia.org", 
"http://live.com",
"http://qq.com", 
"http://taobao.com", 
"http://google.co.in",
"http://twitter.com", 
"http://blogspot.com",
"http://yahoo.co.jp", 
"http://linkedin.com",
"http://bing.com",
"http://sina.com.cn"
, "http://yandex.ru");
// 1. multi handle
$mh = curl_multi_init();
$max_connections = 10;
$dead_urls = array();
$not_found_urls = array();
// 2. add multiple URLs to the multi handle
for ($i = 0; $i < $max_connections; $i++) {
   add_url_to_multi_handle($mh, $url_list);
}
// 3. initial execution
 do {
$mrc = curl_multi_exec($mh, $active);

}而($mrc==CURLM_CALL_MULTI_PERFORM);

//4。主回路而($active&&$mrc==CURLM_OK){

// 5. there is activity
if (curl_multi_select($mh) != -1) {
    // 6. do work
    do {
        $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
    // 7. is there info?
    if ($mhinfo = curl_multi_info_read($mh)) {
        // this means one of the requests were finished
        // 8. get the info on the curl handle
        $chinfo = curl_getinfo($mhinfo['handle']);
        // 9. dead link?
        if (!$chinfo['http_code']) {
            $dead_urls [] = $chinfo['url'];
            // 10. 404?
        } else if ($chinfo['http_code'] == 404) {
            $not_found_urls [] = $chinfo['url'];
            // 11. working
        } else {
            $working_urls [] = $chinfo['url'];
        }
        // 12. remove the handle
        curl_multi_remove_handle($mh, $mhinfo['handle']);
        curl_close($mhinfo['handle']);
        // 13. add a new url and do work
        if (add_url_to_multi_handle($mh, $url_list)) {
            do {
                $mrc = curl_multi_exec($mh, $active);
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);
        }
    }
}

}

//14。已完成curl_multi_close($mh);

echo "==Dead URLs=='n";
echo implode("'n", $dead_urls) . "'n'n";
echo "==404 URLs=='n";
echo implode("'n", $not_found_urls) . "'n'n";
echo "==Working URLs=='n";
echo implode("'n", $working_urls);
// 15. adds a url to the multi handle
function add_url_to_multi_handle($mh, $url_list)
{
static $index = 0;
// if we have another url to get
if (isset($url_list[$index]) && $url_list[$index]) {
    // new curl handle
    $ch = curl_init();
    // set the url
    curl_setopt($ch, CURLOPT_URL, $url_list[$index]);
    // to prevent the response from being outputted
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    // follow redirections
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    // do not need the body. this saves bandwidth and time
    curl_setopt($ch, CURLOPT_NOBODY, 1);
    // add it to the multi handle
    curl_multi_add_handle($mh, $ch);
    // increment so next url is used next time
    $index++;
    return true;
} else {
    // we are done adding new URLs
    return false;
}
}

解决方案是在每个请求完成后立即处理。这消除了繁忙等待所浪费的CPU周期。创建一个cURL请求队列以实现最大吞吐量也是一个好主意。每次完成请求时,我都会从队列中添加一个新的请求。通过动态添加和删除链接,我们可以随时保持下载链接的恒定数量。这为我们提供了一种控制同时发送请求数量的方法。结果是并行处理大量cURL请求的速度更快、效率更高。

来源:onlineaspect.com

这里有一个函数可供参考:

function rolling_curl($urls, $callback, $custom_options = null) {
    // make sure the rolling window isn't greater than the # of urls
    $rolling_window = 5;
    $rolling_window = (sizeof($urls) &lt; $rolling_window) ? sizeof($urls) : $rolling_window;
    $master = curl_multi_init();
    $curl_arr = array();
    // add additional curl options here
    $std_options = array(CURLOPT_RETURNTRANSFER =&gt; true,
    CURLOPT_FOLLOWLOCATION =&gt; true,
    CURLOPT_MAXREDIRS =&gt; 5);
    $options = ($custom_options) ? ($std_options + $custom_options) : $std_options;
    // start the first batch of requests
    for ($i = 0; $i &lt; $rolling_window; $i++) {
        $ch = curl_init();
        $options[CURLOPT_URL] = $urls[$i];
        curl_setopt_array($ch,$options);
        curl_multi_add_handle($master, $ch);
    }
    do {
        while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
        if($execrun != CURLM_OK)
            break;
        // a request was just completed -- find out which one
        while($done = curl_multi_info_read($master)) {
            $info = curl_getinfo($done['handle']);
            if ($info['http_code'] == 200)  {
                $output = curl_multi_getcontent($done['handle']);
                // request successful.  process output using the callback function.
                $callback($output);
                // start a new request (it's important to do this before removing the old one)
                $ch = curl_init();
                $options[CURLOPT_URL] = $urls[$i++];  // increment i
                curl_setopt_array($ch,$options);
                curl_multi_add_handle($master, $ch);
                // remove the curl handle that just completed
                curl_multi_remove_handle($master, $done['handle']);
            } else {
                // request failed.  add error handling.
            }
        }
    } while ($running);
    curl_multi_close($master);
    return true;
}

希望这能有所帮助!