我正在制作一个应用程序,它可以从网站上抓取数据,根据需要对其进行格式化,并将其显示给用户。现在,该站点不允许跨站点脚本请求,所以我使用PHP的curl来检索页面。
-
使用浏览器,当你第一次访问时,该网站会给你一个cookie,要求你登录,在随后的请求中,它会给你你请求的实际页面。
-
有了PHP的curl,网站只会给我一个要求我登录的页面。我想,还会给我的PHP服务器一个cookie。
如何保存此cookie并在后续请求中显示?
使用一些setopts来设置cookie。
示例:
$ch=curl_init();
curl_setopt($ch, CURLOPT_COOKIEFILE, "c:/cookies/cookie.txt");
curl_setopt($ch, CURLOPT_COOKIEJAR, "-");
我修改了nabab的代码,尝试了一下,它完全符合我的要求:
$loginData = array('username'=>'myuser', 'password'=>'mypassword');
$postData = array('url'=>'http://stackoverflow.com');
$loginURL = "http://stackoverflow.com/login.php";
$addURL = "http://stackoverflow.com/addUrl.php";
$curl_options = array(
CURLOPT_RETURNTRANSFER => true, /* return web page */
CURLOPT_HEADER => false, /* don't return headers */
CURLOPT_FOLLOWLOCATION => true, /* follow redirects */
CURLOPT_ENCODING => "", /* handle all encodings */
CURLOPT_AUTOREFERER => true, /* set referer on redirect */
CURLOPT_CONNECTTIMEOUT => 120, /* timeout on connect */
CURLOPT_TIMEOUT => 120, /* timeout on response */
CURLOPT_MAXREDIRS => 10, /* stop after 10 redirects */
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0
);
$cookie = "cookie.txt";
if ( $ch = curl_init() )
{
curl_setopt_array($ch,$curl_options);
if ( $cookie )
{
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_URL, $loginURL);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($loginData) );
$r = curl_exec($ch);
curl_setopt($ch, CURLOPT_URL, $addURL);
curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($postData) );
$r = curl_exec($ch);
}
curl_close($ch);
}
您必须使用cookie。我就是这么做的(我返回一个包含html内容和编码的数组,这对抓取很有用):
$curl_options = array(
CURLOPT_RETURNTRANSFER => true, /* return web page */
CURLOPT_HEADER => false, /* don't return headers */
CURLOPT_FOLLOWLOCATION => true, /* follow redirects */
CURLOPT_ENCODING => "", /* handle all encodings */
CURLOPT_AUTOREFERER => true, /* set referer on redirect */
CURLOPT_CONNECTTIMEOUT => 120, /* timeout on connect */
CURLOPT_TIMEOUT => 120, /* timeout on response */
CURLOPT_MAXREDIRS => 10, /* stop after 10 redirects */
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0
);
if ( $ch = curl_init($url) )
{
curl_setopt_array($ch,self::$curl_options);
if ( $cookie )
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie);
$r = curl_exec($ch);
curl_close($ch);
}
}