我有一个涉及DOM的奇怪错误。我正在尝试迭代文档中的每个 href,并在必要时将其替换为绝对路径。问题是,在我使用 $dom->setttribute()
后,getAttribute
返回更改的值。但是,如果我使用 getElementsByTagName 和 getAttribute 再次saveHTML()
或查询标签,则值将从 http://example.com/path.php?ccc 截断为 http://example.com。
这是我的代码:
<?php
//include 'url_to_absolute.php';
function url_to_absolute($url, $href) {
return trim($url . $href);
}
$url = 'http://example.com';
//$url = $_GET["url"];
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$contents = curl_exec($ch);
@curl_close();
$dom = new DOMDocument();
$dom->loadHTML($contents);
//change the urls to absolute
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
$href = $anchor->getAttribute('href');
$abs = url_to_absolute($url, $href);
$anchor->removeAttribute('href');
$anchor->setAttribute('href', $abs);
//changed
$newhref = $anchor->getAttribute('href');
echo "newhref = " . $newhref; //shows http://example.com/.... (good)
}
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
echo "new2 = " . $anchor->getAttribute('href'); //returns http://example.com only
}
//print output
echo @$dom->saveHTML();
?>
试试这些卷曲选项 + curl_init($url):
<?php
//include 'url_to_absolute.php';
function url_to_absolute($url, $href){
return trim($url . $href);
}
$url = 'http://example.com';
//$url = $_GET["url"];
$ch = curl_init($url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION, TRUE);
$contents = curl_exec($ch);
curl_close();
$dom = new DOMDocument();
$dom->loadHTML($contents);
//$dom->saveHTMLFile('dom_doc_test.html');
//change the urls to absolute
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
$href = $anchor->getAttribute('href');
$abs = url_to_absolute($url, $href);
$anchor->removeAttribute('href');
$anchor->setAttribute('href', $abs);
//changed
$newhref = $anchor->getAttribute('href') . '<br />';
echo "newhref = " . $newhref; //shows http://example.com/.... (good)
}
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor)
{
echo "new2 = " . $anchor->getAttribute('href') . '<br />'; //returns http://example.com only
}
//print output
echo @$dom->saveHTML();
?>
这应该是url_to_absolute函数中的一个错误。我的简单url_to_absolute是:
function url_to_absolute($url, $href){
return trim($url . $href);
}
$url = 'http://example.com';
$dom = new DOMDocument();
$dom->loadHTML('<html><body><a href="/path.html?q=hello&a=bye"></a><a href="/path2.html?before=34&after=44"></a></body></html>');
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
$href = $anchor->getAttribute('href');
echo "href = " . $href . '<br />';
}
echo '<br />';
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
$href = $anchor->getAttribute('href');
$abs = url_to_absolute($url, $href);
$anchor->removeAttribute('href');
$anchor->setAttribute('href', $abs);
$newhref = $anchor->getAttribute('href');
echo "newhref = " . $newhref . '<br />';
}
echo '<br />';
$anchors = $dom->getElementsByTagName('a');
foreach($anchors as $anchor){
echo "new2 = " . $anchor->getAttribute('href') . '<br />';
}
结果是:
href = /path.html?q=hello&a=bye
href = /path2.html?before=34&after=44
newhref = http://example.com/path.html?q=hello&a=bye
newhref = http://example.com/path2.html?before=34&after=44
new2 = http://example.com/path.html?q=hello&a=bye
new2 = http://example.com/path2.html?before=34&after=44