我试图从http://alsat-m.tv/category/5/nga-vendi抓取标题但我不能。我已经尝试了下面的代码。请问有谁能帮我吗?下面,我添加了只拉标题的文本和链接。这段代码只与http://www.programminghelp.com/工作,而不是与其他网页,我不知道问题在哪里。
<?php
$html = file_get_contents("http://alsat-m.tv/");
preg_match_all(
'/<h5><a href="(.*?)" rel="bookmark" title=".*?">(.*?)<'/a><'/h5>/s',
$html,
$posts, // will contain the article data
PREG_SET_ORDER // formats data into an array of posts
);
foreach ($posts as $post) {
$link = $post[1];
$title = $post[2];
echo "<a href'" . $link . "'>" . $title . "</a></br>";
}
echo "<p>" . count($posts) . " posts found</p>";
$html = file_get_contents("http://www.alsat-m.tv/");
preg_match_all(
'/<h5><a href="(.*?)" rel="bookmark" title=".*?">(.*?)<'/a><'/h5>/s',
$html,
$posts, // will contain the article data
PREG_SET_ORDER // formats data into an array of posts
);
foreach ($posts as $post) {
$link = $post[1];
$title = $post[2];
echo "<a href='" . $link . "'>" . $title . "</a></br>";
}
echo "<p>" . count($posts) . " posts found</p>";
?>
这是python的解决方案
import requests
from lxml import etree
xml = requests.get('http://alsat-m.tv/RssFeed')
tree = etree.fromstring(xml.content)
root = tree.find('channel')
titles = [x.find('title').text for x in root.findall('item')]
print titles