Amazon Scraper Script 适用于 XAMPP Windows,但不适用于 Linux 上的 PHP5


Amazon Scraper Script works on XAMPP Windows but not PHP5 Cli on Linux

我正在尝试使用以下代码抓取亚马逊ASIN代码:

<?php
class Scraper {
const BASE_URL = "http://www.amazon.com";
private $categoryFile = "";
private $outputFile = "";
private $catArray;
private $currentPage = NULL;
private $asin = array();
private $categoriesMatched = 0;
private $categoryProducts = array();
private $pagesMatched = 0;
private $totalPagesMatched = 0;
private $productsMatched = 0;
public function __construct($categoryFile, $outputFile) {
    $this->categoryFile = $categoryFile;
    $this->outputFile = $outputFile;
}
public function run() {
    $this->readCategories($this->categoryFile);
    $this->setupASINArray($this->asin);
    $x = 1;
    foreach ($this->catArray as $cat) {
        $this->categoryProducts["$x"] = 0;
        if ($this->currentPage == NULL) {
            $this->currentPage = $cat;
            $this->scrapeASIN($this->currentPage, $x);
            $this->pagesMatched++;
        }           
        if ($this->getNextPageLink($this->currentPage)) {
            do {
                // next page found
                $this->pagesMatched++;
                $this->scrapeASIN($this->currentPage, $x);
            } while ($this->getNextPageLink($this->currentPage));
        }
        echo "Category complete: $this->pagesMatched Pages" . "'n";
        $this->totalPagesMatched += $this->pagesMatched;
        $this->pagesMatched = 0;
        $this->writeASIN($this->outputFile, $x);
        $x++;
        $this->currentPage = NULL;
        $this->categoriesMatched++;

    }
    $this->returnStats();

}
private function readCategories($categoryFile) {
    $catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);
    $this->catArray = $catArray;
}
private function setupASINArray($asinArray) {
    $x = 0;
    foreach ($this->catArray as $cat) {
        $asinArray["$x"][0] = "$cat";
        $x++;
    }
    $this->asin = $asinArray;
}
private function getNextPageLink($currentPage) {
    $document = new DOMDocument();
    $html = file_get_contents($currentPage);
    @$document->loadHTML($html);
    $xpath = new DOMXPath($document);
    $element = $xpath->query("//a[@id='pagnNextLink']/@href");
    if ($element->length != 0) {
        $this->currentPage = self::BASE_URL . $element->item(0)->value;
        return true;
    } else {
        return false;
    }

}
private function scrapeASIN($currentPage, $catNo) {
    $html = file_get_contents($currentPage);
    $regex = '~(?:www'.)?ama?zo?n'.(?:com|ca|co'.uk|co'.jp|de|fr)/(?:exec/obidos/ASIN/|o/|gp/product/|(?:(?:[^"''/]*)/)?dp/|)(B[A-Z0-9]{9})(?:(?:/|'?|'#)(?:[^"'''s]*))?~isx';
    preg_match_all($regex, $html, $asin);
    foreach ($asin[1] as $match) {
        $this->asin[$catNo-1][] = $match;
    }   

}
private function writeASIN($outputFile, $catNo) {
    $fh = fopen($outputFile, "a+");
    $this->fixDupes($catNo);
    $this->productsMatched += (count($this->asin[$catNo-1]) - 1);
    $this->categoryProducts["$catNo"] = (count($this->asin[$catNo-1]) - 1);
    flock($fh, LOCK_EX);
    $x = 0;
    foreach ($this->asin[$catNo-1] as $asin) {
        fwrite($fh, "$asin" . "'n");
        $x++;
    }

    flock($fh, LOCK_UN);
    fclose($fh);
    $x -= 1;
    echo "$x ASIN codes written to file" . "'n";
}
private function fixDupes($catNo) {
    $this->asin[$catNo-1] = array_unique($this->asin[$catNo-1], SORT_STRING);
}
public function returnStats() {
    echo "Categories matched: " . $this->categoriesMatched . "'n";
    echo "Pages parsed: " . $this->totalPagesMatched . "'n";
    echo "Products parsed: " . $this->productsMatched . "'n";
    echo "Category breakdown:" . "'n";
    $x = 1;
    foreach ($this->categoryProducts as $catProds) {
        echo "Category $x had $catProds products" . "'n";
        $x++;
    }
}
}
$scraper = new Scraper($argv[1], $argv[2]);
$scraper->run();
?>

但它在Windows上的XAMPP上运行良好,但在Linux上则不能。关于为什么会这样的任何想法?有时它会抓取 0 个 ASIN 进行归档,有时它只抓取 1+ 页面类别中的 400 页。但是输出/功能在Windows/XAMPP中完全没问题。

任何想法将不胜感激!

干杯-布莱斯

因此,请尝试更改这种方式,以避免错误消息:

private function readCategories($categoryFile) {
if (file_exists($categoryFile)) {
    $catArray = file($categoryFile, FILE_IGNORE_NEW_LINES);
    $this->catArray = $catArray;
} else {
    echo "File ".$categoryFile.' not exists!';
    $this->catArray = array();
}
}