Boost Your PHP Crawling with PHPCreeper: A Complete Step‑by‑Step Guide
PHPCreeper is a high‑performance PHP crawler built on Workerman that leverages asynchronous I/O, multi‑process, distributed deployment and headless‑browser support; this guide covers installation via Composer, core architecture, producer/downloader/parser implementation, Redis configuration and how to start the service to fetch dynamic pages such as weather forecasts.
Overview
PHPCreeper (Chinese name "爬山虎") is a PHP‑based crawling engine that focuses on high efficiency and agile development. It simplifies complex crawling tasks, overcomes performance and scalability limits of traditional PHP crawlers, and fully utilizes multi‑process, distributed and isolated deployment environments. PHPCreeper also supports headless browsers to execute JavaScript for dynamic pages.
Core Architecture
The engine is built on Workerman , a high‑performance PHP socket server framework. PHPCreeper inherits all Workerman features and adds headless‑browser support, Linux‑like crontab scheduling, distributed deployment, flexible callbacks and third‑party middleware customization.
Features
Asynchronous I/O + multi‑process + distributed/isolated deployment + event‑driven model.
Micro‑kernel + plugin architecture for strong extensibility.
Support for headless browsers, custom timers and flexible context parameters.
Installation
Installation documentation: https://www.workerman.net/doc/webman/install.html
composer require blogdaren/webman-phpcreeperPlugin repository:
https://www.workerman.net/plugin/39Quick Start
Example scenario: simulate crawling the weather forecast for the next three days in Beijing.
Create Producer
File:
app/spider/TinywanProducer.php <?php
declare(strict_types=1);
namespace app\spider;
use PHPCreeper\Kernel\PHPCreeper;
use Webman\PHPCreeper\Producer;
class TinywanProducer extends Producer
{
/** Produce a static task */
public function makeTask()
{
$private_task_context = [
'cache_enabled' => true,
'cache_directory' => sys_get_temp_dir() . '/DownloadCache4PHPCreeper/',
'allow_url_repeat' => true,
'track_request_args' => true,
'track_task_package' => true,
'force_use_md5url_if_rulename_empty' => false,
'force_use_old_style_multitask_args' => false,
'headers' => [],
'cookies' => [],
'headless_browser' => ['headless' => false],
'user_define_arg1' => 'user_define_value1',
'user_define_arg2' => 'user_define_value2',
];
$task = [
'active' => true,
'url' => 'http://www.weather.com.cn/weather/101010100.shtml',
'rule' => [
'日子' => ['div#7d ul.t.clearfix h1', 'text', [], function($field_name, $data){
return date("Y-m-d") . " | " . $data;
}],
'天气' => ['div#7d ul.t.clearfix p.wea', 'text'],
'温度' => ['div#7d ul.t.clearfix p.tem', 'text'],
],
'rule_name' => '',
'refer' => '',
'type' => 'text',
'method' => 'get',
'context' => $private_task_context,
];
$this->createTask($task);
}
/** Produce a dynamic task that uses a headless browser */
public function makeDynamicTask()
{
$private_task_context = [
'cache_enabled' => true,
'cache_directory' => sys_get_temp_dir() . '/DownloadCache4PHPCreeper/',
'headless_browser'=> ['headless' => true],
];
$dynamic_task = [
'url' => "https://www.toutiao.com",
'rule' => [
'今日头条热榜标题' => ['div.show-monitor ol li a', 'aria-label'],
'今日头条热榜链接' => ['div.show-monitor ol li a', 'href'],
],
'context' => $private_task_context,
];
$this->createTask($dynamic_task);
}
public function onProducerStart(PHPCreeper $producer)
{
$this->makeTask();
$this->makeDynamicTask();
// Optional timer examples (commented out)
// Timer::add(5, [$this, "makeTask"], [], true);
// new Crontab('*/5 * * * * *', function(){ $this->makeTask(); });
}
public function onProducerStop(PHPCreeper $producer) {}
public function onProducerReload(PHPCreeper $producer) {}
}Create Downloader
File:
app/spider/TinywanDownloader.php <?php
declare(strict_types=1);
namespace app\spider;
use PHPCreeper\Kernel\PHPCreeper;
use Webman\PHPCreeper\Downloader;
class TinywanDownloader extends Downloader
{
public function onDownloaderStart(PHPCreeper $downloader)
{
$downloader->setClientSocketAddress(['ws://127.0.0.1:8888']);
}
public function onDownloaderStop(PHPCreeper $downloader) {}
public function onDownloaderReload(PHPCreeper $downloader) {}
public function onDownloaderMessage(PHPCreeper $downloader, string $parser_reply) {}
public function onBeforeDownload(PHPCreeper $downloader, array $task) {}
public function onStartDownload(PHPCreeper $downloader, array $task) {}
public function onAfterDownload(PHPCreeper $downloader, array $download_data, array $task) {}
public function onTaskEmpty(PHPCreeper $downloader) {}
public function onHeadlessBrowserOpenPage(PHPCreeper $downloader, $browser, $page, string $url)
{
// Return false to abort, string for HTML, array for options, etc.
}
}Create Parser
File:
app/spider/TinywanParser.php <?php
declare(strict_types=1);
namespace app\spider;
use Webman\PHPCreeper\Parser;
class TinywanParser extends Parser
{
public function onParserStart($parser) {}
public function onParserStop($parser) {}
public function onParserReload($parser) {}
public function onParserMessage($parser, $connection, $download_data) {}
public function onParserFindUrl($parser, string $url)
{
return $url;
}
public function onParserExtractField($parser, $download_data, $fields)
{
if (!empty($fields)) {
// Example: pprint($fields[$parser->task['rule_name']]);
}
}
}Configuration
Custom Process Configuration
File:
config/plugin/blogdaren/webman-phpcreeper/process.php <?php
return [
'myproducer' => [
'handler' => \app\spider\TinywanProducer::class,
'listen' => '',
'count' => 1,
'constructor'=> [
'config' => include('spider/global.php')
],
],
'mydownloader' => [
'handler' => \app\spider\TinywanDownloader::class,
'listen' => '',
'count' => 1,
'constructor'=> [
'config' => include('spider/global.php')
],
],
'myparser' => [
'handler' => \app\spider\TinywanParser::class,
'listen' => 'websocket://0.0.0.0:8888',
'count' => 1,
'constructor'=> [
'config' => include('spider/global.php')
],
],
];Redis Configuration
File:
config/plugin/blogdaren/webman-phpcreeper/spider/database.php <?php
return [
'redis' => [
'host' => 'dnmp-redis',
'port' => 63789,
'database' => '0',
'auth' => false,
'pass' => 'guest',
'prefix' => 'PHPCreeper',
'connection_timeout' => 5,
'read_write_timeout' => 0,
],
];Running in multi‑worker mode requires a reachable Redis service; otherwise a connection‑refused error will be reported.
Start Service
After confirming the configuration, start the Webman service (e.g., php start.php or the appropriate command defined by your project).
Sample output (weather data) after crawling:
Array
(
[0] => Array
(
[日子] => 2024-10-05 | 5日(今天)
[天气] => 多云转阴
[温度] => 21/13℃
)
[1] => Array
(
[日子] => 2024-10-05 | 6日(明天)
[天气] => 小雨
[温度] => 18/10℃
)
// ... more days ...
)Signed-in readers can open the original source through BestHub's protected redirect.
This article has been distilled and summarized from source material, then republished for learning and reference. If you believe it infringes your rights, please contactand we will review it promptly.
Open Source Tech Hub
Sharing cutting-edge internet technologies and practical AI resources.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.
