技术饭
PHP数据采集框架QueryList,使用案例~~~
QueryList不依赖任何框架和架构,它可以单独使用也可以引入到任意的PHP开发框架中去使用,如:Laravel、ThinkPHP;你可以使用它来构建简单的采集系统,也可以用它才构建高可用的分布式采集系统。它提供了丰富的基于CSS选择器的页面抽取API,完全模块化的设计,拥有强大的可扩展性。
URL:
http://www.huadongxww.net/list/news
采集规则:
{
"title": ["#content_list li h3 a","text"],
"pic": ["#content_list li .viewimg img","src"],
"desc": ["#content_list li p","text"],
"source": ["#content_list li .spanimg2","text"],
"publish_time": ["#content_list li .timeago","text"]
}
切片选择器:
例如:.posts>.post,可选
/**
* [getcaijidata 采集数据]
*/
/**
* [getcaijidata 采集数据]
*/
public function getcaijidata() {
//处理添加
if ($this->request->isPost()) {
//获取数据
$params = $this->request->post();
//验证地址
if(!isset($params['url']) || empty($params['url'])){
//返回数据
$data['code'] = 0;
$data['msg'] = '请填写URL';
return json($data);
}
//验证规则
if(!isset($params['rules']) || empty($params['rules'])){
//返回数据
$data['code'] = 0;
$data['msg'] = '请填写规则';
return json($data);
}
//处理规则
$params['rules'] = htmlspecialchars_decode($params['rules']);
$rules = json_decode($params['rules'], true);
//验证范切片选择器
if(!isset($params['range']) || empty($params['range'])){
$params['range'] = '';
}
//抓取数据
$html = QueryList::Query($params['url'], $rules, $params['range'])->html;
if(preg_match('/(GB2312|gb2312)/', $html)){
$data = QueryList::Query($params['url'], $rules, $params['range'], 'UTF-8', 'gb2312', true)->data;
} else {
$data = QueryList::Query($params['url'], $rules, $params['range'])->data;
}
//处理数据
$domain_arr = parse_url($params['url']);
$domain = $domain_arr['scheme'] . "://".$domain_arr['host'];
foreach ($data as $key => $value) {
//图片处理
if(isset($value['pic']) && !empty($value['pic'])){
if(!preg_match('/^(https|http)/', $value['pic'])){
$data[$key]['pic'] = $domain . $value['pic'];
}
}
//处理来源
if(isset($value['source']) && !empty($value['source'])){
$value['source'] = trim($value['source']);
$value['source'] = str_replace("来源:", '', $value['source']);
$value['source'] = str_replace("来源:", '', $value['source']);
$data[$key]['source'] = str_replace("来源", '', $value['source']);
}
//处理发布时间
if(isset($value['publish_time']) && !empty($value['publish_time'])){
$value['publish_time'] = str_replace("发布时间:", '', $value['publish_time']);
$value['publish_time'] = str_replace("发布时间:", '', $value['publish_time']);
$value['publish_time'] = str_replace("发布时间", '', $value['publish_time']);
$value['publish_time'] = str_replace("时间", '', $value['publish_time']);
$data[$key]['publish_time'] = date("Y-m-d H:i:s", strtotime($value['publish_time']));
}
}
//返回数据
$rdata['code'] = 1;
$rdata['msg'] = '获取数据成功';
$rdata['data'] = var_export($data, true);
$rdata['ysdata'] = encode(json_encode($data));
return $rdata;
}
}
文明上网理性发言!