PHP抓取网页获取特定信息
发布时间
阅读量:
阅读量
阅读目录
- 需求
- 编码实现
需求
比如,抓取博客网首页文章 标题 和 作者

编码实现
<?php
/** * 抓取网页
*/
function catch_html($url) {
$urlR = parse_url($url);
$domain = $urlR['scheme'].'://'.$urlR['host'].'/';
$headers=array(
"Accept: application/json, text/javascript, */*; q=0.01",
"Content-Type: application/x-www-form-urlencoded; charset=UTF-8",
"Origin: {$domain}",
"Referer: {$url}",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
);
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);//设置抓取的url
curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);//指定头部参数
curl_setopt($curl, CURLOPT_HEADER, 0);//设置头文件的信息作为数据流输出
//设置获取的信息以文件流的形式返回,而不是直接输出。
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_ACCEPT_ENCODING, "gzip,deflate");
//重要!
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); // https请求 不验证证书和hosts
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($curl,CURLOPT_USERAGENT,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"); //模拟浏览器代理
$data = curl_exec($curl);//执行命令
curl_close($curl);//关闭URL请求
$data = mb_convert_encoding($data, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5');//使用该函数对结果进行转码
return $data;
}
/** * 正则匹配,获取标签内容
*/
function get_tag_data($html,$tag,$attr,$value=''){
$regex = $value ? "/<$tag.*?$attr=\"$value\".*?>(.*?)<\/$tag>/is" : "/<$tag.*?$attr=\".*?$value.*?\".*?>(.*?)<\/$tag>/is";
preg_match_all($regex,$html,$matches,PREG_PATTERN_ORDER);
return $matches[1];//返回值为数组 ,查找到的标签内的内容
}
$url = 'https://www.cnblogs.com/';
$html = catch_html($url);
//echo $html;die;
//匹配标题:<a class="post-item-title" href="https://www.cnblogs.com/lyl-star/p/15410719.html" target="_blank">数据库系统之实体完整性约束</a>
$titles = get_tag_data($html,'a','class','post-item-title');
//匹配作者:<a href="https://www.cnblogs.com/lyl-star/" class="post-item-author"><span>lyl-star</span></a>
$authors = get_tag_data($html,'a','class','post-item-author');
//['<span>lyl-star</span>'],
foreach($authors as &$author){
$author = str_replace('<span>','',$author);
$author = str_replace('</span>','',$author);
}
$list = [
'titles' => $titles,
'authors' => $authors,
];
echo '<pre>'; print_r($list); die;
AI助手


全部评论 (0)
还没有任何评论哟~
