java爬虫爬取高考院校信息以及图片
发布时间
阅读量:
阅读量
javaAPI爬取全国高校录取情况以及院校图片
首先,为了能够能够爬取全国所有高校的录取情况以及其他信息,需要做到以下步骤:
1找到要爬取的所有大学的url
2开始爬取院校的cid、院校主页url、cname存入数据库,便于后面进一步利用这些关系爬取其他信息
3利用数据库中cid拼接url的方式爬取院校录取批次信息
4利用数据库中cid拼接url找到所有图片再发送httpclient下载图片
爬取全国院校以及cid存入数据库
该页面是全国高校的主页:https://www.baokaodaxue.com/bkdx/search/college
进入该页面之后可以利用谷歌的开发者工具,找到这个块标签,然后提取出其中想要的元素
主要代码如下
//线程池
private static final ExecutorService executorService = Executors.newCachedThreadPool();
//阻塞队列,用于存放商品盒子li
private static final BlockingQueue<Element> queueLi = new ArrayBlockingQueue<Element>(100);
//阻塞队列,用于存放university
private static final BlockingQueue<University> queuePhone = new ArrayBlockingQueue<University>(100);
//爬取的首页
private String url ="https://www.baokaodaxue.com/bkdx/search/college?dq=&type=&bz=&kp=&keywd=&page=1";
//开始爬取
public void start() throws IOException {
final String sql = "insert into university(name,url,cid) values(?,?,?)";
for (int i = 0; i < 10; i++) {
executorService.execute(new Runnable() {
public void run() {
QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
while (true) {
try {
University university = queuePhone.take();
queryRunner.update(sql,university.getName(),university.getUrl(),university.getCid());
} catch (InterruptedException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
});
}
//创建10个消费者(解析队列中存放的li)
for (int i = 0; i < 10; i++) {
executorService.execute(new Runnable() {
public void run() {
//从队列中取出li进行解析
while (true) {
Element li = null;
try {
li = queueLi.take();
} catch (InterruptedException e) {
e.printStackTrace();
}
University university = parseLi(li);
if (university != null) {
queuePhone.offer(university);
}
}
}
});
}
//获取首页
CloseableHttpResponse indexRes = sendGet(url);
//解析结果
parseIndex(indexRes, 1);
}
//发送get请求,获取响应结果
public CloseableHttpResponse sendGet(String url) throws IOException {
//创建httpClient客户端
CloseableHttpClient httpClient = HttpClients.createDefault();
//创建请求对象,发送请求
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36");
CloseableHttpResponse response = httpClient.execute(httpGet);
return response;
}
//解析首页
public void parseIndex(CloseableHttpResponse indexRes, int page) throws IOException {
System.out.println("---第" + page + "页抓取完毕---");
//得到document对象
String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
Document document = Jsoup.parse(indexHtml);
Elements lis= document.getElementsByClass("college-name");
//取出每个盒子置于队列中
for (Element li : lis) {
queueLi.offer(li);
}
if (++page <= 137) {
int index = 1+page-1;
String url ="https://www.baokaodaxue.com/bkdx/search/college?dq=&type=&bz=&kp=&keywd=&page=" + index ;
CloseableHttpResponse nextRes = sendGet(url);
parseIndex(nextRes, page);
}
}
//解析每个盒子,封装到phone并返回
public University parseLi(Element li) {
try {
University university = new University();
String url = li.select("a").attr("href");
String name = li.text();
// System.out.println(url);
int start=url.indexOf("=");
int end=url.length();
String cid=url.substring(start+1,end);
university.setName(name);
university.setUrl(url);
university.setCid(cid);
return university;
} catch (Exception e) {
//System.out.println("错误数据");
}
return null;
AI写代码
爬取各个院校的录取批次情况
根据之前的代码已经将name、url、cid存入数据库中了,此时只需要从数据库中取出每一个cid,拼接url进行解析,取出想要的录取批次情况,稍稍改造后代码如下
import utils.HttpClientUtils;
import utils.JdbcUtils;
import utils.MysqlUtils;
import java.beans.PropertyVetoException;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class SpiderBatach{
private static final ExecutorService executorService = Executors.newCachedThreadPool();
private static final BlockingQueue<UniversityInfo> queuePhone = new ArrayBlockingQueue<UniversityInfo>(100);
public void start() throws IOException, SQLException, PropertyVetoException {
String sql1 = "select cid from university";
QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
List<Map<String, Object>> list = queryRunner.query(sql1, new MapListHandler());
for (Map<String, Object> stringObjectMap : list) {
String res = stringObjectMap.toString();
int start = res.indexOf("=");
int end=res.indexOf("}");
String result = res.substring(start + 1, end);
String url1="https://www.baokaodaxue.com/bkdx/college/fenshu?wl=1&cid="+result+"&type=0&special=0&limit=3";
String json = HttpClientUtils.doGet(url1);
parseJson(json,result);
}
}
public void parseJson(String json,String result) throws PropertyVetoException {
List<UniversityInfo> universityList = new ArrayList<>();
Gson gson = new Gson();
Map map = gson.fromJson(json, Map.class);
ArrayList<Map> dataList = (ArrayList<Map>) map.get("extraData");
for (Map obj : dataList) {
String cid = result;
String year = (String) obj.get("year");
String batch = (String) obj.get("pici");
String up = (String) obj.get("gaofen");
String low = (String) obj.get("difen");
String avg = (String) obj.get("pjfen");
Double difvalue = (Double) obj.get("xc");
String lowlevel = (String) obj.get("zdwc");
UniversityInfo universityInfo = new UniversityInfo();
universityInfo.setCid(cid);
universityInfo.setYear(year);
universityInfo.setBatch(batch);
universityInfo.setUp(up);
universityInfo.setLow(low);
universityInfo.setAvg(avg);
universityInfo.setDifvalue(difvalue);
universityInfo.setLowlevel(lowlevel);
universityList.add(universityInfo);
}
MysqlUtils.SaveData(universityList);
}
}
AI写代码
爬取各个院校的校园风光图片
跟爬取批次相似,也是通过读取数据库中的cid,拼接url的方式,访问到各个院校的校园风光页面,找到图片所在的li,进行遍历写到本地磁盘,再讲保存信息写入数据库,详细代码如下
import bean.UniversityPicture;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.MapListHandler;
import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import utils.HttpClientUtils;
import utils.JdbcUtils;
import utils.Picture2Mysql;
import java.beans.PropertyVetoException;
import java.io.*;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class SpiderPhoto {
private static final ExecutorService executorService = Executors.newCachedThreadPool();
public void start() throws IOException, SQLException, PropertyVetoException {
int x=0;
CloseableHttpClient httpclient = HttpClients.createDefault();
String sql1 = "select name,cid from university";
QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
List<Map<String, Object>> list = queryRunner.query(sql1, new MapListHandler());
for (Map<String, Object> stringObjectMap : list) {
x+=1;
System.out.println("第"+x+"所学校");
String name= (String) stringObjectMap.get("name");
String cid= (String) stringObjectMap.get("cid");
String url="https://www.baokaodaxue.com/bkdx/college/photos?cid="+cid;
String html = HttpClientUtils.doGet(url);
Document document = Jsoup.parse(html);
Elements elements = document.select(".view-picture-wrap>ul>li");
for(int i=1;i<elements.size();i++){
UniversityPicture pic=new UniversityPicture();
Element li = elements.get(i);
String url1 = li.select("a").attr("href");
HttpGet PicturehttpGet = new HttpGet(url1);
CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
HttpEntity pictureEntity = pictureResponse.getEntity();
InputStream inputStream = pictureEntity.getContent();
// 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
FileUtils.copyToFile(inputStream, new File("F://img//" +cid+"//"+ cid + "_" + i + ".jpg"));
pictureResponse.close(); // pictureResponse关闭
pic.setCid(cid);
pic.setPicture("F://img//" +cid+"//"+ cid + "_" + i + ".jpg");
Picture2Mysql.SaveData(pic);
}
}
httpclient.close(); // httpClient关闭
}
}
AI写代码
以下是数据库的操作
public static void SaveData(List<UniversityInfo> universityList) throws PropertyVetoException {
ComboPooledDataSource comboPooledDataSource = new ComboPooledDataSource();
comboPooledDataSource.setDriverClass("com.mysql.jdbc.Driver");
comboPooledDataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf8");
comboPooledDataSource.setUser("root");
comboPooledDataSource.setPassword("123456");
JdbcTemplate jdbcTemplate = new JdbcTemplate(comboPooledDataSource);
String sql = "insert into universityinfo(cid,year,batch,up,low,avg,difvalue,lowlevel) values(?,?,?,?,?,?,?,?)";
if (universityList != null && universityList.size() > 0) {
for (int i = 0; i < universityList.size(); i++) {
UniversityInfo info = universityList.get(i);
// 保存数据库
jdbcTemplate.update(sql,info.getCid(),info.getYear(),info.getUp(),info.getLow(),info.getAvg(),info.getDifvalue(),info.getLowlevel());
}
}
}
}
public class JdbcUtils {
private static String driver;
private static String url;
private static String username;
private static String password;
private static DruidDataSource dataSource = new DruidDataSource();;
static {
try {
InputStream is = new FileInputStream("F:\ spiderWorkspace\ Spider\ src\ main\ resources\ jdbcconfig.properties");
Properties properties = new Properties();
properties.load(is);
driver = properties.getProperty("jdbc.driver");
url = properties.getProperty("jdbc.url");
username = properties.getProperty("jdbc.username");
password = properties.getProperty("jdbc.password");
} catch (FileNotFoundException e) {
System.out.println("配置文件不存在");
System.exit(0);
} catch (IOException e) {
System.out.println("配置文件有误");
System.exit(0);
}
}
public static DataSource getDataSource(){
dataSource.setDriverClassName(driver);
dataSource.setUrl(url);
dataSource.setUsername(username);
dataSource.setPassword(password);
return dataSource;
}
}
public class Picture2Mysql {
public static void SaveData(UniversityPicture pic) throws PropertyVetoException {
ComboPooledDataSource comboPooledDataSource = new ComboPooledDataSource();
comboPooledDataSource.setDriverClass("com.mysql.jdbc.Driver");
comboPooledDataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf8");
comboPooledDataSource.setUser("root");
comboPooledDataSource.setPassword("123456");
JdbcTemplate jdbcTemplate = new JdbcTemplate(comboPooledDataSource);
String sql = "insert into picture(cid,picture) values(?,?)";
// 保存数据库
jdbcTemplate.update(sql,pic.getCid(),pic.getPicture());
comboPooledDataSource.close();
}
}
AI写代码
这样就可以实现对全国高校录取批次的爬取以及院校内的图片爬取,可以在git上查看详细代码:java爬虫
全部评论 (0)
还没有任何评论哟~
