Advertisement

java爬虫爬取高考院校信息以及图片

阅读量:

javaAPI爬取全国高校录取情况以及院校图片

首先,为了能够能够爬取全国所有高校的录取情况以及其他信息,需要做到以下步骤:
1找到要爬取的所有大学的url
2开始爬取院校的cid、院校主页url、cname存入数据库,便于后面进一步利用这些关系爬取其他信息
3利用数据库中cid拼接url的方式爬取院校录取批次信息
4利用数据库中cid拼接url找到所有图片再发送httpclient下载图片

爬取全国院校以及cid存入数据库

该页面是全国高校的主页:https://www.baokaodaxue.com/bkdx/search/college
进入该页面之后可以利用谷歌的开发者工具,找到这个块标签,然后提取出其中想要的元素
主要代码如下

复制代码
     //线程池
    private static final ExecutorService executorService = Executors.newCachedThreadPool();
    //阻塞队列,用于存放商品盒子li
    private static final BlockingQueue<Element> queueLi = new ArrayBlockingQueue<Element>(100);
    //阻塞队列,用于存放university
    private static final BlockingQueue<University> queuePhone = new ArrayBlockingQueue<University>(100);
    //爬取的首页
    private String url ="https://www.baokaodaxue.com/bkdx/search/college?dq=&type=&bz=&kp=&keywd=&page=1";
    //开始爬取
    public void start() throws IOException {
    
        final String sql = "insert into university(name,url,cid) values(?,?,?)";
        for (int i = 0; i < 10; i++) {
            executorService.execute(new Runnable() {
                public void run() {
                    QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
                    while (true) {
                        try {
                            University university = queuePhone.take();
                            queryRunner.update(sql,university.getName(),university.getUrl(),university.getCid());
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        } catch (SQLException e) {
                            e.printStackTrace();
                        }
                    }
                }
            });
        }
        //创建10个消费者(解析队列中存放的li)
        for (int i = 0; i < 10; i++) {
            executorService.execute(new Runnable() {
                public void run() {
                    //从队列中取出li进行解析
                    while (true) {
                        Element li = null;
                        try {
                            li = queueLi.take();
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                        University university = parseLi(li);
                        if (university != null) {
                            queuePhone.offer(university);
                        }
                    }
                }
            });
        }
        //获取首页
        CloseableHttpResponse indexRes = sendGet(url);
        //解析结果
        parseIndex(indexRes, 1);
    }
    //发送get请求,获取响应结果
    public CloseableHttpResponse sendGet(String url) throws IOException {
        //创建httpClient客户端
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //创建请求对象,发送请求
        HttpGet httpGet = new HttpGet(url);
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36");
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
    //解析首页
    public void parseIndex(CloseableHttpResponse indexRes, int page) throws IOException {
        System.out.println("---第" + page + "页抓取完毕---");
        //得到document对象
        String indexHtml = EntityUtils.toString(indexRes.getEntity(), "UTF-8");
        Document document = Jsoup.parse(indexHtml);
        Elements lis= document.getElementsByClass("college-name");
        //取出每个盒子置于队列中
        for (Element li : lis) {
            queueLi.offer(li);
        }
        if (++page <= 137) {
            int index = 1+page-1;
            String url ="https://www.baokaodaxue.com/bkdx/search/college?dq=&type=&bz=&kp=&keywd=&page=" + index ;
            CloseableHttpResponse nextRes = sendGet(url);
            parseIndex(nextRes, page);
        }
    }
    //解析每个盒子,封装到phone并返回
    public University parseLi(Element li) {
        try {
            University university = new University();
            String url = li.select("a").attr("href");
            String name = li.text();
            //  System.out.println(url);
            int start=url.indexOf("=");
            int end=url.length();
            String cid=url.substring(start+1,end);
    
            university.setName(name);
            university.setUrl(url);
            university.setCid(cid);
    
            return university;
        } catch (Exception e) {
            //System.out.println("错误数据");
        }
        return null;
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码

爬取各个院校的录取批次情况

根据之前的代码已经将name、url、cid存入数据库中了,此时只需要从数据库中取出每一个cid,拼接url进行解析,取出想要的录取批次情况,稍稍改造后代码如下

复制代码
    import utils.HttpClientUtils;
    import utils.JdbcUtils;
    import utils.MysqlUtils;
    import java.beans.PropertyVetoException;
    import java.io.IOException;
    import java.sql.SQLException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.ArrayBlockingQueue;
    import java.util.concurrent.BlockingQueue;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    public class SpiderBatach{
    private static final ExecutorService executorService = Executors.newCachedThreadPool();
    private static final BlockingQueue<UniversityInfo> queuePhone = new ArrayBlockingQueue<UniversityInfo>(100);
    public void start() throws IOException, SQLException, PropertyVetoException {
        String sql1 = "select cid from university";
        QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
        List<Map<String, Object>> list = queryRunner.query(sql1, new MapListHandler());
        for (Map<String, Object> stringObjectMap : list) {
            String res = stringObjectMap.toString();
            int start = res.indexOf("=");
            int end=res.indexOf("}");
            String result = res.substring(start + 1, end);
            String url1="https://www.baokaodaxue.com/bkdx/college/fenshu?wl=1&cid="+result+"&type=0&special=0&limit=3";
            String json = HttpClientUtils.doGet(url1);
            parseJson(json,result);
        }
    }
    public void parseJson(String json,String result) throws PropertyVetoException {
        List<UniversityInfo> universityList = new ArrayList<>();
        Gson gson = new Gson();
        Map map = gson.fromJson(json, Map.class);
        ArrayList<Map> dataList = (ArrayList<Map>) map.get("extraData");
        for (Map obj : dataList) {
            String cid = result;
            String year = (String) obj.get("year");
            String batch = (String) obj.get("pici");
            String up = (String) obj.get("gaofen");
            String low = (String) obj.get("difen");
            String avg = (String) obj.get("pjfen");
            Double difvalue = (Double) obj.get("xc");
            String lowlevel = (String) obj.get("zdwc");
            UniversityInfo universityInfo = new UniversityInfo();
            universityInfo.setCid(cid);
            universityInfo.setYear(year);
            universityInfo.setBatch(batch);
            universityInfo.setUp(up);
            universityInfo.setLow(low);
            universityInfo.setAvg(avg);
            universityInfo.setDifvalue(difvalue);
            universityInfo.setLowlevel(lowlevel);
            universityList.add(universityInfo);
        }
        MysqlUtils.SaveData(universityList);
    }
    }
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码

爬取各个院校的校园风光图片

跟爬取批次相似,也是通过读取数据库中的cid,拼接url的方式,访问到各个院校的校园风光页面,找到图片所在的li,进行遍历写到本地磁盘,再讲保存信息写入数据库,详细代码如下

复制代码
    import bean.UniversityPicture;
    import org.apache.commons.dbutils.QueryRunner;
    import org.apache.commons.dbutils.handlers.MapListHandler;
    import org.apache.commons.io.FileUtils;
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import utils.HttpClientUtils;
    import utils.JdbcUtils;
    import utils.Picture2Mysql;
    import java.beans.PropertyVetoException;
    import java.io.*;
    import java.sql.SQLException;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.ExecutorService;
    import java.util.concurrent.Executors;
    
    public class SpiderPhoto {
    
    private static final ExecutorService executorService = Executors.newCachedThreadPool();
     public void start() throws IOException, SQLException, PropertyVetoException {
     int x=0;
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String sql1 = "select name,cid from university";
        QueryRunner queryRunner = new QueryRunner(JdbcUtils.getDataSource());
        List<Map<String, Object>> list = queryRunner.query(sql1, new MapListHandler());
    
        for (Map<String, Object> stringObjectMap : list) {
                   x+=1;
            System.out.println("第"+x+"所学校");
                String name= (String) stringObjectMap.get("name");
                String cid= (String) stringObjectMap.get("cid");
                String url="https://www.baokaodaxue.com/bkdx/college/photos?cid="+cid;
                String html = HttpClientUtils.doGet(url);
                Document document = Jsoup.parse(html);
                Elements elements = document.select(".view-picture-wrap>ul>li");
    
                    for(int i=1;i<elements.size();i++){
                        UniversityPicture pic=new UniversityPicture();
                            Element li = elements.get(i);
                            String url1 = li.select("a").attr("href");
    
                            HttpGet PicturehttpGet = new HttpGet(url1);
                            CloseableHttpResponse pictureResponse = httpclient.execute(PicturehttpGet);
                            HttpEntity pictureEntity = pictureResponse.getEntity();
                            InputStream inputStream = pictureEntity.getContent();
    
                            // 使用 common-io 下载图片到本地,注意图片名不能重复 ✔
                            FileUtils.copyToFile(inputStream, new File("F://img//" +cid+"//"+ cid + "_" + i + ".jpg"));
                            pictureResponse.close(); // pictureResponse关闭
                            pic.setCid(cid);
                            pic.setPicture("F://img//" +cid+"//"+ cid + "_" + i + ".jpg");
                        Picture2Mysql.SaveData(pic);
                        }
                    }
                    httpclient.close(); // httpClient关闭
                }
        }
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码

以下是数据库的操作

复制代码
      public static void SaveData(List<UniversityInfo> universityList) throws PropertyVetoException {
            ComboPooledDataSource comboPooledDataSource = new ComboPooledDataSource();
            comboPooledDataSource.setDriverClass("com.mysql.jdbc.Driver");
            comboPooledDataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf8");
            comboPooledDataSource.setUser("root");
            comboPooledDataSource.setPassword("123456");
            JdbcTemplate jdbcTemplate = new JdbcTemplate(comboPooledDataSource);
            String sql = "insert into universityinfo(cid,year,batch,up,low,avg,difvalue,lowlevel) values(?,?,?,?,?,?,?,?)";
            if (universityList != null && universityList.size() > 0) {
                for (int i = 0; i < universityList.size(); i++) {
                    UniversityInfo info = universityList.get(i);
                    // 保存数据库
                 jdbcTemplate.update(sql,info.getCid(),info.getYear(),info.getUp(),info.getLow(),info.getAvg(),info.getDifvalue(),info.getLowlevel());
                }
            }
        }
    }
    
    
    
    public class JdbcUtils {
    private static String driver;
    private static String url;
    private static String username;
    private static String password;
    private static DruidDataSource dataSource = new DruidDataSource();;
    static {
        try {
            InputStream is = new FileInputStream("F:\ spiderWorkspace\ Spider\ src\ main\ resources\ jdbcconfig.properties");
            Properties properties = new Properties();
            properties.load(is);
            driver = properties.getProperty("jdbc.driver");
            url = properties.getProperty("jdbc.url");
            username = properties.getProperty("jdbc.username");
            password = properties.getProperty("jdbc.password");
        } catch (FileNotFoundException e) {
            System.out.println("配置文件不存在");
            System.exit(0);
        } catch (IOException e) {
            System.out.println("配置文件有误");
            System.exit(0);
        }
    }
    public static DataSource getDataSource(){
        dataSource.setDriverClassName(driver);
        dataSource.setUrl(url);
        dataSource.setUsername(username);
        dataSource.setPassword(password);
        return dataSource;
    }
    }
    
    
    
    public class Picture2Mysql {
    public static void SaveData(UniversityPicture pic) throws PropertyVetoException {
    
    ComboPooledDataSource comboPooledDataSource = new ComboPooledDataSource();
            comboPooledDataSource.setDriverClass("com.mysql.jdbc.Driver");
            comboPooledDataSource.setJdbcUrl("jdbc:mysql://localhost:3306/spider?characterEncoding=utf8");
            comboPooledDataSource.setUser("root");
            comboPooledDataSource.setPassword("123456");
    JdbcTemplate jdbcTemplate = new JdbcTemplate(comboPooledDataSource);
    String sql = "insert into picture(cid,picture) values(?,?)";
    
            // 保存数据库
            jdbcTemplate.update(sql,pic.getCid(),pic.getPicture());
        comboPooledDataSource.close();
        }
    
    }
    
    
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    AI写代码

这样就可以实现对全国高校录取批次的爬取以及院校内的图片爬取,可以在git上查看详细代码:java爬虫

全部评论 (0)

还没有任何评论哟~