Advertisement

java 爬虫 腾讯新闻_Java代码---实现爬取腾讯新闻

阅读量:

环境准备:

com.alibaba

druid

1.1.21

com.google.code.gson

gson

2.8.5

org.springframework

spring-jdbc

5.2.2.RELEASE

mysql

mysql-connector-java

5.1.47

定义pojo接收

private static final long serialVersionUID = 1L;

private int id;

private String title;

private String intro;

private String url;

private String source;

private Date publishTime;

代码爬取数据:

static JdbcTemplate jdbcTemplate = null;

public static void main(String[] args) throws IOException and ParseException {

//加载外部属性文件

Properties properties = new Properties();

InputStream inputStream被初始化为从指定位置 src/main/resources/db.properties 的资源文件创建的一个新的 FileInputStream实例。

properties.load(inputStream);

//获得数据库属性

String driver = properties.getProperty("jdbc.driverClass");

String url = properties.getProperty("jdbc.url");

String username = properties.getProperty("jdbc.username");

String password = properties.getProperty("jdbc.password");

//System.out.println(driver+"=="+url+"=="+username+"=="+password);

DruidDataSource dataSource = new DruidDataSource();

dataSource.setDriverClassName(driver);

dataSource.setUrl(url);

dataSource.setUsername(username);

dataSource.setPassword(password);

jdbcTemplate =new JdbcTemplate(dataSource);

//设置起始页

int page = 1;

while (true) {

StringBuilder urlTencent = "https://pacaio.match.qq.com/irs/rcd?cid=135&token=6e92c215fb08afa901ac31eca115a34f&ext=world&page="+page+"&expIds=&callback=__jp4"; 包括一个参数页码

//确定路径

//String variableName = "https://pacaio.match.qq.com/irs/rcd?cid=89&token=4d4e2946f92c5708f32141479596d72e&id=&ext=bj&page="+page+"&expIds=&callback=__jp0";

CloseableHttpClient httpClient = HttpClients.createDefault();

HttpGet httpGet = new HttpGet(urlTencent);

httpGet.setHeader("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");

CloseableHttpResponse httpResponse = httpClient.execute(httpGet);

int statusCode = httpResponse.getStatusLine().getStatusCode();

if (statusCode==200) {

HttpEntity httpEntity = httpResponse.getEntity();

Gson gson = new Gson();

//转换

String html = EntityUtils.toString(httpEntity);

//得到json

String json = parseJson(html);

//转换成map

Map map = gson.fromJson(json, Map.class);

//判断有多少数据,然后退出循环

Object num = map.get("datanum");

String nums = num.toString();

Double double1 = Double.parseDouble(nums);

int number = double1.intValue();

if (number==0) {

break;

}

//得到页面的data

@SuppressWarnings("unchecked")

List list = (List) map.get("data");

//遍历集合

for (Map map2 : list) {

final formattedDate = new java.util.DateUtils().getFormattedDate(new java.util.DateUtils().SIMPLE_DATE_FORMAT, "Year-MM-Day Hour:Minute:Second");

Tencent tencent = new Tencent();

String title = map2.get("title").toString();

String intro = map2.get("intro").toString();

String turl = map2.get("url").toString();

String source = map2.get("source").toString();

将变量 publishTime 初始化为通过日期格式解析 map2 中键名为 'publish_time' 的字符串。

tencent.setTitle(title);

tencent.setUrl(turl);

tencent.setIntro(intro);

tencent.setSource(source);

tencent.setPublishTime(publishTime);

addNews(tencent);

}

}

page++;

}

}

public static void addNews(Tencent tencent) {

Assign a string variable named sql to store the SQL statement: "insert into t_tencent (title,intro,url,source,publish_time) values (?,?,?,?,?)".

j*Update`块中的操作将执行以下内容:调用参数化的SQL语句,并将tencent对象的信息插入到数据库中指定的位置。具体来说,该操作会更新tencent对象的标题、简介、URL、来源以及发布日期等字段值,并将这些更新后的数据以相应的数据类型存储在数据库中。

}

public static String parseJson(String data) {

int start = data.indexOf("(");

int end = data.lastIndexOf(")");

String html = data.substring(start+1, end);

return html;

}

结果如下:

20200317155054172554.png

原文:https://www.cnblogs.com/suspring/p/12510826.html

全部评论 (0)

还没有任何评论哟~