python爬虫爬取网站新闻内容植入mysql数据库内案例demo
发布时间
阅读量:
阅读量
python爬虫爬取网站新闻内容植入mysql数据库内案例demo
今天上午再次进行了优化代码,完成了内容的部分清洗操作
比如:
去除新闻主体内容中的图片链接标记和超链接标记后,则不会出现对方网站的反链和图片内容。
说明:以后有机会可以对代码进行升级。为了让新闻内容能够直接获取本地服务器展示的图片信息,在未来有机会时我会进行相应的优化工作。这将使展示效果更加直观。完成后我们计划在适当的时候分享源码。
import datetime
import pymysql
from selenium import webdriver
from lxml import etree
from time import sleep
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
def get_page_source_html(driver, urlinfo):
driver.get(urlinfo)
page_text = driver.page_source
tree = etree.HTML(page_text)
return tree
def get_page_source_etree(driver):
page_text = driver.page_source
tree = etree.HTML(page_text)
return tree
def get_list_a(etree, xpathinfo):
return etree.xpath(xpathinfo)
def get_news_title(etree, xpathino):
return etree.xpath(xpathino)
def get_news_content(etree, xpathino):
return etree.xpath(xpathino)
def get_news_publish(etree, xpathino):
return etree.xpath(xpathino)
if __name__ == "__main__":
# 创建浏览器对象
driver = webdriver.Chrome()
url2 = "http://www.hnanseo.com/seojichu/page/2"
targeturl = "http://www.hnanseo.com/seojichu"
driver.get(url2)
# //*[@id="menu-item-10"]/a
# 点击SEO基础知识链接
# driver.find_element(By.XPATH, value='//*[@id="menu-item-5"]/a').click()
# 存储a标签的集合
list_a = []
# xpath获取到的集合是一个存储了大量的webelment对象的集合,想具体拿到属性信息,还得再写标签自身的xpath语法。
a_list = get_list_a(get_page_source_etree(driver), '/html/body/section/div[2]/div/article/header/h3/a')
sleep(1)
for a in a_list:
href = a.xpath('./@href')[0]
list_a.append(href)
print("当前页面获取a标签集合长度为{0}".format(len(list_a)))
sleep(1)
# 遍历当前list_a
try:
db = pymysql.Connect(
user='root',
password='',
host='127.0.0.1', port=3306,
db='mytest'
)
cursor = db.cursor()
xuanyan = "<p>这是采集文字,我们尊重原作者的所有版权.站内发布该内容仅供大家学习,请勿商用.如果有疑问请联系手机:[15736771259]</p>"
for newsurl in list_a:
try:
driver.get(newsurl)
# title = driver.find_element(By.CLASS_NAME, value='article-title').text
# content = driver.find_element(By.CLASS_NAME, value='article-content').text
newscontent = driver.find_element(By.CLASS_NAME, value='article-content')
# 使用beautifulsoup封装html源码信息,然后开始提取内容。
soup = BeautifulSoup(driver.page_source, features='lxml')
for s in soup('img'):
s.extract()
for s in soup('a'):
s.extract()
#已经提前清除了img标签和a标签了
tup1 = soup.article.findAll('p')
paragraphs = []
for x in tup1:
paragraphs.append(str(x))
# 去掉最后一个元素的值。
content = ''.join(paragraphs[0:-1])+xuanyan
print(content)
title = soup.h1.text
keywords = title
des = str(newscontent.text)[0:120]
sql = 'insert into news (title,keywords,des,publish,author,content) values (%s,%s,%s,%s,%s,%s)'
values = (title, keywords, des, str(datetime.datetime.now()), 'admin', content)
cursor.execute(sql, values)
db.commit()
print("标题:{0}---插入数据库成功".format(title))
except Exception as ee:
print('发生了异常', ee)
continue
except Exception as e:
print('发生了异常', e)
db.rollback()
finally:
cursor.close()
db.close()
sleep(2)
driver.quit()
此份内容乃是我辛勤劳动的结晶,请不要随意传播以免侵犯我的版权权益!特此鸣谢所有支持 LABOR OF LOVE 的朋友们。为确保作品完美呈现,我曾反复推敲、精雕细琢;若觉得此份 LABOR OF LOVE 作品值得欣赏,请慷慨支持一下吧。双手因疲劳而微微颤抖在此表示衷心感谢
全部评论 (0)
还没有任何评论哟~
