Advertisement

python爬虫爬取网站新闻内容植入mysql数据库内案例demo

阅读量:

python爬虫爬取网站新闻内容植入mysql数据库内案例demo

今天上午再次进行了优化代码,完成了内容的部分清洗操作

比如:

去除新闻主体内容中的图片链接标记和超链接标记后,则不会出现对方网站的反链和图片内容。

说明:以后有机会可以对代码进行升级。为了让新闻内容能够直接获取本地服务器展示的图片信息,在未来有机会时我会进行相应的优化工作。这将使展示效果更加直观。完成后我们计划在适当的时候分享源码。

复制代码
 import datetime

    
  
    
 import pymysql
    
 from selenium import webdriver
    
 from lxml import etree
    
 from time import sleep
    
 from bs4 import BeautifulSoup
    
 from selenium.webdriver.common.by import By
    
  
    
  
    
 def get_page_source_html(driver, urlinfo):
    
     driver.get(urlinfo)
    
     page_text = driver.page_source
    
     tree = etree.HTML(page_text)
    
     return tree
    
  
    
  
    
 def get_page_source_etree(driver):
    
     page_text = driver.page_source
    
     tree = etree.HTML(page_text)
    
     return tree
    
  
    
  
    
 def get_list_a(etree, xpathinfo):
    
     return etree.xpath(xpathinfo)
    
  
    
  
    
 def get_news_title(etree, xpathino):
    
     return etree.xpath(xpathino)
    
  
    
  
    
 def get_news_content(etree, xpathino):
    
     return etree.xpath(xpathino)
    
  
    
  
    
 def get_news_publish(etree, xpathino):
    
     return etree.xpath(xpathino)
    
  
    
  
    
 if __name__ == "__main__":
    
  
    
     # 创建浏览器对象
    
     driver = webdriver.Chrome()
    
     url2 = "http://www.hnanseo.com/seojichu/page/2"
    
     targeturl = "http://www.hnanseo.com/seojichu"
    
     driver.get(url2)
    
     # //*[@id="menu-item-10"]/a
    
     # 点击SEO基础知识链接
    
     # driver.find_element(By.XPATH, value='//*[@id="menu-item-5"]/a').click()
    
     # 存储a标签的集合
    
     list_a = []
    
     # xpath获取到的集合是一个存储了大量的webelment对象的集合,想具体拿到属性信息,还得再写标签自身的xpath语法。
    
     a_list = get_list_a(get_page_source_etree(driver), '/html/body/section/div[2]/div/article/header/h3/a')
    
     sleep(1)
    
     for a in a_list:
    
     href = a.xpath('./@href')[0]
    
     list_a.append(href)
    
     print("当前页面获取a标签集合长度为{0}".format(len(list_a)))
    
     sleep(1)
    
     # 遍历当前list_a
    
     try:
    
     db = pymysql.Connect(
    
         user='root',
    
         password='',
    
         host='127.0.0.1', port=3306,
    
         db='mytest'
    
     )
    
     cursor = db.cursor()
    
     xuanyan = "<p>这是采集文字,我们尊重原作者的所有版权.站内发布该内容仅供大家学习,请勿商用.如果有疑问请联系手机:[15736771259]</p>"
    
     for newsurl in list_a:
    
         try:
    
             driver.get(newsurl)
    
             # title = driver.find_element(By.CLASS_NAME, value='article-title').text
    
             # content = driver.find_element(By.CLASS_NAME, value='article-content').text
    
             newscontent = driver.find_element(By.CLASS_NAME, value='article-content')
    
             # 使用beautifulsoup封装html源码信息,然后开始提取内容。
    
             soup = BeautifulSoup(driver.page_source, features='lxml')
    
             for s in soup('img'):
    
                 s.extract()
    
             for s in soup('a'):
    
                 s.extract()
    
             #已经提前清除了img标签和a标签了
    
             tup1 = soup.article.findAll('p')
    
             paragraphs = []
    
             for x in tup1:
    
                 paragraphs.append(str(x))
    
             # 去掉最后一个元素的值。
    
             content = ''.join(paragraphs[0:-1])+xuanyan
    
             print(content)
    
             title = soup.h1.text
    
             keywords = title
    
             des = str(newscontent.text)[0:120]
    
             sql = 'insert into news (title,keywords,des,publish,author,content) values (%s,%s,%s,%s,%s,%s)'
    
             values = (title, keywords, des, str(datetime.datetime.now()), 'admin', content)
    
             cursor.execute(sql, values)
    
             db.commit()
    
             print("标题:{0}---插入数据库成功".format(title))
    
         except Exception as ee:
    
             print('发生了异常', ee)
    
             continue
    
     except Exception as e:
    
     print('发生了异常', e)
    
     db.rollback()
    
     finally:
    
     cursor.close()
    
     db.close()
    
  
    
 sleep(2)
    
 driver.quit()

此份内容乃是我辛勤劳动的结晶,请不要随意传播以免侵犯我的版权权益!特此鸣谢所有支持 LABOR OF LOVE 的朋友们。为确保作品完美呈现,我曾反复推敲、精雕细琢;若觉得此份 LABOR OF LOVE 作品值得欣赏,请慷慨支持一下吧。双手因疲劳而微微颤抖在此表示衷心感谢

全部评论 (0)

还没有任何评论哟~