Advertisement

网易云音乐搜索引擎 python+whoosh---(2)搜索引擎

阅读量:

参考资料:whoosh+jieba:python下实现中文全文检索 whoosh官方文档

1,根据数据库新建schema

只保留音乐库数据中我需要的部分,即下面图片中的黄色部分,把所有表格变成统一格式的记录,格式为:artist_id,artist_name,music_id,music_name,album_id,album_name,lyrics,comment_num,hot_num

基本思想:根据musics表格取出music_id,music_name,album_id,lyrics,comment_num,从albums表格中根据album_id找到album_name和artist_id,从artist表格中根据artist_id找到artist_name。这样前面的8个就得到了,最后一个hot_num代表的是热门度,此热门度的初始值是comment_num,后续根据用户点击情况调整。

新建schema的代码:create_schema.py

复制代码
 # -*- coding:utf-8 -*-

    
 import sqlite3
    
 import re
    
 from whoosh.fields import Schema, STORED, ID, KEYWORD, TEXT, NUMERIC,NGRAMWORDS
    
 from whoosh.index import create_in
    
 from jieba.analyse import ChineseAnalyzer
    
 analyzer = ChineseAnalyzer()
    
 conn = sqlite3.connect("D:\cloudmusic.db")#here is my database
    
 conn.text_factory = str
    
 cu = conn.cursor()
    
 cu.execute("SELECT MUSIC_ID,MUSIC_NAME,ALBUM_ID,LYRICS,COMMENT_NUMBER FROM musics")
    
 musics = cu.fetchall()
    
  
    
 schema = Schema(artist_id=NUMERIC(stored=True),
    
                    artist_name=TEXT(stored=True,analyzer=analyzer),
    
                    music_id=NUMERIC(stored=True,unique=True),
    
                    music_name=TEXT(stored=True,analyzer=analyzer),
    
                    album_id=NUMERIC(stored=True),
    
                    album_name=TEXT(stored=True,analyzer=analyzer),
    
                    lyrics=TEXT(stored=True,analyzer=analyzer),
    
                    comment_num=NUMERIC(stored=True,sortable=True),
    
                    hot_num=NUMERIC(stored=True,sortable=True))
    
  
    
 ix = create_in("D:\课程学习\互联网信息搜索与挖掘\project\whoosh\indexdir", schema)#here to create schema
    
 writer = ix.writer()
    
 index=1
    
 for music in musics:
    
     _lyrics = music[3]#here for lyrics we need delete some strange parts of it
    
     old = re.compile("\(\d+,\d+\)")
    
     _lyrics = re.sub(old,'',_lyrics)
    
     old = re.compile("[\d+,\d+]")
    
     _lyrics = re.sub(old,'',_lyrics)
    
     old = re.compile("\[\d+,\d+\]")
    
     _lyrics = re.sub(old,'',_lyrics)
    
     _artist_id = 0;
    
     _artist_name = ''
    
     _music_id = music[0]
    
     _music_name = music[1]
    
     _album_id = music[2]
    
     _album_name = ''
    
     _comment_number = 0
    
     if music[4]>0:  
    
     _comment_number = music[4]
    
     cu.execute("SELECT ALBUM_ID,ALBUM_NAME,ARTIST_ID FROM albums where ALBUM_ID='%s'"%_album_id)
    
     albums = cu.fetchall()
    
     _album_name = albums[0][1]
    
     _artist_id = albums[0][2]
    
     cu.execute("SELECT ID,NAME FROM artists where ID='%s'"%_artist_id)
    
     artists = cu.fetchall()
    
     _artist_name = artists[0][1]
    
     writer.add_document(artist_id=_artist_id, artist_name=_artist_name.decode('utf-8'), music_id=_music_id, music_name=_music_name.decode('utf-8'), album_id=_album_id, album_name=_album_name.decode('utf-8'), lyrics=_lyrics.decode('utf-8'), comment_num=_comment_number, hot_num=_comment_number)
    
     #print albums[0][1].decode('utf-8').encode('gbk')
    
     print index
    
     index = index+1
    
  
    
 writer.commit()
    
 conn.close()

这是一个小的搜索例子,搜索歌手名“阿宝”的歌曲数目:search.py

复制代码
 import whoosh.index as index

    
 from whoosh import columns, fields, index, sorting
    
 from whoosh.qparser import QueryParser
    
  
    
 ix = index.open_dir("D:\课程学习\互联网信息搜索与挖掘\project\whoosh\indexdir")
    
 facet = sorting.FieldFacet("comment_num", reverse=True)
    
 searcher = ix.searcher()
    
  
    
 searchwords=u"阿宝"
    
 qp = QueryParser("artist_name", schema=ix.schema)
    
 q = qp.parse(searchwords)
    
 results = searcher.search(q, sortedby=facet)
    
 print len(results)

下一篇是界面相关的设计和代码

喜欢本文请打赏,一毛两毛也是个意思,么么哒
支F宝账号:2363891614@qq.com

全部评论 (0)

还没有任何评论哟~