實(shí)踐項(xiàng)目————詩(shī)詞名句網(wǎng)《三國(guó)演義》小說(shuō)爬取
import os import re import time import urllib.request import urllib.parse from bs4 import BeautifulSoup
def header(): # 三國(guó)演義網(wǎng)址 article_url = "http://www.shicimingju.com/book/sanguoyanyi.html" # 模擬瀏覽器創(chuàng)建請(qǐng)求頭 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"} # 創(chuàng)建請(qǐng)求對(duì)象 request = urllib.request.Request(article_url,headers=headers) return request
# 發(fā)送請(qǐng)求 def main(request): # 創(chuàng)建管理器對(duì)象對(duì)象 handler = urllib.request.HTTPHandler() # 使用管理器對(duì)象構(gòu)建請(qǐng)求對(duì)象 opener = urllib.request.build_opener( handler ) # 使用opener進(jìn)行獲取響應(yīng) response = opener.open( request ) return response
# 下載內(nèi)容 def download(): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"} request = header() response = main(request).read() # 使用bs4對(duì)html進(jìn)行解析 article_main_html = BeautifulSoup(response,"lxml") if not os.path.exists("三國(guó)演義"): os.mkdir("三國(guó)演義") # 獲取書(shū)名 # article_name =article_main_html.select(".book-header h1").text # 獲取書(shū)名詳解 # article_details =article_main_html.select(".book-summary p").text # 獲取章節(jié)鏈接 article_section = article_main_html.select(".book-mulu ul li a") section_title_ls = [] section_url_ls = [] # 將章節(jié)和章節(jié)鏈接有序存入列表中 for section in article_section: section_title_ls.append(section.text) section_url_ls.append(section["href"])
# 分章節(jié)爬取章節(jié)內(nèi)容 for num in range(0,120): # 同時(shí)取出章節(jié)名和章節(jié)url進(jìn)行請(qǐng)求數(shù)據(jù) section_title = section_title_ls[num] section_url = section_url_ls[num] # 拼接完整的章節(jié)url section_allurl = "http://www.shicimingju.com"+section_url section_request = urllib.request.Request(section_allurl,headers=headers ) handler = urllib.request.HTTPHandler opener =urllib.request.build_opener(handler) # 請(qǐng)求章節(jié)數(shù)據(jù) section_response = opener.open(section_request).read().decode("utf8") # 使用bs4對(duì)html進(jìn)行解析 article_soup =BeautifulSoup(section_response,"lxml") article_content = article_soup.select(".chapter_content") # 構(gòu)建章節(jié)名并和文章組合 content = section_title+article_content[0].text # 創(chuàng)建存儲(chǔ)文件名 filename ="三國(guó)演義"+".doc" print("正在下載第%d章"%num) # 將下載的數(shù)據(jù)寫(xiě)入文件中 filename_path = os.path.join("三國(guó)演義",filename) with open(filename_path,"ab+") as tf: tf.write(content.encode("utf8")) tf.close() # 防止暴力請(qǐng)求 time.sleep(2)
if __name__ == '__main__': download()
百度音樂(lè)爬取案例
import os import re import time import urllib.request import urllib.parse from bs4 import BeautifulSoup import json
# 面向?qū)ο笈廊?shù)據(jù) class BaiDuMusic( object ):
# 初始化輸入?yún)?shù) def __init__(self, singer, page): self.singer = singer self.page = page self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
# 構(gòu)建請(qǐng)求頭信息 def header(self): url = "http://music./search/song?" data = { "s": "1", "key": self.singer, "jump": "0", "start": (self.page - 1) * 20, "size": "20", "third_type": "0", } # 解析參數(shù) data = urllib.parse.urlencode( data ) singer_url = url + data # 創(chuàng)建請(qǐng)求頭 request = urllib.request.Request( url=singer_url, headers=self.headers ) return request
# 創(chuàng)建管理器對(duì)象,請(qǐng)求數(shù)據(jù) def requset(self): request = self.header() handler = urllib.request.HTTPHandler() opener = urllib.request.build_opener( handler ) response = opener.open( request ) return response
# bs4解析數(shù)據(jù) def paserSong(self): response = self.requset() singer_soup = BeautifulSoup( response, "lxml" ) pattern=re.compile(r'[\d]+') # bs4匹配目標(biāo)標(biāo)簽<li> songs_info =singer_soup.find_all(name="li", attrs={"data-albumid":pattern}) # 獲取<li>標(biāo)簽中的”data-songitem“屬性,并將屬性值轉(zhuǎn)成json格式 song_ls =[json.loads(li["data-songitem"]) for li in songs_info] song_info=[(song_info["songItem"]["sname"],song_info["songItem"]["sid"]) for song_info in song_ls] # print(song_info) # 輸出結(jié)果如下,獲取歌曲id # """[('只要平凡', 598740690), ('My Sunshine', 127018924), ('聽(tīng)', 123192697), ('微笑著勝利(慶祝建軍91周年網(wǎng)宣主題曲)', 601622060), ('Lost In The Stars', 268791350), ('Everything Will Say Goodbye', 285312563), ('《星辰》——電視劇《擇天記》片頭曲', 609686640), ('聽(tīng)', 123206622), ('Give You My World', 277779153), ('微笑著勝利(慶祝建軍91周年網(wǎng)宣主題曲)(伴奏)', 601622061), ('My Sunshine', 131096021), ('三生三世', 537883379), ('著魔', 53603708), ('三生三世', 537883380), ('Torches', 541943830), ('浩瀚', 124796979), ('逆戰(zhàn)', 14944589), ('劍心', 121223583), ('天下', 1103789), ('燕歸巢', 136982116)]""" return song_info def downloadSong(self): if not os.path.exists('music'): os.mkdir('music') song_info =self.paserSong() for song_detail in song_info: song_info_id=song_detail[1] song_info_name=song_detail[0] print("正在下載第%s頁(yè)的:%s的《%s》"%(self.page,self.singer,song_info_name)) # 通過(guò)該API接口獲取歌曲信息的json格式數(shù)據(jù) song_url ='http://tingapi.ting.baidu.com/v1/restserver/ting?method=baidu.ting.song.play&format=jsonp&callback=jQuery17202741599001012014_1513517333931&songid=%s&_=1513517334915'%song_info_id # 獲取請(qǐng)求 request_song_detail =urllib.request.urlopen(song_url) # 解析json歌曲數(shù)據(jù) pattern_song =re.compile(r'\((.*)\)',re.S) json_song_info=pattern_song.findall(request_song_detail.read().decode("utf8")) # 將字符串?dāng)?shù)據(jù)轉(zhuǎn)化成json數(shù)據(jù),便于提取下載路徑 lrclink=json.loads(json_song_info[0])["songinfo"]["lrclink"] file_link =json.loads(json_song_info[0])["bitrate"]["file_link"] # 創(chuàng)建文件格式保存文件 filename_music=song_info_name+"_%s.mp3"%self.singer filename_lrc =song_info_name+"_%s.lrc"%self.singer song_path = os.path.join("music",filename_music) lrc_path = os.path.join("music",filename_lrc) try: # 下載歌曲和歌詞數(shù)據(jù) urllib.request.urlretrieve(lrclink,lrc_path) urllib.request.urlretrieve( file_link, song_path ) time.sleep(1) print("《%s》下載完成"%song_info_name) except Exception as e: print("因版權(quán)受限無(wú)法下載")
# 錄入爬取信息 def main(): singer = input( "請(qǐng)輸入爬取的歌手或是歌名:" ) start_page = int( input( "請(qǐng)輸入爬取的開(kāi)始頁(yè):" ) ) end_page = int( input( "請(qǐng)輸入爬取的終止頁(yè):" ) ) for page in range( start_page, end_page + 1 ): baidumusic = BaiDuMusic( singer, page ) if page>end_page+1: print("%s歌手的所有歌曲都已下載完畢"%singer) baidumusic.downloadSong()
# 運(yùn)行 if __name__ == '__main__': main()
|