爬蟲(chóng)學(xué)習(xí)（十二）

北方的白樺林 2020-05-04

展開(kāi)全文

實(shí)踐項(xiàng)目————詩(shī)詞名句網(wǎng)《三國(guó)演義》小說(shuō)爬取

import os
import re
import time
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup


def header():
    # 三國(guó)演義網(wǎng)址
    article_url = "http://www.shicimingju.com/book/sanguoyanyi.html"
    # 模擬瀏覽器創(chuàng)建請(qǐng)求頭
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
    # 創(chuàng)建請(qǐng)求對(duì)象
    request = urllib.request.Request(article_url,headers=headers)
    return request

# 發(fā)送請(qǐng)求
def main(request):
    # 創(chuàng)建管理器對(duì)象對(duì)象
    handler = urllib.request.HTTPHandler()
    # 使用管理器對(duì)象構(gòu)建請(qǐng)求對(duì)象
    opener = urllib.request.build_opener( handler )
    # 使用opener進(jìn)行獲取響應(yīng)
    response = opener.open( request )
    return response

# 下載內(nèi)容
def download():
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
    request = header()
    response = main(request).read()
    # 使用bs4對(duì)html進(jìn)行解析
    article_main_html = BeautifulSoup(response,"lxml")
    if not os.path.exists("三國(guó)演義"):
        os.mkdir("三國(guó)演義")
    # 獲取書(shū)名
    # article_name =article_main_html.select(".book-header h1").text
    # 獲取書(shū)名詳解
    # article_details =article_main_html.select(".book-summary p").text
    # 獲取章節(jié)鏈接
    article_section = article_main_html.select(".book-mulu ul li a")
    section_title_ls = []
    section_url_ls = []
    # 將章節(jié)和章節(jié)鏈接有序存入列表中
    for section in article_section:
        section_title_ls.append(section.text)
        section_url_ls.append(section["href"])

    # 分章節(jié)爬取章節(jié)內(nèi)容
    for num in range(0,120):
        # 同時(shí)取出章節(jié)名和章節(jié)url進(jìn)行請(qǐng)求數(shù)據(jù)
        section_title = section_title_ls[num]
        section_url = section_url_ls[num]
        # 拼接完整的章節(jié)url
        section_allurl = "http://www.shicimingju.com"+section_url
        section_request = urllib.request.Request(section_allurl,headers=headers )
        handler = urllib.request.HTTPHandler
        opener =urllib.request.build_opener(handler)
        # 請(qǐng)求章節(jié)數(shù)據(jù)
        section_response = opener.open(section_request).read().decode("utf8")
        # 使用bs4對(duì)html進(jìn)行解析
        article_soup =BeautifulSoup(section_response,"lxml")
        article_content = article_soup.select(".chapter_content")
        # 構(gòu)建章節(jié)名并和文章組合
        content = section_title+article_content[0].text
        # 創(chuàng)建存儲(chǔ)文件名
        filename ="三國(guó)演義"+".doc"
        print("正在下載第%d章"%num)
        # 將下載的數(shù)據(jù)寫(xiě)入文件中
        filename_path = os.path.join("三國(guó)演義",filename)
        with open(filename_path,"ab+") as tf:
            tf.write(content.encode("utf8"))
            tf.close()
        # 防止暴力請(qǐng)求
        time.sleep(2)

if __name__ == '__main__':
    download()

百度音樂(lè)爬取案例

import os
import re
import time
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import json

# 面向?qū)ο笈廊?shù)據(jù)
class BaiDuMusic( object ):

    # 初始化輸入?yún)?shù)
    def __init__(self, singer, page):
        self.singer = singer
        self.page = page
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}

    # 構(gòu)建請(qǐng)求頭信息
    def header(self):
        url = "http://music./search/song?"
        data = {
            "s": "1",
            "key": self.singer,
            "jump": "0",
            "start": (self.page - 1) * 20,
            "size": "20",
            "third_type": "0",
        }
        # 解析參數(shù)
        data = urllib.parse.urlencode( data )
        singer_url = url + data
        # 創(chuàng)建請(qǐng)求頭
        request = urllib.request.Request( url=singer_url, headers=self.headers )
        return request

    # 創(chuàng)建管理器對(duì)象，請(qǐng)求數(shù)據(jù)
    def requset(self):
        request = self.header()
        handler = urllib.request.HTTPHandler()
        opener = urllib.request.build_opener( handler )
        response = opener.open( request )
        return response

    # bs4解析數(shù)據(jù)
    def paserSong(self):
        response = self.requset()
        singer_soup = BeautifulSoup( response, "lxml" )
        pattern=re.compile(r'[\d]+')
        # bs4匹配目標(biāo)標(biāo)簽<li>
        songs_info =singer_soup.find_all(name="li", attrs={"data-albumid":pattern})
        # 獲取<li>標(biāo)簽中的”data-songitem“屬性，并將屬性值轉(zhuǎn)成json格式
        song_ls =[json.loads(li["data-songitem"]) for li in songs_info]
        song_info=[(song_info["songItem"]["sname"],song_info["songItem"]["sid"]) for song_info in song_ls]
        # print(song_info)
        # 輸出結(jié)果如下，獲取歌曲id
        # """[('只要平凡', 598740690), ('My Sunshine', 127018924), ('聽(tīng)', 123192697), ('微笑著勝利（慶祝建軍91周年網(wǎng)宣主題曲）', 601622060), ('Lost In The Stars', 268791350), ('Everything Will Say Goodbye', 285312563), ('《星辰》——電視劇《擇天記》片頭曲', 609686640), ('聽(tīng)', 123206622), ('Give You My World', 277779153), ('微笑著勝利（慶祝建軍91周年網(wǎng)宣主題曲）（伴奏）', 601622061), ('My Sunshine', 131096021), ('三生三世', 537883379), ('著魔', 53603708), ('三生三世', 537883380), ('Torches', 541943830), ('浩瀚', 124796979), ('逆戰(zhàn)', 14944589), ('劍心', 121223583), ('天下', 1103789), ('燕歸巢', 136982116)]"""
        return song_info
    def downloadSong(self):
        if not os.path.exists('music'):
            os.mkdir('music')
        song_info =self.paserSong()
        for song_detail in song_info:
            song_info_id=song_detail[1]
            song_info_name=song_detail[0]
            print("正在下載第%s頁(yè)的：%s的《%s》"%(self.page,self.singer,song_info_name))
            # 通過(guò)該API接口獲取歌曲信息的json格式數(shù)據(jù)
            song_url ='http://tingapi.ting.baidu.com/v1/restserver/ting?method=baidu.ting.song.play&format=jsonp&callback=jQuery17202741599001012014_1513517333931&songid=%s&_=1513517334915'%song_info_id
            # 獲取請(qǐng)求
            request_song_detail =urllib.request.urlopen(song_url)
            # 解析json歌曲數(shù)據(jù)
            pattern_song =re.compile(r'\((.*)\)',re.S)
            json_song_info=pattern_song.findall(request_song_detail.read().decode("utf8"))
            # 將字符串?dāng)?shù)據(jù)轉(zhuǎn)化成json數(shù)據(jù)，便于提取下載路徑
            lrclink=json.loads(json_song_info[0])["songinfo"]["lrclink"]
            file_link =json.loads(json_song_info[0])["bitrate"]["file_link"]
            # 創(chuàng)建文件格式保存文件
            filename_music=song_info_name+"_%s.mp3"%self.singer
            filename_lrc =song_info_name+"_%s.lrc"%self.singer
            song_path = os.path.join("music",filename_music)
            lrc_path = os.path.join("music",filename_lrc)
            try:
                # 下載歌曲和歌詞數(shù)據(jù)
                urllib.request.urlretrieve(lrclink,lrc_path)
                urllib.request.urlretrieve( file_link, song_path )
                time.sleep(1)
                print("《%s》下載完成"%song_info_name)
            except Exception as e:
                print("因版權(quán)受限無(wú)法下載")




# 錄入爬取信息
def  main():
    singer = input( "請(qǐng)輸入爬取的歌手或是歌名：" )
    start_page = int( input( "請(qǐng)輸入爬取的開(kāi)始頁(yè)：" ) )
    end_page = int( input( "請(qǐng)輸入爬取的終止頁(yè)：" ) )
    for page in range( start_page, end_page + 1 ):
        baidumusic = BaiDuMusic( singer, page )
        if page>end_page+1:
            print("%s歌手的所有歌曲都已下載完畢"%singer)
        baidumusic.downloadSong()

# 運(yùn)行
if __name__ == '__main__':
    main()

本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間，所有內(nèi)容均由用戶發(fā)布，不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購(gòu)買(mǎi)等信息，謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容，請(qǐng)點(diǎn)擊一鍵舉報(bào)。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來(lái)自：北方的白樺林 > 《python》

舉報(bào)/認(rèn)領(lǐng)