這些都是筆記,還缺少詳細整理,后續(xù)會更新。 下面這種方式,屬于入門階段,手動成分比較多.
首先安裝必要組件: pip3 install requests pip3 install beautifulsoup4
一、爬汽車之家 #!/usr/bin/env python # coding:utf-8 import requests from bs4 import BeautifulSoup # 1.下載頁面 ret = requests.get(url="https://www.autohome.com.cn/news/") # print(ret) # 得到對象 # ret.encoding="gbk" # 指定編碼 # print(ret.apparent_encoding) ret.encoding = ret.apparent_encoding # 指定編碼等于原始頁面編碼 # print(ret.text) # 2. 解析:獲取想要的指定內(nèi)容 beautifulsoup soup = BeautifulSoup(ret.text, 'html.parser') # 使用lxml則速度更快 # 如果要加class,則前面加下劃線 # div = soup.find(name='div', id='auto-channel-lazyload-article', _class='article-wrapper') # 找到外部DIV div = soup.find(name='div', attrs={"id":"auto-channel-lazyload-article","class":"article-wrapper"}) # 使用屬性字典方式 li_list = div.find_all(name='li') for li in li_list: h3 = li.find(name='h3') if not h3: continue print(h3.text) a = li.find('a') # print(a.attrs) print(a.get('href')) p = li.find(name='p') print(p.text) print('----->' * 20) img = li.find(name='img') src = img.get('src') filename = src.rsplit('__', maxsplit=1)[1] down_img = requests.get(url='https:' + src) with open(filename, 'wb') as f: f.write(down_img.content) 當然,從for循環(huán)輸出開始,將內(nèi)容寫入文件或數(shù)據(jù)庫,就隨需求了。
View Code
二、登錄抽屜 #!/usr/bin/env python # coding:utf-8 import requests # 請求頭要加,先訪問普通網(wǎng)頁,偽造得越像瀏覽器越好 # 1. 先訪問網(wǎng)頁,獲取cookie(未授權(quán)) ret = requests.get( url="https://dig./all/hot/recent/1", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } ) # print(ret.text) r1_cookie_dict = ret.cookies.get_dict() # 2. 登錄 發(fā)送用戶名和密碼認證, 帶上未授權(quán)的cookie # 需要注意反爬蟲策略 response_login = requests.post( url="https://dig./login", data={ "phone": "8618912600100", "password": "wodemima", "oneMonth": "1" }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' }, cookies=r1_cookie_dict ) # print(response_login.text) # cookie_dict=response_login.cookies.get_dict() # 第二次返回的cookie # 點贊 r1 = requests.post( url="https://dig./link/vote?linksId=20630611", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}, cookies=r1_cookie_dict ) print(r1.text) # {"result":{"code":"9999", "message":"推薦成功", "data":{"jid":"cdu_53074732774","likedTime":"1530752755154000","lvCount":"21","nick":"aabbccdd","uvCount":"1","voteTime":"小于1分鐘前"}}}
requests和bs4的幾個小片段: #!/usr/bin/env python # coding:utf-8 import requests,re from bs4 import BeautifulSoup ''' requests.get(url="http://www.baidu.com") # requests.request(method="get",url="xxx") requests.post(url="http://www.baidu.com") # requests.request(method="post",url="xxx") 可以傳的參數(shù): url: 地址 params: URL中傳入的參數(shù) headers: 請求頭 cookies: Cookie data: 數(shù)據(jù) 以上必需牢記 ''' ret = requests.get( url="https://www.baidu.com/s", params={"wd": "王歷宏"}, # https://www.baidu.com/s?wd=%E6%9D%8E%E5%81%A5 headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', }, ) ret.encoding = ret.apparent_encoding # print(ret.text) soup = BeautifulSoup(ret.text, 'html.parser') div = soup.find(name='span', attrs={"class":"nums_text"}) # lis = re.findall("\d+",div.text) # print("".join(lis)) print(div.text) ''' ### json參數(shù) requests.post( url="http://www.baidu.com", # json={ # 'name':'alex', # 'passwd':'123456', # }, headers={}, cookies={}, # 如果搞不清對方是要Form_data 還是payload 就使用下面的方式。 data=json_dumps({ 'name':'alex', 'pwd':'123456', }) ) ''' ## 上傳文件 # auth 基本彈窗驗證 from requests.auth import HTTPBasicAuth,HTTPDigestAuth res = requests.get( 'https://api.github.com/user', auth=HTTPBasicAuth("abc@163.com","11223344") # 'https://api.github.com/user', auth=HTTPDigestAuth("abc@163.com","11223344") # 方法不一樣 ) print(res.text) # timeout 超時時間 # allow_redirects ## proxies 代理 ''' proxies ={ "http":"61.172.249.96:80", "https":"http://61.185.219.126:3128", } ret = requests.get("http://www./Proxy",proxies=proxies) proxies2 = {"http://10.20.1.128":"http://10.10.1.10:5323"} ''' # 使用代理字典,以及用戶名密碼 ''' from requests.auth import HTTPProxyAuth proxy_dict={ 'http':'77.75.105.165', 'https':'77.75.105.166' } auth=HTTPProxyAuth('username','mypwd') r = requests.get("http://www.google.com",proxies=proxy_dict,auth=auth) '''
我上交的作業(yè),還是有不少問題。 #!/usr/bin/env python # coding:utf-8 import requests from bs4 import BeautifulSoup username = input("請輸入github賬號:") pwd = input("請輸入github密碼:") print("請稍等幾秒... ") # 1. 打開登錄頁 ret1 = requests.get( url="https://github.com/login", headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', } ) r1_cookie_dict = ret1.cookies.get_dict() # 首次獲取cookie soup1 = BeautifulSoup(ret1.text, features='lxml') token1 = soup1.find(name="input", attrs={"name": "authenticity_token"}).get("value") # 拿到頁面token # print(token1) # 是否取到 authenticity_token # 2. 登錄動作 ret2 = requests.post( url="https://github.com/session", data={ "commit": "Sign in", "utf8": "?", "authenticity_token": token1, "login": username, "password": pwd, }, headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r1_cookie_dict # 帶上首次的cookie ) r2_cookie_dict = ret2.cookies.get_dict() # 獲取登錄成功后返回的cookie # print(ret2.text) # 確實是慢了點 # 3. 作業(yè)中要求獲取個人信息,所以打開個人settings頁 ret3 = requests.get( url="https://github.com/settings/profile", headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0', }, cookies=r2_cookie_dict # 帶上登錄成功后的cookie ) # print(ret3.text) # 4. 查找并打印個人信息 soup3 = BeautifulSoup(ret3.text, features='lxml') user_info_name= soup3.find(name="input", attrs={"name": "user[profile_name]"}).get("value") user_info_email = soup3.find(name="select", attrs={"name": "user[profile_email]"}).get("option") # 可能有問題 user_info_bio = soup3.find(name="textarea", attrs={"name": "user[profile_bio]"}).get("value") user_info_url = soup3.find(name="input", attrs={"name": "user[profile_blog]"}).get("value") user_info_company = soup3.find(name="input", attrs={"name": "user[profile_company]"}).get("value") user_info_location = soup3.find(name="input", attrs={"name": "user[profile_location]"}).get("value") print('Name: ',user_info_name) print('Public email: ',user_info_email) print('Bio: ',user_info_bio) print('URL: ',user_info_url) print('Company: ',user_info_company) print('Location: ',user_info_location) ''' 以下是API的方式,試過,直接得到字典。 from requests.auth import HTTPBasicAuth res = requests.get( 'https://api.github.com/user', auth=HTTPBasicAuth(username, pwd) ) print(res.text) '''
以下是老師給的指導意見,真是非常好的反饋: 1.請了解下python的pep8規(guī)范 2.你的請求頭一定要寫完整,不要這么暴露你的爬蟲請求,這種行為是不好的習慣。 3.你代碼的注釋寫在文檔里最好了。 4.你每個請求一定要try一下這在爬蟲里很重要你要保證你的爬蟲穩(wěn)定運行 5.你的代碼應該封裝成函數(shù) 6.你寫任何項目的時候注意下項目結(jié)構(gòu)哈 7.同學作業(yè)寫的很好了,其實生產(chǎn)中bs4還是不多的。pyquery或者路徑獲取的方式用的很多。
|
|