scrapy框架是爬蟲界最為強(qiáng)大的框架,沒有之一,它的強(qiáng)大在于它的高可擴(kuò)展性和低耦合,使使用者能夠輕松的實(shí)現(xiàn)更改和補(bǔ)充。
其中內(nèi)置三種爬蟲主程序模板,scrapy.Spider、RedisSpider、CrawlSpider、RedisCrawlSpider(深度分布式爬蟲)分別為別為一般爬蟲、分布式爬蟲、深度爬蟲提供內(nèi)部邏輯;下面將從源碼和應(yīng)用來學(xué)習(xí), scrapy.Spider 源碼: ''' Base class for Scrapy spiders
See documentation in docs/topics/spiders.rst ''' import logging import warnings
from scrapy import signals from scrapy.http import Request from scrapy.utils.trackref import object_ref from scrapy.utils.url import url_is_from_spider from scrapy.utils.deprecate import create_deprecated_class from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.utils.deprecate import method_is_overridden
class Spider(object_ref): '''Base class for scrapy spiders. All spiders must inherit from this class. '''
name = None custom_settings = None
def __init__(self, name=None, **kwargs): if name is not None: self.name = name elif not getattr(self, 'name', None): raise ValueError('%s must have a name' % type(self).__name__) self.__dict__.update(kwargs) if not hasattr(self, 'start_urls'): self.start_urls = []
@property def logger(self): logger = logging.getLogger(self.name) return logging.LoggerAdapter(logger, {'spider': self})
def log(self, message, level=logging.DEBUG, **kw): '''Log the given message at the given log level
This helper wraps a log call to the logger within the spider, but you can use it directly (e.g. Spider.logger.info('msg')) or use any other Python logger too. ''' self.logger.log(level, message, **kw)
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = cls(*args, **kwargs) spider._set_crawler(crawler) return spider
def set_crawler(self, crawler): warnings.warn('set_crawler is deprecated, instantiate and bound the ' 'spider to this crawler with from_crawler method ' 'instead.', category=ScrapyDeprecationWarning, stacklevel=2) assert not hasattr(self, 'crawler'), 'Spider already bounded to a ' \ 'crawler' self._set_crawler(crawler)
def _set_crawler(self, crawler): self.crawler = crawler self.settings = crawler.settings crawler.signals.connect(self.close, signals.spider_closed)
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( 'Spider.make_requests_from_url method is deprecated; it ' 'won't be called in future Scrapy releases. Please ' 'override Spider.start_requests method instead (see %s.%s).' % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)
def make_requests_from_url(self, url): ''' This method is deprecated. ''' return Request(url, dont_filter=True)
def parse(self, response): raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
@classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or {}, priority='spider')
@classmethod def handles_request(cls, request): return url_is_from_spider(request.url, cls)
@staticmethod def close(spider, reason): closed = getattr(spider, 'closed', None) if callable(closed): return closed(reason)
def __str__(self): return '<%s %r at 0x%0x>' % (type(self).__name__, self.name, id(self))
__repr__ = __str__
BaseSpider = create_deprecated_class('BaseSpider', Spider)
class ObsoleteClass(object): def __init__(self, message): self.message = message
def __getattr__(self, name): raise AttributeError(self.message)
spiders = ObsoleteClass( ''from scrapy.spider import spiders' no longer works - use ' ''from scrapy.spiderloader import SpiderLoader' and instantiate ' 'it with your project settings'' )
# Top-level imports from scrapy.spiders.crawl import CrawlSpider, Rule from scrapy.spiders.feed import XMLFeedSpider, CSVFeedSpider from scrapy.spiders.sitemap import SitemapSpider
其中需要關(guān)注的是name(爬蟲名字)、start_urls(抓取的起始url列表)、allowed_domains(限定抓取的url所在域名)、start_requests(開始抓取的方法) name、start_urls、allowed_domains是屬性,在創(chuàng)建創(chuàng)建項(xiàng)目的時候已經(jīng)建好了,稍作修改即可。start_requests是起始的抓取方法,一般是默認(rèn)的遍歷start_urls列表生成Request對象,在scrapy中需要登錄的時候可以復(fù)寫該方法,這個比較簡單不在贅述。 CrawlSpider 深度爬蟲,根據(jù)連接提取規(guī)則,會自動抓取頁面中滿足規(guī)則的連接,然后再請求解析,再抓取從而一直深入。 源碼 ''' This modules implements the CrawlSpider which is the recommended spider to use for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst '''
import copy import six
from scrapy.http import Request, HtmlResponse from scrapy.utils.spider import iterate_spider_output from scrapy.spiders import Spider
def identity(x): return x
class Rule(object):
def __init__(self, link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity): self.link_extractor = link_extractor self.callback = callback self.cb_kwargs = cb_kwargs or {} self.process_links = process_links self.process_request = process_request if follow is None: self.follow = False if callback else True else: self.follow = follow
class CrawlSpider(Spider):
rules = ()
def __init__(self, *a, **kw): super(CrawlSpider, self).__init__(*a, **kw) self._compile_rules()
def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
def parse_start_url(self, response): return []
def process_results(self, response, results): return results
def _build_request(self, rule, link): r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(rule=rule, link_text=link.text) return r
def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r)
def _response_downloaded(self, response): rule = self._rules[response.meta['rule']] return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item
if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _compile_rules(self): def get_method(method): if callable(method): return method elif isinstance(method, six.string_types): return getattr(self, method, None)
self._rules = [copy.copy(r) for r in self.rules] for rule in self._rules: rule.callback = get_method(rule.callback) rule.process_links = get_method(rule.process_links) rule.process_request = get_method(rule.process_request)
@classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs) spider._follow_links = crawler.settings.getbool( 'CRAWLSPIDER_FOLLOW_LINKS', True) return spider
def set_crawler(self, crawler): super(CrawlSpider, self).set_crawler(crawler) self._follow_links = crawler.settings.getbool('CRAWLSPIDER_FOLLOW_LINKS', True)
CrawlSpider是繼承于Spider,也實(shí)現(xiàn)了其中的常用屬性和方法,新增了一個rules屬性(連接提取規(guī)則集合),但是不同的是Crawl內(nèi)部實(shí)現(xiàn)了parse解析方法,不能在Crawl中使用該關(guān)鍵詞。 def parse(self, response): return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True) 也提供了一個可復(fù)寫(overrideable)的方法: rules
在rules中包含一個或多個Rule對象,每個Rule對爬取網(wǎng)站的動作定義了特定操作。如果多個rule匹配了相同的鏈接,則根據(jù)規(guī)則在本集合中被定義的順序,第一個會被使用。 class scrapy.spiders.Rule( link_extractor, callback = None, cb_kwargs = None, follow = None, process_links = None, process_request = None )
link_extractor :是一個Link Extractor對象,用于定義需要提取的鏈接(Link Extractor對象見下)。
callback : 從link_extractor中每獲取到鏈接時,參數(shù)所指定的值作為回調(diào)函數(shù),該回調(diào)函數(shù)接受一個response作為其第一個參數(shù)。
注意:當(dāng)編寫爬蟲規(guī)則時,避免使用parse作為回調(diào)函數(shù)。由于CrawlSpider使用parse方法來實(shí)現(xiàn)其邏輯,如果覆蓋了 parse方法,crawl spider將會運(yùn)行失敗。
follow :是一個布爾(boolean)值,指定了根據(jù)該規(guī)則從response提取的鏈接是否需要跟進(jìn)。 如果callback為None,follow 默認(rèn)設(shè)置為True ,否則默認(rèn)為False。
process_links :指定該spider中哪個的函數(shù)將會被調(diào)用,從link_extractor中獲取到鏈接列表時將會調(diào)用該函數(shù)。該方法主要用來過濾。
process_request :指定該spider中哪個的函數(shù)將會被調(diào)用, 該規(guī)則提取到每個request時都會調(diào)用該函數(shù)。 (用來過濾request)
LinkExtractorsclass scrapy.linkextractors.LinkExtractor
Link Extractors 的目的很簡單: 提取鏈接? 每個LinkExtractor有唯一的公共方法是 extract_links(),它接收一個 Response 對象,并返回一個 scrapy.link.Link 對象。 Link Extractors要實(shí)例化一次,并且 extract_links 方法會根據(jù)不同的 response 調(diào)用多次提取鏈接? class scrapy.linkextractors.LinkExtractor( allow = (), deny = (), allow_domains = (), deny_domains = (), deny_extensions = None, restrict_xpaths = (), tags = ('a','area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None )
主要參數(shù): allow :滿足括號中“正則表達(dá)式”的值會被提取,如果為空,則全部匹配。
deny :與這個正則表達(dá)式(或正則表達(dá)式列表)不匹配的URL一定不提取。
allow_domains :會被提取的鏈接的domains。
deny_domains :一定不會被提取鏈接的domains。
restrict_xpaths :使用xpath表達(dá)式,和allow共同作用過濾鏈接。
案例 from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule
class TestSpider(CrawlSpider): name = 'Test' allowed_domains = ['Test.com'] start_urls = ['http://Test.com/']
rules = ( Rule(LinkExtractor(allow=r'Items/'), callback='parse_test', follow=True), )
def parse_test(self, response): items = {} ············ return items
RedisSpider、RedisCrawlSpider Scrapy-redis提供了下面四種組件: Scheduler :
Scrapy中跟“待爬隊(duì)列”直接相關(guān)的就是調(diào)度器Scheduler ,它負(fù)責(zé)對新的request進(jìn)行入列操作(加入Scrapy queue),取出下一個要爬取的request(從Scrapy queue中取出)等操作。它把待爬隊(duì)列按照優(yōu)先級建立了一個字典結(jié)構(gòu),比如: { 優(yōu)先級0 : 隊(duì)列0 優(yōu)先級1 : 隊(duì)列1 優(yōu)先級2 : 隊(duì)列2 }
然后根據(jù)request中的優(yōu)先級,來決定該入哪個隊(duì)列,出列時則按優(yōu)先級較小的優(yōu)先出列。為了管理這個比較高級的隊(duì)列字典,Scheduler需要提供一系列的方法。但是原來的Scheduler已經(jīng)無法使用,所以使用Scrapy-redis的scheduler組件。 Duplication Filter
Scrapy中用集合實(shí)現(xiàn)這個request去重功能,Scrapy中把已經(jīng)發(fā)送的request指紋放入到一個集合中,把下一個request的指紋拿到集合中比對,如果該指紋存在于集合中,說明這個request發(fā)送過了,如果沒有則繼續(xù)操作。這個核心的判重功能是這樣實(shí)現(xiàn)的: def request_seen(self, request): # self.request_figerprints就是一個指紋集合 fp = self.request_fingerprint(request)
# 這就是判重的核心操作 if fp in self.fingerprints: return True self.fingerprints.add(fp) if self.file: self.file.write(fp + os.linesep)
在scrapy-redis中去重是由Duplication Filter 組件來實(shí)現(xiàn)的,它通過redis的set 不重復(fù)的特性,巧妙的實(shí)現(xiàn)了Duplication Filter去重。scrapy-redis調(diào)度器從引擎接受request,將request的指紋存?redis的set檢查是否重復(fù),并將不重復(fù)的request push寫?redis的 request queue。 引擎請求request(Spider發(fā)出的)時,調(diào)度器從redis的request queue隊(duì)列?里根據(jù)優(yōu)先級pop 出?個request 返回給引擎,引擎將此request發(fā)給spider處理。 Item Pipeline :
引擎將(Spider返回的)爬取到的Item給Item Pipeline,scrapy-redis 的Item Pipeline將爬取到的 Item 存?redis的 items queue。 修改過Item Pipeline 可以很方便的根據(jù) key 從 items queue 提取item,從?實(shí)現(xiàn) items processes 集群。 Base Spider
不在使用scrapy原有的Spider類,重寫的RedisSpider 繼承了Spider和RedisMixin這兩個類,RedisMixin是用來從redis讀取url的類。 當(dāng)我們生成一個Spider繼承RedisSpider時,調(diào)用setup_redis函數(shù),這個函數(shù)會去連接redis數(shù)據(jù)庫,然后會設(shè)置signals(信號): 一個是當(dāng)spider空閑時候的signal,會調(diào)用spider_idle函數(shù),這個函數(shù)調(diào)用schedule_next_request 函數(shù),保證spider是一直活著的狀態(tài),并且拋出DontCloseSpider異常。 一個是當(dāng)抓到一個item時的signal,會調(diào)用item_scraped函數(shù),這個函數(shù)會調(diào)用schedule_next_request 函數(shù),獲取下一個request。
from scrapy import signals from scrapy.exceptions import DontCloseSpider from scrapy.spiders import Spider, CrawlSpider
from . import connection, defaults from .utils import bytes_to_str
class RedisMixin(object): '''Mixin class to implement reading urls from a redis queue.''' redis_key = None redis_batch_size = None redis_encoding = None
# Redis client placeholder. server = None
def start_requests(self): '''Returns a batch of start requests from redis.''' return self.next_requests()
def setup_redis(self, crawler=None): '''Setup redis connection and idle signal.
This should be called after the spider has set its crawler object. ''' if self.server is not None: return
if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None)
if crawler is None: raise ValueError('crawler is required')
settings = crawler.settings
if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, )
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip(): raise ValueError('redis_key must not be empty')
if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), )
try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError('redis_batch_size must be an integer')
if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
self.logger.info('Reading start URLs from redis key '%(redis_key)s' ' '(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s', self.__dict__)
self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def next_requests(self): '''Returns a request to be scheduled or none.''' use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop # XXX: Do we need to use a timeout here? found = 0 # TODO: Use redis pipeline execution. while found < self.redis_batch_size: data = fetch_one(self.redis_key) if not data: # Queue empty. break req = self.make_request_from_data(data) if req: yield req found += 1 else: self.logger.debug('Request not made from data: %r', data)
if found: self.logger.debug('Read %s requests from '%s'', found, self.redis_key)
def make_request_from_data(self, data): '''Returns a Request instance from data coming from Redis.
By default, ``data`` is an encoded URL. You can override this method to provide your own message decoding.
Parameters ---------- data : bytes Message from redis.
''' url = bytes_to_str(data, self.redis_encoding) return self.make_requests_from_url(url)
def schedule_next_requests(self): '''Schedules a request if available''' # TODO: While there is capacity, schedule a batch of redis requests. for req in self.next_requests(): self.crawler.engine.crawl(req, spider=self)
def spider_idle(self): '''Schedules a request if available, otherwise waits.''' # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() raise DontCloseSpider
class RedisSpider(RedisMixin, Spider): '''Spider that reads urls from redis queue when idle.
Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue.
Settings -------- REDIS_START_URLS_KEY : str (default: '<spider.name>:start_urls') Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: False) Use SET operations to retrieve messages from the redis queue. If False, the messages are retrieve using the LPOP command. REDIS_ENCODING : str (default: 'utf-8') Default encoding to use when decoding messages from redis queue.
'''
@classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj
class RedisCrawlSpider(RedisMixin, CrawlSpider): '''Spider that reads urls from redis queue when idle.
Attributes ---------- redis_key : str (default: REDIS_START_URLS_KEY) Redis key where to fetch start URLs from.. redis_batch_size : int (default: CONCURRENT_REQUESTS) Number of messages to fetch from redis on each attempt. redis_encoding : str (default: REDIS_ENCODING) Encoding to use when decoding messages from redis queue.
Settings -------- REDIS_START_URLS_KEY : str (default: '<spider.name>:start_urls') Default Redis key where to fetch start URLs from.. REDIS_START_URLS_BATCH_SIZE : int (deprecated by CONCURRENT_REQUESTS) Default number of messages to fetch from redis on each attempt. REDIS_START_URLS_AS_SET : bool (default: True) Use SET operations to retrieve messages from the redis queue. REDIS_ENCODING : str (default: 'utf-8') Default encoding to use when decoding messages from redis queue.
'''
@classmethod def from_crawler(self, crawler, *args, **kwargs): obj = super(RedisCrawlSpider, self).from_crawler(crawler, *args, **kwargs) obj.setup_redis(crawler) return obj
在scrapy_redis組件中不僅提供了RedisSpider還提供了兼具深度爬蟲的RedisCrawlSpider,至于其余幾個Redis分布式組件將在后面逐一分享。 Redis分布式組件,新增redis_key 屬性,用于早redis中去重和數(shù)據(jù)存儲。 示例 from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
class TestSpider(RedisCrawlSpider): name = 'test' allowed_domains = ['www.'] redis_key = 'testspider:start_urls' rules = [ # 獲取每一頁的鏈接 Rule(link_extractor=LinkExtractor(allow=('/?page=\d+'))), # 獲取每一個公司的詳情 Rule(link_extractor=LinkExtractor(allow=('/\d+')), callback='parse_item') ]
def parse_item(self, response): ······ return item
至于更多配置不再贅述, 后續(xù)將對一些組件繼續(xù)深入分析。
|