Python: 使用Scrapy定制可动态配置的爬虫
May 26, 2015
本文紧接上篇博客,在上一篇博客中我们讲解了如何使用编程的方式运行Scrapy spider。本文将讲解如何通过维护多个网站的爬取规则来抓取各个网站的数据。
具体要实现的目标是这样的,有一张
我们使用 SQLAlchemy来映射数据库,Rule表的结构如下:
from sqlalchemy import Column, String , DateTime, Integer from sqlalchemy.ext.declarative import declarative_base Base = declarative_base() class Rule (Base) : __tablename__ = 'rules' id = Column(Integer, primary_key=True ) name = Column(String) allow_domains = Column(String) start_urls = Column(String) next_page = Column(String) allow_url = Column(String) extract_from = Column(String) title_xpath = Column(String) body_xpath = Column(String) publish_time_xpath = Column(String) source_site_xpath = Column(String) enable = Column(Integer) |
接下来我们要重新定制我们的spider,命名为
# -*- coding: utf-8 -*- import scrapy from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor class Article (scrapy.Item) : title = scrapy.Field() url = scrapy.Field() body = scrapy.Field() publish_time = scrapy.Field() source_site = scrapy.Field() class DeepSpider (CrawlSpider) : name = "Deep" def __init__ (self,rule) : self.rule = rule self.name = rule.name self.allowed_domains = rule.allow_domains.split("," ) self.start_urls = rule.start_urls.split("," ) rule_list = [] #添加`下一页`的规则 if rule.next_page: rule_list.append(Rule(LinkExtractor(restrict_xpaths = rule.next_page))) #添加抽取文章链接的规则 rule_list.append(Rule(LinkExtractor( allow=[rule.allow_url], restrict_xpaths = [rule.extract_from]), callback='parse_item' )) self.rules = tuple(rule_list) super(DeepSpider, self).__init__() def parse_item (self, response) : self.log('Hi, this is an article page! %s' % response.url) article = Article() article["url" ] = response.url title = response.xpath(self.rule.title_xpath).extract() article["title" ] = title[0 ] if title else "" body = response.xpath(self.rule.body_xpath).extract() article["body" ] = '\n' .join(body) if body else "" publish_time = response.xpath(self.rule.publish_time_xpath).extract() article["publish_time" ] = publish_time[0 ] if publish_time else "" source_site = response.xpath(self.rule.source_site_xpath).extract() article["source_site" ] = source_site[0 ] if source_site else "" return article |
要注意的是
为了同时运行多个spider,我们需要稍稍修改上节中的运行脚本
# -*- coding: utf-8 -*- from spiders.deep_spider import DeepSpider from model.config import DBSession from model.rule import Rule # scrapy api from scrapy import signals, log from twisted.internet import reactor from scrapy.crawler import Crawler from scrapy.settings import Settings RUNNING_CRAWLERS = [] def spider_closing (spider) : """Activates on spider closed signal""" log.msg("Spider closed: %s" % spider, level=log.INFO) RUNNING_CRAWLERS.remove(spider) if not RUNNING_CRAWLERS: reactor.stop() log.start(loglevel=log.DEBUG) settings = Settings() # crawl settings settings.set("USER_AGENT" , "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36" ) db = DBSession() rules = db.query(Rule).filter(Rule.enable == 1 ) for rule in rules: crawler = Crawler(settings) spider = DeepSpider(rule) # instantiate every spider using rule RUNNING_CRAWLERS.append(spider) # stop reactor when spider closes crawler.signals.connect(spider_closing, signal=signals.spider_closed) crawler.configure() crawler.crawl(spider) crawler.start() # blocks process so always keep as the last statement reactor.run() |
我们从数据库中查出启用的rules,并对于rules中每一个规则实例化一个
运行
现在我们可以往Rule表中加入成百上千个网站的规则,而不用添加一行代码,就可以对这成百上千个网站进行爬取。当然你完全可以做一个Web前端来完成维护Rule表的任务。当然Rule规则也可以放在除了数据库的任何地方,比如配置文件。
由于本人刚接触 Scrapy 不久,如有理解不当之处或是更好的解决方案,还请不吝赐教 🙂
你可以在 GitHub上看到本文的完整项目。
原文:http://wuchong.me/blog/2015/05/22/running-scrapy-dynamic-and-configurable/
0 Comments