温馨提示:
1. http/https 网页均可适用
2. 在第一级 tutorial 目录下运行如下命令查看结果:scrapy crawl kdl
3. scrapy 不是 python 原生库,需要安装才能使用: pip install scrapy
运行命令:scrapy startproject tutorial 新建 Scrapy 项目,创建包含下列内容的 tutorial 目录
tutorial/ scrapy.cfg # 项目的配置文件 tutorial/ # 该项目的python模块。之后您将在此加入代码 __init__.py items.py # 项目中的item文件 pipelines.py # 项目中的pipelines文件 settings.py # 项目的设置文件 spiders/ # 放置spider代码的目录 __init__.py ...
编写爬虫:在 tutorial/spiders/ 目录下新建 zdaye_spider.py 文件
import scrapy class KdlSpider(scrapy.spiders.Spider): name = "zdaye" def start_requests(self): url = "https://example.com" yield scrapy.Request(url, callback=self.parse) def parse(self, response): print(response.status)
添加自定义扩展:在 tutorial/ 目录下新建 myextend.py 文件,调用时只需修改 api_url 以及在 time.sleep 处设置提取IP的间隔时间即可
import time import threading import requests from scrapy import signals # 提取代理IP的api api_url = 'http://www.***.com/ShortProxy/GetIP/?api=1234567890&akey=8a17ca305f683620&count=10×pan=3&type=3' foo = True class Proxy: def __init__(self, ): self._proxy_list=requests.get(api_url).json().get('data').get('proxy_list') @property def proxy_list(self): return self._proxy_list @proxy_list.setter def proxy_list(self,list): self._proxy_list=list pro=Proxy() print(pro.proxy_list) class MyExtend: def__init__(self, crawler): self.crawler=crawler # 将自定义方法绑定到scrapy信号上,使程序与spider引擎同步启动与关闭 # scrapy信号文档: https://www.osgeo.cn/scrapy/topics/signals.html # scrapy自定义拓展文档: https://www.osgeo.cn/scrapy/topics/extensions.html crawler.signals.connect(self.start,signals.engine_started) crawler.signals.connect(self.close,signals.spider_closed) @classmethod def from_crawler(cls,crawler): return cls(crawler) def start(self): t=threading.Thread(target=self.extract_proxy) t.start() def extract_proxy(self): while foo: pro.proxy_list=requests.get(api_url).json().get('data').get('proxy_list') #设置每15秒提取一次ip time.sleep(15) def close(self): global foo foo = False
middlewares.py 中新增 ProxyDownloaderMiddleware 即代理中间件; 需要替换代码中的部分信息:username:用户名,password:密码
from scrapy import signals from .myextend import pro import random # useful for handling different item types with a single interface from itemadapter import is_item, ItemAdapter class TutorialSpiderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s=cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Request or item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class TutorialDownloaderMiddleware: # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class ProxyDownloaderMiddleware: def process_request(self, request, spider): proxy = random.choice(pro.proxy_list) # 用户名密码授权 username = "username" password = "password" request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": proxy} # 终端IP授权 # request.meta['proxy'] = "http://%(proxy)s/" % {"proxy": proxy} return None
settings.py中激活ProxyDownloaderMiddleware代理中间件和自定义拓展
BOT_NAME = 'tutorial' SPIDER_MODULES = ['tutorial.spiders'] NEWSPIDER_MODULE = 'tutorial.spiders' ROBOTSTXT_OBEY = False DOWNLOADER_MIDDLEWARES = { 'tutorial.middlewares.ProxyDownloaderMiddleware': 100, } LOG_LEVEL = 'WARNING' EXTENSIONS = { 'tutorial.myextend.MyExtend': 300, }