Python中如何使用Scrapy框架结合MongoDB进行去重与下载优化

def process_item(self, item, spider): url = item['file_url'] name = item['name']

    result = self.post.aggregate(
        [
            {"$group": {"_id": {"url": url, "name": name}}}
        ]
    )
    if result:
        pass
    else:
    self.post.insert({"url": url, "name": name})
    return item

def file_path(self, request, response=None, info=None): return request.meta.get(‘filename’, ‘’)

现在情况是这样,我想去组合键去重,然后没有的话就入数据库,然后下载,但是这个是框架里面的下载,我现在想在插入数据库后 直接下载,这个怎么调动那个 下载函数呢?

def get_media_requests(self, item, info):
    file_url = item['file_url']
    meta = {'filename': item['name']}
    yield Request(url=file_url, meta=meta)

Python中如何使用Scrapy框架结合MongoDB进行去重与下载优化

1 回复

核心方案:使用Scrapy的 DupeFilter + Item Pipeline 实现MongoDB去重与下载优化

这里给你一个可直接运行的完整示例,包含去重过滤器和MongoDB存储优化:

# settings.py 配置
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'scrapy_db'
DUPEFILTER_CLASS = 'myproject.dupefilters.MongoDupeFilter'
ITEM_PIPELINES = {
    'myproject.pipelines.MongoPipeline': 300,
}

# dupefilters.py - MongoDB去重过滤器
from scrapy.dupefilters import BaseDupeFilter
from pymongo import MongoClient
import hashlib

class MongoDupeFilter(BaseDupeFilter):
    def __init__(self, mongo_uri, mongo_db):
        self.client = MongoClient(mongo_uri)
        self.db = self.client[mongo_db]
        self.collection = self.db['seen_urls']
        self.collection.create_index('url_hash', unique=True)
    
    @classmethod
    def from_settings(cls, settings):
        return cls(
            mongo_uri=settings.get('MONGO_URI'),
            mongo_db=settings.get('MONGO_DATABASE')
        )
    
    def request_seen(self, request):
        url_hash = hashlib.sha1(request.url.encode()).hexdigest()
        if self.collection.find_one({'url_hash': url_hash}):
            return True
        self.collection.insert_one({
            'url_hash': url_hash,
            'url': request.url,
            'timestamp': datetime.now()
        })
        return False
    
    def close(self, reason):
        self.client.close()

# pipelines.py - MongoDB存储管道
import pymongo
from itemadapter import ItemAdapter

class MongoPipeline:
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )
    
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
    
    def close_spider(self, spider):
        self.client.close()
    
    def process_item(self, item, spider):
        collection_name = spider.name + '_items'
        self.db[collection_name].insert_one(ItemAdapter(item).asdict())
        return item

# spider示例
import scrapy
from scrapy.linkextractors import LinkExtractor

class MySpider(scrapy.Spider):
    name = 'myspider'
    start_urls = ['http://example.com']
    
    def parse(self, response):
        # 提取数据
        item = {
            'title': response.css('title::text').get(),
            'url': response.url
        }
        yield item
        
        # 继续爬取链接
        le = LinkExtractor()
        for link in le.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse)

关键点说明:

  1. 去重机制MongoDupeFilter 使用SHA1哈希存储URL指纹,通过MongoDB的唯一索引实现高效去重
  2. 存储优化MongoPipeline 自动按爬虫名称分集合存储,避免数据混乱
  3. 配置简单:只需在settings.py中配置MongoDB连接和启用相应组件

下载优化技巧:

  • 在settings.py中调整并发参数:
CONCURRENT_REQUESTS = 32  # 增加并发数
DOWNLOAD_DELAY = 0.25     # 适当延迟避免被封
RETRY_TIMES = 2           # 失败重试

一句话总结:用MongoDB做去重库,Scrapy管爬取逻辑,配置好并发参数就行。

回到顶部