Python中如何使用Scrapy框架结合MongoDB进行去重与下载优化
def process_item(self, item, spider): url = item['file_url'] name = item['name']
result = self.post.aggregate(
[
{"$group": {"_id": {"url": url, "name": name}}}
]
)
if result:
pass
else:
self.post.insert({"url": url, "name": name})
return item
def file_path(self, request, response=None, info=None):
return request.meta.get(‘filename’, ‘’)
现在情况是这样,我想去组合键去重,然后没有的话就入数据库,然后下载,但是这个是框架里面的下载,我现在想在插入数据库后 直接下载,这个怎么调动那个 下载函数呢?
def get_media_requests(self, item, info):
file_url = item['file_url']
meta = {'filename': item['name']}
yield Request(url=file_url, meta=meta)
Python中如何使用Scrapy框架结合MongoDB进行去重与下载优化
1 回复
核心方案:使用Scrapy的 DupeFilter + Item Pipeline 实现MongoDB去重与下载优化
这里给你一个可直接运行的完整示例,包含去重过滤器和MongoDB存储优化:
# settings.py 配置
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'scrapy_db'
DUPEFILTER_CLASS = 'myproject.dupefilters.MongoDupeFilter'
ITEM_PIPELINES = {
'myproject.pipelines.MongoPipeline': 300,
}
# dupefilters.py - MongoDB去重过滤器
from scrapy.dupefilters import BaseDupeFilter
from pymongo import MongoClient
import hashlib
class MongoDupeFilter(BaseDupeFilter):
def __init__(self, mongo_uri, mongo_db):
self.client = MongoClient(mongo_uri)
self.db = self.client[mongo_db]
self.collection = self.db['seen_urls']
self.collection.create_index('url_hash', unique=True)
@classmethod
def from_settings(cls, settings):
return cls(
mongo_uri=settings.get('MONGO_URI'),
mongo_db=settings.get('MONGO_DATABASE')
)
def request_seen(self, request):
url_hash = hashlib.sha1(request.url.encode()).hexdigest()
if self.collection.find_one({'url_hash': url_hash}):
return True
self.collection.insert_one({
'url_hash': url_hash,
'url': request.url,
'timestamp': datetime.now()
})
return False
def close(self, reason):
self.client.close()
# pipelines.py - MongoDB存储管道
import pymongo
from itemadapter import ItemAdapter
class MongoPipeline:
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
collection_name = spider.name + '_items'
self.db[collection_name].insert_one(ItemAdapter(item).asdict())
return item
# spider示例
import scrapy
from scrapy.linkextractors import LinkExtractor
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['http://example.com']
def parse(self, response):
# 提取数据
item = {
'title': response.css('title::text').get(),
'url': response.url
}
yield item
# 继续爬取链接
le = LinkExtractor()
for link in le.extract_links(response):
yield scrapy.Request(link.url, callback=self.parse)
关键点说明:
- 去重机制:
MongoDupeFilter使用SHA1哈希存储URL指纹,通过MongoDB的唯一索引实现高效去重 - 存储优化:
MongoPipeline自动按爬虫名称分集合存储,避免数据混乱 - 配置简单:只需在settings.py中配置MongoDB连接和启用相应组件
下载优化技巧:
- 在settings.py中调整并发参数:
CONCURRENT_REQUESTS = 32 # 增加并发数
DOWNLOAD_DELAY = 0.25 # 适当延迟避免被封
RETRY_TIMES = 2 # 失败重试
一句话总结:用MongoDB做去重库,Scrapy管爬取逻辑,配置好并发参数就行。

