https://github.com/intohole/xspider 是再重复造轮子！但让我们一起熟悉

xspider 简单 python 抓取框架

xspider

抓取单线程
简单 api 使用
xpath/css/json 提取器
多种队列
架构代码逻辑清晰，可以了解 spider 抓取过程
it's easy to crawl and extract web;

main.py:
from xspider.spider.spider import BaseSpider
from xspider.filters import urlfilter
from kuailiyu import KuaiLiYu
if name == “main”:
spider = BaseSpider(name = “kuailiyu”  , page_processor = KuaiLiYu() , allow_site = [“kuailiyu.cyzone.cn”] , start_urls = [“http://kuailiyu.cyzone.cn/”])
spider.url_filters.append(urlfilter.UrlRegxFilter([“kuailiyu.cyzone.cn/article/[0-9]*.html$","kuailiyu.cyzone.cn/index_[0-9]+.html$”]))
spider.start()
kuailiyu.py
from xspider import processor
from xspider.selector import xpath_selector
from xspider import model
class KuaiLiYu(processor.PageProcessor.PageProcessor):

    def __init__(self):
        super(KuaiLiYu , self).__init__()
        self.title_extractor = xpath_selector.XpathSelector(path = "//title/text()")

    def process(self , page , spider):
        items = model.fileds.Fileds()
        items["title"] = self.title_extractor.find(page)
        items["url"] = page.url
        return items

抓取部分有以下工程代码

Python爬虫小框架如何使用？一起来造作吧！

ionicwang 1楼

助攻 https://github.com/howie6879/talonspider

nodeper 2楼

import requests
from bs4 import BeautifulSoup
import re

class SimpleCrawler:
    def __init__(self, base_url, headers=None):
        self.base_url = base_url
        self.headers = headers or {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.session = requests.Session()
        
    def fetch_page(self, url, params=None):
        """获取页面内容"""
        try:
            response = self.session.get(url, headers=self.headers, params=params, timeout=10)
            response.raise_for_status()
            response.encoding = response.apparent_encoding
            return response.text
        except requests.RequestException as e:
            print(f"请求失败: {e}")
            return None
    
    def parse_html(self, html, selector=None):
        """解析HTML内容"""
        if not html:
            return None
            
        soup = BeautifulSoup(html, 'html.parser')
        
        if selector:
            # 支持CSS选择器
            if selector.startswith(('.', '#')):
                return soup.select(selector)
            # 支持正则表达式
            elif selector.startswith('re:'):
                pattern = selector[3:]
                return re.findall(pattern, html)
            # 默认按标签查找
            else:
                return soup.find_all(selector)
        return soup
    
    def extract_links(self, html, pattern=None):
        """提取页面链接"""
        soup = BeautifulSoup(html, 'html.parser')
        links = []
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            # 处理相对链接
            if href.startswith('/'):
                href = self.base_url + href
            elif not href.startswith(('http://', 'https://')):
                href = self.base_url + '/' + href
                
            if pattern:
                if re.search(pattern, href):
                    links.append(href)
            else:
                links.append(href)
                
        return links
    
    def crawl(self, start_url, max_depth=2, callback=None):
        """递归爬取页面"""
        visited = set()
        
        def _crawl(url, depth):
            if depth > max_depth or url in visited:
                return
                
            visited.add(url)
            print(f"正在爬取: {url} (深度: {depth})")
            
            html = self.fetch_page(url)
            if html and callback:
                callback(url, html)
            
            # 只在第一层深度提取链接继续爬取
            if depth < max_depth:
                links = self.extract_links(html)
                for link in links[:5]:  # 限制链接数量防止过度爬取
                    _crawl(link, depth + 1)
        
        _crawl(start_url, 0)

# 使用示例
if __name__ == "__main__":
    # 1. 创建爬虫实例
    crawler = SimpleCrawler("https://httpbin.org")
    
    # 2. 定义数据处理回调函数
    def process_data(url, html):
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.string if soup.title else "无标题"
        print(f"页面标题: {title}")
        
        # 提取所有段落文本
        paragraphs = crawler.parse_html(html, 'p')
        for i, p in enumerate(paragraphs[:3], 1):
            print(f"段落{i}: {p.get_text()[:50]}...")
    
    # 3. 开始爬取
    crawler.crawl("https://httpbin.org/html", max_depth=1, callback=process_data)
    
    # 4. 单独请求并解析示例
    html = crawler.fetch_page("https://httpbin.org/html")
    if html:
        # 使用CSS选择器
        headers = crawler.parse_html(html, 'h1')
        print(f"\n找到 {len(headers)} 个h1标签")
        
        # 使用正则表达式提取
        titles = crawler.parse_html(html, 're:<title>(.*?)</title>')
        print(f"正则提取标题: {titles}")

这个框架的核心设计思路：