Python中如何使用BeautifulSoup和正则表达式爬取网站并解决常见问题

1、代码一的问题 
from urllib.request import urlopen 
from bs4 import BeautifulSoup  
import re 
def getLinks(articleUrl):
html = urlopen(“http://www.ccb.com/cn/home”+articleUrl)

bsObj = BeautifulSoup(html,‘lxml’)
print(“bsObj=”,bsObj)
return bsObj.find(“div”, {“class”:“copy-right_text”}).findAll(“span”,re.compile("^‘手机网站：’(.)*$"))
links = getLinks("/indexv3.html")
print(links)
上面的代码一是用于爬“ http://www.ccb.com/cn/home/indexv3.html ” 这个网址底部“手机网站”栏位显示的网址，打印 BeautifulSoup(html,‘lxml’) 返回的对象时发现“手机网站”这几个字并未出现。
通过查看网站源，发现程序未显示的内容都是在网站接近末尾处的这句话之后：“<!–底栏下面不可跳转部分–>”，这句话之后的网站源码无法被 BeautifulSoup.find 搜索到，
请问这是为什么呢？要如何才能查到呢？感谢！
2、代码二的问题
from urllib.request import urlopen
from bs4 import BeautifulSoup

import re
def getLinks(articleUrl):
html = urlopen(“http://www.ccb.com/cn/home”+articleUrl)
bsObj = BeautifulSoup(html,‘lxml’)
print(‘bsObj.find=’,bsObj.find(“div”, {“class”:“Language_select”}))
return bsObj.find(“div”, {“class”:“Language_select”}).findAll(“a”,href=re.compile(""( http://.* )">繁体"))
links = getLinks("/indexv3.html")
print(‘links=’,links)
上面的代码二是用于爬“ http://www.ccb.com/cn/home/indexv3.html ” 这个网址底部繁体网站的域名，程序输出如下：
bsObj.find=
http://fjt.ccb.com ">繁体 /http://en.ccb.com/en/home/indexv3.html ">ENGLISH
links= []
输出中的 http://fjt.ccb.com 就是希望提取的结果，但是为何最终打印 links 却没有内容呢？恳请指点！感谢！

caililin 1楼

帖子回复：

要高效爬取网站，结合BeautifulSoup和正则表达式是常见做法。这里给你一个完整的实战示例，包含常见问题的解决方案：

import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time

class WebCrawler:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
    
    def fetch_page(self, url, retries=3):
        """获取页面内容，带重试机制"""
        for attempt in range(retries):
            try:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                return response.text
            except requests.RequestException as e:
                if attempt == retries - 1:
                    raise
                time.sleep(2 ** attempt)  # 指数退避
        return None
    
    def parse_with_bs4(self, html):
        """使用BeautifulSoup解析HTML"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # 示例：提取所有链接
        links = []
        for a_tag in soup.find_all('a', href=True):
            link = a_tag['href']
            # 使用正则过滤特定模式的链接
            if re.search(r'\.(html|htm|php)$', link):
                links.append(link)
        
        # 示例：提取特定class的内容
        articles = []
        for article in soup.find_all('div', class_=re.compile(r'article|post')):
            title = article.find('h1') or article.find('h2')
            if title:
                articles.append(title.get_text(strip=True))
        
        return {
            'links': links,
            'articles': articles,
            'soup': soup  # 保留soup对象供后续使用
        }
    
    def extract_with_regex(self, text, pattern):
        """使用正则表达式提取特定数据"""
        # 示例：提取邮箱
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
        
        # 示例：提取电话号码（中国格式）
        phones = re.findall(r'1[3-9]\d{9}', text)
        
        # 使用传入的自定义模式
        custom_matches = re.findall(pattern, text) if pattern else []
        
        return {
            'emails': emails,
            'phones': phones,
            'custom': custom_matches
        }
    
    def resolve_relative_urls(self, base_url, relative_urls):
        """处理相对URL"""
        return [urljoin(base_url, url) for url in relative_urls]
    
    def clean_text(self, text):
        """清理提取的文本"""
        # 移除多余空白字符
        text = re.sub(r'\s+', ' ', text)
        # 移除不可见字符
        text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
        return text.strip()

# 使用示例
def main():
    crawler = WebCrawler()
    
    try:
        # 1. 获取页面
        url = "https://example.com"
        html = crawler.fetch_page(url)
        
        if html:
            # 2. 使用BeautifulSoup解析
            parsed_data = crawler.parse_with_bs4(html)
            print(f"找到链接数: {len(parsed_data['links'])}")
            print(f"找到文章数: {len(parsed_data['articles'])}")
            
            # 3. 使用正则提取特定数据
            text_content = parsed_data['soup'].get_text()
            regex_data = crawler.extract_with_regex(text_content, r'\d{4}-\d{2}-\d{2}')  # 提取日期
            
            # 4. 处理相对URL
            full_urls = crawler.resolve_relative_urls(url, parsed_data['links'][:5])
            print("前5个完整URL:", full_urls)
            
            # 5. 清理文本
            sample_text = parsed_data['articles'][0] if parsed_data['articles'] else ""
            cleaned = crawler.clean_text(sample_text)
            print("清理后的文本:", cleaned)
            
    except Exception as e:
        print(f"爬取失败: {e}")

if __name__ == "__main__":
    main()

关键点说明：

BeautifulSoup负责结构解析：用find_all()和CSS选择器定位元素，比正则更适合处理HTML结构
正则处理模式化数据：用正则提取邮箱、电话、日期等有固定模式的内容
常见问题解决：
- 编码问题：BeautifulSoup自动处理
- 相对URL：用urljoin转换
- 请求失败：加入重试机制
- 反爬：设置User-Agent
- 文本清理：移除多余空白和特殊字符

总结建议：用BeautifulSoup处理结构，正则处理模式化数据。

eggper 2楼

自己顶一下