Python中如何获取自定义标签的内容？

<txpdiv> 这种自定义的 html 标签，标签的 id 还是通过动态生成的，要怎么获取内容？
css selector 和 xpath 都涉及到该节点的 id，而 id 每次刷新都不一样
Python中如何获取自定义标签的内容？

import re
from html.parser import HTMLParser

# 方法1：使用正则表达式（适合简单场景）
def get_custom_tags_regex(html, tag_name):
    """
    使用正则表达式提取自定义标签内容
    注意：正则解析HTML有局限性，复杂HTML建议用HTMLParser
    """
    pattern = f'<{tag_name}>(.*?)</{tag_name}>'
    return re.findall(pattern, html, re.DOTALL)

# 方法2：使用HTMLParser（标准库，更健壮）
class CustomTagParser(HTMLParser):
    def __init__(self, target_tag):
        super().__init__()
        self.target_tag = target_tag
        self.in_target_tag = False
        self.current_data = []
        self.results = []
    
    def handle_starttag(self, tag, attrs):
        if tag == self.target_tag:
            self.in_target_tag = True
    
    def handle_endtag(self, tag):
        if tag == self.target_tag:
            self.in_target_tag = False
            if self.current_data:
                self.results.append(''.join(self.current_data))
                self.current_data = []
    
    def handle_data(self, data):
        if self.in_target_tag:
            self.current_data.append(data)
    
    def get_results(self):
        return self.results

def get_custom_tags_htmlparser(html, tag_name):
    """使用HTMLParser提取自定义标签内容"""
    parser = CustomTagParser(tag_name)
    parser.feed(html)
    return parser.get_results()

# 方法3：使用BeautifulSoup（第三方库，最方便）
try:
    from bs4 import BeautifulSoup
    
    def get_custom_tags_bs4(html, tag_name):
        """使用BeautifulSoup提取自定义标签内容（需要安装beautifulsoup4）"""
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.find_all(tag_name)
        return [tag.get_text(strip=False) for tag in tags]
        
except ImportError:
    get_custom_tags_bs4 = None

# 示例用法
if __name__ == "__main__":
    # 测试HTML
    html_content = """
    <html>
        <body>
            <custom>这是第一个自定义标签的内容</custom>
            <div>普通div</div>
            <custom>这是第二个<br/>自定义标签的内容</custom>
            <custom><span>嵌套内容</span></custom>
        </body>
    </html>
    """
    
    tag_name = "custom"
    
    print("=== 方法1：正则表达式 ===")
    results1 = get_custom_tags_regex(html_content, tag_name)
    for i, content in enumerate(results1, 1):
        print(f"标签{i}: {content}")
    
    print("\n=== 方法2：HTMLParser ===")
    results2 = get_custom_tags_htmlparser(html_content, tag_name)
    for i, content in enumerate(results2, 1):
        print(f"标签{i}: {content}")
    
    if get_custom_tags_bs4:
        print("\n=== 方法3：BeautifulSoup ===")
        results3 = get_custom_tags_bs4(html_content, tag_name)
        for i, content in enumerate(results3, 1):
            print(f"标签{i}: {content}")

三种方法对比：

正则表达式：简单快速，但处理复杂HTML（如嵌套标签、属性）容易出错
HTMLParser：Python标准库，健壮性好，能处理复杂情况
BeautifulSoup：需要安装第三方库，但API最友好，功能最强大

建议： 简单需求用正则，标准项目用HTMLParser，复杂解析用BeautifulSoup。