Python中提取标签内容遇到的BUG如何解决?

#--coding:utf-8--

import requests
import re
import time
from urllib.parse import quote
from save import get_Mysql

from lxml import etree
import json


class mySpider(object):
def init(self,dbname,mykey,mycity):
self.dbname = dbname
self.key = mykey
self.city = mycity
self.start_url = “http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&sm=0&p=1”.format(quote(self.key),quote(self.city))
self.headers = {
“Accept”:“text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8”,
“Accept-Encoding”:“gzip, deflate, sdch”,
“Accept-Language”:“zh-CN,zh;q=0.8,mt;q=0.6”,
“Cache-Control”:“max-age=0”,
“Connection”:“keep-alive”,
“Host”:“sou.zhaopin.com”,
“Referer”:“http://www.zhaopin.com/”,
“Upgrade-Insecure-Requests”:“1”,
“User-Agent”:“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36”
}
# self.mysql = get_Mysql(self.dbname,self.key,self.city)
# self.mysql.create_table()

#递归获取页面数据
def get_one_html(self,url):
data = {}
html = requests.get(url,headers=self.headers)
infos = etree.HTML(html.text)
selectors = infos.xpath(’//[@id=“newlist_list_content_table”]/table’)
for i in range(len(selectors)):
selector = selectors[i]
# 职位月薪
data[“job_gz”] = selector.xpath(’//tr[1]/td[4]/text()’)
print(data[“job_gz”])

# 招聘链接
data[“job_link”] = re.findall(‘http://jobs.zhaopin.com/\d+.htm’,html.text)

# 职位名称
data[‘job_name’] = re.findall('http://jobs.zhaopin.com/\d+.htm" target="_blank">(.
?)</a>’,html.text,re.S)


# 公司名称
data[“gsmc”] = [gsmc for gsmc in selector.xpath(’//td[@class=“gsmc”]/a[1]/text()’)]

# 公司链接
data[“gs_link”] = selector.xpath(’//td[@class=“gsmc”]/a[1]/@href’)


# 工作地点
data[“job_dd”] = selector.xpath(’//td[@class=“gzdd”]/text()’)

# 学历 /经验要求
data[“xlyq”] = selector.xpath(’//li[@class=“newlist_deatil_two”]/span[4]/text()’)

# 公司规模
data[“gsgm”] = selector.xpath(’//li[@class=“newlist_deatil_two”]/span[3]/text()’)

# 公司性质
data[“gsxz”] = selector.xpath(’//li[@class=“newlist_deatil_two”]/span[2]/text()’)



def main(self):
self.get_one_html(self.start_url)
# try:
# self.get_one_html(self.start_url)
# except Exception as e:
# print(e)
# finally:
# self.mysql.close_table()

if name == ‘main’:
start = time.time()
s = mySpider(‘51job’,‘北京’,‘java’)
s.main()
end = time.time()
print(“耗时:{:.2f}秒”.format(float(time.time()-start)))

----------------------------------------------分割线------------------------------------------------------
以上是源码,bug 出现在
selectors = infos.xpath(’//*[@id=“newlist_list_content_table”]/table’)
for i in range(len(selectors)):
selector = selectors[i]
# 职位月薪
data[“job_gz”] = selector.xpath(’//tr[1]/td[4]/text()’)
print(data[“job_gz”])

我享得到是循环获取每个标签下的文本,结果一次循环就提取了所有的,得到的数据为
{‘job_gz’: [‘10001-15000’, ‘10000-20000’, ‘15000-30000’, ‘10000-18000’, ‘面议’, ‘10001-15000’, ‘8001-10000’, ‘6001-8000’, ‘12000-20000’, ‘8001-10000’, ‘10001-15000’, ‘10001-15000’, ‘4001-6000’, ‘15001-20000’, ‘6001-8000’, ‘面议’, ‘6001-8000’, ‘6001-8000’, ‘15001-20000’, ‘8001-10000’, ‘10001-15000’, ‘面议’, ‘6001-8000’, ‘8001-10000’, ‘6001-8000’, ‘15000-25000’, ‘15000-20000’, ‘12000-20000’, ‘15001-20000’, ‘6001-8000’, ‘6001-8000’, ‘20001-30000’, ‘6001-8000’, ‘10001-15000’, ‘6001-8000’, ‘6001-8000’, ‘6001-8000’, ‘8001-10000’, ‘6001-8000’, ‘15000-25000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘8001-10000’, ‘2500-5000’, ‘6001-8000’]}
key 对应的 value 是 list,我想得到的是 key-value,一一对应的,不清楚哪里的逻辑出错了。请帮忙 debug
Python中提取标签内容遇到的BUG如何解决?


1 回复

问题分析: 在Python中提取HTML/XML标签内容时,常见问题包括:

  1. 编码问题导致乱码
  2. 标签嵌套或属性匹配不准确
  3. 动态加载内容未处理
  4. 正则表达式匹配不严谨

解决方案:

# 推荐使用BeautifulSoup + requests组合
import requests
from bs4 import BeautifulSoup
import re

def extract_tag_content(html, tag_name, attrs=None):
    """
    安全提取标签内容
    
    Args:
        html: HTML字符串或文件路径
        tag_name: 标签名
        attrs: 属性字典,如{'class': 'content'}
    """
    # 处理编码问题
    if isinstance(html, str) and html.startswith('http'):
        response = requests.get(html)
        response.encoding = response.apparent_encoding  # 自动检测编码
        soup = BeautifulSoup(response.text, 'html.parser')
    else:
        soup = BeautifulSoup(html, 'html.parser')
    
    # 提取内容
    if attrs:
        elements = soup.find_all(tag_name, attrs=attrs)
    else:
        elements = soup.find_all(tag_name)
    
    results = []
    for elem in elements:
        # 获取文本并清理
        text = elem.get_text(strip=True, separator=' ')
        if text:
            results.append(text)
    
    return results

# 示例用法
html_content = """
<div class="post">
    <h1>标题</h1>
    <p class="content">这是正文内容</p>
    <p>另一个段落</p>
</div>
"""

# 提取特定标签
print(extract_tag_content(html_content, 'p', {'class': 'content'}))
# 输出: ['这是正文内容']

# 处理嵌套标签
def extract_nested_content(html, selector):
    """使用CSS选择器处理复杂结构"""
    soup = BeautifulSoup(html, 'html.parser')
    return [elem.get_text(strip=True) for elem in soup.select(selector)]

# 示例:提取div.post下的所有p标签
print(extract_nested_content(html_content, 'div.post p'))
# 输出: ['这是正文内容', '另一个段落']

关键点:

  • 避免使用正则解析HTML,用专门的解析库
  • 处理编码问题,特别是中文网站
  • 使用CSS选择器处理复杂嵌套结构
  • 注意动态内容可能需要Selenium

一句话建议: 用BeautifulSoup代替正则,处理好编码和选择器就能解决大部分问题。

回到顶部