Python中如何实现百度搜索结果URL爬虫,支持爬取指定1到10页的结果(目前只能固定爬取某一页)

#coding=utf-8 import urllib2 import urllib import sys import re #from selenium import webdriver #from selenium.webdriver.common.keys import Keys import time

#url = "href = "http://www.baidu.com/link?url=bu4fsa-txw7aHhz0LEu-Ej8ON__uS6btmV_mo7nI2O0_qKtfc-3rJHSyXnYOINHSgDASX4R1V6GcjE2UBGFdjZ9ahmEbG2gsGGW6MVW7pQm"" #print url pattern = re.compile(r"href = "( http://www.baidu.com/link?url=.+?)"") #rehh = re.findall(pattern, url)

#for i in rehh: #print i

with open('data.txt','a+') as f: key_word = [] with open('key_word.txt','r') as kf: for line in kf: request = urllib2.Request('http://www.baidu.com/s?wd='+line.decode('gbk').encode('utf-8')+'&pn=0') response = urllib2.urlopen(request)

        #print response.read()
        #pattern = re.compile(r"href = \"(.+?)\"")
        rehh = re.findall(pattern, response.read())
    for i in rehh:
        request2 = urllib2.Request(i)
        response2 = urllib2.urlopen(request2)

        print response2.geturl()
        f.write(response2.geturl())
        f.write('\n')

f.close() kf.close()


Python中如何实现百度搜索结果URL爬虫,支持爬取指定1到10页的结果(目前只能固定爬取某一页)

4 回复

排版感人。


import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import re

class BaiduSearchCrawler:
    def __init__(self, user_agent=None):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.base_url = "https://www.baidu.com/s"
        
    def search(self, keyword, pages=1, start_page=1, delay=1):
        """
        搜索关键词并获取多页结果
        
        Args:
            keyword: 搜索关键词
            pages: 要爬取的页数(默认1页)
            start_page: 起始页码(默认从第1页开始)
            delay: 请求延迟(秒),避免被封
        
        Returns:
            list: 包含所有搜索结果URL的列表
        """
        all_results = []
        
        for page in range(start_page, start_page + pages):
            try:
                # 计算百度分页参数
                pn = (page - 1) * 10
                
                params = {
                    'wd': keyword,
                    'pn': pn,
                    'rn': 10,  # 每页结果数
                    'ie': 'utf-8'
                }
                
                # 发送请求
                response = self.session.get(
                    self.base_url, 
                    params=params, 
                    headers=self.headers,
                    timeout=10
                )
                response.encoding = 'utf-8'
                
                if response.status_code == 200:
                    page_results = self._parse_results(response.text)
                    all_results.extend(page_results)
                    print(f"第{page}页爬取完成,获取到{len(page_results)}个结果")
                    
                    # 添加延迟避免频繁请求
                    if page < start_page + pages - 1:
                        time.sleep(delay)
                else:
                    print(f"第{page}页请求失败,状态码: {response.status_code}")
                    
            except Exception as e:
                print(f"第{page}页爬取出错: {str(e)}")
                continue
        
        return all_results
    
    def _parse_results(self, html_content):
        """解析HTML内容,提取搜索结果URL"""
        soup = BeautifulSoup(html_content, 'html.parser')
        results = []
        
        # 百度搜索结果的主要容器
        result_divs = soup.find_all('div', class_='result')
        
        # 如果没有找到传统结果div,尝试新的选择器
        if not result_divs:
            result_divs = soup.find_all('div', class_='c-container')
        
        for div in result_divs:
            # 查找包含链接的h3标签
            h3_tag = div.find('h3')
            if h3_tag:
                link_tag = h3_tag.find('a')
                if link_tag and link_tag.get('href'):
                    # 处理百度跳转链接
                    real_url = self._get_real_url(link_tag['href'])
                    if real_url:
                        results.append(real_url)
        
        return results
    
    def _get_real_url(self, baidu_url):
        """解析百度跳转链接获取真实URL"""
        try:
            # 如果是百度自己的链接,直接返回
            if baidu_url.startswith('http://www.baidu.com/link?url='):
                response = self.session.head(baidu_url, allow_redirects=True, timeout=5)
                return response.url
            # 如果是直接链接,返回
            elif baidu_url.startswith('http'):
                return baidu_url
        except:
            pass
        return None

# 使用示例
if __name__ == "__main__":
    # 创建爬虫实例
    crawler = BaiduSearchCrawler()
    
    # 搜索关键词,爬取前3页结果
    keyword = "Python编程"
    results = crawler.search(keyword, pages=3, start_page=1, delay=2)
    
    # 输出结果
    print(f"\n总共获取到{len(results)}个搜索结果:")
    for i, url in enumerate(results, 1):
        print(f"{i}. {url}")
    
    # 保存到文件
    with open('search_results.txt', 'w', encoding='utf-8') as f:
        for url in results:
            f.write(url + '\n')
    print(f"\n结果已保存到 search_results.txt")

核心修改说明:

  1. 分页参数计算:百度使用pn参数控制页码,每页10条结果,所以第N页的pn = (N-1) * 10

  2. 循环爬取:通过pages参数控制爬取页数,start_page控制起始页码

  3. 延迟机制:添加time.sleep(delay)避免请求过快被封

  4. 错误处理:每页独立try-catch,一页失败不影响其他页

  5. URL解析:处理百度跳转链接获取真实URL

使用方式:

# 爬取第2页到第5页(共4页)
crawler.search("关键词", pages=4, start_page=2)

# 爬取前10页
crawler.search("关键词", pages=10)

一句话建议: 合理设置延迟参数,避免触发反爬机制。

pn=0,pn 即 pagenumber

回到顶部