Python中如何实现百度搜索结果URL爬虫,支持爬取指定1到10页的结果(目前只能固定爬取某一页)
#coding=utf-8 import urllib2 import urllib import sys import re #from selenium import webdriver #from selenium.webdriver.common.keys import Keys import time
#url = "href = "http://www.baidu.com/link?url=bu4fsa-txw7aHhz0LEu-Ej8ON__uS6btmV_mo7nI2O0_qKtfc-3rJHSyXnYOINHSgDASX4R1V6GcjE2UBGFdjZ9ahmEbG2gsGGW6MVW7pQm"" #print url pattern = re.compile(r"href = "( http://www.baidu.com/link?url=.+?)"") #rehh = re.findall(pattern, url)
#for i in rehh: #print i
with open('data.txt','a+') as f: key_word = [] with open('key_word.txt','r') as kf: for line in kf: request = urllib2.Request('http://www.baidu.com/s?wd='+line.decode('gbk').encode('utf-8')+'&pn=0') response = urllib2.urlopen(request)
#print response.read()
#pattern = re.compile(r"href = \"(.+?)\"")
rehh = re.findall(pattern, response.read())
for i in rehh:
request2 = urllib2.Request(i)
response2 = urllib2.urlopen(request2)
print response2.geturl()
f.write(response2.geturl())
f.write('\n')
f.close() kf.close()
Python中如何实现百度搜索结果URL爬虫,支持爬取指定1到10页的结果(目前只能固定爬取某一页)
排版感人。
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time
import re
class BaiduSearchCrawler:
def __init__(self, user_agent=None):
self.session = requests.Session()
self.headers = {
'User-Agent': user_agent or 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.base_url = "https://www.baidu.com/s"
def search(self, keyword, pages=1, start_page=1, delay=1):
"""
搜索关键词并获取多页结果
Args:
keyword: 搜索关键词
pages: 要爬取的页数(默认1页)
start_page: 起始页码(默认从第1页开始)
delay: 请求延迟(秒),避免被封
Returns:
list: 包含所有搜索结果URL的列表
"""
all_results = []
for page in range(start_page, start_page + pages):
try:
# 计算百度分页参数
pn = (page - 1) * 10
params = {
'wd': keyword,
'pn': pn,
'rn': 10, # 每页结果数
'ie': 'utf-8'
}
# 发送请求
response = self.session.get(
self.base_url,
params=params,
headers=self.headers,
timeout=10
)
response.encoding = 'utf-8'
if response.status_code == 200:
page_results = self._parse_results(response.text)
all_results.extend(page_results)
print(f"第{page}页爬取完成,获取到{len(page_results)}个结果")
# 添加延迟避免频繁请求
if page < start_page + pages - 1:
time.sleep(delay)
else:
print(f"第{page}页请求失败,状态码: {response.status_code}")
except Exception as e:
print(f"第{page}页爬取出错: {str(e)}")
continue
return all_results
def _parse_results(self, html_content):
"""解析HTML内容,提取搜索结果URL"""
soup = BeautifulSoup(html_content, 'html.parser')
results = []
# 百度搜索结果的主要容器
result_divs = soup.find_all('div', class_='result')
# 如果没有找到传统结果div,尝试新的选择器
if not result_divs:
result_divs = soup.find_all('div', class_='c-container')
for div in result_divs:
# 查找包含链接的h3标签
h3_tag = div.find('h3')
if h3_tag:
link_tag = h3_tag.find('a')
if link_tag and link_tag.get('href'):
# 处理百度跳转链接
real_url = self._get_real_url(link_tag['href'])
if real_url:
results.append(real_url)
return results
def _get_real_url(self, baidu_url):
"""解析百度跳转链接获取真实URL"""
try:
# 如果是百度自己的链接,直接返回
if baidu_url.startswith('http://www.baidu.com/link?url='):
response = self.session.head(baidu_url, allow_redirects=True, timeout=5)
return response.url
# 如果是直接链接,返回
elif baidu_url.startswith('http'):
return baidu_url
except:
pass
return None
# 使用示例
if __name__ == "__main__":
# 创建爬虫实例
crawler = BaiduSearchCrawler()
# 搜索关键词,爬取前3页结果
keyword = "Python编程"
results = crawler.search(keyword, pages=3, start_page=1, delay=2)
# 输出结果
print(f"\n总共获取到{len(results)}个搜索结果:")
for i, url in enumerate(results, 1):
print(f"{i}. {url}")
# 保存到文件
with open('search_results.txt', 'w', encoding='utf-8') as f:
for url in results:
f.write(url + '\n')
print(f"\n结果已保存到 search_results.txt")
核心修改说明:
-
分页参数计算:百度使用
pn参数控制页码,每页10条结果,所以第N页的pn = (N-1) * 10 -
循环爬取:通过
pages参数控制爬取页数,start_page控制起始页码 -
延迟机制:添加
time.sleep(delay)避免请求过快被封 -
错误处理:每页独立try-catch,一页失败不影响其他页
-
URL解析:处理百度跳转链接获取真实URL
使用方式:
# 爬取第2页到第5页(共4页)
crawler.search("关键词", pages=4, start_page=2)
# 爬取前10页
crawler.search("关键词", pages=10)
一句话建议: 合理设置延迟参数,避免触发反爬机制。
selenium
pn=0,pn 即 pagenumber

