Python爬虫中翻页时遇到每次set-cookie重新更新sessionid的问题如何解决?

#!/usr/bin env python3 import requests import os import execjs,json,time

class qimingpian(object):

 def __init__(self):
     self.s=requests.session()
     self.js_file=os.getcwd()+"/"+"js_decrypt.js"
 def get_content(self):
     cookies={}
     cookies_str='Hm_lvt_d1cdd45a1d449d32c7b4dbab4915de60=1532161260; Hm_lpvt_d1cdd45a1d449d32c7b4dbab4915de60=1532161260; gr_user_id=0ac1c623-6d25-4c89-b0eb-beaccb4ed35c; time_token=1532254367533; unionid=ETXncbCRyisjw/hr0zeTaonhpvkz/81ntwbBWAKYE4wdmhbtHCwxkjwb+0gjVdRzeJWqqIs6kiQsM8IbOYgM5A==; Hm_lvt_1e712c5331439bcf163b46f3d208f00b=1532161262,1532252857,1532254027,1532254368; Hm_lpvt_1e712c5331439bcf163b46f3d208f00b=1532254368; userinfo={%22nickname%22:%22Wing%E3%80%82%22%2C%22headimgurl%22:%22http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTJzmBzIeVHkjp6IVAl3uWAgB4FYIC96KygBjBvY2qAHycK1OctdAcODsWMh8zJia3j9GCBOzR5Truw/132%22%2C%22coin%22:%2250%22%2C%22applySubmit%22:%220%22%2C%22team_flag%22:%220%22%2C%22team_uuid%22:%22%22%2C%22vip_out_date%22:%22%22%2C%22usernum%22:%22226256331%22%2C%22team_enterprise%22:%220%22%2C%22enterprise_coin%22:%220%22%2C%22is_admin%22:%220%22%2C%22is_manager%22:%220%22%2C%22first_shenqing%22:%220%22%2C%22phone%22:%2213161346498%22%2C%22apply_phone%22:%2213161346498%22%2C%22scope%22:%22qmp%22%2C%22apply_state%22:3%2C%22liyou%22:%22%22%2C%22is_certify%22:1%2C%22ip%22:%22106.37.197.194%22%2C%22person_role%22:%22%22%2C%22claim_type%22:0%2C%22expireinfo%22:false%2C%22inneruser%22:false%2C%22apply_pro_state%22:3%2C%22person_id%22:%22%22}'
     for line in cookies_str.split(';'):  # 按照字符:进行划分读取
         # 其设置为 1 就会把字符串拆分成 2 份
         name, value = line.strip().split('=', 1)
         cookies[name] = value  # 为字典 cookies 添加内容
     url='http://pdf.api.qimingpian.com/t/getFileByPage1'
     headers={"Referer": "http://vip.qimingpian.com/","User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36","Host": "pdf.api.qimingpian.com","Accept": "application/json, text/plain, */*","Accept-Encoding": "gzip, deflate","Accept-Language": "en-US,en;q=0.9","Connection": "keep-alive","Content-Length": "183","Content-Type": "application/x-www-form-urlencoded","Origin": "http://vip.qimingpian.com"}
     for i in range(1,101):
       form_data={"page":"i","num":"40","w":"" ,"ptype": "qmp_pc","version": "2.0","unionid": "ETXncbCRyisjw/hr0zeTaonhpvkz/81ntwbBWAKYE4wdmhbtHCwxkjwb+0gjVdRzeJWqqIs6kiQsM8IbOYgM5A==","jtype": "vip","time_token": "1532254367533"}
       response=self.s.post(url=url,data=form_data,headers=headers,cookies=cookies)
       print(self.s.cookies)
       print(response.headers)
       print(response.text)
       json_data=json.loads(response.text)
       _js=open(self.js_file,'r').read()
       data=execjs.compile(_js).call('n',json_data['data1'])
       print(data)
       for j in range(0,len(data['items'])):
           name=data['items'][j]['name']
           report_source=data['items'][j]['report_source']
           update_time=data['items'][j]['update_time']
           url=data['items'][j]['url']
           print(name)
           print(report_source)
           print(update_time)
           print(url)
           print('\n')

if name=='main': qimingpian().get_content()

代码很少,js 加密破解,但是现在的问题,这个网站每请求一页就 set-cookie 重新设置 sessionid,我这里用的是 session 应该是动态的变化,为啥还是报错呢?现在的情况只能访问第一页 到第二页就报以下错误

<RequestsCookieJar[<cookie phpsessid="3khddv90nbg11lu1ia8eld8ol3" for="" <a="" href="&lt;a href=" http:="" pdf.api.qimingpian.com"="" rel="nofollow">http://pdf.api.qimingpian.com" rel="nofollow">pdf.api.qimingpian.com=""/>]> {'Content-Type': 'text/html', 'Connection': 'keep-alive', 'Content-Length': '254', 'Via': 'kunlun6.cn24[,0]', 'Timing-Allow-Origin': '*', 'Date': 'Sun, 22 Jul 2018 16:49:17 GMT', 'EagleId': '7ae1224615322781579372751e', 'Server': 'Tengine', 'X-Tengine-Error': 'non-existent domain'}

<html> <head><title>403 Forbidden</title></head> <body bgcolor=“white”>

403 Forbidden

You don't have permission to access the URL on this server.


Powered by Tengine</body>

http://vip.qimingpian.com/#/finos/investment/ireport 进去之后 创投数据 报告库里面的数据 不知道自己错在哪里?


Python爬虫中翻页时遇到每次set-cookie重新更新sessionid的问题如何解决?

2 回复

遇到翻页时sessionid被重置,典型的反爬虫机制。核心思路是保持会话一致性,用requests.Session()自动管理cookies。

import requests
from time import sleep

def crawl_paginated_data(base_url, total_pages):
    """
    爬取分页数据,处理sessionid更新问题
    
    Args:
        base_url: 基础URL,包含页码占位符
        total_pages: 需要爬取的总页数
    """
    # 关键:创建Session对象,自动处理cookies
    session = requests.Session()
    
    # 设置通用请求头,模拟真实浏览器
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
    }
    session.headers.update(headers)
    
    all_data = []
    
    for page in range(1, total_pages + 1):
        try:
            # 构建每页URL
            url = base_url.format(page=page)
            
            # 使用同一个session发送请求
            response = session.get(url, timeout=10)
            response.raise_for_status()  # 检查HTTP错误
            
            # 处理响应数据
            data = process_response(response)
            all_data.extend(data)
            
            print(f"成功爬取第{page}页,数据量: {len(data)}条")
            
            # 添加延迟,避免请求过快
            sleep(1)
            
        except requests.exceptions.RequestException as e:
            print(f"第{page}页请求失败: {e}")
            # 可以添加重试逻辑
            continue
    
    return all_data

def process_response(response):
    """
    处理响应内容,提取需要的数据
    这里需要根据实际网页结构实现
    """
    # 示例:返回空列表,实际应解析HTML/JSON
    return []

# 使用示例
if __name__ == "__main__":
    # 示例URL,实际使用时替换为真实地址
    base_url = "https://example.com/data?page={page}"
    
    data = crawl_paginated_data(base_url, total_pages=5)
    print(f"总共爬取数据: {len(data)}条")

关键点:

  1. Session对象:requests.Session()会保持cookies跨请求,自动处理set-cookie
  2. 请求头设置:完整headers减少被识别为爬虫的几率
  3. 异常处理:捕获网络异常,保证程序健壮性
  4. 请求间隔:适当sleep避免触发频率限制

如果网站有更复杂的反爬(如token验证),可能需要额外处理请求参数。

总结:用Session对象保持会话。


Access-Control-Allow-Origin: *
Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
Connection: keep-alive
Content-Type: application/json;charset=UTF-8
Date: Sun, 22 Jul 2018 17:24:14 GMT
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Pragma: no-cache
Server: nginx/1.4.4
Set-Cookie: PHPSESSID=g9o7ulf9399oldina7h8jvkef3; path=/
Transfer-Encoding: chunked
X-Powered-By: PHP/5.5.7

回到顶部