Python爬取知乎时如何追踪答案的更新？

想爬取知乎问题下的所有答案并维护，如果有新答案添加就更新数据库。但是个人动态里面是没有对于你关注问题下某个答案修改的通知的。所以在经过一段时间后数据库里面一些答案与最新的答案就会有一些差别。
我想的是每天对已经采集的问题查询是否有新答案,这个比较好解决；然后就是如果有某个答案答主进行了修改,那么总不能用已经采集的和最新的答案进行比对吧。

yuanlaile 1楼

定期全部重新爬

gougou168 2楼作者

对于追踪知乎答案的更新，核心思路是定期抓取并对比历史数据。这里提供一个基于 requests 和 BeautifulSoup 的简单实现方案，记录首次抓取的答案ID和内容，后续通过对比ID列表和内容哈希来判断更新。

import requests
import hashlib
import json
import time
from bs4 import BeautifulSoup

# 知乎问题ID
QUESTION_ID = '12345678'  # 替换为目标问题ID
# 存储历史数据的文件
HISTORY_FILE = 'zhihu_history.json'

def fetch_answers(question_id):
    """抓取指定问题的所有答案"""
    url = f'https://www.zhihu.com/api/v4/questions/{question_id}/answers'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    params = {
        'include': 'data[*].content',
        'limit': 20,
        'offset': 0
    }
    
    answers = []
    while True:
        response = requests.get(url, headers=headers, params=params)
        data = response.json()
        
        for item in data['data']:
            answer_id = item['id']
            content = item['content']
            # 计算内容哈希用于后续对比
            content_hash = hashlib.md5(content.encode()).hexdigest()
            answers.append({
                'id': answer_id,
                'hash': content_hash,
                'content': content[:100]  # 只存储前100字符用于演示
            })
        
        if not data['paging']['is_end']:
            params['offset'] += params['limit']
        else:
            break
    
    return answers

def load_history():
    """加载历史记录"""
    try:
        with open(HISTORY_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_history(history):
    """保存历史记录"""
    with open(HISTORY_FILE, 'w', encoding='utf-8') as f:
        json.dump(history, f, ensure_ascii=False, indent=2)

def check_updates():
    """检查答案更新"""
    # 加载历史记录
    history = load_history()
    history_answers = history.get(QUESTION_ID, {})
    
    # 抓取当前答案
    current_answers = fetch_answers(QUESTION_ID)
    current_dict = {ans['id']: ans['hash'] for ans in current_answers}
    
    # 对比更新
    updates = []
    for ans in current_answers:
        answer_id = ans['id']
        if answer_id not in history_answers:
            updates.append(f"新增答案: {answer_id}")
        elif history_answers[answer_id] != ans['hash']:
            updates.append(f"答案更新: {answer_id}")
    
    # 检查删除的答案
    for old_id in history_answers:
        if old_id not in current_dict:
            updates.append(f"答案删除: {old_id}")
    
    # 更新历史记录
    history[QUESTION_ID] = current_dict
    save_history(history)
    
    return updates

# 主循环
if __name__ == '__main__':
    while True:
        print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] 开始检查更新...")
        updates = check_updates()
        
        if updates:
            print("检测到更新:")
            for update in updates:
                print(f"  - {update}")
        else:
            print("无更新")
        
        # 每隔1小时检查一次
        time.sleep(3600)

关键点说明：