Python中如何抓取Gmail所有邮件的内容

Python中如何抓取Gmail所有邮件的内容

6 回复

看你发帖记录,你这样是怎么维护 /开发爬虫系统的?


要抓取Gmail所有邮件内容,最可靠的方法是使用Gmail API。首先需要在Google Cloud Console创建项目并启用Gmail API,然后获取OAuth 2.0凭据。下面是完整的实现代码:

import os
import pickle
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# 设置API范围
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def authenticate_gmail():
    """认证并返回Gmail服务对象"""
    creds = None
    
    # 检查是否已有保存的凭据
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    
    # 如果没有有效凭据,则重新认证
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        
        # 保存凭据供下次使用
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)
    
    return build('gmail', 'v1', credentials=creds)

def get_all_emails(service):
    """获取所有邮件"""
    try:
        results = service.users().messages().list(userId='me').execute()
        messages = results.get('messages', [])
        
        all_emails = []
        for msg in messages:
            msg_id = msg['id']
            message = service.users().messages().get(
                userId='me', id=msg_id, format='full').execute()
            
            # 提取邮件内容
            email_data = extract_email_content(message)
            all_emails.append(email_data)
            
        return all_emails
        
    except Exception as e:
        print(f"获取邮件时出错: {e}")
        return []

def extract_email_content(message):
    """从邮件消息中提取内容"""
    headers = message['payload']['headers']
    subject = next((h['value'] for h in headers if h['name'] == 'Subject'), '无主题')
    sender = next((h['value'] for h in headers if h['name'] == 'From'), '未知发件人')
    date = next((h['value'] for h in headers if h['name'] == 'Date'), '未知日期')
    
    # 提取邮件正文
    body = ''
    if 'parts' in message['payload']:
        for part in message['payload']['parts']:
            if part['mimeType'] == 'text/plain':
                if 'data' in part['body']:
                    import base64
                    body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
                    break
    elif 'body' in message['payload'] and 'data' in message['payload']['body']:
        import base64
        body = base64.urlsafe_b64decode(message['payload']['body']['data']).decode('utf-8')
    
    return {
        'id': message['id'],
        'subject': subject,
        'sender': sender,
        'date': date,
        'body': body[:500] + '...' if len(body) > 500 else body  # 截取前500字符
    }

def main():
    # 认证并获取服务
    service = authenticate_gmail()
    
    # 获取所有邮件
    print("正在获取邮件...")
    emails = get_all_emails(service)
    
    # 显示结果
    print(f"\n共获取到 {len(emails)} 封邮件:")
    print("-" * 50)
    
    for i, email in enumerate(emails, 1):
        print(f"{i}. 主题: {email['subject']}")
        print(f"   发件人: {email['sender']}")
        print(f"   日期: {email['date']}")
        print(f"   内容预览: {email['body'][:100]}...")
        print("-" * 50)

if __name__ == '__main__':
    main()

使用前需要:

  1. 访问Google Cloud Console创建项目
  2. 启用Gmail API
  3. 创建OAuth 2.0客户端ID并下载credentials.json文件
  4. 安装所需库:pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

代码会处理认证流程,首次运行会打开浏览器进行授权。获取的邮件内容包括主题、发件人、日期和正文内容。

建议使用Gmail API的批量获取和分页功能处理大量邮件。

为什么要爬,pop 收取不就好了

本地搭个邮件接收服务器,gmail 设置转发,python 直接读本地接收到的邮

明显假简历

回到顶部