Python中如何抓取Gmail所有邮件的内容
Python中如何抓取Gmail所有邮件的内容
6 回复
看你发帖记录,你这样是怎么维护 /开发爬虫系统的?
要抓取Gmail所有邮件内容,最可靠的方法是使用Gmail API。首先需要在Google Cloud Console创建项目并启用Gmail API,然后获取OAuth 2.0凭据。下面是完整的实现代码:
import os
import pickle
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
# 设置API范围
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
def authenticate_gmail():
"""认证并返回Gmail服务对象"""
creds = None
# 检查是否已有保存的凭据
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# 如果没有有效凭据,则重新认证
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
# 保存凭据供下次使用
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
return build('gmail', 'v1', credentials=creds)
def get_all_emails(service):
"""获取所有邮件"""
try:
results = service.users().messages().list(userId='me').execute()
messages = results.get('messages', [])
all_emails = []
for msg in messages:
msg_id = msg['id']
message = service.users().messages().get(
userId='me', id=msg_id, format='full').execute()
# 提取邮件内容
email_data = extract_email_content(message)
all_emails.append(email_data)
return all_emails
except Exception as e:
print(f"获取邮件时出错: {e}")
return []
def extract_email_content(message):
"""从邮件消息中提取内容"""
headers = message['payload']['headers']
subject = next((h['value'] for h in headers if h['name'] == 'Subject'), '无主题')
sender = next((h['value'] for h in headers if h['name'] == 'From'), '未知发件人')
date = next((h['value'] for h in headers if h['name'] == 'Date'), '未知日期')
# 提取邮件正文
body = ''
if 'parts' in message['payload']:
for part in message['payload']['parts']:
if part['mimeType'] == 'text/plain':
if 'data' in part['body']:
import base64
body = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
break
elif 'body' in message['payload'] and 'data' in message['payload']['body']:
import base64
body = base64.urlsafe_b64decode(message['payload']['body']['data']).decode('utf-8')
return {
'id': message['id'],
'subject': subject,
'sender': sender,
'date': date,
'body': body[:500] + '...' if len(body) > 500 else body # 截取前500字符
}
def main():
# 认证并获取服务
service = authenticate_gmail()
# 获取所有邮件
print("正在获取邮件...")
emails = get_all_emails(service)
# 显示结果
print(f"\n共获取到 {len(emails)} 封邮件:")
print("-" * 50)
for i, email in enumerate(emails, 1):
print(f"{i}. 主题: {email['subject']}")
print(f" 发件人: {email['sender']}")
print(f" 日期: {email['date']}")
print(f" 内容预览: {email['body'][:100]}...")
print("-" * 50)
if __name__ == '__main__':
main()
使用前需要:
- 访问Google Cloud Console创建项目
- 启用Gmail API
- 创建OAuth 2.0客户端ID并下载credentials.json文件
- 安装所需库:
pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client
代码会处理认证流程,首次运行会打开浏览器进行授权。获取的邮件内容包括主题、发件人、日期和正文内容。
建议使用Gmail API的批量获取和分页功能处理大量邮件。
为什么要爬,pop 收取不就好了
gmail api
本地搭个邮件接收服务器,gmail 设置转发,python 直接读本地接收到的邮
明显假简历

