learn-spider/spider/mail_qq.py

120 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import argparse
from playwright.sync_api import sync_playwright
def save_login_state(auth_file, keyword):
with sync_playwright() as p:
os.makedirs(os.path.dirname(auth_file), exist_ok=True)
# headless=False 表示显示浏览器窗口,方便用户扫码登录
browser = p.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
login_success = False
def on_response(response):
nonlocal login_success
if '/login_jump' in response.url:
if response.status == 200:
print(f"[网络监听] 检测到登录API响应成功")
print(f" - 请求地址: {response.url}")
print(f" - 状态码: {response.status}")
login_success = True
def on_url_change(frame):
nonlocal login_success
current_url = page.url
if 'home/index?sid' in current_url:
print(f"[URL监听] 检测到页面已跳转")
print(f" - 当前URL: {current_url}")
login_success = True
# 其他可用的事件:'request'(请求发送时)、'requestfailed'(请求失败时)、'requestfinished'(请求完成时)
page.on('response', on_response)
page.on('framenavigated', on_url_change)
page.goto('https://mail.qq.com')
try:
# 毫秒单位300000ms = 300秒 = 5分钟
page.wait_for_function(f'document.body.innerText.includes("{keyword}")', timeout=300000)
print(f"[文本监听] 检测到'{keyword}'文字,登录确认成功!")
except Exception as e:
print(f"[警告] 未检测到'{keyword}'文字,可能页面结构有变化或登录超时")
print(f" 错误信息: {e}")
if login_success:
print("已确认登录成功!")
else:
print("\n⚠️ 未明确检测到登录成功标志,但仍将保存当前状态")
print(" 如果登录成功,状态应该是有效的")
page.wait_for_timeout(5000) # 2000毫秒 = 2秒
context.storage_state(path=auth_file)
print(f"✅ 登录状态已保存到: {auth_file}")
browser.close()
print("浏览器已关闭")
# 1. 从 Playwright 保存的 auth.json 中提取 cookies
def extract_cookies_from_auth(auth_file):
"""
从 Playwright 的 auth.json 提取 cookies
Args:
auth_file: auth.json 文件路径
Returns:
dict: requests 可用的 cookies 字典
"""
with open(auth_file, 'r', encoding='utf-8') as f:
auth_data = json.load(f)
cookies = {}
for cookie in auth_data.get('cookies', []):
# Playwright 保存的每个 cookie 包含 name 和 value
cookie_name = cookie['name']
cookie_value = cookie['value']
cookies[cookie_name] = cookie_value
return cookies
def crawl_with_saved_state(auth_file):
with sync_playwright() as p:
# 加载之前保存的登录状态
browser = p.chromium.launch(headless=False) # 可以无头模式了
context = browser.new_context(storage_state=auth_file)
page = context.new_page()
page.goto('https://mail.qq.com')
print("当前URL:", page.url)
page.wait_for_load_state('networkidle')
page.wait_for_timeout(2000) # 等待2秒
# 可以使用 BeautifulSoup 解析
# from bs4 import BeautifulSoup
# soup = BeautifulSoup(content, 'html.parser')
items = page.locator('.mail-subject').all()
for item in items:
print(item.text_content())
# page.get_by_title("mail-subject mail-unread").click()
page.wait_for_timeout(20000)
browser.close()
def start(account):
print(f"用户名{account}")
if not account:
print("请输入用户名")
exit(1)
auth_file_path = "./auth/mail"
file_path = f"{auth_file_path}/{account}.json"
if os.path.exists(file_path):
crawl_with_saved_state(file_path)
else:
save_login_state(file_path,"收件箱")