learn-spider/spider/mail_qq.py

import os
import argparse

from playwright.sync_api import sync_playwright


def save_login_state(auth_file, keyword):
    with sync_playwright() as p:
        os.makedirs(os.path.dirname(auth_file), exist_ok=True)
        # headless=False 表示显示浏览器窗口，方便用户扫码登录
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = context.new_page()
        login_success = False

        def on_response(response):
            nonlocal login_success
            if '/login_jump' in response.url:
                if response.status == 200:
                    print(f"[网络监听] 检测到登录API响应成功")
                    print(f"  - 请求地址: {response.url}")
                    print(f"  - 状态码: {response.status}")
                    login_success = True

        def on_url_change(frame):
            nonlocal login_success
            current_url = page.url
            if 'home/index?sid' in current_url:
                print(f"[URL监听] 检测到页面已跳转")
                print(f"  - 当前URL: {current_url}")
                login_success = True

        # 其他可用的事件：'request'(请求发送时)、'requestfailed'(请求失败时)、'requestfinished'(请求完成时)
        page.on('response', on_response)
        page.on('framenavigated', on_url_change)

        page.goto('https://mail.qq.com')
        try:
            # 毫秒单位，300000ms = 300秒 = 5分钟
            page.wait_for_function(f'document.body.innerText.includes("{keyword}")', timeout=300000)
            print(f"[文本监听] 检测到'{keyword}'文字，登录确认成功！")

        except Exception as e:
            print(f"[警告] 未检测到'{keyword}'文字，可能页面结构有变化或登录超时")
            print(f"  错误信息: {e}")

        if login_success:
            print("已确认登录成功！")
        else:
            print("\n⚠️ 未明确检测到登录成功标志，但仍将保存当前状态")
            print("   如果登录成功，状态应该是有效的")

        page.wait_for_timeout(5000)  # 2000毫秒 = 2秒
        context.storage_state(path=auth_file)
        print(f"✅ 登录状态已保存到: {auth_file}")
        browser.close()
        print("浏览器已关闭")


# 1. 从 Playwright 保存的 auth.json 中提取 cookies
def extract_cookies_from_auth(auth_file):
    """
    从 Playwright 的 auth.json 提取 cookies

    Args:
        auth_file: auth.json 文件路径

    Returns:
        dict: requests 可用的 cookies 字典
    """
    with open(auth_file, 'r', encoding='utf-8') as f:
        auth_data = json.load(f)

    cookies = {}
    for cookie in auth_data.get('cookies', []):
        # Playwright 保存的每个 cookie 包含 name 和 value
        cookie_name = cookie['name']
        cookie_value = cookie['value']
        cookies[cookie_name] = cookie_value

    return cookies


def crawl_with_saved_state(auth_file):
    with sync_playwright() as p:
        # 加载之前保存的登录状态
        browser = p.chromium.launch(headless=False)  # 可以无头模式了
        context = browser.new_context(storage_state=auth_file)
        page = context.new_page()
        page.goto('https://mail.qq.com')
        print("当前URL:", page.url)
        page.wait_for_load_state('networkidle')
        page.wait_for_timeout(2000)  # 等待2秒
        # 可以使用 BeautifulSoup 解析
        # from bs4 import BeautifulSoup
        # soup = BeautifulSoup(content, 'html.parser')
        items = page.locator('.mail-subject').all()
        for item in items:
            print(item.text_content())

        # page.get_by_title("mail-subject mail-unread").click()


        page.wait_for_timeout(20000)

        browser.close()

def start(account):
    print(f"用户名{account}")
    if not account:
        print("请输入用户名")
        exit(1)
    auth_file_path = "./auth/mail"
    file_path = f"{auth_file_path}/{account}.json"
    if os.path.exists(file_path):
        crawl_with_saved_state(file_path)
    else:
        save_login_state(file_path,"收件箱")