learn-spider/spider/china_net_request.py

import requests
import time
import re
from bs4 import BeautifulSoup

BASE_URL = 'https://caigou.chinatelecom.com.cn'

def fetch_all_pages():
    list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
    headers = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Referer': 'https://caigou.chinatelecom.com.cn/',
        'Cookie': 'Secure; sag_agent_cookie='
    }

    all_data = []
    page = 1
    page_size = 10

    while True:
        print(f"正在获取第 {page} 页...")
        params =  {
            "pageNum": page,
            "pageSize": page_size,
            "type":"e2no",
            "provinceCode":"",
            "noticeSummary":""
        }
        response = requests.post(list_url, json=params, headers=headers)
        result = response.json()
        data = result.get('data', {})
        pageInfo = data.get('pageInfo', {})
        items = pageInfo.get('list', [])
        total = pageInfo.get('total', 0)
        if not items:
            break
        all_data.extend(items)
        print(f"  获取 {len(items)} 条，累计 {len(all_data)} 条, 总共 {total} 条")
        if len(all_data) >= total /100:
            break
        page += 1
        time.sleep(0.5)
    print(f"\n总共获取 {len(all_data)} 条数据")
    return all_data

# 详情链接
#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
def get_detail(item):
    detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
    headers = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Referer': 'https://caigou.chinatelecom.com.cn/',
    }
    params = {
        "type":"TenderAnnouncement",
        "id":item.get('docId'),
        "securityViewCode": item.get('securityViewCode'),
        "title": item.get('docTitle')
    }
    response = requests.post(detail_url, headers=headers, json=params)
    context = response.json().get('data').get('context')
    return clean_html_tag(context)

def clean_html_tag(html_text):
    """综合清理 HTML 标签、多余空格和换行"""
    if not html_text:
        return ""
    soup = BeautifulSoup(html_text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    return text

data = fetch_all_pages()
for item in data:
    print(get_detail(item))
    time.sleep(0.5)