import requests import time import re from bs4 import BeautifulSoup BASE_URL = 'https://caigou.chinatelecom.com.cn' def fetch_all_pages(): list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew''' headers = { 'Content-Type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Referer': 'https://caigou.chinatelecom.com.cn/', 'Cookie': 'Secure; sag_agent_cookie=' } all_data = [] page = 1 page_size = 10 while True: print(f"正在获取第 {page} 页...") params = { "pageNum": page, "pageSize": page_size, "type":"e2no", "provinceCode":"", "noticeSummary":"" } response = requests.post(list_url, json=params, headers=headers) result = response.json() data = result.get('data', {}) pageInfo = data.get('pageInfo', {}) items = pageInfo.get('list', []) total = pageInfo.get('total', 0) if not items: break all_data.extend(items) print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total} 条") if len(all_data) >= total /100: break page += 1 time.sleep(0.5) print(f"\n总共获取 {len(all_data)} 条数据") return all_data # 详情链接 #/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8 # https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view def get_detail(item): detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view" headers = { 'Content-Type': 'application/json', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Referer': 'https://caigou.chinatelecom.com.cn/', } params = { "type":"TenderAnnouncement", "id":item.get('docId'), "securityViewCode": item.get('securityViewCode'), "title": item.get('docTitle') } response = requests.post(detail_url, headers=headers, json=params) context = response.json().get('data').get('context') return clean_html_tag(context) def clean_html_tag(html_text): """综合清理 HTML 标签、多余空格和换行""" if not html_text: return "" soup = BeautifulSoup(html_text, 'html.parser') text = soup.get_text() text = re.sub(r'\s+', ' ', text) text = text.strip() text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') return text data = fetch_all_pages() for item in data: print(get_detail(item)) time.sleep(0.5)