learn-spider/spider/china_net_request.py

93 lines
3.1 KiB
Python

import requests
import time
import re
from bs4 import BeautifulSoup
BASE_URL = 'https://caigou.chinatelecom.com.cn'
def fetch_all_pages():
list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Referer': 'https://caigou.chinatelecom.com.cn/',
'Cookie': 'Secure; sag_agent_cookie='
}
all_data = []
page = 1
page_size = 10
while True:
print(f"正在获取第 {page} 页...")
params = {
"pageNum": page,
"pageSize": page_size,
"type":"e2no",
"provinceCode":"",
"noticeSummary":""
}
response = requests.post(list_url, json=params, headers=headers)
result = response.json()
data = result.get('data', {})
pageInfo = data.get('pageInfo', {})
items = pageInfo.get('list', [])
total = pageInfo.get('total', 0)
if not items:
break
all_data.extend(items)
print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total}")
if len(all_data) >= total /100:
break
page += 1
time.sleep(0.5)
print(f"\n总共获取 {len(all_data)} 条数据")
return all_data
# 详情链接
#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
def get_detail(item):
detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Referer': 'https://caigou.chinatelecom.com.cn/',
}
params = {
"type":"TenderAnnouncement",
"id":item.get('docId'),
"securityViewCode": item.get('securityViewCode'),
"title": item.get('docTitle')
}
response = requests.post(detail_url, headers=headers, json=params)
context = response.json().get('data').get('context')
return clean_html_tag(context)
def clean_html_tag(html_text):
"""综合清理 HTML 标签、多余空格和换行"""
if not html_text:
return ""
soup = BeautifulSoup(html_text, 'html.parser')
text = soup.get_text()
text = re.sub(r'\s+', ' ', text)
text = text.strip()
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
return text
data = fetch_all_pages()
for item in data:
print(get_detail(item))
time.sleep(0.5)