93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
import requests
|
|
import time
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
BASE_URL = 'https://caigou.chinatelecom.com.cn'
|
|
|
|
def fetch_all_pages():
|
|
list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
|
|
headers = {
|
|
'Content-Type': 'application/json',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': '*/*',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Referer': 'https://caigou.chinatelecom.com.cn/',
|
|
'Cookie': 'Secure; sag_agent_cookie='
|
|
}
|
|
|
|
all_data = []
|
|
page = 1
|
|
page_size = 10
|
|
|
|
while True:
|
|
print(f"正在获取第 {page} 页...")
|
|
params = {
|
|
"pageNum": page,
|
|
"pageSize": page_size,
|
|
"type":"e2no",
|
|
"provinceCode":"",
|
|
"noticeSummary":""
|
|
}
|
|
response = requests.post(list_url, json=params, headers=headers)
|
|
result = response.json()
|
|
data = result.get('data', {})
|
|
pageInfo = data.get('pageInfo', {})
|
|
items = pageInfo.get('list', [])
|
|
total = pageInfo.get('total', 0)
|
|
if not items:
|
|
break
|
|
all_data.extend(items)
|
|
print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total} 条")
|
|
if len(all_data) >= total /100:
|
|
break
|
|
page += 1
|
|
time.sleep(0.5)
|
|
print(f"\n总共获取 {len(all_data)} 条数据")
|
|
return all_data
|
|
|
|
# 详情链接
|
|
#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
|
|
# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
|
|
def get_detail(item):
|
|
detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
|
|
headers = {
|
|
'Content-Type': 'application/json',
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': '*/*',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
|
'Cache-Control': 'no-cache',
|
|
'Connection': 'keep-alive',
|
|
'Referer': 'https://caigou.chinatelecom.com.cn/',
|
|
}
|
|
params = {
|
|
"type":"TenderAnnouncement",
|
|
"id":item.get('docId'),
|
|
"securityViewCode": item.get('securityViewCode'),
|
|
"title": item.get('docTitle')
|
|
}
|
|
response = requests.post(detail_url, headers=headers, json=params)
|
|
context = response.json().get('data').get('context')
|
|
return clean_html_tag(context)
|
|
|
|
def clean_html_tag(html_text):
|
|
"""综合清理 HTML 标签、多余空格和换行"""
|
|
if not html_text:
|
|
return ""
|
|
soup = BeautifulSoup(html_text, 'html.parser')
|
|
text = soup.get_text()
|
|
text = re.sub(r'\s+', ' ', text)
|
|
text = text.strip()
|
|
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
|
return text
|
|
|
|
data = fetch_all_pages()
|
|
for item in data:
|
|
print(get_detail(item))
|
|
time.sleep(0.5)
|
|
|