first commit

This commit is contained in:
mshe 2026-05-27 11:25:56 +08:00
commit a607545324
16 changed files with 440 additions and 0 deletions

0
.idea/.gitignore vendored Normal file
View File

View File

@ -0,0 +1,14 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="1">
<item index="0" class="java.lang.String" itemvalue="flask" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

21
.idea/learn-spider.iml Normal file
View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="Flask">
<option name="enabled" value="true" />
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
<option name="TEMPLATE_FOLDERS">
<list>
<option value="$MODULE_DIR$/templates" />
</list>
</option>
</component>
</module>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.9 (learn-spider)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/learn-spider.iml" filepath="$PROJECT_DIR$/.idea/learn-spider.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

136
.idea/workspace.xml Normal file
View File

@ -0,0 +1,136 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="29e8f12f-1d00-4e65-8b7a-d266f481ff29" name="更改" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Flask Main" />
<option value="Python Script" />
</list>
</option>
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 5
}</component>
<component name="ProjectId" id="3DjvhMu4Ipdz8hnE7N4fLfhY0AN" />
<component name="ProjectViewState">
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"Flask 服务器.learn-spider.executor": "Run",
"Python.china_net.executor": "Run",
"RunOnceActivity.ShowReadmeOnStart": "true",
"last_opened_file_path": "/Users/mshe/developer/code/python-workspace/learn-spider/spider",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "settings.sync",
"vue.rearranger.settings.migration": "true"
}
}]]></component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/spider" />
</key>
</component>
<component name="RunManager" selected="Python.china_net">
<configuration name="china_net" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="learn-spider" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/spider" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/spider/china_net.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="learn-spider" type="Python.FlaskServer">
<module name="learn-spider" />
<option name="target" value="$PROJECT_DIR$/app.py" />
<option name="targetType" value="PATH" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="launchJavascriptDebuger" value="false" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.china_net" />
</list>
</recent_temporary>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-js-predefined-d6986cc7102b-5c90d61e3bab-JavaScript-PY-242.23339.19" />
<option value="bundled-python-sdk-0029f7779945-399fe30bd8c1-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-242.23339.19" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="应用程序级" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="默认任务">
<changelist id="29e8f12f-1d00-4e65-8b7a-d266f481ff29" name="更改" comment="" />
<created>1778808412351</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1778808412351</updated>
<workItem from="1778808425415" duration="1833000" />
<workItem from="1779096652209" duration="2278000" />
<workItem from="1779170746673" duration="525000" />
<workItem from="1779673152455" duration="4561000" />
<workItem from="1779779192025" duration="4366000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<default-breakpoints>
<breakpoint type="python-exception">
<properties notifyOnTerminate="true" exception="BaseException">
<option name="notifyOnTerminate" value="true" />
</properties>
</breakpoint>
</default-breakpoints>
</breakpoint-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/learn_spider$china_net.coverage" NAME="china_net 覆盖结果" MODIFIED="1779780853999" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/spider" />
<SUITE FILE_PATH="coverage/learn_spider$learn_spider.coverage" NAME="learn-spider 覆盖结果" MODIFIED="1778808456145" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="" />
</component>
</project>

Binary file not shown.

12
app.py Normal file
View File

@ -0,0 +1,12 @@
from flask import Flask
app = Flask(__name__)
@app.route('/')
def hello_world(): # put application's code here
return 'Hello World!'
if __name__ == '__main__':
app.run()

32
requirements.txt Normal file
View File

@ -0,0 +1,32 @@
# HTTP请求
requests==2.31.0
httpx==0.27.0 # 支持HTTP/2比requests更快
# HTML解析
beautifulsoup4==4.12.3
lxml==5.1.0 # 更快的解析器比html.parser快很多
parsel==1.9.0 # Scrapy的解析库
# 异步爬虫
aiohttp==3.9.5 # 异步HTTP客户端
aiofiles==23.2.1 # 异步文件操作
# 模拟浏览器对付Vue/React等SPA
playwright==1.42.0 # 推荐,现代浏览器自动化
selenium==4.18.1 # 经典方案
# 代理和反爬
fake-useragent==1.5.1 # 随机User-Agent
requests-html==0.10.0 # 支持JS渲染基于pyppeteer
# 数据存储
pymongo==4.6.1 # MongoDB
redis==5.0.1 # Redis
pymysql==1.1.0 # MySQL
# 数据处理
pandas==2.2.1 # 数据分析
numpy==1.26.4 # 科学计算
# 爬虫框架
scrapy==2.11.1 # 重量级爬虫框架

33
spider/baidu.py Normal file
View File

@ -0,0 +1,33 @@
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
def crawl_vue_app():
with sync_playwright() as p:
# 启动浏览器headless=False 可以看到浏览器窗口)
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 访问页面
page.goto('http://localhost:8080/')
# 等待页面加载完成
page.wait_for_load_state('networkidle')
# 额外的等待时间(如果某些异步数据加载较慢)
page.wait_for_timeout(2000) # 等待2秒
# 获取页面内容
html = page.content()
title = page.title()
print(f"标题: {title}")
# 解析内容
soup = BeautifulSoup(html, 'html.parser')
# 获取 Vue 应用的根元素内的文本
app_div = soup.find('div', id='app')
if app_div:
# 获取所有可见文本
text = app_div.get_text(strip=True)
print(f"应用内容: {text[:500]}")
# 截图
page.screenshot(path='vue_app.png')
print("截图已保存")
browser.close()
if __name__ == "__main__":
crawl_vue_app()

48
spider/china_net.py Normal file
View File

@ -0,0 +1,48 @@
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
def crawl_vue_app():
with sync_playwright() as p:
# 启动浏览器headless=False 可以看到浏览器窗口)
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# 访问页面
page.goto('https://caigou.chinatelecom.com.cn/')
# 等待页面加载完成
page.wait_for_load_state('networkidle')
# 额外的等待时间(如果某些异步数据加载较慢)
page.wait_for_timeout(2000) # 等待2秒
page.get_by_text("更多").nth(0).click()
page.wait_for_load_state('networkidle')
page.wait_for_timeout(2000) # 等待2秒
page.get_by_text("招标公告").click()
page.wait_for_timeout(2000) # 等待2秒
items = page.locator('.el-table__row').all()
for item in items:
print(item.text_content())
# # 获取页面内容
# html = page.content()
# title = page.title()
# print(f"标题: {title}")
# # 解析内容
# soup = BeautifulSoup(html, 'html.parser')
# # 获取 Vue 应用的根元素内的文本
# app_div = soup.find('div', id='app')
# if app_div:
# # 获取所有可见文本
# text = app_div.get_text(strip=True)
# print(f"应用内容: {text}")
# # print(f"应用内容: {text[:500]}")
# 截图
# page.screenshot(path='vue_app.png')
# print("截图已保存")
page.wait_for_timeout(5000) # 等待2秒
browser.close()
if __name__ == "__main__":
crawl_vue_app()

View File

@ -0,0 +1,92 @@
import requests
import time
import re
from bs4 import BeautifulSoup
BASE_URL = 'https://caigou.chinatelecom.com.cn'
def fetch_all_pages():
list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Referer': 'https://caigou.chinatelecom.com.cn/',
'Cookie': 'Secure; sag_agent_cookie='
}
all_data = []
page = 1
page_size = 10
while True:
print(f"正在获取第 {page} 页...")
params = {
"pageNum": page,
"pageSize": page_size,
"type":"e2no",
"provinceCode":"",
"noticeSummary":""
}
response = requests.post(list_url, json=params, headers=headers)
result = response.json()
data = result.get('data', {})
pageInfo = data.get('pageInfo', {})
items = pageInfo.get('list', [])
total = pageInfo.get('total', 0)
if not items:
break
all_data.extend(items)
print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total}")
if len(all_data) >= total /100:
break
page += 1
time.sleep(0.5)
print(f"\n总共获取 {len(all_data)} 条数据")
return all_data
# 详情链接
#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
def get_detail(item):
detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Referer': 'https://caigou.chinatelecom.com.cn/',
}
params = {
"type":"TenderAnnouncement",
"id":item.get('docId'),
"securityViewCode": item.get('securityViewCode'),
"title": item.get('docTitle')
}
response = requests.post(detail_url, headers=headers, json=params)
context = response.json().get('data').get('context')
return clean_html_tag(context)
def clean_html_tag(html_text):
"""综合清理 HTML 标签、多余空格和换行"""
if not html_text:
return ""
soup = BeautifulSoup(html_text, 'html.parser')
text = soup.get_text()
text = re.sub(r'\s+', ' ', text)
text = text.strip()
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
return text
data = fetch_all_pages()
for item in data:
print(get_detail(item))
time.sleep(0.5)

25
spider/mail_qq.py Normal file
View File

@ -0,0 +1,25 @@
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
def crawl_vue_app():
with sync_playwright() as p:
# 启动浏览器headless=False 可以看到浏览器窗口)
browser = p.chromium.launch(headless=False)
page = browser.new_page()
# 访问页面
page.goto('https://mail.qq.com')
context = browser.new_context()
page = context.new_page()
# 访问页面
page.goto('https://caigou.chinatelecom.com.cn/')
input("请在浏览器中完成登录,然后按 Enter 继续...")
context.storage_state(path='auth.json')
print("登录状态已保存到 auth.json")
browser.close()
if __name__ == "__main__":
crawl_vue_app()

BIN
spider/vue_app.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 585 KiB