first commit
This commit is contained in:
commit
a607545324
|
|
@ -0,0 +1,14 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="1">
|
||||
<item index="0" class="java.lang.String" itemvalue="flask" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="Flask">
|
||||
<option name="enabled" value="true" />
|
||||
</component>
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TemplatesService">
|
||||
<option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
|
||||
<option name="TEMPLATE_FOLDERS">
|
||||
<list>
|
||||
<option value="$MODULE_DIR$/templates" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
</module>
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.9 (learn-spider)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/learn-spider.iml" filepath="$PROJECT_DIR$/.idea/learn-spider.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="AutoImportSettings">
|
||||
<option name="autoReloadType" value="SELECTIVE" />
|
||||
</component>
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="29e8f12f-1d00-4e65-8b7a-d266f481ff29" name="更改" comment="" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Flask Main" />
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectColorInfo">{
|
||||
"associatedIndex": 5
|
||||
}</component>
|
||||
<component name="ProjectId" id="3DjvhMu4Ipdz8hnE7N4fLfhY0AN" />
|
||||
<component name="ProjectViewState">
|
||||
<option name="showLibraryContents" value="true" />
|
||||
</component>
|
||||
<component name="PropertiesComponent"><![CDATA[{
|
||||
"keyToString": {
|
||||
"Flask 服务器.learn-spider.executor": "Run",
|
||||
"Python.china_net.executor": "Run",
|
||||
"RunOnceActivity.ShowReadmeOnStart": "true",
|
||||
"last_opened_file_path": "/Users/mshe/developer/code/python-workspace/learn-spider/spider",
|
||||
"node.js.detected.package.eslint": "true",
|
||||
"node.js.detected.package.tslint": "true",
|
||||
"node.js.selected.package.eslint": "(autodetect)",
|
||||
"node.js.selected.package.tslint": "(autodetect)",
|
||||
"nodejs_package_manager_path": "npm",
|
||||
"settings.editor.selected.configurable": "settings.sync",
|
||||
"vue.rearranger.settings.migration": "true"
|
||||
}
|
||||
}]]></component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$/spider" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.china_net">
|
||||
<configuration name="china_net" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
||||
<module name="learn-spider" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/spider" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/spider/china_net.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="learn-spider" type="Python.FlaskServer">
|
||||
<module name="learn-spider" />
|
||||
<option name="target" value="$PROJECT_DIR$/app.py" />
|
||||
<option name="targetType" value="PATH" />
|
||||
<option name="ENV_FILES" value="" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="" />
|
||||
<option name="IS_MODULE_SDK" value="false" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="launchJavascriptDebuger" value="false" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.china_net" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SharedIndexes">
|
||||
<attachedChunks>
|
||||
<set>
|
||||
<option value="bundled-js-predefined-d6986cc7102b-5c90d61e3bab-JavaScript-PY-242.23339.19" />
|
||||
<option value="bundled-python-sdk-0029f7779945-399fe30bd8c1-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-242.23339.19" />
|
||||
</set>
|
||||
</attachedChunks>
|
||||
</component>
|
||||
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="应用程序级" UseSingleDictionary="true" transferred="true" />
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="默认任务">
|
||||
<changelist id="29e8f12f-1d00-4e65-8b7a-d266f481ff29" name="更改" comment="" />
|
||||
<created>1778808412351</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1778808412351</updated>
|
||||
<workItem from="1778808425415" duration="1833000" />
|
||||
<workItem from="1779096652209" duration="2278000" />
|
||||
<workItem from="1779170746673" duration="525000" />
|
||||
<workItem from="1779673152455" duration="4561000" />
|
||||
<workItem from="1779779192025" duration="4366000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<default-breakpoints>
|
||||
<breakpoint type="python-exception">
|
||||
<properties notifyOnTerminate="true" exception="BaseException">
|
||||
<option name="notifyOnTerminate" value="true" />
|
||||
</properties>
|
||||
</breakpoint>
|
||||
</default-breakpoints>
|
||||
</breakpoint-manager>
|
||||
</component>
|
||||
<component name="com.intellij.coverage.CoverageDataManagerImpl">
|
||||
<SUITE FILE_PATH="coverage/learn_spider$china_net.coverage" NAME="china_net 覆盖结果" MODIFIED="1779780853999" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/spider" />
|
||||
<SUITE FILE_PATH="coverage/learn_spider$learn_spider.coverage" NAME="learn-spider 覆盖结果" MODIFIED="1778808456145" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="" />
|
||||
</component>
|
||||
</project>
|
||||
Binary file not shown.
|
|
@ -0,0 +1,12 @@
|
|||
from flask import Flask
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route('/')
|
||||
def hello_world(): # put application's code here
|
||||
return 'Hello World!'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run()
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
# HTTP请求
|
||||
requests==2.31.0
|
||||
httpx==0.27.0 # 支持HTTP/2,比requests更快
|
||||
|
||||
# HTML解析
|
||||
beautifulsoup4==4.12.3
|
||||
lxml==5.1.0 # 更快的解析器,比html.parser快很多
|
||||
parsel==1.9.0 # Scrapy的解析库
|
||||
|
||||
# 异步爬虫
|
||||
aiohttp==3.9.5 # 异步HTTP客户端
|
||||
aiofiles==23.2.1 # 异步文件操作
|
||||
|
||||
# 模拟浏览器(对付Vue/React等SPA)
|
||||
playwright==1.42.0 # 推荐,现代浏览器自动化
|
||||
selenium==4.18.1 # 经典方案
|
||||
|
||||
# 代理和反爬
|
||||
fake-useragent==1.5.1 # 随机User-Agent
|
||||
requests-html==0.10.0 # 支持JS渲染(基于pyppeteer)
|
||||
|
||||
# 数据存储
|
||||
pymongo==4.6.1 # MongoDB
|
||||
redis==5.0.1 # Redis
|
||||
pymysql==1.1.0 # MySQL
|
||||
|
||||
# 数据处理
|
||||
pandas==2.2.1 # 数据分析
|
||||
numpy==1.26.4 # 科学计算
|
||||
|
||||
# 爬虫框架
|
||||
scrapy==2.11.1 # 重量级爬虫框架
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
from playwright.sync_api import sync_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def crawl_vue_app():
|
||||
with sync_playwright() as p:
|
||||
# 启动浏览器(headless=False 可以看到浏览器窗口)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
page = browser.new_page()
|
||||
# 访问页面
|
||||
page.goto('http://localhost:8080/')
|
||||
# 等待页面加载完成
|
||||
page.wait_for_load_state('networkidle')
|
||||
# 额外的等待时间(如果某些异步数据加载较慢)
|
||||
page.wait_for_timeout(2000) # 等待2秒
|
||||
# 获取页面内容
|
||||
html = page.content()
|
||||
title = page.title()
|
||||
print(f"标题: {title}")
|
||||
# 解析内容
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# 获取 Vue 应用的根元素内的文本
|
||||
app_div = soup.find('div', id='app')
|
||||
if app_div:
|
||||
# 获取所有可见文本
|
||||
text = app_div.get_text(strip=True)
|
||||
print(f"应用内容: {text[:500]}")
|
||||
# 截图
|
||||
page.screenshot(path='vue_app.png')
|
||||
print("截图已保存")
|
||||
browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_vue_app()
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
from playwright.sync_api import sync_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def crawl_vue_app():
|
||||
with sync_playwright() as p:
|
||||
# 启动浏览器(headless=False 可以看到浏览器窗口)
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
# 访问页面
|
||||
page.goto('https://caigou.chinatelecom.com.cn/')
|
||||
# 等待页面加载完成
|
||||
page.wait_for_load_state('networkidle')
|
||||
# 额外的等待时间(如果某些异步数据加载较慢)
|
||||
page.wait_for_timeout(2000) # 等待2秒
|
||||
page.get_by_text("更多").nth(0).click()
|
||||
page.wait_for_load_state('networkidle')
|
||||
page.wait_for_timeout(2000) # 等待2秒
|
||||
page.get_by_text("招标公告").click()
|
||||
page.wait_for_timeout(2000) # 等待2秒
|
||||
items = page.locator('.el-table__row').all()
|
||||
for item in items:
|
||||
print(item.text_content())
|
||||
|
||||
# # 获取页面内容
|
||||
# html = page.content()
|
||||
# title = page.title()
|
||||
|
||||
# print(f"标题: {title}")
|
||||
|
||||
# # 解析内容
|
||||
# soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# # 获取 Vue 应用的根元素内的文本
|
||||
# app_div = soup.find('div', id='app')
|
||||
# if app_div:
|
||||
# # 获取所有可见文本
|
||||
# text = app_div.get_text(strip=True)
|
||||
# print(f"应用内容: {text}")
|
||||
# # print(f"应用内容: {text[:500]}")
|
||||
|
||||
# 截图
|
||||
# page.screenshot(path='vue_app.png')
|
||||
# print("截图已保存")
|
||||
page.wait_for_timeout(5000) # 等待2秒
|
||||
browser.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_vue_app()
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
import requests
|
||||
import time
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
BASE_URL = 'https://caigou.chinatelecom.com.cn'
|
||||
|
||||
def fetch_all_pages():
|
||||
list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*/*',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'https://caigou.chinatelecom.com.cn/',
|
||||
'Cookie': 'Secure; sag_agent_cookie='
|
||||
}
|
||||
|
||||
all_data = []
|
||||
page = 1
|
||||
page_size = 10
|
||||
|
||||
while True:
|
||||
print(f"正在获取第 {page} 页...")
|
||||
params = {
|
||||
"pageNum": page,
|
||||
"pageSize": page_size,
|
||||
"type":"e2no",
|
||||
"provinceCode":"",
|
||||
"noticeSummary":""
|
||||
}
|
||||
response = requests.post(list_url, json=params, headers=headers)
|
||||
result = response.json()
|
||||
data = result.get('data', {})
|
||||
pageInfo = data.get('pageInfo', {})
|
||||
items = pageInfo.get('list', [])
|
||||
total = pageInfo.get('total', 0)
|
||||
if not items:
|
||||
break
|
||||
all_data.extend(items)
|
||||
print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total} 条")
|
||||
if len(all_data) >= total /100:
|
||||
break
|
||||
page += 1
|
||||
time.sleep(0.5)
|
||||
print(f"\n总共获取 {len(all_data)} 条数据")
|
||||
return all_data
|
||||
|
||||
# 详情链接
|
||||
#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
|
||||
# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
|
||||
def get_detail(item):
|
||||
detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': '*/*',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'https://caigou.chinatelecom.com.cn/',
|
||||
}
|
||||
params = {
|
||||
"type":"TenderAnnouncement",
|
||||
"id":item.get('docId'),
|
||||
"securityViewCode": item.get('securityViewCode'),
|
||||
"title": item.get('docTitle')
|
||||
}
|
||||
response = requests.post(detail_url, headers=headers, json=params)
|
||||
context = response.json().get('data').get('context')
|
||||
return clean_html_tag(context)
|
||||
|
||||
def clean_html_tag(html_text):
|
||||
"""综合清理 HTML 标签、多余空格和换行"""
|
||||
if not html_text:
|
||||
return ""
|
||||
soup = BeautifulSoup(html_text, 'html.parser')
|
||||
text = soup.get_text()
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = text.strip()
|
||||
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
|
||||
return text
|
||||
|
||||
data = fetch_all_pages()
|
||||
for item in data:
|
||||
print(get_detail(item))
|
||||
time.sleep(0.5)
|
||||
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
from playwright.sync_api import sync_playwright
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def crawl_vue_app():
|
||||
with sync_playwright() as p:
|
||||
# 启动浏览器(headless=False 可以看到浏览器窗口)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
page = browser.new_page()
|
||||
# 访问页面
|
||||
page.goto('https://mail.qq.com')
|
||||
|
||||
context = browser.new_context()
|
||||
page = context.new_page()
|
||||
|
||||
# 访问页面
|
||||
page.goto('https://caigou.chinatelecom.com.cn/')
|
||||
|
||||
input("请在浏览器中完成登录,然后按 Enter 继续...")
|
||||
context.storage_state(path='auth.json')
|
||||
print("登录状态已保存到 auth.json")
|
||||
browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawl_vue_app()
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 585 KiB |
Loading…
Reference in New Issue