commit a6075453248a4dc50db1ffd360682912018d2b6b
Author: mshe <666666666@666666666.666666666>
Date: Wed May 27 11:25:56 2026 +0800
first commit
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..a86e35a
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/learn-spider.iml b/.idea/learn-spider.iml
new file mode 100644
index 0000000..1e32a71
--- /dev/null
+++ b/.idea/learn-spider.iml
@@ -0,0 +1,21 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..f1fe18c
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..a16fa14
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000..067e537
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,136 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {
+ "associatedIndex": 5
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1778808412351
+
+
+ 1778808412351
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/__pycache__/app.cpython-39.pyc b/__pycache__/app.cpython-39.pyc
new file mode 100644
index 0000000..cb21463
Binary files /dev/null and b/__pycache__/app.cpython-39.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..5d20a01
--- /dev/null
+++ b/app.py
@@ -0,0 +1,12 @@
+from flask import Flask
+
+app = Flask(__name__)
+
+
+@app.route('/')
+def hello_world(): # put application's code here
+ return 'Hello World!'
+
+
+if __name__ == '__main__':
+ app.run()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f11f711
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,32 @@
+# HTTP请求
+requests==2.31.0
+httpx==0.27.0 # 支持HTTP/2,比requests更快
+
+# HTML解析
+beautifulsoup4==4.12.3
+lxml==5.1.0 # 更快的解析器,比html.parser快很多
+parsel==1.9.0 # Scrapy的解析库
+
+# 异步爬虫
+aiohttp==3.9.5 # 异步HTTP客户端
+aiofiles==23.2.1 # 异步文件操作
+
+# 模拟浏览器(对付Vue/React等SPA)
+playwright==1.42.0 # 推荐,现代浏览器自动化
+selenium==4.18.1 # 经典方案
+
+# 代理和反爬
+fake-useragent==1.5.1 # 随机User-Agent
+requests-html==0.10.0 # 支持JS渲染(基于pyppeteer)
+
+# 数据存储
+pymongo==4.6.1 # MongoDB
+redis==5.0.1 # Redis
+pymysql==1.1.0 # MySQL
+
+# 数据处理
+pandas==2.2.1 # 数据分析
+numpy==1.26.4 # 科学计算
+
+# 爬虫框架
+scrapy==2.11.1 # 重量级爬虫框架
\ No newline at end of file
diff --git a/spider/baidu.py b/spider/baidu.py
new file mode 100644
index 0000000..4fee651
--- /dev/null
+++ b/spider/baidu.py
@@ -0,0 +1,33 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+
+def crawl_vue_app():
+ with sync_playwright() as p:
+ # 启动浏览器(headless=False 可以看到浏览器窗口)
+ browser = p.chromium.launch(headless=False)
+ page = browser.new_page()
+ # 访问页面
+ page.goto('http://localhost:8080/')
+ # 等待页面加载完成
+ page.wait_for_load_state('networkidle')
+ # 额外的等待时间(如果某些异步数据加载较慢)
+ page.wait_for_timeout(2000) # 等待2秒
+ # 获取页面内容
+ html = page.content()
+ title = page.title()
+ print(f"标题: {title}")
+ # 解析内容
+ soup = BeautifulSoup(html, 'html.parser')
+ # 获取 Vue 应用的根元素内的文本
+ app_div = soup.find('div', id='app')
+ if app_div:
+ # 获取所有可见文本
+ text = app_div.get_text(strip=True)
+ print(f"应用内容: {text[:500]}")
+ # 截图
+ page.screenshot(path='vue_app.png')
+ print("截图已保存")
+ browser.close()
+
+if __name__ == "__main__":
+ crawl_vue_app()
\ No newline at end of file
diff --git a/spider/china_net.py b/spider/china_net.py
new file mode 100644
index 0000000..3ffc08b
--- /dev/null
+++ b/spider/china_net.py
@@ -0,0 +1,48 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+
+def crawl_vue_app():
+ with sync_playwright() as p:
+ # 启动浏览器(headless=False 可以看到浏览器窗口)
+ browser = p.chromium.launch(headless=True)
+ page = browser.new_page()
+ # 访问页面
+ page.goto('https://caigou.chinatelecom.com.cn/')
+ # 等待页面加载完成
+ page.wait_for_load_state('networkidle')
+ # 额外的等待时间(如果某些异步数据加载较慢)
+ page.wait_for_timeout(2000) # 等待2秒
+ page.get_by_text("更多").nth(0).click()
+ page.wait_for_load_state('networkidle')
+ page.wait_for_timeout(2000) # 等待2秒
+ page.get_by_text("招标公告").click()
+ page.wait_for_timeout(2000) # 等待2秒
+ items = page.locator('.el-table__row').all()
+ for item in items:
+ print(item.text_content())
+
+ # # 获取页面内容
+ # html = page.content()
+ # title = page.title()
+
+ # print(f"标题: {title}")
+
+ # # 解析内容
+ # soup = BeautifulSoup(html, 'html.parser')
+
+ # # 获取 Vue 应用的根元素内的文本
+ # app_div = soup.find('div', id='app')
+ # if app_div:
+ # # 获取所有可见文本
+ # text = app_div.get_text(strip=True)
+ # print(f"应用内容: {text}")
+ # # print(f"应用内容: {text[:500]}")
+
+ # 截图
+ # page.screenshot(path='vue_app.png')
+ # print("截图已保存")
+ page.wait_for_timeout(5000) # 等待2秒
+ browser.close()
+
+if __name__ == "__main__":
+ crawl_vue_app()
\ No newline at end of file
diff --git a/spider/china_net_request.py b/spider/china_net_request.py
new file mode 100644
index 0000000..c94661a
--- /dev/null
+++ b/spider/china_net_request.py
@@ -0,0 +1,92 @@
+import requests
+import time
+import re
+from bs4 import BeautifulSoup
+
+BASE_URL = 'https://caigou.chinatelecom.com.cn'
+
+def fetch_all_pages():
+ list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew'''
+ headers = {
+ 'Content-Type': 'application/json',
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Accept': '*/*',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive',
+ 'Referer': 'https://caigou.chinatelecom.com.cn/',
+ 'Cookie': 'Secure; sag_agent_cookie='
+ }
+
+ all_data = []
+ page = 1
+ page_size = 10
+
+ while True:
+ print(f"正在获取第 {page} 页...")
+ params = {
+ "pageNum": page,
+ "pageSize": page_size,
+ "type":"e2no",
+ "provinceCode":"",
+ "noticeSummary":""
+ }
+ response = requests.post(list_url, json=params, headers=headers)
+ result = response.json()
+ data = result.get('data', {})
+ pageInfo = data.get('pageInfo', {})
+ items = pageInfo.get('list', [])
+ total = pageInfo.get('total', 0)
+ if not items:
+ break
+ all_data.extend(items)
+ print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total} 条")
+ if len(all_data) >= total /100:
+ break
+ page += 1
+ time.sleep(0.5)
+ print(f"\n总共获取 {len(all_data)} 条数据")
+ return all_data
+
+# 详情链接
+#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8
+# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view
+def get_detail(item):
+ detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view"
+ headers = {
+ 'Content-Type': 'application/json',
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+ 'Accept': '*/*',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive',
+ 'Referer': 'https://caigou.chinatelecom.com.cn/',
+ }
+ params = {
+ "type":"TenderAnnouncement",
+ "id":item.get('docId'),
+ "securityViewCode": item.get('securityViewCode'),
+ "title": item.get('docTitle')
+ }
+ response = requests.post(detail_url, headers=headers, json=params)
+ context = response.json().get('data').get('context')
+ return clean_html_tag(context)
+
+def clean_html_tag(html_text):
+ """综合清理 HTML 标签、多余空格和换行"""
+ if not html_text:
+ return ""
+ soup = BeautifulSoup(html_text, 'html.parser')
+ text = soup.get_text()
+ text = re.sub(r'\s+', ' ', text)
+ text = text.strip()
+ text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
+ return text
+
+data = fetch_all_pages()
+for item in data:
+ print(get_detail(item))
+ time.sleep(0.5)
+
diff --git a/spider/mail_qq.py b/spider/mail_qq.py
new file mode 100644
index 0000000..580f4ab
--- /dev/null
+++ b/spider/mail_qq.py
@@ -0,0 +1,25 @@
+from playwright.sync_api import sync_playwright
+from bs4 import BeautifulSoup
+
+def crawl_vue_app():
+ with sync_playwright() as p:
+ # 启动浏览器(headless=False 可以看到浏览器窗口)
+ browser = p.chromium.launch(headless=False)
+ page = browser.new_page()
+ # 访问页面
+ page.goto('https://mail.qq.com')
+
+ context = browser.new_context()
+ page = context.new_page()
+
+ # 访问页面
+ page.goto('https://caigou.chinatelecom.com.cn/')
+
+ input("请在浏览器中完成登录,然后按 Enter 继续...")
+ context.storage_state(path='auth.json')
+ print("登录状态已保存到 auth.json")
+ browser.close()
+
+
+if __name__ == "__main__":
+ crawl_vue_app()
\ No newline at end of file
diff --git a/spider/vue_app.png b/spider/vue_app.png
new file mode 100644
index 0000000..6236991
Binary files /dev/null and b/spider/vue_app.png differ