commit a6075453248a4dc50db1ffd360682912018d2b6b Author: mshe <666666666@666666666.666666666> Date: Wed May 27 11:25:56 2026 +0800 first commit diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..a86e35a --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,14 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/learn-spider.iml b/.idea/learn-spider.iml new file mode 100644 index 0000000..1e32a71 --- /dev/null +++ b/.idea/learn-spider.iml @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..f1fe18c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..a16fa14 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 0000000..067e537 --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,136 @@ + + + + + + + + + + + { + "associatedIndex": 5 +} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1778808412351 + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/__pycache__/app.cpython-39.pyc b/__pycache__/app.cpython-39.pyc new file mode 100644 index 0000000..cb21463 Binary files /dev/null and b/__pycache__/app.cpython-39.pyc differ diff --git a/app.py b/app.py new file mode 100644 index 0000000..5d20a01 --- /dev/null +++ b/app.py @@ -0,0 +1,12 @@ +from flask import Flask + +app = Flask(__name__) + + +@app.route('/') +def hello_world(): # put application's code here + return 'Hello World!' + + +if __name__ == '__main__': + app.run() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f11f711 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,32 @@ +# HTTP请求 +requests==2.31.0 +httpx==0.27.0 # 支持HTTP/2,比requests更快 + +# HTML解析 +beautifulsoup4==4.12.3 +lxml==5.1.0 # 更快的解析器,比html.parser快很多 +parsel==1.9.0 # Scrapy的解析库 + +# 异步爬虫 +aiohttp==3.9.5 # 异步HTTP客户端 +aiofiles==23.2.1 # 异步文件操作 + +# 模拟浏览器(对付Vue/React等SPA) +playwright==1.42.0 # 推荐,现代浏览器自动化 +selenium==4.18.1 # 经典方案 + +# 代理和反爬 +fake-useragent==1.5.1 # 随机User-Agent +requests-html==0.10.0 # 支持JS渲染(基于pyppeteer) + +# 数据存储 +pymongo==4.6.1 # MongoDB +redis==5.0.1 # Redis +pymysql==1.1.0 # MySQL + +# 数据处理 +pandas==2.2.1 # 数据分析 +numpy==1.26.4 # 科学计算 + +# 爬虫框架 +scrapy==2.11.1 # 重量级爬虫框架 \ No newline at end of file diff --git a/spider/baidu.py b/spider/baidu.py new file mode 100644 index 0000000..4fee651 --- /dev/null +++ b/spider/baidu.py @@ -0,0 +1,33 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup + +def crawl_vue_app(): + with sync_playwright() as p: + # 启动浏览器(headless=False 可以看到浏览器窗口) + browser = p.chromium.launch(headless=False) + page = browser.new_page() + # 访问页面 + page.goto('http://localhost:8080/') + # 等待页面加载完成 + page.wait_for_load_state('networkidle') + # 额外的等待时间(如果某些异步数据加载较慢) + page.wait_for_timeout(2000) # 等待2秒 + # 获取页面内容 + html = page.content() + title = page.title() + print(f"标题: {title}") + # 解析内容 + soup = BeautifulSoup(html, 'html.parser') + # 获取 Vue 应用的根元素内的文本 + app_div = soup.find('div', id='app') + if app_div: + # 获取所有可见文本 + text = app_div.get_text(strip=True) + print(f"应用内容: {text[:500]}") + # 截图 + page.screenshot(path='vue_app.png') + print("截图已保存") + browser.close() + +if __name__ == "__main__": + crawl_vue_app() \ No newline at end of file diff --git a/spider/china_net.py b/spider/china_net.py new file mode 100644 index 0000000..3ffc08b --- /dev/null +++ b/spider/china_net.py @@ -0,0 +1,48 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup + +def crawl_vue_app(): + with sync_playwright() as p: + # 启动浏览器(headless=False 可以看到浏览器窗口) + browser = p.chromium.launch(headless=True) + page = browser.new_page() + # 访问页面 + page.goto('https://caigou.chinatelecom.com.cn/') + # 等待页面加载完成 + page.wait_for_load_state('networkidle') + # 额外的等待时间(如果某些异步数据加载较慢) + page.wait_for_timeout(2000) # 等待2秒 + page.get_by_text("更多").nth(0).click() + page.wait_for_load_state('networkidle') + page.wait_for_timeout(2000) # 等待2秒 + page.get_by_text("招标公告").click() + page.wait_for_timeout(2000) # 等待2秒 + items = page.locator('.el-table__row').all() + for item in items: + print(item.text_content()) + + # # 获取页面内容 + # html = page.content() + # title = page.title() + + # print(f"标题: {title}") + + # # 解析内容 + # soup = BeautifulSoup(html, 'html.parser') + + # # 获取 Vue 应用的根元素内的文本 + # app_div = soup.find('div', id='app') + # if app_div: + # # 获取所有可见文本 + # text = app_div.get_text(strip=True) + # print(f"应用内容: {text}") + # # print(f"应用内容: {text[:500]}") + + # 截图 + # page.screenshot(path='vue_app.png') + # print("截图已保存") + page.wait_for_timeout(5000) # 等待2秒 + browser.close() + +if __name__ == "__main__": + crawl_vue_app() \ No newline at end of file diff --git a/spider/china_net_request.py b/spider/china_net_request.py new file mode 100644 index 0000000..c94661a --- /dev/null +++ b/spider/china_net_request.py @@ -0,0 +1,92 @@ +import requests +import time +import re +from bs4 import BeautifulSoup + +BASE_URL = 'https://caigou.chinatelecom.com.cn' + +def fetch_all_pages(): + list_url = f'{BASE_URL}/portal/base/announcementJoin/queryListNew''' + headers = { + 'Content-Type': 'application/json', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Referer': 'https://caigou.chinatelecom.com.cn/', + 'Cookie': 'Secure; sag_agent_cookie=' + } + + all_data = [] + page = 1 + page_size = 10 + + while True: + print(f"正在获取第 {page} 页...") + params = { + "pageNum": page, + "pageSize": page_size, + "type":"e2no", + "provinceCode":"", + "noticeSummary":"" + } + response = requests.post(list_url, json=params, headers=headers) + result = response.json() + data = result.get('data', {}) + pageInfo = data.get('pageInfo', {}) + items = pageInfo.get('list', []) + total = pageInfo.get('total', 0) + if not items: + break + all_data.extend(items) + print(f" 获取 {len(items)} 条,累计 {len(all_data)} 条, 总共 {total} 条") + if len(all_data) >= total /100: + break + page += 1 + time.sleep(0.5) + print(f"\n总共获取 {len(all_data)} 条数据") + return all_data + +# 详情链接 +#/DeclareDetails?id=177118231561666&type=1&docTypeCode=TenderAnnouncement&securityViewCode=2f06d88f0032ae9e828be0f7767674c8 +# https://caigou.chinatelecom.com.cn/portal/base/tenderannouncement/view +def get_detail(item): + detail_url = f"{BASE_URL}/portal/base/tenderannouncement/view" + headers = { + 'Content-Type': 'application/json', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Referer': 'https://caigou.chinatelecom.com.cn/', + } + params = { + "type":"TenderAnnouncement", + "id":item.get('docId'), + "securityViewCode": item.get('securityViewCode'), + "title": item.get('docTitle') + } + response = requests.post(detail_url, headers=headers, json=params) + context = response.json().get('data').get('context') + return clean_html_tag(context) + +def clean_html_tag(html_text): + """综合清理 HTML 标签、多余空格和换行""" + if not html_text: + return "" + soup = BeautifulSoup(html_text, 'html.parser') + text = soup.get_text() + text = re.sub(r'\s+', ' ', text) + text = text.strip() + text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') + return text + +data = fetch_all_pages() +for item in data: + print(get_detail(item)) + time.sleep(0.5) + diff --git a/spider/mail_qq.py b/spider/mail_qq.py new file mode 100644 index 0000000..580f4ab --- /dev/null +++ b/spider/mail_qq.py @@ -0,0 +1,25 @@ +from playwright.sync_api import sync_playwright +from bs4 import BeautifulSoup + +def crawl_vue_app(): + with sync_playwright() as p: + # 启动浏览器(headless=False 可以看到浏览器窗口) + browser = p.chromium.launch(headless=False) + page = browser.new_page() + # 访问页面 + page.goto('https://mail.qq.com') + + context = browser.new_context() + page = context.new_page() + + # 访问页面 + page.goto('https://caigou.chinatelecom.com.cn/') + + input("请在浏览器中完成登录,然后按 Enter 继续...") + context.storage_state(path='auth.json') + print("登录状态已保存到 auth.json") + browser.close() + + +if __name__ == "__main__": + crawl_vue_app() \ No newline at end of file diff --git a/spider/vue_app.png b/spider/vue_app.png new file mode 100644 index 0000000..6236991 Binary files /dev/null and b/spider/vue_app.png differ