当前位置: 首页 > news >正文

借助 GitHub Workflow 定时获取博客状态

由这篇博文启发,我们可以通过 GitHub 的工作流让代码跑在 GitHub 上,并将获得的数据存在仓库中。

首先是写 Python 脚本,这部分比较简单。不难找到几个关键的请求链接 https://www.cnblogs.com/XuYueming/ajax/GetPostStat/blog-stats/news/sidecolumn.aspx,用 requests 请求后用 BeautifulSoup 解析即可。然后通过 json 保存下来。

import requests
import json
from datetime import datetime, UTC
from pathlib import Path
from bs4 import BeautifulSoupBASE_URL = "https://www.cnblogs.com/XuYueming/ajax"
OUTPUT_PATH = Path("data")
INTERESTED_BLOGS = [18313014, 18397758]def fetch_html(url: str) -> BeautifulSoup:resp = requests.get(url, timeout=10)resp.raise_for_status()return BeautifulSoup(resp.text, "html.parser")def extract_int_by_id(soup: BeautifulSoup, element_id: str) -> int:el = soup.find(id=element_id)if not el:return 0text = el.get_text(strip=True)digits = "".join(ch for ch in text if ch.isdigit())return int(digits) if digits else 0def fetch_blog_info(blog_ids: list[int]) -> dict:resp = requests.post(f'{BASE_URL}/GetPostStat',data=json.dumps(blog_ids),headers={"Content-Type": "application/json; charset=utf-8"})resp.raise_for_status()return resp.json()def fetch_blog_stats() -> dict:soup = fetch_html(f"{BASE_URL}/blog-stats")return {"post_count": extract_int_by_id(soup, "stats_post_count"),"article_count": extract_int_by_id(soup, "stats_article_count"),"comment_count": extract_int_by_id(soup, "stats-comment_count"),"view_count": extract_int_by_id(soup, "stats-total-view-count"),}def fetch_news() -> dict:soup = fetch_html(f"{BASE_URL}/news")profile_div = soup.find(id="profile_block")if not profile_div:return {"nickname": "", "join_age": "", "fans": 0, "follow": 0}a_tags = profile_div.find_all("a")nickname = a_tags[0].get_text(strip=True) if len(a_tags) > 0 else ""join_age = a_tags[1].get_text(strip=True) if len(a_tags) > 1 else ""fans_tag = profile_div.find("a", class_="follower-count")fans = int(fans_tag.get_text(strip=True)) if fans_tag and fans_tag.get_text(strip=True).isdigit() else 0follow_tag = profile_div.find("a", class_="folowing-count")follow = int(follow_tag.get_text(strip=True)) if follow_tag and follow_tag.get_text(strip=True).isdigit() else 0return {"nickname": nickname,"join_age": join_age,"fans": fans,"follow": follow}def fetch_sidecolumn() -> dict:soup = fetch_html(f"{BASE_URL}/sidecolumn.aspx")data = {}# recent_posts = []# ul = soup.select_one("#sidebar_recentposts ul")# if ul:#     for li in ul.find_all("li"):#         a = li.find("a")#         if a:#             recent_posts.append({"title": a.get_text(strip=True), "link": a.get("href")})# data["recent_posts"] = recent_posts# tags = []# ul = soup.select_one("#sidebar_toptags ul")# if ul:#     for li in ul.find_all("li"):#         a = li.find("a")#         if a and "更多" not in a.get_text():#             count_span = li.find("span", class_="tag-count")#             count = int(count_span.get_text(#                 strip=True).strip("()")) if count_span else 0#             tags.append({#                 "name": a.get_text(strip=True).replace(f"({count})", ""),#                 "count": count,#                 # "link": a.get("href")#             })# data["tags"] = tags# collections = []# for div in soup.select("#sidebar_categories .catList"):#     title = div.select_one(".catListTitle")#     if title:#         title_text = title.get_text(strip=True).split("(")[0]#         items = []#         for a in div.select("ul li a"):#             items.append({"name": a.get_text(strip=True), "link": a.get("href")})#         collections.append({"title": title_text, "items": items})# data["collections"] = collections# archives = []# archive_div = soup.select_one("#sidebar_postarchive ul")# if archive_div:#     for a in archive_div.find_all("a"):#         archives.append({"name": a.get_text(strip=True), "link": a.get("href")})# data["archives"] = archives# recent_comments = []# comment_block = soup.select_one("#sidebar_recentcomments .RecentCommentBlock ul")# if comment_block:#     items = comment_block.find_all(recursive=False)#     for i in range(0, len(items), 3):#         title_li = items[i]#         body_li = items[i + 1] if i + 1 < len(items) else None#         author_li = items[i + 2] if i + 2 < len(items) else None#         if title_li and body_li and author_li:#             a = title_li.find("a")#             title = a.get_text(strip=True) if a else ""#             link = a.get("href") if a else ""#             content = body_li.get_text(strip=True)#             author = author_li.get_text(strip=True).lstrip("--")#             recent_comments.append({"title": title, "link": link, "content": content, "author": author})# data["recent_comments"] = recent_commentsscore_rank = {}ul = soup.select_one("#sidebar_scorerank ul")if ul:score_li = ul.find("li", class_="liScore")rank_li = ul.find("li", class_="liRank")score = int("".join(ch for ch in score_li.get_text()if ch.isdigit())) if score_li else 0rank = int("".join(ch for ch in rank_li.get_text()if ch.isdigit())) if rank_li else 0score_rank = {"score": score, "rank": rank}data["score_rank"] = score_rankreturn datadef main():snapshot = {"fetched_at": datetime.now(UTC).isoformat() + "Z","blog_stats": fetch_blog_stats(),"news": fetch_news(),"sidecolumn": fetch_sidecolumn(),"interested_blogs:": fetch_blog_info(INTERESTED_BLOGS)}OUTPUT_PATH.mkdir(exist_ok=True)timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")output_file = OUTPUT_PATH / f"cnblogs_snapshot_{timestamp}.json"with open(output_file, "w", encoding="utf-8") as f:json.dump(snapshot, f, ensure_ascii=False, indent=2)print("blog snapshot saved:", output_file)if __name__ == "__main__":main()

保存的 json 长这样:

{"fetched_at": "2025-11-26T14:07:43.823701+00:00Z","blog_stats": {"post_count": 130,"article_count": 0,"comment_count": 76,"view_count": 10033},"news": {"nickname": "XuYueming","join_age": "1年9个月","fans": 24,"follow": 5},"sidecolumn": {"score_rank": {"score": 12067,"rank": 111361}},"interested_blogs:": [{"postId": 18313014,"viewCount": 704,"feedbackCount": 1,"diggCount": 10,"buryCount": 0},{"postId": 18397758,"viewCount": 1080,"feedbackCount": 15,"diggCount": 11,"buryCount": 0}]
}

然后我们要通过 GitHub workflow 进行上传到仓库,新建 .github/workflows/fetch_blog_stats.yml,里面就可以自定义流程了:

name: Fetch Blog Statson:workflow_dispatch:schedule:- cron: '0 0 * * *'jobs:run-python:runs-on: ubuntu-lateststeps:- name: Checkout repositoryuses: actions/checkout@v3with:persist-credentials: true- name: Set up Pythonuses: actions/setup-python@v4with:python-version: '3.11'- name: Install dependenciesrun: |python -m pip install --upgrade pippip install -r requirements.txt- name: Run Python scriptrun: python scripts/fetch_blog_stats.py- name: Commit and push changesrun: |git config --global user.name "github-actions[bot]"git config --global user.email "github-actions[bot]@users.noreply.github.com"git add data/git commit -m "chore: auto-update by workflow" || echo "No changes to commit"git push origin HEAD

不要忘记在 settings 里把 Workflow permissions 改成 Read and write permissions,第一次忘开导致工作流失败了。

然后就结束了,可以在我的 github 仓库查看,该项目在 MIT 协议下开源。

http://www.zskr.cn/news/62769.html

相关文章:

  • 零九CDN — 国内网站加速 + DD/CC 攻击防御解决方案
  • 体育赛事多元竞技价值 球类奥运亚运实战提升
  • 2025 年美发会员管理软件最新推荐榜,技术实力与市场口碑深度解析的优质品牌合集理发店会员系统推荐
  • 2025年口碑好的湖北塑料桶实力厂家排行榜
  • 2025年口碑好、服务佳、资质全的食品包装设计专业公司推荐
  • 西城区离婚律师事务所推荐:专注婚姻家事法律服务机构盘点
  • 2025上海留学中介哪家靠谱
  • 2025 最新推荐!学术论文AI写作工具排行榜:核心优势与使用场景评测
  • 华为设备接入视频汇聚平台EasyCVR不上线问题排查实录
  • 治疗妇科炎症的药有哪些?科学认识与合理应对指南
  • 白带异常中成药有哪些?女性健康调理常用药物盘点
  • 【SQL练习】找出每一科都是前30%的学生ID
  • 2025年鸿容AI智能办公鼠标年度排名:深度测评5大AI鼠标
  • 1127
  • VMware Ubuntu虚拟机安装 备忘录
  • 2025 年 11 月常州宠物医院权威推荐榜:市区天宁区专业诊疗与暖心服务口碑之选
  • TikTok广告开户投放服务商TOP7实力榜单发布
  • 2025年现浇混凝土企业推荐,楼板现浇/现浇楼梯/现浇别墅搭建/现浇楼板/现浇钢筋混凝土楼梯/现浇混凝土公司哪个好哪家好
  • Actix-Web中间件开发
  • 磨砂膏里的颗粒会伤害鸡皮肤吗?2025年安全评测与产品推荐
  • F037 vue+neo4j 编程语言知识图谱可视化分析系统vue+flask+neo4j - 指南
  • 监控摄像头方案商推荐:涂鸦智能如何破解企业转型痛点
  • 涂鸦智能:窗帘电机的智能解决方案,助力企业抢占市场先机
  • Qt 获取容器Vector中的最大值和最小值
  • vue3项目解析token信息
  • 2025中国AI云计算TOP10权威推荐榜单!靠谱智算云厂商推荐榜!
  • 交叉编译hostop
  • EF Core 深入学习
  • 2025年五大靠谱纸桶包装设备制造商推荐,专业纸桶包装设备厂
  • 2025年深圳USB充电器外壳厂家推荐:安全环保充电器外壳厂