feat: 实现微信公众号新闻和视频同步服务
- 使用 draft API 同步文章(适配个人订阅号) - 使用 material API 同步视频(含详情获取) - 自动建表(videos)、UPSERT 已有 articles 表 - 同步删除:微信端删除的素材自动从数据库移除 - APScheduler 定时调度,支持 --once 手动触发 - Docker + docker-compose 部署配置 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
159
sync.py
Normal file
159
sync.py
Normal file
@@ -0,0 +1,159 @@
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from db import Database
|
||||
from wechat import WeChatClient, WeChatError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SyncService:
|
||||
def __init__(self, wechat: WeChatClient, db: Database):
|
||||
self.wechat = wechat
|
||||
self.db = db
|
||||
|
||||
def run_sync(self):
|
||||
logger.info("=== Sync started ===")
|
||||
for material_type in ["news", "video"]:
|
||||
sync_key = f"wechat_{material_type}_sync"
|
||||
try:
|
||||
self.db.update_sync_state(sync_key, {"status": "syncing", "count": 0})
|
||||
if material_type == "news":
|
||||
count = self._sync_published_articles()
|
||||
else:
|
||||
count = self._sync_materials(material_type)
|
||||
self.db.update_sync_state(sync_key, {
|
||||
"status": "idle",
|
||||
"count": count,
|
||||
"last_sync": datetime.now(tz=timezone.utc).isoformat(),
|
||||
})
|
||||
logger.info("Sync %s completed, %d items processed", material_type, count)
|
||||
except Exception as e:
|
||||
logger.error("Sync %s failed: %s", material_type, e, exc_info=True)
|
||||
self.db.update_sync_state(sync_key, {
|
||||
"status": "error",
|
||||
"error": str(e),
|
||||
"last_sync": datetime.now(tz=timezone.utc).isoformat(),
|
||||
})
|
||||
logger.info("=== Sync finished ===")
|
||||
|
||||
# --- Published articles (freepublish API) ---
|
||||
|
||||
def _sync_published_articles(self) -> int:
|
||||
"""Sync articles using draft API (personal subscription account)."""
|
||||
offset = 0
|
||||
processed = 0
|
||||
all_media_ids = set()
|
||||
|
||||
while True:
|
||||
batch = self.wechat.batch_get_drafts(offset=offset, count=20, no_content=0)
|
||||
items = batch.get("item", [])
|
||||
total = batch.get("total_count", 0)
|
||||
item_count = batch.get("item_count", 0)
|
||||
|
||||
if not items:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
media_id = item.get("media_id", "")
|
||||
all_media_ids.add(media_id)
|
||||
self._sync_draft_item(item)
|
||||
processed += 1
|
||||
except Exception as e:
|
||||
logger.error("Error processing draft %s: %s",
|
||||
item.get("media_id", "?"), e)
|
||||
|
||||
offset += item_count
|
||||
if offset >= total:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
# Delete articles no longer in drafts
|
||||
self.db.delete_missing_articles(all_media_ids)
|
||||
return processed
|
||||
|
||||
def _sync_draft_item(self, item: dict):
|
||||
media_id = item["media_id"]
|
||||
update_time = item.get("update_time", 0)
|
||||
news_items = item.get("content", {}).get("news_item", [])
|
||||
|
||||
for idx, news in enumerate(news_items):
|
||||
wechat_article_id = f"{media_id}_{idx}"
|
||||
|
||||
article = {
|
||||
"wechat_article_id": wechat_article_id,
|
||||
"title": news.get("title", "")[:200],
|
||||
"content": news.get("content", ""),
|
||||
"cover_url": news.get("thumb_url", "")[:500] if news.get("thumb_url") else None,
|
||||
"author": news.get("author", "")[:100] if news.get("author") else None,
|
||||
"publish_date": datetime.fromtimestamp(update_time, tz=timezone.utc).date() if update_time else None,
|
||||
"source_url": news.get("url", "")[:1000] if news.get("url") else None,
|
||||
}
|
||||
self.db.upsert_article(article)
|
||||
|
||||
# --- Materials (video) ---
|
||||
|
||||
def _sync_materials(self, material_type: str) -> int:
|
||||
counts = self.wechat.get_material_count()
|
||||
total_key = f"{material_type}_count"
|
||||
total = counts.get(total_key, 0)
|
||||
logger.info("Total %s materials: %d", material_type, total)
|
||||
|
||||
if total == 0:
|
||||
return 0
|
||||
|
||||
offset = 0
|
||||
processed = 0
|
||||
all_media_ids = set()
|
||||
while offset < total:
|
||||
batch = self.wechat.batch_get_materials(material_type, offset, count=20)
|
||||
items = batch.get("item", [])
|
||||
item_count = batch.get("item_count", 0)
|
||||
|
||||
for item in items:
|
||||
try:
|
||||
all_media_ids.add(item["media_id"])
|
||||
if material_type == "video":
|
||||
self._sync_video_item(item)
|
||||
processed += 1
|
||||
except Exception as e:
|
||||
logger.error("Error processing %s item %s: %s",
|
||||
material_type, item.get("media_id", "?"), e)
|
||||
|
||||
offset += item_count
|
||||
if offset < total:
|
||||
time.sleep(0.5)
|
||||
|
||||
# Delete items that exist in DB but no longer on WeChat
|
||||
if material_type == "video":
|
||||
self.db.delete_missing_videos(all_media_ids)
|
||||
|
||||
return processed
|
||||
|
||||
def _sync_video_item(self, item: dict):
|
||||
media_id = item["media_id"]
|
||||
update_time = item.get("update_time", 0)
|
||||
wechat_time = datetime.fromtimestamp(update_time, tz=timezone.utc) if update_time else None
|
||||
|
||||
video = {
|
||||
"media_id": media_id,
|
||||
"name": item.get("name", ""),
|
||||
"url": item.get("url", ""),
|
||||
"title": None,
|
||||
"description": None,
|
||||
"down_url": None,
|
||||
"wechat_update_time": wechat_time,
|
||||
}
|
||||
|
||||
if self.db.should_fetch_video_detail(media_id, update_time):
|
||||
try:
|
||||
detail = self.wechat.get_material(media_id)
|
||||
video["title"] = detail.get("title")
|
||||
video["description"] = detail.get("description")
|
||||
video["down_url"] = detail.get("down_url")
|
||||
except WeChatError as e:
|
||||
logger.warning("Failed to fetch video detail %s: %s", media_id, e)
|
||||
|
||||
self.db.upsert_video(video)
|
||||
Reference in New Issue
Block a user