From 4c33d5d01999b7e5089944b3b858cedaee258910 Mon Sep 17 00:00:00 2001 From: Zongxia Li Date: Sun, 26 Apr 2026 22:50:21 -0400 Subject: [PATCH 1/6] feat(news): add GitHub fetcher and icons --- public/icons/news/github-white.svg | 1 + public/icons/news/github.svg | 1 + server/scripts/research-news/search_github.py | 491 ++++++++++++++++++ 3 files changed, 493 insertions(+) create mode 100644 public/icons/news/github-white.svg create mode 100644 public/icons/news/github.svg create mode 100755 server/scripts/research-news/search_github.py diff --git a/public/icons/news/github-white.svg b/public/icons/news/github-white.svg new file mode 100644 index 00000000..f606ce01 --- /dev/null +++ b/public/icons/news/github-white.svg @@ -0,0 +1 @@ + diff --git a/public/icons/news/github.svg b/public/icons/news/github.svg new file mode 100644 index 00000000..3fde195f --- /dev/null +++ b/public/icons/news/github.svg @@ -0,0 +1 @@ + diff --git a/server/scripts/research-news/search_github.py b/server/scripts/research-news/search_github.py new file mode 100755 index 00000000..47ecef81 --- /dev/null +++ b/server/scripts/research-news/search_github.py @@ -0,0 +1,491 @@ +#!/usr/bin/env python3 +""" +GitHub repository news search. + +Surfaces: +1. Trending repositories (daily / weekly / monthly), via the unofficial trending + API mirror with a fallback to the official Search API sorted by stars. +2. Newly created repositories matching the user's research domains, via the + GitHub Search API (created:>YYYY-MM-DD). + +Auth: +- Optional `GITHUB_TOKEN` (or `GH_TOKEN`) env var lifts the search rate limit + from 10 req/min (unauth) to 30 req/min (auth). Functional without a token. + +Output JSON shape mirrors search_arxiv.py / search_huggingface.py so the +existing UI pipeline picks it up unchanged. +""" + +from __future__ import annotations + +import argparse +import json +import logging +import math +import os +import ssl +import sys +import time +import urllib.parse +import urllib.request +from datetime import datetime, timedelta, timezone +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger(__name__) + +try: + import requests + HAS_REQUESTS = True +except ImportError: + HAS_REQUESTS = False + +try: + import certifi + CERTIFI_CA_BUNDLE = certifi.where() +except ImportError: + CERTIFI_CA_BUNDLE = None + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from scoring_utils import ( + SCORE_MAX, + calculate_relevance_score, + calculate_recency_score, + calculate_quality_score, + calculate_recommendation_score, +) + +# --------------------------------------------------------------------------- +# Config +# --------------------------------------------------------------------------- +GITHUB_API = "https://api.github.com" +TRENDING_MIRROR = "https://ghapi.huchen.dev/repositories" + +# 1000+ stars on a freshly created repo is exceptional → max popularity score. +GH_STARS_FULL_SCORE = 1000 + + +def get_token() -> Optional[str]: + return os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN") or None + + +def build_ssl_context() -> ssl.SSLContext: + if CERTIFI_CA_BUNDLE and os.path.exists(CERTIFI_CA_BUNDLE): + return ssl.create_default_context(cafile=CERTIFI_CA_BUNDLE) + return ssl.create_default_context() + + +def http_get_json( + url: str, + headers: Optional[Dict[str, str]] = None, + timeout: int = 30, +): + headers = headers or {} + if HAS_REQUESTS: + kwargs = {"headers": headers, "timeout": timeout} + if CERTIFI_CA_BUNDLE and os.path.exists(CERTIFI_CA_BUNDLE): + kwargs["verify"] = CERTIFI_CA_BUNDLE + resp = requests.get(url, **kwargs) + resp.raise_for_status() + return resp.json() + + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=timeout, context=build_ssl_context()) as resp: + return json.loads(resp.read().decode("utf-8")) + + +# --------------------------------------------------------------------------- +# Fetchers +# --------------------------------------------------------------------------- +def gh_search_repos( + query: str, + max_pages: int = 1, + per_page: int = 25, + sort: str = "stars", +) -> List[Dict]: + """Search GitHub repositories via /search/repositories.""" + headers = { + "Accept": "application/vnd.github+json", + "User-Agent": "ResearchNews-GitHubFetcher/1.0", + "X-GitHub-Api-Version": "2022-11-28", + } + token = get_token() + if token: + headers["Authorization"] = f"Bearer {token}" + + items: List[Dict] = [] + for page in range(1, max_pages + 1): + params = urllib.parse.urlencode({ + "q": query, + "sort": sort, + "order": "desc", + "per_page": per_page, + "page": page, + }) + url = f"{GITHUB_API}/search/repositories?{params}" + try: + data = http_get_json(url, headers=headers) + page_items = data.get("items", []) or [] + items.extend(page_items) + if len(page_items) < per_page: + break + except Exception as exc: + logger.warning("[GH] search page %d failed: %s", page, exc) + break + return items + + +def fetch_trending(since: str = "weekly", language: Optional[str] = None) -> List[Dict]: + """Fetch GitHub trending repos. Falls back to recent star-sorted search.""" + since = since if since in {"daily", "weekly", "monthly"} else "weekly" + + params = {"since": since} + if language: + params["language"] = language + url = TRENDING_MIRROR + "?" + urllib.parse.urlencode(params) + + try: + data = http_get_json( + url, + headers={"User-Agent": "ResearchNews-GitHubFetcher/1.0"}, + timeout=15, + ) + if isinstance(data, list) and data: + logger.info("[GH] trending mirror returned %d repos", len(data)) + return data + logger.info("[GH] trending mirror returned empty payload, falling back to search") + except Exception as exc: + logger.info("[GH] trending mirror unavailable (%s), falling back to search", exc) + + days = {"daily": 2, "weekly": 7, "monthly": 30}.get(since, 7) + since_date = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d") + fallback_query = f"created:>{since_date}" + if language: + fallback_query += f" language:{language}" + return gh_search_repos(fallback_query, max_pages=1, per_page=25, sort="stars") + + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- +def _parse_iso(value: str) -> Optional[datetime]: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except (TypeError, ValueError): + return None + + +def normalize_search_repo(item: Dict) -> Dict: + pushed_at = item.get("pushed_at") or item.get("created_at") or "" + owner = item.get("owner") or {} + license_obj = item.get("license") or {} + return { + "id": item.get("full_name") or item.get("name") or item.get("html_url") or "", + "title": item.get("full_name") or item.get("name") or "", + "summary": item.get("description") or "", + "authors_str": owner.get("login", ""), + "published": pushed_at, + "published_date": _parse_iso(pushed_at), + "stars": item.get("stargazers_count") or 0, + "forks": item.get("forks_count") or 0, + "watchers": item.get("watchers_count") or 0, + "language": item.get("language") or "", + "topics": item.get("topics") or [], + "categories": item.get("topics") or [], + "html_url": item.get("html_url") or "", + "owner_avatar": owner.get("avatar_url") or "", + "license": license_obj.get("spdx_id") or None, + "mode": "search", + } + + +def normalize_trending_repo(item: Dict) -> Dict: + """Normalize a repo entry returned by the trending mirror.""" + if "stargazers_count" in item or "owner" in item: + return normalize_search_repo(item) + + name = item.get("name") or "" + author = item.get("author") or "" + full_name = f"{author}/{name}" if author and name else (name or item.get("repo") or "") + return { + "id": full_name, + "title": full_name, + "summary": item.get("description") or "", + "authors_str": author, + "published": "", + "published_date": None, + "stars": item.get("stars") or item.get("currentPeriodStars") or 0, + "forks": item.get("forks") or 0, + "watchers": 0, + "language": item.get("language") or "", + "topics": [], + "categories": [], + "html_url": item.get("url") or item.get("html_url") or ( + f"https://github.com/{full_name}" if full_name else "" + ), + "owner_avatar": item.get("avatar") or "", + "license": None, + "mode": "trending", + } + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- +def calculate_popularity_stars(stars: int) -> float: + """Logarithmic popularity from star count. 1000+ stars = max.""" + if stars <= 0: + return 0.0 + if stars >= GH_STARS_FULL_SCORE: + return SCORE_MAX + # log-scale so 10/100/1000 map to ~0.33/0.66/1.0 of SCORE_MAX + return min( + math.log10(stars + 1) / math.log10(GH_STARS_FULL_SCORE) * SCORE_MAX, + SCORE_MAX, + ) + + +def score_repos( + repos: List[Dict], + config: Optional[Dict], +) -> Tuple[List[Dict], int]: + domains = (config or {}).get("research_domains", {}) + excluded = (config or {}).get("excluded_keywords", []) + has_domains = bool(domains) + + scored: List[Dict] = [] + filtered = 0 + + for repo in repos: + if has_domains: + relevance, matched_domain, matched_keywords = calculate_relevance_score( + { + "title": repo["title"], + "summary": repo["summary"], + "categories": repo["categories"], + }, + domains, + excluded, + ) + if relevance == 0: + # Trending repos are interesting even if they don't match a + # domain — keep them with a soft floor so they rank lower. + if repo.get("mode") == "trending": + relevance = 0.5 + matched_domain = "trending" + matched_keywords = [] + else: + filtered += 1 + continue + else: + relevance = 1.5 + matched_domain = "trending" if repo.get("mode") == "trending" else "github" + matched_keywords = [] + + recency = calculate_recency_score(repo.get("published_date")) + popularity = calculate_popularity_stars(repo.get("stars", 0)) + quality = calculate_quality_score(repo.get("summary", "")) + + # Light bonuses for curated metadata. + if repo.get("topics"): + quality = min(quality + 0.3, SCORE_MAX) + if repo.get("license"): + quality = min(quality + 0.2, SCORE_MAX) + + final_score = calculate_recommendation_score( + relevance, recency, popularity, quality + ) + + scored.append({ + "id": repo["id"], + "title": repo["title"], + "authors": repo.get("authors_str", ""), + "abstract": repo.get("summary", "") or "(no description)", + "published": repo.get("published", ""), + "categories": (repo.get("topics") or [])[:5], + "relevance_score": round(relevance, 2), + "recency_score": round(recency, 2), + "popularity_score": round(popularity, 2), + "quality_score": round(quality, 2), + "final_score": final_score, + "matched_domain": matched_domain, + "matched_keywords": matched_keywords, + "link": repo.get("html_url", ""), + "source": "github", + "engagement": { + "likes": repo.get("stars", 0), + "comments": repo.get("forks", 0), + }, + # GitHub-specific extras (consumed by NewsItemCard's GH branch) + "stars": repo.get("stars", 0), + "forks": repo.get("forks", 0), + "language": repo.get("language", ""), + "license": repo.get("license"), + "owner_avatar": repo.get("owner_avatar", ""), + "mode": repo.get("mode", "search"), + }) + + scored.sort(key=lambda x: x["final_score"], reverse=True) + return scored, filtered + + +# --------------------------------------------------------------------------- +# Query construction +# --------------------------------------------------------------------------- +def build_search_queries(config: Optional[Dict]) -> List[str]: + """Build GitHub search queries from research_domains. + + Returns one query per domain. Each query OR-combines that domain's top + keywords. Falls back to a sensible LLM/AI default when no domains exist. + """ + if not config: + return ["llm OR transformer OR foundation-model"] + + domains = config.get("research_domains") or {} + queries: List[str] = [] + for _name, dom in domains.items(): + kws = dom.get("keywords") or [] + if not kws: + continue + # GitHub treats space-separated terms as AND; OR them explicitly. + clauses = [] + for kw in kws[:4]: + kw_clean = kw.strip() + if not kw_clean: + continue + clauses.append(f'"{kw_clean}"' if " " in kw_clean else kw_clean) + if clauses: + queries.append(" OR ".join(clauses)) + if not queries: + queries = ["llm OR transformer OR foundation-model"] + return queries + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main() -> int: + parser = argparse.ArgumentParser( + description="Fetch and score trending / recent GitHub repositories." + ) + parser.add_argument("--config", type=str, default=None, + help="Path to research interests JSON config") + parser.add_argument("--output", type=str, required=True, + help="Output JSON file path") + parser.add_argument("--top-n", type=int, default=10, + help="Number of top repos to return") + parser.add_argument("--language", type=str, default="", + help="Optional language filter (e.g. python, typescript)") + parser.add_argument("--time-window", type=str, default="weekly", + choices=["daily", "weekly", "monthly"], + help="Trending time window") + parser.add_argument("--include-trending", type=str, default="true", + help="Whether to include trending repos (true/false)") + parser.add_argument("--max-search-pages", type=int, default=1, + help="Pages of search results per query (≤3 recommended)") + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", + stream=sys.stderr, + ) + + config: Optional[Dict] = None + if args.config: + try: + with open(args.config, "r", encoding="utf-8-sig") as f: + config = json.load(f) + logger.info("[GH] loaded config from %s", args.config) + except Exception as exc: + logger.warning("[GH] failed to load config %s: %s", args.config, exc) + + language = (args.language or (config or {}).get("language") or "").strip() or None + time_window = args.time_window + include_trending = (args.include_trending or "true").lower() in {"true", "1", "yes", "on"} + + if get_token(): + logger.info("[GH] using GITHUB_TOKEN for higher rate limits") + else: + logger.info("[GH] no GITHUB_TOKEN set — using unauthenticated rate limits") + + repos: List[Dict] = [] + seen_ids: set = set() + + # 1) Trending repos + if include_trending: + logger.info( + "[GH] fetching trending (since=%s, language=%s)", + time_window, language or "any", + ) + trending_raw = fetch_trending(since=time_window, language=language) + for entry in trending_raw: + r = normalize_trending_repo(entry) + if r["id"] and r["id"] not in seen_ids: + seen_ids.add(r["id"]) + repos.append(r) + logger.info("[GH] trending: %d repos collected", len(repos)) + + # 2) Per-domain search for newly created repos + queries = build_search_queries(config) + days_recent = 30 + since_date = (datetime.now(timezone.utc) - timedelta(days=days_recent)).strftime("%Y-%m-%d") + + for q in queries: + full_q = q + if language: + full_q += f" language:{language}" + full_q += f" created:>{since_date}" + logger.info("[GH] search: %s", full_q) + try: + search_items = gh_search_repos( + full_q, + max_pages=max(1, args.max_search_pages), + per_page=20, + sort="stars", + ) + except Exception as exc: + logger.warning("[GH] search failed: %s", exc) + search_items = [] + added = 0 + for item in search_items: + r = normalize_search_repo(item) + if r["id"] and r["id"] not in seen_ids: + seen_ids.add(r["id"]) + repos.append(r) + added += 1 + logger.info("[GH] +%d new repos", added) + # Be polite to the API, especially without a token. + time.sleep(1.0 if not get_token() else 0.4) + + scored, filtered = score_repos(repos, config) + logger.info( + "[GH] scored %d repos (%d filtered out)", len(scored), filtered + ) + top = scored[: args.top_n] + + output = { + "top_papers": top, + "total_found": len(repos), + "total_filtered": filtered, + "search_date": datetime.now().strftime("%Y-%m-%d"), + } + + with open(args.output, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2, default=str) + + logger.info("[GH] saved %d repos to %s", len(top), args.output) + for i, r in enumerate(top, 1): + logger.info( + " %d. %s ⭐%s (score %s)", + i, r["title"][:60], r["stars"], r["final_score"], + ) + + print(json.dumps(output, ensure_ascii=True, indent=2, default=str)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From e738372400a51a627744b8ca8ef41fdfd6578536 Mon Sep 17 00:00:00 2001 From: Zongxia Li Date: Sun, 26 Apr 2026 22:50:26 -0400 Subject: [PATCH 2/6] feat(news): expand HuggingFace fetcher to models/datasets/spaces --- .../research-news/search_huggingface.py | 359 ++++++++++++++++-- 1 file changed, 318 insertions(+), 41 deletions(-) diff --git a/server/scripts/research-news/search_huggingface.py b/server/scripts/research-news/search_huggingface.py index 0e62be3a..f401abcb 100644 --- a/server/scripts/research-news/search_huggingface.py +++ b/server/scripts/research-news/search_huggingface.py @@ -48,9 +48,29 @@ # HuggingFace API configuration # --------------------------------------------------------------------------- HF_DAILY_PAPERS_URL = "https://huggingface.co/api/daily_papers" +HF_MODELS_URL = "https://huggingface.co/api/models" +HF_DATASETS_URL = "https://huggingface.co/api/datasets" +HF_SPACES_URL = "https://huggingface.co/api/spaces" -# Popularity: 50+ upvotes = max score (SCORE_MAX) +# Popularity: 50+ upvotes = max score (SCORE_MAX) for papers HF_UPVOTES_FULL_SCORE = 50 +# Popularity: 200+ likes = max score for repos (models/datasets/spaces) +HF_REPO_LIKES_FULL_SCORE = 200 +# Popularity: 100k+ downloads contributes a small bonus +HF_DOWNLOADS_FULL_SCORE = 100_000 + +VALID_MODES = ("papers", "models", "datasets", "spaces") + + +def hf_auth_headers(extra: Optional[Dict[str, str]] = None) -> Dict[str, str]: + """Build request headers, attaching `HF_TOKEN` / `HUGGINGFACE_TOKEN` if set.""" + headers = {"User-Agent": "ResearchNews-HFFetcher/1.0"} + if extra: + headers.update(extra) + token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + return headers def build_ssl_context() -> ssl.SSLContext: @@ -318,6 +338,208 @@ def score_papers( return scored, total_filtered +# --------------------------------------------------------------------------- +# HuggingFace Hub repos (models / datasets / spaces) +# --------------------------------------------------------------------------- +def fetch_hub_repos(kind: str, limit: int = 50, sort: str = "likes7d") -> List[Dict]: + """ + Fetch entries from the HuggingFace Hub API for a given repo kind. + + Args: + kind: One of "models", "datasets", "spaces". + limit: Number of entries to request. + sort: "likes7d" (trending), "likes", "downloads", "lastModified". + + Returns: + List of raw JSON entries (empty on failure). + """ + if kind not in {"models", "datasets", "spaces"}: + return [] + url = { + "models": HF_MODELS_URL, + "datasets": HF_DATASETS_URL, + "spaces": HF_SPACES_URL, + }[kind] + params = urllib.parse.urlencode({ + "sort": sort, + "direction": "-1", + "limit": limit, + "full": "true", + }) + try: + data = http_get_json(f"{url}?{params}", headers=hf_auth_headers(), timeout=30) + if isinstance(data, list): + logger.info("[HF/%s] fetched %d entries", kind, len(data)) + return data + return [] + except Exception as exc: + logger.warning("[HF/%s] fetch failed: %s", kind, exc) + return [] + + +def normalize_hub_repo(entry: Dict, kind: str) -> Optional[Dict]: + """Normalize a HF Hub repo (model/dataset/space) into the internal shape.""" + repo_id = entry.get("id") or entry.get("modelId") + if not repo_id: + return None + + author = entry.get("author") or (repo_id.split("/")[0] if "/" in repo_id else "") + summary = entry.get("description") or "" + likes = entry.get("likes", 0) or 0 + downloads = entry.get("downloads", 0) or 0 + pipeline_tag = entry.get("pipeline_tag", "") or "" + tags = entry.get("tags", []) or [] + + last_modified = entry.get("lastModified") or entry.get("createdAt") or "" + last_dt = None + if last_modified: + try: + last_dt = datetime.fromisoformat(last_modified.replace("Z", "+00:00")) + except (ValueError, TypeError): + pass + + if kind == "models": + link = f"https://huggingface.co/{repo_id}" + else: + link = f"https://huggingface.co/{kind}/{repo_id}" + + # Build a friendly category list: pipeline tag first, then a few tags + # (skip noisy license: prefixes). + categories: List[str] = [] + if pipeline_tag: + categories.append(pipeline_tag) + for t in tags: + if not t or ":" in t: + continue + if t in categories: + continue + categories.append(t) + if len(categories) >= 6: + break + + return { + "id": f"{kind}:{repo_id}", + "title": repo_id, + "summary": summary, + "authors_str": author, + "published": last_modified, + "published_date": last_dt, + # We reuse "upvotes" so it flows through the existing scoring path. + "upvotes": likes, + "downloads": downloads, + "thumbnail": "", + "num_comments": 0, + "submitted_by_name": author, + "submitted_by_avatar": "", + "organization": author, + "categories": categories, + "tags": tags, + "pipeline_tag": pipeline_tag, + "kind": kind, + "link": link, + "source": "huggingface", + } + + +def calculate_repo_popularity_score(likes: int, downloads: int = 0) -> float: + """Combine HF Hub likes (primary) + downloads (bonus) into a 0..SCORE_MAX score.""" + if likes <= 0 and downloads <= 0: + return 0.0 + likes_part = min(likes / HF_REPO_LIKES_FULL_SCORE * SCORE_MAX, SCORE_MAX) + download_part = 0.0 + if downloads > 0: + import math as _math + download_part = min( + _math.log10(downloads + 1) / _math.log10(HF_DOWNLOADS_FULL_SCORE) * 1.5, + 1.5, + ) + # 70% likes, plus a small download bonus, capped at SCORE_MAX + return min(likes_part * 0.7 + download_part, SCORE_MAX) + + +def score_hub_repos( + repos: List[Dict], + config: Optional[Dict], + kind: str, +) -> Tuple[List[Dict], int]: + """ + Score HF Hub repos. If config has research_domains, repos with relevance 0 + are kept with a soft floor (these are trending entries; we don't want to + drop everything just because the user's keywords are narrow). + """ + domains = (config or {}).get("research_domains", {}) + excluded_keywords = (config or {}).get("excluded_keywords", []) + has_domains = bool(domains) + + scored: List[Dict] = [] + + for repo in repos: + if has_domains: + relevance, matched_domain, matched_keywords = calculate_relevance_score( + { + "title": repo["title"], + "summary": repo["summary"], + "categories": repo["categories"], + }, + domains, + excluded_keywords, + ) + if relevance == 0: + relevance = 0.5 + matched_domain = f"hf_{kind}" + matched_keywords = [] + else: + relevance = 1.0 + matched_domain = f"hf_{kind}" + matched_keywords = [] + + recency = calculate_recency_score(repo.get("published_date")) + popularity = calculate_repo_popularity_score( + repo.get("upvotes", 0), repo.get("downloads", 0), + ) + quality = calculate_quality_score(repo.get("summary", "")) + if repo.get("pipeline_tag"): + quality = min(quality + 0.3, SCORE_MAX) + + final_score = calculate_recommendation_score( + relevance, recency, popularity, quality, + ) + + repo_id_clean = repo["id"].split(":", 1)[-1] + scored.append({ + "id": repo["id"], + "title": repo["title"], + "authors": repo.get("authors_str", ""), + "abstract": repo.get("summary") or f"{kind.capitalize()} on Hugging Face Hub.", + "published": repo.get("published", ""), + "categories": repo.get("categories", []), + "relevance_score": round(relevance, 2), + "recency_score": round(recency, 2), + "popularity_score": round(popularity, 2), + "quality_score": round(quality, 2), + "final_score": final_score, + "matched_domain": matched_domain, + "matched_keywords": matched_keywords, + "link": repo["link"], + "source": "huggingface", + "media_urls": [], + "engagement": { + "likes": repo.get("upvotes", 0), + "comments": 0, + "downloads": repo.get("downloads", 0), + }, + "submitted_by": repo.get("submitted_by_name", ""), + "organization": repo.get("organization", ""), + # HF Hub-specific extras (consumed by the HF card branch) + "kind": kind, + "pipeline_tag": repo.get("pipeline_tag", ""), + "downloads": repo.get("downloads", 0), + "hub_id": repo_id_clean, + }) + + return scored, 0 + + def main(): """Main entry point.""" import argparse @@ -349,6 +571,21 @@ def main(): default=10, help="Number of top papers to return", ) + parser.add_argument( + "--modes", + type=str, + default="papers", + help=( + "Comma-separated list of HF Hub modes to fetch. " + "Valid: papers, models, datasets, spaces. Default: papers." + ), + ) + parser.add_argument( + "--per-mode-limit", + type=int, + default=40, + help="Number of raw entries to fetch per non-paper mode before scoring.", + ) args = parser.parse_args() @@ -359,78 +596,118 @@ def main(): stream=sys.stderr, ) - # Config is optional — without it we show all daily papers + # Config is optional — without it we show all daily papers / trending repos config = None if args.config: logger.info("Loading config from: %s", args.config) config = load_research_config(args.config) else: - logger.info("No config provided — showing all HuggingFace Daily Papers") + logger.info("No config provided — using trending defaults") + + # Parse and validate modes + requested_modes = [ + m.strip().lower() for m in (args.modes or "papers").split(",") if m.strip() + ] + modes = [m for m in requested_modes if m in VALID_MODES] + if not modes: + modes = ["papers"] + logger.info("[HF] active modes: %s", ", ".join(modes)) + + if os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN"): + logger.info("[HF] using HF token for higher rate limits") + + aggregated: List[Dict] = [] + total_found = 0 + total_filtered = 0 + seen_ids: set = set() + + # ---- Papers (Daily Papers API) ---- + if "papers" in modes: + logger.info("[HF/papers] fetching Daily Papers...") + raw_entries = fetch_daily_papers() + papers = [] + for entry in raw_entries: + normalized = normalize_paper(entry) + if normalized: + papers.append(normalized) + logger.info( + "[HF/papers] normalized %d papers from %d raw entries", + len(papers), len(raw_entries), + ) + scored_papers, paper_filtered = score_papers(papers, config) + total_found += len(papers) + total_filtered += paper_filtered + for p in scored_papers: + if p["id"] in seen_ids: + continue + seen_ids.add(p["id"]) + # Tag papers with kind so the UI can render uniformly. + p.setdefault("kind", "papers") + aggregated.append(p) + logger.info( + "[HF/papers] kept %d (filtered %d)", + len(scored_papers), paper_filtered, + ) - # Fetch daily papers from HuggingFace - logger.info("Fetching HuggingFace Daily Papers...") - raw_entries = fetch_daily_papers() + # ---- Hub repos (models / datasets / spaces) ---- + for kind in ("models", "datasets", "spaces"): + if kind not in modes: + continue + raw_repos = fetch_hub_repos(kind, limit=args.per_mode_limit, sort="likes7d") + normalized_repos: List[Dict] = [] + for entry in raw_repos: + r = normalize_hub_repo(entry, kind) + if r: + normalized_repos.append(r) + scored_repos, _ = score_hub_repos(normalized_repos, config, kind) + total_found += len(normalized_repos) + for r in scored_repos: + if r["id"] in seen_ids: + continue + seen_ids.add(r["id"]) + aggregated.append(r) + logger.info( + "[HF/%s] kept %d repos", kind, len(scored_repos), + ) - if not raw_entries: - logger.warning("No papers returned from HuggingFace API") + if not aggregated: + logger.warning("[HF] no entries collected across modes: %s", modes) output = { "top_papers": [], "total_found": 0, "total_filtered": 0, "search_date": datetime.now().strftime("%Y-%m-%d"), + "modes": modes, } with open(args.output, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) print(json.dumps(output, ensure_ascii=True, indent=2)) return 0 - # Normalize entries - papers = [] - for entry in raw_entries: - normalized = normalize_paper(entry) - if normalized: - papers.append(normalized) - - logger.info("Normalized %d papers from %d raw entries", len(papers), len(raw_entries)) + aggregated.sort(key=lambda x: x.get("final_score", 0), reverse=True) + top = aggregated[: args.top_n] - # Score (and optionally filter if config has domains) - scored_papers, total_filtered = score_papers(papers, config) - - logger.info( - "Scored %d papers (%d filtered out by relevance/exclusion)", - len(scored_papers), - total_filtered, - ) - - # Take top N - top_papers = scored_papers[: args.top_n] - - # Build output output = { - "top_papers": top_papers, - "total_found": len(papers), + "top_papers": top, + "total_found": total_found, "total_filtered": total_filtered, "search_date": datetime.now().strftime("%Y-%m-%d"), + "modes": modes, } - # Save to file with open(args.output, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2, default=str) logger.info("Results saved to: %s", args.output) - logger.info("Top %d papers:", len(top_papers)) - for i, p in enumerate(top_papers, 1): + logger.info("Top %d entries (modes=%s):", len(top), modes) + for i, p in enumerate(top, 1): + kind_tag = p.get("kind", "papers") logger.info( - " %d. %s... (Score: %s, Upvotes-based popularity: %s)", - i, - p["title"][:60], - p["final_score"], - p["popularity_score"], + " %d. [%s] %s... (Score: %s, Pop: %s)", + i, kind_tag, p["title"][:54], p["final_score"], p["popularity_score"], ) - # Also output to stdout print(json.dumps(output, ensure_ascii=True, indent=2, default=str)) - return 0 From 8b6fa83ffc98a193cad41a08bfaea20d0ff61dd9 Mon Sep 17 00:00:00 2001 From: Zongxia Li Date: Sun, 26 Apr 2026 22:50:30 -0400 Subject: [PATCH 3/6] feat(news): wire GitHub/HF/WeChat sources in UI, routes, and i18n --- public/icons/news/wechat.svg | 1 + server/routes/news.js | 120 ++++- server/scripts/research-news/search_wechat.py | 500 ++++++++++++++++++ .../news-dashboard/view/NewsDashboard.tsx | 10 +- .../news-dashboard/view/NewsItemCard.tsx | 296 ++++++++++- .../news-dashboard/view/SourceFilterBar.tsx | 8 +- .../news-dashboard/view/SourceIcon.tsx | 2 + .../view/SourceSettingsDialog.tsx | 257 +++++++++ .../news-dashboard/view/UnifiedFeed.tsx | 8 + .../view/useNewsDashboardData.ts | 4 +- src/i18n/locales/en/news.json | 72 ++- src/i18n/locales/ko/news.json | 72 ++- src/i18n/locales/zh-CN/news.json | 72 ++- 13 files changed, 1397 insertions(+), 25 deletions(-) create mode 100644 public/icons/news/wechat.svg create mode 100644 server/scripts/research-news/search_wechat.py diff --git a/public/icons/news/wechat.svg b/public/icons/news/wechat.svg new file mode 100644 index 00000000..0a9b32a4 --- /dev/null +++ b/public/icons/news/wechat.svg @@ -0,0 +1 @@ + diff --git a/server/routes/news.js b/server/routes/news.js index 1c4e23cb..4902c561 100644 --- a/server/routes/news.js +++ b/server/routes/news.js @@ -172,13 +172,19 @@ const SOURCE_REGISTRY = { requiresCredentials: false, }, huggingface: { - label: 'HuggingFace Daily Papers', + label: 'HuggingFace', script: 'research-news/search_huggingface.py', configFile: 'news-config-huggingface.json', resultsFile: 'news-results-huggingface.json', defaultConfig: { research_domains: {}, top_n: 30, + // Comma-separated list of HF Hub modes to fetch. + // Valid: papers, models, datasets, spaces. + modes: 'papers,models,datasets,spaces', + per_mode_limit: 40, + // Optional HuggingFace token (hf_xxx). Overrides HF_TOKEN env var when set. + api_token: '', }, requiresCredentials: false, }, @@ -219,6 +225,56 @@ const SOURCE_REGISTRY = { }, requiresCredentials: false, }, + github: { + label: 'GitHub', + script: 'research-news/search_github.py', + configFile: 'news-config-github.json', + resultsFile: 'news-results-github.json', + defaultConfig: { + research_domains: { + 'Large Language Models': { + keywords: ['llm', 'large language model', 'transformer', 'foundation model'], + arxiv_categories: [], + priority: 5, + }, + 'AI Agents': { + keywords: ['agent', 'autonomous', 'multi-agent', 'orchestration'], + arxiv_categories: [], + priority: 4, + }, + }, + top_n: 12, + // GitHub-specific + language: '', // optional: python, typescript, ... + time_window: 'weekly', // daily | weekly | monthly + include_trending: true, + max_search_pages: 1, + // Optional GitHub token (ghp_xxx). Overrides GITHUB_TOKEN env var when set. + api_token: '', + }, + requiresCredentials: false, + }, + wechat: { + label: 'WeChat 公众号', + script: 'research-news/search_wechat.py', + configFile: 'news-config-wechat.json', + resultsFile: 'news-results-wechat.json', + defaultConfig: { + research_domains: {}, + top_n: 12, + // RSSHub instance — public default, configurable in Settings. + instance_url: 'https://rsshub.app', + // Comma-separated WeChat 公众号 routes/IDs. Examples: + // wechat/ce/huxiu_com + // https://rsshub.app/wechat/ce/ifanr + // huxiu_com (bare ID → wechat/ce/) + accounts: '', + // Optional ?key=... for private RSSHub instances. + access_key: '', + per_account_limit: 20, + }, + requiresCredentials: false, + }, }; async function ensureDataDir() { @@ -373,12 +429,74 @@ async function handleSearch(sourceName, req, res) { args.push('--keywords', config.keywords); } + if (sourceName === 'huggingface') { + const modes = (typeof config.modes === 'string' && config.modes.trim()) + ? config.modes.trim() + : 'papers'; + args.push('--modes', modes); + if (Number.isFinite(config.per_mode_limit) && config.per_mode_limit > 0) { + args.push('--per-mode-limit', String(config.per_mode_limit)); + } + } + + if (sourceName === 'github') { + if (typeof config.language === 'string' && config.language.trim()) { + args.push('--language', config.language.trim()); + } + const timeWindow = ['daily', 'weekly', 'monthly'].includes(config.time_window) + ? config.time_window + : 'weekly'; + args.push('--time-window', timeWindow); + args.push('--include-trending', config.include_trending === false ? 'false' : 'true'); + if (Number.isFinite(config.max_search_pages) && config.max_search_pages > 0) { + args.push('--max-search-pages', String(Math.min(3, config.max_search_pages))); + } + } + + if (sourceName === 'wechat') { + const instance = (typeof config.instance_url === 'string' && config.instance_url.trim()) + ? config.instance_url.trim() + : 'https://rsshub.app'; + args.push('--instance', instance); + + // Accounts may be stored as a string (comma/newline-separated) or array. + let accountsList = []; + if (Array.isArray(config.accounts)) { + accountsList = config.accounts.map((a) => String(a).trim()).filter(Boolean); + } else if (typeof config.accounts === 'string') { + accountsList = config.accounts + .split(/[\n,]/) + .map((a) => a.trim()) + .filter(Boolean); + } + if (accountsList.length > 0) { + args.push('--accounts', accountsList.join(',')); + } + + if (typeof config.access_key === 'string' && config.access_key.trim()) { + args.push('--access-key', config.access_key.trim()); + } + if (Number.isFinite(config.per_account_limit) && config.per_account_limit > 0) { + args.push('--per-account-limit', String(config.per_account_limit)); + } + } + // Build env — pass credentials if required. // Strip __PYVENV_LAUNCHER__ so uv-installed Python CLIs invoked by the // search scripts find the correct stdlib (macOS Python framework sets this // variable and it confuses child interpreters with a different version). const env = { ...process.env }; delete env.__PYVENV_LAUNCHER__; + + // UI-supplied API tokens override env vars. Stored in plain JSON config + // (same trust model as the rest of news settings), and only applied to the + // child process — never echoed back over the API. + if (sourceName === 'github' && typeof config.api_token === 'string' && config.api_token.trim()) { + env.GITHUB_TOKEN = config.api_token.trim(); + } + if (sourceName === 'huggingface' && typeof config.api_token === 'string' && config.api_token.trim()) { + env.HF_TOKEN = config.api_token.trim(); + } if (entry.requiresCredentials) { try { const credValue = credentialsDb.getActiveCredential(req.user.id, entry.credentialType); diff --git a/server/scripts/research-news/search_wechat.py b/server/scripts/research-news/search_wechat.py new file mode 100644 index 00000000..38744211 --- /dev/null +++ b/server/scripts/research-news/search_wechat.py @@ -0,0 +1,500 @@ +#!/usr/bin/env python3 +""" +WeChat 公众号 news search via RSSHub (or any RSS instance). + +Fetches RSS feeds from a configurable RSSHub instance, normalizes WeChat +公众号 articles into the same shape used by the rest of the news pipeline, +and scores them against the user's research_domains. + +Why RSSHub: +- WeChat doesn't expose a public read API for articles you don't own. +- RSSHub is open source, self-hostable, and exposes WeChat 公众号 articles + as standard RSS/Atom — no QR scan, no cookie scraping. +- The instance URL is user-configurable, so users can point at their own + Docker-hosted RSSHub when the public one is rate-limited. + +Usage: + python search_wechat.py \\ + --instance https://rsshub.app \\ + --accounts wechat/ce/huxiu_com,wechat/ce/ifanr \\ + --config research_interests.json \\ + --output wechat_results.json \\ + --top-n 12 + +Routes: + Each entry in --accounts is either: + - A relative RSSHub path: wechat/ce/huxiu_com + - A full URL: https://rsshub.app/wechat/ce/huxiu_com + - A bare ID: huxiu_com (treated as wechat/ce/) + +Auth: + --access-key foo → appended as ?key=foo to every request (some private + RSSHub instances require this). +""" + +from __future__ import annotations + +import argparse +import html as html_mod +import json +import logging +import os +import re +import ssl +import sys +import time +import urllib.parse +import urllib.request +from datetime import datetime, timezone +from typing import Dict, List, Optional, Tuple +from xml.etree import ElementTree as ET + +logger = logging.getLogger(__name__) + +try: + import requests + HAS_REQUESTS = True +except ImportError: + HAS_REQUESTS = False + +try: + import certifi + CERTIFI_CA_BUNDLE = certifi.where() +except ImportError: + CERTIFI_CA_BUNDLE = None + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from scoring_utils import ( + SCORE_MAX, + calculate_relevance_score, + calculate_recency_score, + calculate_quality_score, + calculate_recommendation_score, +) + +DEFAULT_INSTANCE = "https://rsshub.app" +DEFAULT_TIMEOUT = 25 +USER_AGENT = "ResearchNews-WeChatFetcher/1.0 (RSSHub-compatible)" +TAG_RE = re.compile(r"<[^>]+>") +WHITESPACE_RE = re.compile(r"\s+") + + +# --------------------------------------------------------------------------- +# HTTP +# --------------------------------------------------------------------------- +def build_ssl_context() -> ssl.SSLContext: + if CERTIFI_CA_BUNDLE and os.path.exists(CERTIFI_CA_BUNDLE): + return ssl.create_default_context(cafile=CERTIFI_CA_BUNDLE) + return ssl.create_default_context() + + +def http_get_text( + url: str, + headers: Optional[Dict[str, str]] = None, + timeout: int = DEFAULT_TIMEOUT, +) -> str: + headers = {"User-Agent": USER_AGENT, **(headers or {})} + if HAS_REQUESTS: + kwargs = {"headers": headers, "timeout": timeout} + if CERTIFI_CA_BUNDLE and os.path.exists(CERTIFI_CA_BUNDLE): + kwargs["verify"] = CERTIFI_CA_BUNDLE + resp = requests.get(url, **kwargs) + resp.raise_for_status() + # RSSHub returns UTF-8 by default; trust the server's declared encoding. + if not resp.encoding or resp.encoding.lower() == "iso-8859-1": + resp.encoding = resp.apparent_encoding or "utf-8" + return resp.text + + req = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(req, timeout=timeout, context=build_ssl_context()) as resp: + raw = resp.read() + # Try to honor charset from Content-Type header. + ctype = resp.headers.get("Content-Type", "") + encoding = "utf-8" + if "charset=" in ctype: + encoding = ctype.split("charset=", 1)[1].strip().split(";")[0].strip() or "utf-8" + return raw.decode(encoding, errors="replace") + + +# --------------------------------------------------------------------------- +# Route normalization +# --------------------------------------------------------------------------- +def normalize_account_to_url(account: str, instance: str, access_key: str = "") -> Optional[str]: + """ + Resolve a user-provided account spec to a fully-qualified URL. + + Examples: + ("wechat/ce/huxiu_com", "https://rsshub.app", "") + → "https://rsshub.app/wechat/ce/huxiu_com" + ("https://rsshub.app/wechat/ce/huxiu_com", ...) + → unchanged + ("huxiu_com", ...) + → "https://rsshub.app/wechat/ce/huxiu_com" (bare-ID heuristic) + """ + if not account: + return None + + account = account.strip() + instance = (instance or DEFAULT_INSTANCE).rstrip("/") + + if account.startswith(("http://", "https://")): + url = account + else: + path = account.lstrip("/") + # Bare ID heuristic: no slash → assume wechat/ce/ (chuansongme proxy, + # the most stable RSSHub WeChat route as of 2026). + if "/" not in path: + path = f"wechat/ce/{path}" + url = f"{instance}/{path}" + + if access_key: + sep = "&" if "?" in url else "?" + url = f"{url}{sep}key={urllib.parse.quote(access_key, safe='')}" + return url + + +# --------------------------------------------------------------------------- +# RSS parsing +# --------------------------------------------------------------------------- +ATOM_NS = "{http://www.w3.org/2005/Atom}" +DC_NS = "{http://purl.org/dc/elements/1.1/}" +CONTENT_NS = "{http://purl.org/rss/1.0/modules/content/}" + +# A handful of tolerated date formats; RFC 822 covered by parsedate_to_datetime, +# everything else handled below. +ISO_FORMATS = [ + "%Y-%m-%dT%H:%M:%S%z", + "%Y-%m-%dT%H:%M:%SZ", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%d", +] + + +def _parse_date(value: str) -> Optional[datetime]: + if not value: + return None + value = value.strip() + + try: + # Python ISO parser handles "2026-04-26T12:34:56+08:00" since 3.11 + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except (TypeError, ValueError): + pass + + # RFC 822 (e.g. "Sat, 26 Apr 2026 12:34:56 +0800") + try: + from email.utils import parsedate_to_datetime + dt = parsedate_to_datetime(value) + if dt and dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except (TypeError, ValueError): + pass + + for fmt in ISO_FORMATS: + try: + return datetime.strptime(value, fmt) + except ValueError: + continue + return None + + +def _strip_html(text: str, max_len: int = 320) -> str: + if not text: + return "" + no_tags = TAG_RE.sub(" ", text) + decoded = html_mod.unescape(no_tags) + cleaned = WHITESPACE_RE.sub(" ", decoded).strip() + if len(cleaned) > max_len: + cleaned = cleaned[: max_len - 1].rstrip() + "…" + return cleaned + + +def parse_rss(xml_text: str) -> Tuple[str, List[Dict]]: + """ + Parse RSS 2.0 or Atom feed. Returns (channel_title, list_of_entries). + Each entry: {title, link, summary, published, published_date, author}. + """ + try: + root = ET.fromstring(xml_text) + except ET.ParseError as exc: + logger.warning("[WeChat] feed parse error: %s", exc) + return "", [] + + entries: List[Dict] = [] + + # RSS 2.0 + channel = root.find("channel") + if channel is not None: + channel_title = (channel.findtext("title") or "").strip() + for item in channel.findall("item"): + title = (item.findtext("title") or "").strip() + link = (item.findtext("link") or "").strip() + description = item.findtext("description") or "" + content_encoded = item.findtext(f"{CONTENT_NS}encoded") or "" + body = content_encoded or description + pub = item.findtext("pubDate") or item.findtext(f"{DC_NS}date") or "" + author = ( + item.findtext("author") + or item.findtext(f"{DC_NS}creator") + or channel_title + ).strip() + entries.append({ + "title": title, + "link": link, + "summary": _strip_html(body), + "published": pub, + "published_date": _parse_date(pub), + "author": author, + }) + return channel_title, entries + + # Atom 1.0 + if root.tag.endswith("}feed") or root.tag == "feed": + channel_title = (root.findtext(f"{ATOM_NS}title") or "").strip() + for entry in root.findall(f"{ATOM_NS}entry"): + title = (entry.findtext(f"{ATOM_NS}title") or "").strip() + link_el = entry.find(f"{ATOM_NS}link") + link = link_el.get("href", "") if link_el is not None else "" + summary_text = ( + entry.findtext(f"{ATOM_NS}content") + or entry.findtext(f"{ATOM_NS}summary") + or "" + ) + pub = ( + entry.findtext(f"{ATOM_NS}updated") + or entry.findtext(f"{ATOM_NS}published") + or "" + ) + author_el = entry.find(f"{ATOM_NS}author") + author = "" + if author_el is not None: + author = (author_el.findtext(f"{ATOM_NS}name") or "").strip() + entries.append({ + "title": title, + "link": link, + "summary": _strip_html(summary_text), + "published": pub, + "published_date": _parse_date(pub), + "author": author or channel_title, + }) + return channel_title, entries + + logger.warning("[WeChat] unrecognized feed format (root=%s)", root.tag) + return "", [] + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- +def score_articles( + articles: List[Dict], + config: Optional[Dict], +) -> Tuple[List[Dict], int]: + """ + Score WeChat articles. RSS doesn't expose engagement metrics, so popularity + falls back to a soft floor — relevance, recency, and quality do the heavy + lifting. + """ + domains = (config or {}).get("research_domains", {}) or {} + excluded = (config or {}).get("excluded_keywords", []) or [] + has_domains = bool(domains) + + scored: List[Dict] = [] + filtered = 0 + + for art in articles: + if has_domains: + relevance, matched_domain, matched_keywords = calculate_relevance_score( + { + "title": art.get("title", ""), + "summary": art.get("summary", ""), + "categories": [], + }, + domains, + excluded, + ) + if relevance == 0: + filtered += 1 + continue + else: + relevance = 1.5 # neutral when no domains configured + matched_domain = "wechat" + matched_keywords = [] + + recency = calculate_recency_score(art.get("published_date")) + quality = calculate_quality_score(art.get("summary", "")) + # Light bonus for non-empty title (many RSSHub feeds have empty titles + # for image-only posts; penalize those implicitly by giving nothing). + if art.get("title"): + quality = min(quality + 0.2, SCORE_MAX) + + # No engagement metrics in RSS — give a soft, fixed floor so popularity + # doesn't tank the final score for everything. + popularity = 1.0 + + final_score = calculate_recommendation_score( + relevance, recency, popularity, quality + ) + + scored.append({ + "id": art.get("link") or art.get("title", ""), + "title": art.get("title", ""), + "authors": art.get("author", ""), + "abstract": art.get("summary", "") or "(no excerpt)", + "published": art.get("published", ""), + "categories": [], + "relevance_score": round(relevance, 2), + "recency_score": round(recency, 2), + "popularity_score": round(popularity, 2), + "quality_score": round(quality, 2), + "final_score": final_score, + "matched_domain": matched_domain, + "matched_keywords": matched_keywords, + "link": art.get("link", ""), + "source": "wechat", + "engagement": {}, + # WeChat-specific extras (consumed by NewsItemCard's wechat branch) + "account_name": art.get("account_name", ""), + "account_route": art.get("account_route", ""), + }) + + scored.sort(key=lambda x: x["final_score"], reverse=True) + return scored, filtered + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- +def main() -> int: + parser = argparse.ArgumentParser( + description="Fetch and score WeChat 公众号 articles via RSSHub." + ) + parser.add_argument("--config", type=str, default=None, + help="Path to research interests JSON config") + parser.add_argument("--output", type=str, required=True, + help="Output JSON file path") + parser.add_argument("--top-n", type=int, default=12, + help="Number of top articles to return") + parser.add_argument("--instance", type=str, default=DEFAULT_INSTANCE, + help="RSSHub instance base URL") + parser.add_argument("--accounts", type=str, default="", + help="Comma-separated WeChat 公众号 routes/IDs") + parser.add_argument("--access-key", type=str, default="", + help="Optional RSSHub access key (?key=...)") + parser.add_argument("--per-account-limit", type=int, default=20, + help="Max articles to keep per feed before scoring") + args = parser.parse_args() + + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", + stream=sys.stderr, + ) + + config: Optional[Dict] = None + if args.config: + try: + with open(args.config, "r", encoding="utf-8-sig") as f: + config = json.load(f) + logger.info("[WeChat] loaded config from %s", args.config) + except Exception as exc: + logger.warning("[WeChat] failed to load config %s: %s", args.config, exc) + + accounts_raw = (args.accounts or (config or {}).get("accounts") or "") + if isinstance(accounts_raw, list): + account_specs = [str(a).strip() for a in accounts_raw if str(a).strip()] + else: + account_specs = [s.strip() for s in str(accounts_raw).split(",") if s.strip()] + + if not account_specs: + logger.warning( + "[WeChat] no accounts configured — pass --accounts or set " + "config.accounts. Returning empty results." + ) + output = { + "top_papers": [], + "total_found": 0, + "total_filtered": 0, + "search_date": datetime.now().strftime("%Y-%m-%d"), + "instance": args.instance, + "accounts": [], + } + with open(args.output, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2) + print(json.dumps(output, ensure_ascii=True, indent=2)) + return 0 + + instance = (args.instance or DEFAULT_INSTANCE).rstrip("/") + access_key = (args.access_key or (config or {}).get("access_key") or "").strip() + + logger.info("[WeChat] instance=%s, %d accounts", instance, len(account_specs)) + + all_articles: List[Dict] = [] + seen_links: set = set() + + for spec in account_specs: + url = normalize_account_to_url(spec, instance, access_key) + if not url: + continue + logger.info("[WeChat] fetching: %s", url) + try: + xml_text = http_get_text(url) + except Exception as exc: + logger.warning("[WeChat] fetch failed: %s", exc) + time.sleep(1.0) + continue + + channel_title, entries = parse_rss(xml_text) + if not entries: + logger.info("[WeChat] no entries (channel=%r)", channel_title) + continue + + added = 0 + for entry in entries[: args.per_account_limit]: + link = entry.get("link") or "" + dedup_key = link or entry.get("title", "") + if dedup_key in seen_links: + continue + seen_links.add(dedup_key) + entry["account_name"] = channel_title or spec + entry["account_route"] = spec + all_articles.append(entry) + added += 1 + logger.info( + "[WeChat] +%d articles from %r", added, channel_title or spec, + ) + # Be polite to public RSSHub instances. + time.sleep(0.6) + + scored, filtered = score_articles(all_articles, config) + logger.info( + "[WeChat] scored %d articles (%d filtered)", len(scored), filtered, + ) + top = scored[: args.top_n] + + output = { + "top_papers": top, + "total_found": len(all_articles), + "total_filtered": filtered, + "search_date": datetime.now().strftime("%Y-%m-%d"), + "instance": instance, + "accounts": account_specs, + } + with open(args.output, "w", encoding="utf-8") as f: + json.dump(output, f, ensure_ascii=False, indent=2, default=str) + + logger.info("[WeChat] saved %d articles to %s", len(top), args.output) + for i, art in enumerate(top, 1): + logger.info( + " %d. %s... (score=%s)", + i, (art["title"] or "(no title)")[:60], art["final_score"], + ) + + print(json.dumps(output, ensure_ascii=True, indent=2, default=str)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/components/news-dashboard/view/NewsDashboard.tsx b/src/components/news-dashboard/view/NewsDashboard.tsx index b603713f..c131874b 100644 --- a/src/components/news-dashboard/view/NewsDashboard.tsx +++ b/src/components/news-dashboard/view/NewsDashboard.tsx @@ -8,11 +8,13 @@ import UnifiedFeed from './UnifiedFeed'; import { useNewsDashboardData } from './useNewsDashboardData'; import type { NewsSourceKey } from './useNewsDashboardData'; -const ALL_SOURCES: NewsSourceKey[] = ['arxiv', 'huggingface', 'x', 'xiaohongshu']; +const ALL_SOURCES: NewsSourceKey[] = ['arxiv', 'huggingface', 'github', 'wechat', 'x', 'xiaohongshu']; const SOURCE_LABEL_KEYS: Record = { arxiv: 'sources.arxiv', huggingface: 'sources.huggingface', + github: 'sources.github', + wechat: 'sources.wechatShort', x: 'sources.x', xiaohongshu: 'sources.xiaohongshuShort', }; @@ -20,6 +22,8 @@ const SOURCE_LABEL_KEYS: Record = { const SOURCE_STAT_ACCENTS: Record = { arxiv: 'bg-rose-100 text-rose-700 dark:bg-rose-950/50 dark:text-rose-300', huggingface: 'bg-yellow-100 text-yellow-700 dark:bg-yellow-950/50 dark:text-yellow-300', + github: 'bg-violet-100 text-violet-700 dark:bg-violet-950/50 dark:text-violet-300', + wechat: 'bg-emerald-100 text-emerald-700 dark:bg-emerald-950/50 dark:text-emerald-300', x: 'bg-gray-200 text-gray-700 dark:bg-gray-800/50 dark:text-gray-300', xiaohongshu: 'bg-red-100 text-red-600 dark:bg-red-950/50 dark:text-red-300', }; @@ -143,6 +147,10 @@ export default function NewsDashboard() { · HuggingFace · + GitHub + · + {t('sources.wechat')} + · X · {t('sources.xiaohongshu')} diff --git a/src/components/news-dashboard/view/NewsItemCard.tsx b/src/components/news-dashboard/view/NewsItemCard.tsx index 90bd20bd..4050f6c7 100644 --- a/src/components/news-dashboard/view/NewsItemCard.tsx +++ b/src/components/news-dashboard/view/NewsItemCard.tsx @@ -10,6 +10,10 @@ import { Bookmark, ArrowUp, FileText, + GitFork, + Download, + Scale, + TrendingUp, } from 'lucide-react'; import { useState } from 'react'; import { useTranslation } from 'react-i18next'; @@ -34,14 +38,67 @@ export type NewsItem = { pdf_link?: string; source?: string; // Social-specific fields - engagement?: { likes?: number; retweets?: number; reposts?: number; replies?: number; comments?: number; collects?: number; impressions?: number }; + engagement?: { + likes?: number; + retweets?: number; + reposts?: number; + replies?: number; + comments?: number; + collects?: number; + impressions?: number; + downloads?: number; + }; avatar_url?: string; media_urls?: string[]; // HuggingFace-specific fields submitted_by?: string; organization?: string; + // HF Hub repo extras (kind: papers | models | datasets | spaces) + kind?: 'papers' | 'models' | 'datasets' | 'spaces' | string; + pipeline_tag?: string; + downloads?: number; + hub_id?: string; + // GitHub-specific extras + stars?: number; + forks?: number; + language?: string; + license?: string | null; + owner_avatar?: string; + mode?: string; + // WeChat-specific extras + account_name?: string; + account_route?: string; }; +const LANGUAGE_COLORS: Record = { + python: 'bg-blue-500', + typescript: 'bg-sky-500', + javascript: 'bg-yellow-400', + rust: 'bg-orange-600', + go: 'bg-cyan-500', + java: 'bg-red-500', + 'c++': 'bg-pink-500', + cpp: 'bg-pink-500', + c: 'bg-gray-500', + ruby: 'bg-red-600', + swift: 'bg-orange-500', + kotlin: 'bg-purple-500', + shell: 'bg-emerald-500', + cuda: 'bg-green-600', +}; + +function langColor(lang?: string): string { + if (!lang) return 'bg-muted'; + return LANGUAGE_COLORS[lang.toLowerCase()] || 'bg-muted-foreground/40'; +} + +function formatCompact(n?: number): string { + if (n == null) return '0'; + if (n < 1000) return String(n); + if (n < 1_000_000) return (n / 1000).toFixed(n < 10_000 ? 1 : 0).replace(/\.0$/, '') + 'k'; + return (n / 1_000_000).toFixed(1).replace(/\.0$/, '') + 'M'; +} + const VIDEO_EXTENSIONS = /\.(mp4|webm|mov|m3u8)(\?|$)/i; function isVideoUrl(url: string): boolean { @@ -116,6 +173,8 @@ export default function NewsItemCard({ item, index, sourceKey }: { item: NewsIte const sourceBadgeLabel = sourceKey === 'arxiv' ? 'arXiv' : sourceKey === 'huggingface' ? 'HF' + : sourceKey === 'github' ? 'GitHub' + : sourceKey === 'wechat' ? '公众号' : sourceKey === 'x' ? 'X' : 'XHS'; @@ -154,11 +213,244 @@ export default function NewsItemCard({ item, index, sourceKey }: { item: NewsIte ); } - // HuggingFace: paper card styled like HF Daily Papers + // WeChat 公众号: article card with account name, excerpt, link + if (sourceKey === 'wechat') { + const accountName = item.account_name || item.authors || ''; + return ( +
+
+ {/* Header: account chip + title + score */} +
+ + + +
+ + {item.title || t('card.untitled')} + + {accountName && ( +

{accountName}

+ )} +
+
+ + {item.final_score?.toFixed(1) ?? '—'} +
+
+ + {/* Excerpt */} + {item.abstract && item.abstract !== '(no excerpt)' && ( +

{item.abstract}

+ )} + + {/* Footer: published + open link */} +
+ {item.published && ( + {item.published.slice(0, 10)} + )} + + {t('card.openInWechat')} + +
+
+
+ ); + } + + // GitHub: repo card with stars, forks, language, topics + if (sourceKey === 'github') { + const stars = item.stars ?? item.engagement?.likes ?? 0; + const forks = item.forks ?? item.engagement?.comments ?? 0; + const isTrending = item.mode === 'trending'; + return ( +
+
+ {/* Header: avatar + repo name + score */} +
+ {item.owner_avatar ? ( + + ) : ( + + + + )} +
+ + {item.title} + + {item.authors && ( +

{item.authors}

+ )} +
+
+ + {item.final_score?.toFixed(1) ?? '—'} +
+
+ + {/* Trending pill */} + {isTrending && ( + + {t('card.trending')} + + )} + + {/* Description */} + {item.abstract && item.abstract !== '(no description)' && ( +

{item.abstract}

+ )} + + {/* Topics */} + {item.categories && item.categories.length > 0 && ( +
+ {item.categories.slice(0, 4).map((topic) => ( + + {topic} + + ))} +
+ )} + + {/* Footer: language + stars + forks + license */} +
+ {item.language && ( + + + {item.language} + + )} + + {formatCompact(stars)} + + + {formatCompact(forks)} + + {item.license && ( + + {item.license} + + )} + + GitHub + +
+
+
+ ); + } + + // HuggingFace: paper card OR Hub repo card depending on `kind` if (sourceKey === 'huggingface') { + const isHubRepo = item.kind === 'models' || item.kind === 'datasets' || item.kind === 'spaces'; const thumbnailUrl = item.media_urls?.[0]; const upvotes = item.engagement?.likes ?? 0; const comments = item.engagement?.comments ?? 0; + const downloads = item.downloads ?? item.engagement?.downloads ?? 0; + + // Hub repo card: compact layout with kind badge, pipeline tag, downloads + if (isHubRepo) { + const kindBadge: Record = { + models: { label: t('card.kindModel'), className: 'bg-amber-100 text-amber-700 dark:bg-amber-950/40 dark:text-amber-300' }, + datasets: { label: t('card.kindDataset'), className: 'bg-emerald-100 text-emerald-700 dark:bg-emerald-950/40 dark:text-emerald-300' }, + spaces: { label: t('card.kindSpace'), className: 'bg-fuchsia-100 text-fuchsia-700 dark:bg-fuchsia-950/40 dark:text-fuchsia-300' }, + }; + const badge = kindBadge[item.kind as string]; + + return ( +
+
+
+
+ + {formatCompact(upvotes)} +
+
+ {badge && ( + + {badge.label} + + )} + + {item.title} + + {item.organization && ( +

{item.organization}

+ )} +
+
+ + {item.final_score?.toFixed(1) ?? '—'} +
+
+ + {item.abstract && item.abstract !== `${(item.kind ?? '').toString().replace(/^\w/, (c) => c.toUpperCase())} on Hugging Face Hub.` && ( +

{item.abstract}

+ )} + + {item.categories && item.categories.length > 0 && ( +
+ {item.pipeline_tag && ( + + {item.pipeline_tag} + + )} + {item.categories.filter((c) => c !== item.pipeline_tag).slice(0, 4).map((tag) => ( + + {tag} + + ))} +
+ )} + +
+ {downloads > 0 && ( + + {formatCompact(downloads)} + + )} + + {formatCompact(upvotes)} + + + HF + +
+
+
+ ); + } + + // Daily Paper card (original layout) return (
{/* Thumbnail */} diff --git a/src/components/news-dashboard/view/SourceFilterBar.tsx b/src/components/news-dashboard/view/SourceFilterBar.tsx index d66b222c..4a6f8ba4 100644 --- a/src/components/news-dashboard/view/SourceFilterBar.tsx +++ b/src/components/news-dashboard/view/SourceFilterBar.tsx @@ -11,6 +11,8 @@ import type { NewsSourceKey, SourceInfo } from './useNewsDashboardData'; const SOURCE_LABEL_KEYS: Record = { arxiv: 'sources.arxiv', huggingface: 'sources.huggingface', + github: 'sources.github', + wechat: 'sources.wechat', x: 'sources.x', xiaohongshu: 'sources.xiaohongshu', }; @@ -18,6 +20,8 @@ const SOURCE_LABEL_KEYS: Record = { const SOURCE_INACTIVE_COLORS: Record = { arxiv: 'bg-transparent text-rose-800/60 hover:bg-rose-100/50 dark:text-rose-400/60 dark:hover:bg-rose-950/30', huggingface: 'bg-transparent text-yellow-800/60 hover:bg-yellow-100/50 dark:text-yellow-400/60 dark:hover:bg-yellow-950/30', + github: 'bg-transparent text-violet-800/60 hover:bg-violet-100/50 dark:text-violet-300/60 dark:hover:bg-violet-950/30', + wechat: 'bg-transparent text-emerald-800/60 hover:bg-emerald-100/50 dark:text-emerald-300/60 dark:hover:bg-emerald-950/30', x: 'bg-transparent text-gray-600/60 hover:bg-gray-200/50 dark:text-gray-400/60 dark:hover:bg-gray-800/30', xiaohongshu: 'bg-transparent text-red-600/60 hover:bg-red-100/50 dark:text-red-400/60 dark:hover:bg-red-950/30', }; @@ -25,11 +29,13 @@ const SOURCE_INACTIVE_COLORS: Record = { const SOURCE_ACTIVE_COLORS: Record = { arxiv: 'bg-rose-600 text-white shadow-md ring-2 ring-rose-600/30 hover:bg-rose-700 dark:bg-rose-700 dark:ring-rose-500/30 dark:hover:bg-rose-600', huggingface: 'bg-yellow-500 text-white shadow-md ring-2 ring-yellow-500/30 hover:bg-yellow-600 dark:bg-yellow-600 dark:ring-yellow-400/30 dark:hover:bg-yellow-500', + github: 'bg-violet-600 text-white shadow-md ring-2 ring-violet-600/30 hover:bg-violet-700 dark:bg-violet-700 dark:ring-violet-500/30 dark:hover:bg-violet-600', + wechat: 'bg-emerald-600 text-white shadow-md ring-2 ring-emerald-600/30 hover:bg-emerald-700 dark:bg-emerald-700 dark:ring-emerald-500/30 dark:hover:bg-emerald-600', x: 'bg-gray-800 text-white shadow-md ring-2 ring-gray-800/30 hover:bg-gray-900 dark:bg-gray-600 dark:ring-gray-500/30 dark:hover:bg-gray-500', xiaohongshu: 'bg-red-500 text-white shadow-md ring-2 ring-red-500/30 hover:bg-red-600 dark:bg-red-600 dark:ring-red-400/30 dark:hover:bg-red-500', }; -const ALL_SOURCES: NewsSourceKey[] = ['arxiv', 'huggingface', 'x', 'xiaohongshu']; +const ALL_SOURCES: NewsSourceKey[] = ['arxiv', 'huggingface', 'github', 'wechat', 'x', 'xiaohongshu']; export default function SourceFilterBar({ activeSource, diff --git a/src/components/news-dashboard/view/SourceIcon.tsx b/src/components/news-dashboard/view/SourceIcon.tsx index d3caa18e..64259d80 100644 --- a/src/components/news-dashboard/view/SourceIcon.tsx +++ b/src/components/news-dashboard/view/SourceIcon.tsx @@ -3,6 +3,8 @@ import type { NewsSourceKey } from './useNewsDashboardData'; const SOURCE_ICONS: Record = { arxiv: { light: '/icons/news/arxiv.svg' }, huggingface: { light: '/icons/news/huggingface.svg' }, + github: { light: '/icons/news/github.svg', dark: '/icons/news/github-white.svg' }, + wechat: { light: '/icons/news/wechat.svg' }, x: { light: '/icons/news/x-black.png', dark: '/icons/news/x-white.png' }, xiaohongshu: { light: '/icons/news/xiaohongshu.png' }, }; diff --git a/src/components/news-dashboard/view/SourceSettingsDialog.tsx b/src/components/news-dashboard/view/SourceSettingsDialog.tsx index 6720c3b8..c22861a0 100644 --- a/src/components/news-dashboard/view/SourceSettingsDialog.tsx +++ b/src/components/news-dashboard/view/SourceSettingsDialog.tsx @@ -1,5 +1,8 @@ import { Cookie, + Eye, + EyeOff, + KeyRound, Loader2, Plus, QrCode, @@ -24,10 +27,47 @@ const ARXIV_CATEGORIES = [ const SOURCE_TITLE_KEYS: Record = { arxiv: 'settings.arxivTitle', huggingface: 'settings.huggingfaceTitle', + github: 'settings.githubTitle', + wechat: 'settings.wechatTitle', x: 'settings.xTitle', xiaohongshu: 'settings.xiaohongshuTitle', }; +const HF_MODE_OPTIONS: Array<{ value: 'papers' | 'models' | 'datasets' | 'spaces'; labelKey: string }> = [ + { value: 'papers', labelKey: 'settings.hfModePapers' }, + { value: 'models', labelKey: 'settings.hfModeModels' }, + { value: 'datasets', labelKey: 'settings.hfModeDatasets' }, + { value: 'spaces', labelKey: 'settings.hfModeSpaces' }, +]; + +const GITHUB_LANGUAGES = [ + { value: '', label: 'Any' }, + { value: 'python', label: 'Python' }, + { value: 'typescript', label: 'TypeScript' }, + { value: 'javascript', label: 'JavaScript' }, + { value: 'rust', label: 'Rust' }, + { value: 'go', label: 'Go' }, + { value: 'cuda', label: 'CUDA' }, + { value: 'c++', label: 'C++' }, + { value: 'java', label: 'Java' }, +]; + +const GITHUB_TIME_WINDOWS: Array<{ value: 'daily' | 'weekly' | 'monthly'; labelKey: string }> = [ + { value: 'daily', labelKey: 'settings.githubWindowDaily' }, + { value: 'weekly', labelKey: 'settings.githubWindowWeekly' }, + { value: 'monthly', labelKey: 'settings.githubWindowMonthly' }, +]; + +function parseModes(value: unknown): Array<'papers' | 'models' | 'datasets' | 'spaces'> { + if (typeof value !== 'string') return ['papers']; + return value + .split(',') + .map((s) => s.trim().toLowerCase()) + .filter((s): s is 'papers' | 'models' | 'datasets' | 'spaces' => + s === 'papers' || s === 'models' || s === 'datasets' || s === 'spaces' + ); +} + // eslint-disable-next-line @typescript-eslint/no-explicit-any type AnyConfig = Record; type XhsLoginMethod = 'browser' | 'qrcode'; @@ -111,6 +151,52 @@ function EditableNumberInput({ ); } +function TokenInput({ + value, + onChange, + placeholder, + envVarLabel, +}: { + value: string; + onChange: (next: string) => void; + placeholder: string; + envVarLabel: string; +}) { + const { t } = useTranslation('news'); + const [reveal, setReveal] = useState(false); + return ( +
+ +
+ onChange(e.target.value)} + placeholder={placeholder} + autoComplete="off" + spellCheck={false} + className="w-full rounded-lg border border-border/50 bg-background px-3 py-2 pr-10 text-sm font-mono focus:border-primary/40 focus:outline-none focus:ring-1 focus:ring-primary/20" + /> + +
+

{t('settings.tokenHelp')}

+
+ ); +} + function shouldPreferXhsQrLogin() { if (IS_PLATFORM) return true; if (typeof window === 'undefined') return false; @@ -466,6 +552,177 @@ export default function SourceSettingsDialog({ )} {sourceKey === 'xiaohongshu' && } + {sourceKey === 'github' && ( +
+

{t('settings.authentication')}

+

{t('settings.githubAuthDescription')}

+ updateField('api_token', next)} + placeholder="ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + envVarLabel="GITHUB_TOKEN" + /> +
+ )} + {sourceKey === 'huggingface' && ( +
+

{t('settings.authentication')}

+

{t('settings.hfAuthDescription')}

+ updateField('api_token', next)} + placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + envVarLabel="HF_TOKEN" + /> +
+ )} + + {/* HuggingFace: mode toggles */} + {sourceKey === 'huggingface' && ( +
+ +

{t('settings.hfModesHelp')}

+
+ {HF_MODE_OPTIONS.map((opt) => { + const active = parseModes(config.modes).includes(opt.value); + return ( + + ); + })} +
+
+ )} + + {/* GitHub: language + time window + trending toggle */} + {sourceKey === 'github' && ( +
+
+ + +
+
+ + +
+
+
+ +

{t('settings.githubIncludeTrendingHelp')}

+
+ +
+
+ )} + + {/* WeChat 公众号: RSSHub instance + accounts list + access key */} + {sourceKey === 'wechat' && ( + <> +
+

{t('settings.wechatRsshubTitle')}

+

{t('settings.wechatRsshubDescription')}

+
+ + updateField('instance_url', e.target.value)} + placeholder="https://rsshub.app" + spellCheck={false} + className="w-full rounded-lg border border-border/50 bg-background px-3 py-2 text-sm font-mono focus:border-primary/40 focus:outline-none focus:ring-1 focus:ring-primary/20" + /> +

{t('settings.wechatInstanceHelp')}

+
+ updateField('access_key', next)} + placeholder={t('settings.wechatAccessKeyPlaceholder')} + envVarLabel="?key=…" + /> +
+
+ +

{t('settings.wechatAccountsHelp')}

+