From ce19567a861d440b9d28219bda035e774dff3ba6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:31:39 +0000 Subject: [PATCH 01/24] Initial plan From c6693bdf0bc14eed638ad6c427bd89228bef97d1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:37:51 +0000 Subject: [PATCH 02/24] Update URL sourcing to sitemap Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- README.md | 11 +- blogger_register/blogger_register.py | 160 +++++++++++++++++++-------- 2 files changed, 118 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 07025ab..7373bde 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,11 @@ ### 背景・目的 -Bloggerで公開した記事のURLをGoogle Indexing APIに自動通知し、インデックス登録作業を自動化するためのバッチプログラムです。Firestoreで各URLの通知日時を管理し、結果をメールで通知します。 +サイトマップから取得した記事URLをGoogle Indexing APIに自動通知し、インデックス登録作業を自動化するためのバッチプログラムです。Firestoreで各URLの通知日時を管理し、結果をメールで通知します。 ### 機能一覧 -- Blogger APIから全記事URLを取得し、FirestoreのURLを事前取得したキャッシュと突き合わせて登録・管理します。 +- サイトマップから全記事URLを取得し、FirestoreのURLを事前取得したキャッシュと突き合わせて登録・管理します。 - Firestoreで各URLの通知日時を管理し、通知日時が古い順に指定件数だけURLを抽出してGoogle Indexing APIへ通知します。 - 通知結果をHTML形式でまとめ、指定アドレスへメール送信します。 - 各種設定値は環境変数で管理します。 @@ -18,8 +18,7 @@ Bloggerで公開した記事のURLをGoogle Indexing APIに自動通知し、イ | 変数名 | 用途 | | ---- | ---- | -| BLOGGER_INDEX_REGIST_API_KEY | Blogger APIキー | -| BLOG_ID | 対象ブログのID | +| SITEMAP_URL | サイトマップURL(通常は sitemap.xml など) | | MAIL_FROM | 送信元メールアドレス(Gmail) | | MAIL_PASSWORD | 送信元メールアドレスのアプリパスワード | | MAIL_TO | 通知先メールアドレス | @@ -42,7 +41,7 @@ Bloggerで公開した記事のURLをGoogle Indexing APIに自動通知し、イ ### 入力 -- Blogger APIから取得した記事URL +- サイトマップから取得した記事URL - Firestoreコレクション `url_notifications` ### 出力 @@ -73,7 +72,7 @@ python blogger_register/blogger_register.py 1. 環境変数から各種設定値を取得 2. Google認証セッションを初期化 -3. Blogger APIから記事URL一覧をFirestoreに登録 +3. サイトマップから記事URL一覧をFirestoreに登録 4. Firestoreから通知日時が古い順に指定件数だけURLを抽出 5. Google Indexing APIへ通知し、結果をFirestoreに反映 6. 全通知結果をHTMLメールで送信 diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 72c8258..fa8888d 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -5,6 +5,7 @@ """ import base64 +import gzip import os import smtplib import time @@ -12,11 +13,12 @@ from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from typing import Any, TypedDict +from xml.etree import ElementTree as ET import google.auth +import requests from google.auth.transport.requests import AuthorizedSession from google.cloud import firestore -from googleapiclient.discovery import build # 定数定義 SCOPES: list[str] = ["https://www.googleapis.com/auth/indexing"] @@ -34,8 +36,7 @@ class EnvVars(TypedDict): - blogger_api_key: str - blog_id: str + sitemap_url: str mail_from: str mail_password: str mail_to: str @@ -58,8 +59,7 @@ def get_env_vars() -> EnvVars: EnvironmentError: 必須環境変数が未設定の場合 """ env = { - "blogger_api_key": os.environ.get("BLOGGER_INDEX_REGIST_API_KEY"), - "blog_id": os.environ.get("BLOG_ID"), + "sitemap_url": os.environ.get("SITEMAP_URL"), "mail_from": os.environ.get("MAIL_FROM"), "mail_password": os.environ.get("MAIL_PASSWORD"), "mail_to": os.environ.get("MAIL_TO"), @@ -218,49 +218,118 @@ def send_indexing_notification( return success, response.status_code, response.text -def register_blog_urls_to_firestore(blog_id: str, api_key: str) -> None: - """Blogger APIからブログ投稿URL一覧を取得し、Firestoreに登録する。 +def decode_sitemap_content(content: bytes, url: str) -> bytes: + """Sitemapコンテンツを必要に応じてデコードする。 Args: - blog_id (str): ブログID - api_key (str): APIキー + content (bytes): 取得したSitemapのバイト列 + url (str): SitemapのURL + + Returns: + bytes: デコード済みのSitemap """ - has_last_sent = build_last_sent_cache(FIRESTORE_BATCH_LIMIT) + if url.lower().endswith(".gz"): + return gzip.decompress(content) + return content + - service = build("blogger", "v3", developerKey=api_key) - page_token: str | None = None +def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: + """Sitemap XMLからURLと子Sitemap URLを抽出する。 + + Args: + content (bytes): XMLコンテンツ + + Returns: + tuple[list[str], list[str]]: (URLリスト, 子Sitemap URLリスト) + """ + root = ET.fromstring(content) # noqa: S314 + namespace = "" + if root.tag.startswith("{"): + namespace = root.tag.split("}")[0] + "}" + + if root.tag.endswith("urlset"): + urls = [ + loc.text.strip() + for loc in root.findall(f".//{namespace}url/{namespace}loc") + if loc.text + ] + return urls, [] + if root.tag.endswith("sitemapindex"): + sitemap_urls = [ + loc.text.strip() + for loc in root.findall(f".//{namespace}sitemap/{namespace}loc") + if loc.text + ] + return [], sitemap_urls + return [], [] + + +def fetch_sitemap_urls(sitemap_url: str) -> list[str]: + """サイトマップからURL一覧を取得する。 + + Args: + sitemap_url (str): 取得対象のサイトマップURL + + Returns: + list[str]: 取得したURL一覧 + """ + pending_sitemaps = [sitemap_url] + visited_sitemaps: set[str] = set() + seen_urls: set[str] = set() + collected_urls: list[str] = [] + + while pending_sitemaps: + current_url = pending_sitemaps.pop() + if current_url in visited_sitemaps: + continue + visited_sitemaps.add(current_url) + response = requests.get(current_url, timeout=30) + response.raise_for_status() + content = decode_sitemap_content(response.content, current_url) + urls, sitemap_urls = extract_sitemap_entries(content) + for url in urls: + if url not in seen_urls: + seen_urls.add(url) + collected_urls.append(url) + pending_sitemaps.extend( + [ + child_url + for child_url in sitemap_urls + if child_url not in visited_sitemaps + ], + ) + + return collected_urls + + +def register_sitemap_urls_to_firestore(sitemap_url: str) -> None: + """サイトマップからURL一覧を取得し、Firestoreに登録する。 + + Args: + sitemap_url (str): サイトマップURL + """ + has_last_sent = build_last_sent_cache(FIRESTORE_BATCH_LIMIT) batch = db.batch() # pending_doc_ids はバッチ確定前の重複追加を防ぐために利用 pending_doc_ids: set[str] = set() - while True: - posts_response: dict[str, Any] = ( - service.posts().list(blogId=blog_id, pageToken=page_token).execute() - ) - for post in posts_response.get("items", []): - url: str | None = post.get("url") - if not url: - # URL フィールドが存在しない投稿はスキップ - continue - doc_id = encode_doc_id(url) - - last_sent_exists = has_last_sent.get(doc_id, False) - if not last_sent_exists and doc_id not in pending_doc_ids: - doc_ref = db.collection("url_notifications").document(doc_id) - batch.set( - doc_ref, - {"url": url, "last_sent": INITIAL_TIMESTAMP}, - merge=True, - ) - pending_doc_ids.add(doc_id) - print(f"FirestoreにURL登録: {url}") - - if len(pending_doc_ids) >= FIRESTORE_BATCH_LIMIT: - batch = commit_pending_batch(batch, pending_doc_ids, has_last_sent) - - page_token = posts_response.get("nextPageToken") - if not page_token: - break + sitemap_urls = fetch_sitemap_urls(sitemap_url) + for url in sitemap_urls: + doc_id = encode_doc_id(url) + + last_sent_exists = has_last_sent.get(doc_id, False) + if not last_sent_exists and doc_id not in pending_doc_ids: + doc_ref = db.collection("url_notifications").document(doc_id) + batch.set( + doc_ref, + {"url": url, "last_sent": INITIAL_TIMESTAMP}, + merge=True, + ) + pending_doc_ids.add(doc_id) + print(f"FirestoreにURL登録: {url}") + + if len(pending_doc_ids) >= FIRESTORE_BATCH_LIMIT: + batch = commit_pending_batch(batch, pending_doc_ids, has_last_sent) batch = commit_pending_batch(batch, pending_doc_ids, has_last_sent) @@ -326,7 +395,7 @@ def build_summary_email_body_html(results: list[NotificationResult]) -> str: def main(request: Any) -> tuple[dict[str, Any], int]: # noqa: ANN401, ARG001 """Cloud Functionsのエントリポイント。 - Blogger APIからURLを取得しFirestoreに登録後、未送信・古い通知をAPIに送信し更新する。 + サイトマップからURLを取得しFirestoreに登録後、未送信・古い通知をAPIに送信し更新する。 Args: request (Any): HTTPリクエストオブジェクト(Cloud Functions仕様) @@ -347,12 +416,9 @@ def main(request: Any) -> tuple[dict[str, Any], int]: # noqa: ANN401, ARG001 authed_session = AuthorizedSession(credentials) print(f"認証セッションの取得に成功しました。スコープ: {SCOPES}") - # Blogger APIからURL一覧をFirestoreに登録 - print("Blogger APIからURL一覧を取得し、Firestoreに登録します。") - register_blog_urls_to_firestore( - blog_id=env["blog_id"], - api_key=env["blogger_api_key"], - ) + # サイトマップからURL一覧をFirestoreに登録 + print("サイトマップからURL一覧を取得し、Firestoreに登録します。") + register_sitemap_urls_to_firestore(env["sitemap_url"]) # Firestoreから送信待ちURLを取得 print(f"Firestoreから送信待ちのURLを最大{BATCH_SIZE}件取得します。") From 9b9826f84fe6d93a4d070a06410b2e8bcd00e668 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:39:05 +0000 Subject: [PATCH 03/24] Harden sitemap parsing Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- README.md | 1 + blogger_register/blogger_register.py | 28 +++++++++++++++++++++++++--- requirements.txt | 1 + 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7373bde..c74269f 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ python blogger_register/blogger_register.py | cachetools | 5.5.2 | Apache License 2.0 | | certifi | 2025.6.15 | Mozilla Public License 2.0 | | charset-normalizer | 3.4.2 | MIT License | +| defusedxml | 0.7.1 | PSF License | | google-api-core | 2.25.1 | Apache License 2.0 | | google-api-python-client | 2.175.0 | Apache License 2.0 | | google-auth | 2.40.3 | Apache License 2.0 | diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index fa8888d..d586c3d 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -13,10 +13,11 @@ from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from typing import Any, TypedDict -from xml.etree import ElementTree as ET +from urllib.parse import urlparse import google.auth import requests +from defusedxml import ElementTree from google.auth.transport.requests import AuthorizedSession from google.cloud import firestore @@ -31,6 +32,7 @@ FIRESTORE_BATCH_LIMIT = 500 INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) +MAX_SITEMAP_COUNT = 100 db = firestore.Client() @@ -233,6 +235,21 @@ def decode_sitemap_content(content: bytes, url: str) -> bytes: return content +def ensure_https_url(url: str) -> None: + """Sitemap URLがHTTPSかを検証する。 + + Args: + url (str): 検証対象のURL + + Raises: + ValueError: HTTPSではない場合 + """ + parsed = urlparse(url) + if parsed.scheme.lower() != "https": + message = f"HTTPS以外のサイトマップURLは許可されていません: {url}" + raise ValueError(message) + + def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: """Sitemap XMLからURLと子Sitemap URLを抽出する。 @@ -242,7 +259,7 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: Returns: tuple[list[str], list[str]]: (URLリスト, 子Sitemap URLリスト) """ - root = ET.fromstring(content) # noqa: S314 + root = ElementTree.fromstring(content) namespace = "" if root.tag.startswith("{"): namespace = root.tag.split("}")[0] + "}" @@ -273,17 +290,22 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: Returns: list[str]: 取得したURL一覧 """ + ensure_https_url(sitemap_url) pending_sitemaps = [sitemap_url] visited_sitemaps: set[str] = set() seen_urls: set[str] = set() collected_urls: list[str] = [] while pending_sitemaps: + if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: + message = "サイトマップ取得数が上限を超えたため処理を中断します。" + raise RuntimeError(message) current_url = pending_sitemaps.pop() if current_url in visited_sitemaps: continue + ensure_https_url(current_url) visited_sitemaps.add(current_url) - response = requests.get(current_url, timeout=30) + response = requests.get(current_url, timeout=30, verify=True) response.raise_for_status() content = decode_sitemap_content(response.content, current_url) urls, sitemap_urls = extract_sitemap_entries(content) diff --git a/requirements.txt b/requirements.txt index 5da6b46..684e665 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ cachetools==5.5.2 certifi==2025.6.15 charset-normalizer==3.4.2 +defusedxml==0.7.1 google-api-core==2.25.1 google-api-python-client==2.175.0 google-auth==2.40.3 From de71cd29871b8ecfbb6ec92410552b32baebf3e9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:40:00 +0000 Subject: [PATCH 04/24] Improve sitemap fetch robustness Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index d586c3d..57358e3 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -305,21 +305,28 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: continue ensure_https_url(current_url) visited_sitemaps.add(current_url) - response = requests.get(current_url, timeout=30, verify=True) - response.raise_for_status() + try: + response = requests.get(current_url, timeout=30, verify=True) + response.raise_for_status() + except requests.RequestException as exc: + message = f"サイトマップの取得に失敗しました: {current_url}" + raise RuntimeError(message) from exc content = decode_sitemap_content(response.content, current_url) - urls, sitemap_urls = extract_sitemap_entries(content) + try: + urls, sitemap_urls = extract_sitemap_entries(content) + except ElementTree.ParseError as exc: + message = f"サイトマップXMLの解析に失敗しました: {current_url}" + raise RuntimeError(message) from exc for url in urls: if url not in seen_urls: seen_urls.add(url) collected_urls.append(url) - pending_sitemaps.extend( - [ - child_url - for child_url in sitemap_urls - if child_url not in visited_sitemaps - ], - ) + for child_url in sitemap_urls: + if ( + child_url not in visited_sitemaps + and child_url not in pending_sitemaps + ): + pending_sitemaps.append(child_url) return collected_urls From 36b9d04df11037b2226a0011bcfda5f89b07f54f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:40:33 +0000 Subject: [PATCH 05/24] Add sitemap request headers Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 57358e3..7d36bed 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -306,15 +306,20 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: ensure_https_url(current_url) visited_sitemaps.add(current_url) try: - response = requests.get(current_url, timeout=30, verify=True) + response = requests.get( + current_url, + timeout=30, + verify=True, + headers={"User-Agent": "BloggerRegister/1.0 (sitemap fetcher)"}, + ) response.raise_for_status() - except requests.RequestException as exc: + except (requests.RequestException, ValueError) as exc: message = f"サイトマップの取得に失敗しました: {current_url}" raise RuntimeError(message) from exc content = decode_sitemap_content(response.content, current_url) try: urls, sitemap_urls = extract_sitemap_entries(content) - except ElementTree.ParseError as exc: + except (ElementTree.ParseError, ValueError) as exc: message = f"サイトマップXMLの解析に失敗しました: {current_url}" raise RuntimeError(message) from exc for url in urls: From 291324cbfb9a9c4176b451b979dd007aa4e7842b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:41:04 +0000 Subject: [PATCH 06/24] Refine sitemap traversal Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 7d36bed..39d66e3 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -33,6 +33,7 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 +USER_AGENT = "BloggerRegister/1.0 (sitemap fetcher)" db = firestore.Client() @@ -300,7 +301,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: message = "サイトマップ取得数が上限を超えたため処理を中断します。" raise RuntimeError(message) - current_url = pending_sitemaps.pop() + current_url = pending_sitemaps.pop(0) if current_url in visited_sitemaps: continue ensure_https_url(current_url) @@ -310,7 +311,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: current_url, timeout=30, verify=True, - headers={"User-Agent": "BloggerRegister/1.0 (sitemap fetcher)"}, + headers={"User-Agent": USER_AGENT}, ) response.raise_for_status() except (requests.RequestException, ValueError) as exc: From a7a7616bfba02cf33b1af34f0fe1f45661c699ca Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:41:50 +0000 Subject: [PATCH 07/24] Tighten sitemap XML parsing Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 39d66e3..6e25d09 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -17,7 +17,7 @@ import google.auth import requests -from defusedxml import ElementTree +from defusedxml import ElementTree as DefusedElementTree from google.auth.transport.requests import AuthorizedSession from google.cloud import firestore @@ -260,10 +260,10 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: Returns: tuple[list[str], list[str]]: (URLリスト, 子Sitemap URLリスト) """ - root = ElementTree.fromstring(content) + root = DefusedElementTree.fromstring(content) namespace = "" if root.tag.startswith("{"): - namespace = root.tag.split("}")[0] + "}" + namespace = root.tag.partition("}")[0] + "}" if root.tag.endswith("urlset"): urls = [ @@ -320,7 +320,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: content = decode_sitemap_content(response.content, current_url) try: urls, sitemap_urls = extract_sitemap_entries(content) - except (ElementTree.ParseError, ValueError) as exc: + except (DefusedElementTree.ParseError, ValueError) as exc: message = f"サイトマップXMLの解析に失敗しました: {current_url}" raise RuntimeError(message) from exc for url in urls: From 0c7bed5cc31919bf02246c7eebc5acf45fc45755 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:43:20 +0000 Subject: [PATCH 08/24] Refactor sitemap helpers Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 85 +++++++++++++++++++++------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 6e25d09..8a48a6d 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -15,6 +15,7 @@ from typing import Any, TypedDict from urllib.parse import urlparse +import certifi import google.auth import requests from defusedxml import ElementTree as DefusedElementTree @@ -251,6 +252,19 @@ def ensure_https_url(url: str) -> None: raise ValueError(message) +def is_https_url(url: str) -> bool: + """URLがHTTPSかどうかを判定する。 + + Args: + url (str): 判定対象のURL + + Returns: + bool: HTTPSならTrue + """ + parsed = urlparse(url) + return parsed.scheme.lower() == "https" + + def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: """Sitemap XMLからURLと子Sitemap URLを抽出する。 @@ -282,6 +296,53 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: return [], [] +def fetch_sitemap_content(sitemap_url: str) -> bytes: + """サイトマップのバイナリコンテンツを取得する。 + + Args: + sitemap_url (str): 取得対象のサイトマップURL + + Returns: + bytes: 取得したコンテンツ + """ + ensure_https_url(sitemap_url) + try: + response = requests.get( + sitemap_url, + timeout=30, + verify=certifi.where(), + headers={"User-Agent": USER_AGENT}, + ) + response.raise_for_status() + except requests.SSLError as exc: + message = f"サイトマップのSSL検証に失敗しました: {sitemap_url}" + raise RuntimeError(message) from exc + except (requests.RequestException, ValueError) as exc: + message = f"サイトマップの取得に失敗しました: {sitemap_url}" + raise RuntimeError(message) from exc + return decode_sitemap_content(response.content, sitemap_url) + + +def parse_sitemap_content( + content: bytes, + sitemap_url: str, +) -> tuple[list[str], list[str]]: + """サイトマップのXMLを解析してURLを抽出する。 + + Args: + content (bytes): 解析対象のXMLバイト列 + sitemap_url (str): 解析対象のURL + + Returns: + tuple[list[str], list[str]]: (URLリスト, 子Sitemap URLリスト) + """ + try: + return extract_sitemap_entries(content) + except (DefusedElementTree.ParseError, ValueError) as exc: + message = f"サイトマップXMLの解析に失敗しました: {sitemap_url}" + raise RuntimeError(message) from exc + + def fetch_sitemap_urls(sitemap_url: str) -> list[str]: """サイトマップからURL一覧を取得する。 @@ -291,7 +352,6 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: Returns: list[str]: 取得したURL一覧 """ - ensure_https_url(sitemap_url) pending_sitemaps = [sitemap_url] visited_sitemaps: set[str] = set() seen_urls: set[str] = set() @@ -304,26 +364,13 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: current_url = pending_sitemaps.pop(0) if current_url in visited_sitemaps: continue - ensure_https_url(current_url) visited_sitemaps.add(current_url) - try: - response = requests.get( - current_url, - timeout=30, - verify=True, - headers={"User-Agent": USER_AGENT}, - ) - response.raise_for_status() - except (requests.RequestException, ValueError) as exc: - message = f"サイトマップの取得に失敗しました: {current_url}" - raise RuntimeError(message) from exc - content = decode_sitemap_content(response.content, current_url) - try: - urls, sitemap_urls = extract_sitemap_entries(content) - except (DefusedElementTree.ParseError, ValueError) as exc: - message = f"サイトマップXMLの解析に失敗しました: {current_url}" - raise RuntimeError(message) from exc + content = fetch_sitemap_content(current_url) + urls, sitemap_urls = parse_sitemap_content(content, current_url) for url in urls: + if not is_https_url(url): + print(f"HTTPS以外のURLをスキップしました: {url}") + continue if url not in seen_urls: seen_urls.add(url) collected_urls.append(url) From 2b2ee1b55d36d86741f36516288af6f7e3a45081 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:43:56 +0000 Subject: [PATCH 09/24] Adjust sitemap parsing loop Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 8a48a6d..ae89c1f 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -281,14 +281,14 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: if root.tag.endswith("urlset"): urls = [ - loc.text.strip() + loc.text for loc in root.findall(f".//{namespace}url/{namespace}loc") if loc.text ] return urls, [] if root.tag.endswith("sitemapindex"): sitemap_urls = [ - loc.text.strip() + loc.text for loc in root.findall(f".//{namespace}sitemap/{namespace}loc") if loc.text ] @@ -358,12 +358,12 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: collected_urls: list[str] = [] while pending_sitemaps: - if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: - message = "サイトマップ取得数が上限を超えたため処理を中断します。" - raise RuntimeError(message) current_url = pending_sitemaps.pop(0) if current_url in visited_sitemaps: continue + if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: + message = "サイトマップ取得数が上限を超えたため処理を中断します。" + raise RuntimeError(message) visited_sitemaps.add(current_url) content = fetch_sitemap_content(current_url) urls, sitemap_urls = parse_sitemap_content(content, current_url) From 7cd8f1189ef19be1e5d548e663963337720e6440 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:44:32 +0000 Subject: [PATCH 10/24] Handle sitemap decode errors Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index ae89c1f..f1e3be2 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -233,7 +233,11 @@ def decode_sitemap_content(content: bytes, url: str) -> bytes: bytes: デコード済みのSitemap """ if url.lower().endswith(".gz"): - return gzip.decompress(content) + try: + return gzip.decompress(content) + except OSError as exc: + message = f"サイトマップの解凍に失敗しました: {url}" + raise RuntimeError(message) from exc return content @@ -338,7 +342,7 @@ def parse_sitemap_content( """ try: return extract_sitemap_entries(content) - except (DefusedElementTree.ParseError, ValueError) as exc: + except (DefusedElementTree.ParseError, ValueError, UnicodeError) as exc: message = f"サイトマップXMLの解析に失敗しました: {sitemap_url}" raise RuntimeError(message) from exc From 12722b0947055575da73cbb6d78e49bde191acb3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:45:07 +0000 Subject: [PATCH 11/24] Document sitemap constants Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index f1e3be2..5c2f395 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -33,8 +33,8 @@ FIRESTORE_BATCH_LIMIT = 500 INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) -MAX_SITEMAP_COUNT = 100 -USER_AGENT = "BloggerRegister/1.0 (sitemap fetcher)" +MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 +USER_AGENT = "BloggerRegister/1.0 (sitemap fetcher)" # サーバー識別用 db = firestore.Client() From 9b773261e76d9a50dc202336118b51e17bdc46ef Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:45:36 +0000 Subject: [PATCH 12/24] Update sitemap user agent Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 5c2f395..c509ded 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -34,7 +34,7 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = "BloggerRegister/1.0 (sitemap fetcher)" # サーバー識別用 +USER_AGENT = "SitemapIndexer/1.0 (sitemap fetcher)" # サーバー識別用 db = firestore.Client() From d6f416252b9d341550e3095e1a6bad579d8be4cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 15:46:34 +0000 Subject: [PATCH 13/24] Enhance sitemap handling Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index c509ded..6d1be03 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -9,6 +9,7 @@ import os import smtplib import time +from collections import deque from datetime import UTC, datetime, timedelta from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText @@ -222,7 +223,11 @@ def send_indexing_notification( return success, response.status_code, response.text -def decode_sitemap_content(content: bytes, url: str) -> bytes: +def decode_sitemap_content( + content: bytes, + url: str, + content_encoding: str | None = None, +) -> bytes: """Sitemapコンテンツを必要に応じてデコードする。 Args: @@ -232,6 +237,12 @@ def decode_sitemap_content(content: bytes, url: str) -> bytes: Returns: bytes: デコード済みのSitemap """ + if content_encoding and "gzip" in content_encoding.lower(): + try: + return gzip.decompress(content) + except OSError as exc: + message = f"サイトマップの解凍に失敗しました: {url}" + raise RuntimeError(message) from exc if url.lower().endswith(".gz"): try: return gzip.decompress(content) @@ -324,7 +335,11 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: except (requests.RequestException, ValueError) as exc: message = f"サイトマップの取得に失敗しました: {sitemap_url}" raise RuntimeError(message) from exc - return decode_sitemap_content(response.content, sitemap_url) + return decode_sitemap_content( + response.content, + sitemap_url, + response.headers.get("Content-Encoding"), + ) def parse_sitemap_content( @@ -356,13 +371,13 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: Returns: list[str]: 取得したURL一覧 """ - pending_sitemaps = [sitemap_url] + pending_sitemaps = deque([sitemap_url]) visited_sitemaps: set[str] = set() seen_urls: set[str] = set() collected_urls: list[str] = [] while pending_sitemaps: - current_url = pending_sitemaps.pop(0) + current_url = pending_sitemaps.popleft() if current_url in visited_sitemaps: continue if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: @@ -373,7 +388,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: urls, sitemap_urls = parse_sitemap_content(content, current_url) for url in urls: if not is_https_url(url): - print(f"HTTPS以外のURLをスキップしました: {url}") + print(f"警告: HTTPS以外のURLをスキップしました: {url}") continue if url not in seen_urls: seen_urls.add(url) From 1573240df2b3d8697ac3e1ba54e5b49468e10508 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:28:48 +0000 Subject: [PATCH 14/24] Address sitemap review feedback Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- README.md | 2 ++ blogger_register/blogger_register.py | 44 +++++++++++++++------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index c74269f..ade1a49 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,8 @@ | INITIAL_TIMESTAMP | 新規URL登録時のlast_sent初期値 | Unix epoch (1970年1月1日) | | MIN_NOTIFY_INTERVAL_DAYS | 通知間隔の最小日数。この日数以内に通知されたURLはバッチ対象から除外される(0以下=制限なし) | 0 | | FIRESTORE_BATCH_LIMIT | Firestoreの取得・書き込みバッチ上限 | 500 | +| MAX_SITEMAP_COUNT | サイトマップの取得上限(循環参照対策) | 100 | +| USER_AGENT | サイトマップ取得時のUser-Agent(Noneで送信しない) | None | ## 入出力 diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 6d1be03..2c821fc 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -35,7 +35,7 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = "SitemapIndexer/1.0 (sitemap fetcher)" # サーバー識別用 +USER_AGENT = None db = firestore.Client() @@ -252,37 +252,37 @@ def decode_sitemap_content( return content -def ensure_https_url(url: str) -> None: - """Sitemap URLがHTTPSかを検証する。 +def is_https_url(url: str) -> bool: + """URLがHTTPSかどうかを判定する。 Args: - url (str): 検証対象のURL + url (str): 判定対象のURL - Raises: - ValueError: HTTPSではない場合 + Returns: + bool: HTTPSならTrue """ parsed = urlparse(url) - if parsed.scheme.lower() != "https": - message = f"HTTPS以外のサイトマップURLは許可されていません: {url}" - raise ValueError(message) + return parsed.scheme.lower() == "https" -def is_https_url(url: str) -> bool: - """URLがHTTPSかどうかを判定する。 +def normalize_sitemap_url(url: str) -> str: + """サイトマップURLを正規化する。 Args: - url (str): 判定対象のURL + url (str): 正規化対象のURL Returns: - bool: HTTPSならTrue + str: 正規化後のURL """ - parsed = urlparse(url) - return parsed.scheme.lower() == "https" + return url.strip() def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: """Sitemap XMLからURLと子Sitemap URLを抽出する。 + - rootが urlset の場合: url/loc から URLリストを取得 + - rootが sitemapindex の場合: sitemap/loc から 子Sitemap URLを取得 + Args: content (bytes): XMLコンテンツ @@ -296,14 +296,14 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: if root.tag.endswith("urlset"): urls = [ - loc.text + normalize_sitemap_url(loc.text) for loc in root.findall(f".//{namespace}url/{namespace}loc") if loc.text ] return urls, [] if root.tag.endswith("sitemapindex"): sitemap_urls = [ - loc.text + normalize_sitemap_url(loc.text) for loc in root.findall(f".//{namespace}sitemap/{namespace}loc") if loc.text ] @@ -320,13 +320,15 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: Returns: bytes: 取得したコンテンツ """ - ensure_https_url(sitemap_url) + if not sitemap_url: + message = "サイトマップURLが空のため取得できません。" + raise ValueError(message) try: response = requests.get( sitemap_url, timeout=30, verify=certifi.where(), - headers={"User-Agent": USER_AGENT}, + headers={"User-Agent": USER_AGENT} if USER_AGENT else None, ) response.raise_for_status() except requests.SSLError as exc: @@ -371,7 +373,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: Returns: list[str]: 取得したURL一覧 """ - pending_sitemaps = deque([sitemap_url]) + pending_sitemaps = deque([normalize_sitemap_url(sitemap_url)]) visited_sitemaps: set[str] = set() seen_urls: set[str] = set() collected_urls: list[str] = [] @@ -388,7 +390,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: urls, sitemap_urls = parse_sitemap_content(content, current_url) for url in urls: if not is_https_url(url): - print(f"警告: HTTPS以外のURLをスキップしました: {url}") + print(f"警告: HTTPS以外のURLを登録対象外としました: {url}") continue if url not in seen_urls: seen_urls.add(url) From f495d30683da76ecd4e96f71c1c9411d55c9cf47 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:30:14 +0000 Subject: [PATCH 15/24] Clarify sitemap defaults Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 2c821fc..ab8f882 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -35,7 +35,7 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = None +USER_AGENT = None # Noneの場合はUser-Agentを送信しない db = firestore.Client() @@ -266,13 +266,13 @@ def is_https_url(url: str) -> bool: def normalize_sitemap_url(url: str) -> str: - """サイトマップURLを正規化する。 + """サイトマップURLの前後空白を除去する。 Args: url (str): 正規化対象のURL Returns: - str: 正規化後のURL + str: 前後空白を除去したURL """ return url.strip() @@ -328,7 +328,7 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: sitemap_url, timeout=30, verify=certifi.where(), - headers={"User-Agent": USER_AGENT} if USER_AGENT else None, + headers={"User-Agent": USER_AGENT} if USER_AGENT else {}, ) response.raise_for_status() except requests.SSLError as exc: From 62647489392170524e509e807c5aa58c21b95523 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:31:28 +0000 Subject: [PATCH 16/24] Set default User-Agent Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- README.md | 2 +- blogger_register/blogger_register.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ade1a49..f0a0bd5 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ | MIN_NOTIFY_INTERVAL_DAYS | 通知間隔の最小日数。この日数以内に通知されたURLはバッチ対象から除外される(0以下=制限なし) | 0 | | FIRESTORE_BATCH_LIMIT | Firestoreの取得・書き込みバッチ上限 | 500 | | MAX_SITEMAP_COUNT | サイトマップの取得上限(循環参照対策) | 100 | -| USER_AGENT | サイトマップ取得時のUser-Agent(Noneで送信しない) | None | +| USER_AGENT | サイトマップ取得時のUser-Agent | "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" | ## 入出力 diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index ab8f882..e3362db 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -35,7 +35,9 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = None # Noneの場合はUser-Agentを送信しない +USER_AGENT = ( + "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" +) # Noneの場合はUser-Agentを送信しない db = firestore.Client() @@ -323,6 +325,8 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: if not sitemap_url: message = "サイトマップURLが空のため取得できません。" raise ValueError(message) + if not is_https_url(sitemap_url): + print(f"警告: HTTPS以外のサイトマップURLを取得します: {sitemap_url}") try: response = requests.get( sitemap_url, From 8044b098c7d6732254daad78f9209ca944340310 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:32:08 +0000 Subject: [PATCH 17/24] Refine sitemap normalization Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index e3362db..63b3188 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -35,9 +35,7 @@ INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = ( - "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" -) # Noneの場合はUser-Agentを送信しない +USER_AGENT = "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" db = firestore.Client() @@ -275,8 +273,15 @@ def normalize_sitemap_url(url: str) -> str: Returns: str: 前後空白を除去したURL + + Raises: + ValueError: 空文字の場合 """ - return url.strip() + normalized = url.strip() + if not normalized: + message = "サイトマップURLが空のため取得できません。" + raise ValueError(message) + return normalized def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: @@ -322,14 +327,12 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: Returns: bytes: 取得したコンテンツ """ - if not sitemap_url: - message = "サイトマップURLが空のため取得できません。" - raise ValueError(message) - if not is_https_url(sitemap_url): - print(f"警告: HTTPS以外のサイトマップURLを取得します: {sitemap_url}") + normalized_url = normalize_sitemap_url(sitemap_url) + if not is_https_url(normalized_url): + print(f"警告: HTTPS以外のサイトマップURLを取得します: {normalized_url}") try: response = requests.get( - sitemap_url, + normalized_url, timeout=30, verify=certifi.where(), headers={"User-Agent": USER_AGENT} if USER_AGENT else {}, From 973fccf0ef09c9f9a030646f9b2db03e20297f34 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:32:41 +0000 Subject: [PATCH 18/24] Harden sitemap entry parsing Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 63b3188..706d9f8 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -302,18 +302,24 @@ def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: namespace = root.tag.partition("}")[0] + "}" if root.tag.endswith("urlset"): - urls = [ - normalize_sitemap_url(loc.text) - for loc in root.findall(f".//{namespace}url/{namespace}loc") - if loc.text - ] + urls: list[str] = [] + for loc in root.findall(f".//{namespace}url/{namespace}loc"): + if not loc.text: + continue + try: + urls.append(normalize_sitemap_url(loc.text)) + except ValueError: + continue return urls, [] if root.tag.endswith("sitemapindex"): - sitemap_urls = [ - normalize_sitemap_url(loc.text) - for loc in root.findall(f".//{namespace}sitemap/{namespace}loc") - if loc.text - ] + sitemap_urls: list[str] = [] + for loc in root.findall(f".//{namespace}sitemap/{namespace}loc"): + if not loc.text: + continue + try: + sitemap_urls.append(normalize_sitemap_url(loc.text)) + except ValueError: + continue return [], sitemap_urls return [], [] @@ -335,7 +341,7 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: normalized_url, timeout=30, verify=certifi.where(), - headers={"User-Agent": USER_AGENT} if USER_AGENT else {}, + headers={"User-Agent": USER_AGENT}, ) response.raise_for_status() except requests.SSLError as exc: From bbcd74b6bbd2cd78ab6c5e6535c4a779f1536707 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:33:11 +0000 Subject: [PATCH 19/24] Clarify sitemap docstrings Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 706d9f8..76ab091 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -266,7 +266,7 @@ def is_https_url(url: str) -> bool: def normalize_sitemap_url(url: str) -> str: - """サイトマップURLの前後空白を除去する。 + """サイトマップURLの前後空白を除去し、空なら例外を投げる。 Args: url (str): 正規化対象のURL @@ -287,8 +287,9 @@ def normalize_sitemap_url(url: str) -> str: def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: """Sitemap XMLからURLと子Sitemap URLを抽出する。 - - rootが urlset の場合: url/loc から URLリストを取得 - - rootが sitemapindex の場合: sitemap/loc から 子Sitemap URLを取得 + - rootが urlset の場合: url/loc からURLリストを取得 + - rootが sitemapindex の場合: sitemap/loc から子Sitemap URLを取得 + - URLは前後空白を除去し、空の値は除外する Args: content (bytes): XMLコンテンツ From c791a06b2c7e2e074ebf788f42fc93027148b136 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 1 Feb 2026 16:34:00 +0000 Subject: [PATCH 20/24] Use module logger Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 76ab091..3ba9975 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -6,6 +6,7 @@ import base64 import gzip +import logging import os import smtplib import time @@ -37,6 +38,7 @@ MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 USER_AGENT = "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" +logger = logging.getLogger(__name__) db = firestore.Client() @@ -268,6 +270,8 @@ def is_https_url(url: str) -> bool: def normalize_sitemap_url(url: str) -> str: """サイトマップURLの前後空白を除去し、空なら例外を投げる。 + HTTPSの検証は呼び出し元で行う。 + Args: url (str): 正規化対象のURL @@ -336,7 +340,7 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: """ normalized_url = normalize_sitemap_url(sitemap_url) if not is_https_url(normalized_url): - print(f"警告: HTTPS以外のサイトマップURLを取得します: {normalized_url}") + logger.warning("HTTPS以外のサイトマップURLを取得します: %s", normalized_url) try: response = requests.get( normalized_url, @@ -404,7 +408,7 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: urls, sitemap_urls = parse_sitemap_content(content, current_url) for url in urls: if not is_https_url(url): - print(f"警告: HTTPS以外のURLを登録対象外としました: {url}") + logger.warning("HTTPS以外のURLを登録対象外としました: %s", url) continue if url not in seen_urls: seen_urls.add(url) From 28b16cdf6919705dadb7c97f86ce5a48f42a51e0 Mon Sep 17 00:00:00 2001 From: Shinoryo Date: Mon, 2 Feb 2026 09:30:12 +0900 Subject: [PATCH 21/24] Add defusedxml license file and update README for User-Agent and license details --- LICENSES/defusedxml==0.7.1 | 48 ++++++++++++++++++++++++++++++++++++++ README.md | 4 ++-- 2 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 LICENSES/defusedxml==0.7.1 diff --git a/LICENSES/defusedxml==0.7.1 b/LICENSES/defusedxml==0.7.1 new file mode 100644 index 0000000..96a7f9b --- /dev/null +++ b/LICENSES/defusedxml==0.7.1 @@ -0,0 +1,48 @@ +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 +-------------------------------------------- + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF +hereby grants Licensee a nonexclusive, royalty-free, world-wide +license to reproduce, analyze, test, perform and/or display publicly, +prepare derivative works, distribute, and otherwise use Python +alone or in any derivative version, provided, however, that PSF's +License Agreement and PSF's notice of copyright, i.e., "Copyright (c) +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 Python Software Foundation; +All Rights Reserved" are retained in Python alone or in any derivative +version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. diff --git a/README.md b/README.md index f0a0bd5..2d57164 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ | MIN_NOTIFY_INTERVAL_DAYS | 通知間隔の最小日数。この日数以内に通知されたURLはバッチ対象から除外される(0以下=制限なし) | 0 | | FIRESTORE_BATCH_LIMIT | Firestoreの取得・書き込みバッチ上限 | 500 | | MAX_SITEMAP_COUNT | サイトマップの取得上限(循環参照対策) | 100 | -| USER_AGENT | サイトマップ取得時のUser-Agent | "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" | +| USER_AGENT | サイトマップ取得時のUser-Agent | "SitemapIndexer/1.0 ()" | ## 入出力 @@ -119,7 +119,7 @@ python blogger_register/blogger_register.py | cachetools | 5.5.2 | Apache License 2.0 | | certifi | 2025.6.15 | Mozilla Public License 2.0 | | charset-normalizer | 3.4.2 | MIT License | -| defusedxml | 0.7.1 | PSF License | +| defusedxml | 0.7.1 | Python Software Foundation License 2.0 | | google-api-core | 2.25.1 | Apache License 2.0 | | google-api-python-client | 2.175.0 | Apache License 2.0 | | google-auth | 2.40.3 | Apache License 2.0 | From b036beacc7b12a990d628d832a791494f4676ea5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Feb 2026 00:34:59 +0000 Subject: [PATCH 22/24] Adjust sitemap normalization and logging Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- README.md | 2 -- blogger_register/blogger_register.py | 37 +++++++++++++++++----------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 2d57164..0272a06 100644 --- a/README.md +++ b/README.md @@ -36,8 +36,6 @@ | INITIAL_TIMESTAMP | 新規URL登録時のlast_sent初期値 | Unix epoch (1970年1月1日) | | MIN_NOTIFY_INTERVAL_DAYS | 通知間隔の最小日数。この日数以内に通知されたURLはバッチ対象から除外される(0以下=制限なし) | 0 | | FIRESTORE_BATCH_LIMIT | Firestoreの取得・書き込みバッチ上限 | 500 | -| MAX_SITEMAP_COUNT | サイトマップの取得上限(循環参照対策) | 100 | -| USER_AGENT | サイトマップ取得時のUser-Agent | "SitemapIndexer/1.0 ()" | ## 入出力 diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 3ba9975..15ba664 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -6,7 +6,6 @@ import base64 import gzip -import logging import os import smtplib import time @@ -15,7 +14,7 @@ from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from typing import Any, TypedDict -from urllib.parse import urlparse +from urllib.parse import urlparse, urlunparse import certifi import google.auth @@ -35,10 +34,7 @@ FIRESTORE_BATCH_LIMIT = 500 INITIAL_TIMESTAMP = datetime(1970, 1, 1, tzinfo=UTC) # 新規URL用の初期タイムスタンプ MIN_NOTIFY_INTERVAL_DAYS: int = 0 # 通知間隔の最小日数(0以下=制限なし) -MAX_SITEMAP_COUNT = 100 # 過剰なサイトマップ循環取得を防ぐ上限 -USER_AGENT = "SitemapIndexer/1.0 (https://github.com/Shinoryo/BloggerRegister)" -logger = logging.getLogger(__name__) db = firestore.Client() @@ -268,15 +264,17 @@ def is_https_url(url: str) -> bool: def normalize_sitemap_url(url: str) -> str: - """サイトマップURLの前後空白を除去し、空なら例外を投げる。 + """URLの空白除去と正規化を行う。 - HTTPSの検証は呼び出し元で行う。 + - 前後空白の除去 + - クエリパラメータとフラグメントの除外 + - 末尾の/を統一 (ルート以外は除外) Args: url (str): 正規化対象のURL Returns: - str: 前後空白を除去したURL + str: 正規化後のURL Raises: ValueError: 空文字の場合 @@ -285,7 +283,20 @@ def normalize_sitemap_url(url: str) -> str: if not normalized: message = "サイトマップURLが空のため取得できません。" raise ValueError(message) - return normalized + parsed = urlparse(normalized) + path = parsed.path.rstrip("/") + if not path: + path = "/" + return urlunparse( + ( + parsed.scheme, + parsed.netloc, + path, + parsed.params, + "", + "", + ), + ) def extract_sitemap_entries(content: bytes) -> tuple[list[str], list[str]]: @@ -340,13 +351,12 @@ def fetch_sitemap_content(sitemap_url: str) -> bytes: """ normalized_url = normalize_sitemap_url(sitemap_url) if not is_https_url(normalized_url): - logger.warning("HTTPS以外のサイトマップURLを取得します: %s", normalized_url) + print(f"警告: HTTPS以外のサイトマップURLを取得します: {normalized_url}") try: response = requests.get( normalized_url, timeout=30, verify=certifi.where(), - headers={"User-Agent": USER_AGENT}, ) response.raise_for_status() except requests.SSLError as exc: @@ -400,15 +410,12 @@ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: current_url = pending_sitemaps.popleft() if current_url in visited_sitemaps: continue - if len(visited_sitemaps) >= MAX_SITEMAP_COUNT: - message = "サイトマップ取得数が上限を超えたため処理を中断します。" - raise RuntimeError(message) visited_sitemaps.add(current_url) content = fetch_sitemap_content(current_url) urls, sitemap_urls = parse_sitemap_content(content, current_url) for url in urls: if not is_https_url(url): - logger.warning("HTTPS以外のURLを登録対象外としました: %s", url) + print(f"警告: HTTPS以外のURLを登録対象外としました: {url}") continue if url not in seen_urls: seen_urls.add(url) From 8ad5c884c5ee5213cdf7d65415691c1a915c92ab Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Feb 2026 00:36:07 +0000 Subject: [PATCH 23/24] Clarify URL normalization rationale Co-authored-by: Shinoryo <78910349+Shinoryo@users.noreply.github.com> --- blogger_register/blogger_register.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 15ba664..465fb01 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -268,7 +268,7 @@ def normalize_sitemap_url(url: str) -> str: - 前後空白の除去 - クエリパラメータとフラグメントの除外 - - 末尾の/を統一 (ルート以外は除外) + - 末尾の/を統一 (ルート以外は除外して重複を抑制) Args: url (str): 正規化対象のURL From e3d9dea52d4039f2ad4e04e2631286c4067c2c7a Mon Sep 17 00:00:00 2001 From: Shinoryo <78910349+Shinoryo@users.noreply.github.com> Date: Mon, 2 Feb 2026 09:48:47 +0900 Subject: [PATCH 24/24] Update blogger_register/blogger_register.py Add the content_encoding parameter in the Args section Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- blogger_register/blogger_register.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/blogger_register/blogger_register.py b/blogger_register/blogger_register.py index 465fb01..771c3f9 100644 --- a/blogger_register/blogger_register.py +++ b/blogger_register/blogger_register.py @@ -231,6 +231,8 @@ def decode_sitemap_content( Args: content (bytes): 取得したSitemapのバイト列 url (str): SitemapのURL + content_encoding (str | None): レスポンスヘッダーのContent-Encoding値。 + "gzip"などが指定されている場合はgzipとして解凍を試みる。 Returns: bytes: デコード済みのSitemap