From 811e3106d1a6bd444f1554494f2b6359758fb684 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 04:18:28 +0000 Subject: [PATCH 1/6] Initial plan From b564bb12a4a4593b57f5c54b9f975c76623ee80a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 04:24:31 +0000 Subject: [PATCH 2/6] Add HTML fetching capability for articles Co-authored-by: njt <66792+njt@users.noreply.github.com> --- =4.12.0 | 0 database.py | 56 ++++++++++++++++++++++++++ forcible.py | 51 ++++++++++++++++++++++++ html_fetcher.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 5 files changed, 210 insertions(+) create mode 100644 =4.12.0 create mode 100644 html_fetcher.py diff --git a/=4.12.0 b/=4.12.0 new file mode 100644 index 0000000..e69de29 diff --git a/database.py b/database.py index afc59f7..ded397f 100644 --- a/database.py +++ b/database.py @@ -36,6 +36,7 @@ def _init_schema(self): published_date TEXT, fetched_date TEXT NOT NULL, content TEXT, + raw_html TEXT, data TEXT, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP @@ -140,6 +141,61 @@ def update_article_data(self, article_id: int, data: Dict[str, Any]): self.conn.commit() + def update_article_html(self, article_id: int, raw_html: str): + """ + Update article raw HTML content. + + Args: + article_id: Article ID + raw_html: Raw HTML content of the article + """ + cursor = self.conn.cursor() + updated_at = datetime.now(UTC).isoformat() + + cursor.execute(''' + UPDATE articles + SET raw_html = ?, updated_at = ? + WHERE id = ? + ''', (raw_html, updated_at, article_id)) + + self.conn.commit() + + def get_articles_without_html(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Get articles that don't have raw HTML fetched yet. + + Args: + limit: Maximum number of articles to return (optional) + + Returns: + List of article dictionaries + """ + cursor = self.conn.cursor() + + query = ''' + SELECT * FROM articles + WHERE raw_html IS NULL + ORDER BY published_date DESC + ''' + + if limit: + query += ' LIMIT ?' + cursor.execute(query, (limit,)) + else: + cursor.execute(query) + + articles = [] + for row in cursor.fetchall(): + article = dict(row) + if article['data']: + try: + article['data'] = json.loads(article['data']) + except json.JSONDecodeError: + article['data'] = None + articles.append(article) + + return articles + def get_last_scrape_time(self, source_name: str) -> Optional[str]: """ Get the last scrape time for a source. diff --git a/forcible.py b/forcible.py index 79804e0..dd28995 100755 --- a/forcible.py +++ b/forcible.py @@ -14,6 +14,7 @@ from database import Database from rnz_ingester import RNZIngester from llm_processor import LLMProcessor +from html_fetcher import HTMLFetcher def cmd_fetch(args): @@ -43,6 +44,47 @@ def cmd_fetch(args): sys.exit(1) +def cmd_fetch_html(args): + """Fetch full HTML content for articles.""" + try: + config = Config(args.config) + db = Database(config.get_database_path()) + + fetcher = HTMLFetcher(db) + + # Get articles without HTML + articles_to_fetch = db.get_articles_without_html(limit=args.limit) + + if not articles_to_fetch: + print("No articles need HTML fetching.") + db.close() + return + + print(f"Fetching HTML for {len(articles_to_fetch)} article(s)...\n") + + # Progress callback + def progress_callback(current, total, headline): + print(f"[{current}/{total}] Fetching: {headline[:60]}...") + + # Fetch HTML + success_count = fetcher.fetch_all_missing_html( + limit=args.limit, + progress_callback=progress_callback + ) + + db.close() + print(f"\nFetch complete! Successfully fetched {success_count} article(s).") + + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error during HTML fetch: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + + def cmd_list(args): """List articles from the database.""" try: @@ -345,6 +387,15 @@ def main(): ) parser_fetch.set_defaults(func=cmd_fetch) + # fetch-html command + parser_fetch_html = subparsers.add_parser('fetch-html', help='Fetch full HTML content for articles') + parser_fetch_html.add_argument( + '--limit', + type=int, + help='Maximum number of articles to fetch (default: all without HTML)' + ) + parser_fetch_html.set_defaults(func=cmd_fetch_html) + # list command parser_list = subparsers.add_parser('list', help='List articles') parser_list.add_argument('--source', help='Filter by source') diff --git a/html_fetcher.py b/html_fetcher.py new file mode 100644 index 0000000..c762022 --- /dev/null +++ b/html_fetcher.py @@ -0,0 +1,102 @@ +""" +HTML content fetcher for news articles. +""" +import requests +from typing import Optional +from bs4 import BeautifulSoup + + +class HTMLFetcher: + """Handles fetching and storing raw HTML content from article URLs.""" + + def __init__(self, database): + """ + Initialize the HTML fetcher. + + Args: + database: Database instance + """ + self.db = database + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + + def fetch_html(self, url: str) -> Optional[str]: + """ + Fetch raw HTML content from a URL. + + Args: + url: Article URL + + Returns: + Raw HTML content or None if fetch fails + """ + try: + response = requests.get(url, headers=self.headers, timeout=30) + response.raise_for_status() + + # Parse HTML to remove script and style tags + soup = BeautifulSoup(response.text, 'html.parser') + + # Remove script and style elements + for script in soup(['script', 'style']): + script.decompose() + + # Return the cleaned HTML as string + return str(soup) + + except requests.RequestException as e: + print(f"Error fetching HTML from {url}: {e}") + return None + except Exception as e: + print(f"Unexpected error fetching HTML from {url}: {e}") + return None + + def fetch_article_html(self, article_id: int, url: str) -> bool: + """ + Fetch HTML for a specific article and store it in the database. + + Args: + article_id: Article ID + url: Article URL + + Returns: + True if successful, False otherwise + """ + html = self.fetch_html(url) + if html: + self.db.update_article_html(article_id, html) + return True + return False + + def fetch_all_missing_html(self, limit: Optional[int] = None, progress_callback: Optional[callable] = None) -> int: + """ + Fetch HTML for all articles that don't have it yet. + + Args: + limit: Maximum number of articles to process (optional) + progress_callback: Optional callback function for progress updates + + Returns: + Number of articles successfully fetched + """ + articles = self.db.get_articles_without_html(limit=limit) + + if not articles: + return 0 + + success_count = 0 + total = len(articles) + + for i, article in enumerate(articles): + article_id = article['id'] + url = article['url'] + headline = article['headline'] + + if progress_callback: + progress_callback(i + 1, total, headline) + + if self.fetch_article_html(article_id, url): + success_count += 1 + + return success_count diff --git a/requirements.txt b/requirements.txt index cf850e3..89e1d49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ requests>=2.31.0 openai>=1.3.0 python-dateutil>=2.8.2 pydantic>=2.0.0 +beautifulsoup4>=4.12.0 From c81b0298737400fad326ee01ba6e60154ffb2ea5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 04:26:17 +0000 Subject: [PATCH 3/6] Update README and remove unwanted file Co-authored-by: njt <66792+njt@users.noreply.github.com> --- =4.12.0 | 0 README.md | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) delete mode 100644 =4.12.0 diff --git a/=4.12.0 b/=4.12.0 deleted file mode 100644 index e69de29..0000000 diff --git a/README.md b/README.md index 447b788..2f8d486 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,22 @@ Fetch only from Radio New Zealand: python forcible.py fetch --source rnz ``` +### Fetch Full Article HTML + +Fetch full HTML content for articles that don't have it yet: + +```bash +python forcible.py fetch-html +``` + +Fetch HTML for a limited number of articles: + +```bash +python forcible.py fetch-html --limit 10 +``` + +This command fetches the raw HTML content from article URLs and stores it in the database. The HTML is cleaned of script and style tags but preserves the article structure for later processing. + ### List Articles List recent articles: @@ -150,6 +166,7 @@ This displays the article headline, content, and structured LLM analysis includi - **config.py**: Configuration management (supports both INI and JSON formats) - **database.py**: SQLite database interface for storing articles - **rnz_ingester.py**: Radio New Zealand RSS feed ingester +- **html_fetcher.py**: HTML content fetcher for retrieving full article content - **llm_processor.py**: LLM-based article analysis with structured outputs - **forcible.py**: Command-line interface @@ -162,7 +179,8 @@ This displays the article headline, content, and structured LLM analysis includi - `headline`: Article headline - `published_date`: Publication date (ISO format) - `fetched_date`: Date fetched from source -- `content`: Article content/summary +- `content`: Article content/summary from RSS feed +- `raw_html`: Full raw HTML content fetched from article URL - `data`: JSON field for LLM analysis results (facts, relevance, PR probability, etc.) - `created_at`: Record creation timestamp - `updated_at`: Last update timestamp From d8eff5aef57a995c1215d8047a3746c1c61dfb73 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 04:29:04 +0000 Subject: [PATCH 4/6] Address code review feedback - update user agent and move imports Co-authored-by: njt <66792+njt@users.noreply.github.com> --- forcible.py | 3 +-- html_fetcher.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/forcible.py b/forcible.py index dd28995..d7c482d 100755 --- a/forcible.py +++ b/forcible.py @@ -7,6 +7,7 @@ import argparse import sys import json +import traceback from pathlib import Path from datetime import datetime, UTC @@ -80,7 +81,6 @@ def progress_callback(current, total, headline): sys.exit(1) except Exception as e: print(f"Error during HTML fetch: {e}", file=sys.stderr) - import traceback traceback.print_exc() sys.exit(1) @@ -286,7 +286,6 @@ def progress_callback(current, total, headline): sys.exit(1) except Exception as e: print(f"Error during processing: {e}", file=sys.stderr) - import traceback traceback.print_exc() sys.exit(1) diff --git a/html_fetcher.py b/html_fetcher.py index c762022..19158cd 100644 --- a/html_fetcher.py +++ b/html_fetcher.py @@ -18,7 +18,7 @@ def __init__(self, database): """ self.db = database self.headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' } def fetch_html(self, url: str) -> Optional[str]: @@ -32,7 +32,7 @@ def fetch_html(self, url: str) -> Optional[str]: Raw HTML content or None if fetch fails """ try: - response = requests.get(url, headers=self.headers, timeout=30) + response = requests.get(url, headers=self.headers, timeout=30, allow_redirects=True) response.raise_for_status() # Parse HTML to remove script and style tags From d974c7992cbc39d31ddad930270350d04dc555ad Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 07:07:16 +0000 Subject: [PATCH 5/6] Extract article content instead of raw HTML to minimize tokens Co-authored-by: njt <66792+njt@users.noreply.github.com> --- README.md | 4 +-- html_fetcher.py | 78 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2f8d486..238ff78 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Fetch HTML for a limited number of articles: python forcible.py fetch-html --limit 10 ``` -This command fetches the raw HTML content from article URLs and stores it in the database. The HTML is cleaned of script and style tags but preserves the article structure for later processing. +This command fetches article content from URLs and extracts only the essential text (paragraphs, headings, links) to minimize token usage for LLM processing. Navigation, ads, and other non-content elements are removed. ### List Articles @@ -180,7 +180,7 @@ This displays the article headline, content, and structured LLM analysis includi - `published_date`: Publication date (ISO format) - `fetched_date`: Date fetched from source - `content`: Article content/summary from RSS feed -- `raw_html`: Full raw HTML content fetched from article URL +- `raw_html`: Extracted article content (text, headings, links) from article URL - `data`: JSON field for LLM analysis results (facts, relevance, PR probability, etc.) - `created_at`: Record creation timestamp - `updated_at`: Last update timestamp diff --git a/html_fetcher.py b/html_fetcher.py index 19158cd..8267624 100644 --- a/html_fetcher.py +++ b/html_fetcher.py @@ -23,27 +23,89 @@ def __init__(self, database): def fetch_html(self, url: str) -> Optional[str]: """ - Fetch raw HTML content from a URL. + Fetch and extract article content from a URL. + + Extracts only the main article content (paragraphs, headings, links) + to minimize token usage for LLM processing. Args: url: Article URL Returns: - Raw HTML content or None if fetch fails + Extracted article content or None if fetch fails """ try: response = requests.get(url, headers=self.headers, timeout=30, allow_redirects=True) response.raise_for_status() - # Parse HTML to remove script and style tags + # Parse HTML soup = BeautifulSoup(response.text, 'html.parser') - # Remove script and style elements - for script in soup(['script', 'style']): - script.decompose() + # Remove unwanted elements + for element in soup(['script', 'style', 'nav', 'header', 'footer', + 'aside', 'iframe', 'noscript', 'form']): + element.decompose() + + # Try to find the main article content + # Common article containers + article_content = None + for selector in ['article', 'main', '[role="main"]', '.article-content', + '.post-content', '.entry-content', '#content']: + article_content = soup.select_one(selector) + if article_content: + break + + # If no article container found, use body + if not article_content: + article_content = soup.body if soup.body else soup + + # Extract text content with structure + extracted = [] + + # Extract title if available + title = soup.find('h1') + if title: + extracted.append(f"# {title.get_text(strip=True)}\n") + + # Extract paragraphs, headings, and lists from the article content + for element in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'blockquote']): + text = element.get_text(strip=True) + if text: # Only include non-empty elements + tag = element.name + + if tag == 'p': + extracted.append(text) + elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + level = int(tag[1]) + extracted.append(f"\n{'#' * level} {text}\n") + elif tag in ['ul', 'ol']: + # Extract list items + for li in element.find_all('li', recursive=False): + li_text = li.get_text(strip=True) + if li_text: + extracted.append(f"- {li_text}") + elif tag == 'blockquote': + extracted.append(f"> {text}") + + extracted.append("") # Add blank line between elements + + # Extract external links + links = [] + for a in article_content.find_all('a', href=True): + href = a['href'] + link_text = a.get_text(strip=True) + # Only include external links (http/https) + if href.startswith('http') and link_text: + links.append(f"[{link_text}]({href})") + + # Combine content + content = "\n".join(extracted).strip() + + # Add links section if there are external links + if links: + content += "\n\n## External Links\n" + "\n".join(links) - # Return the cleaned HTML as string - return str(soup) + return content if content else None except requests.RequestException as e: print(f"Error fetching HTML from {url}: {e}") From 74f541e30d3a98e915c754f231aac81b45ab4791 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 14 Dec 2025 07:10:34 +0000 Subject: [PATCH 6/6] Fix code review issues: update docstring, prevent duplicate h1, clean blank lines Co-authored-by: njt <66792+njt@users.noreply.github.com> --- html_fetcher.py | 66 +++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/html_fetcher.py b/html_fetcher.py index 8267624..528aa01 100644 --- a/html_fetcher.py +++ b/html_fetcher.py @@ -7,7 +7,11 @@ class HTMLFetcher: - """Handles fetching and storing raw HTML content from article URLs.""" + """Handles fetching and extracting article content from URLs. + + Extracts essential content (text, headings, links) while removing + navigation, ads, and other non-content elements to minimize token usage. + """ def __init__(self, database): """ @@ -61,33 +65,43 @@ def fetch_html(self, url: str) -> Optional[str]: # Extract text content with structure extracted = [] - - # Extract title if available - title = soup.find('h1') - if title: - extracted.append(f"# {title.get_text(strip=True)}\n") + seen_title = False # Extract paragraphs, headings, and lists from the article content for element in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'blockquote']): text = element.get_text(strip=True) - if text: # Only include non-empty elements - tag = element.name - - if tag == 'p': - extracted.append(text) - elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - level = int(tag[1]) - extracted.append(f"\n{'#' * level} {text}\n") - elif tag in ['ul', 'ol']: - # Extract list items - for li in element.find_all('li', recursive=False): - li_text = li.get_text(strip=True) - if li_text: - extracted.append(f"- {li_text}") - elif tag == 'blockquote': - extracted.append(f"> {text}") + if not text: # Skip empty elements + continue + + tag = element.name + + if tag == 'p': + extracted.append(text) + extracted.append("") # Blank line after paragraph + elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + # Skip duplicate h1 (title) if we've already seen one + if tag == 'h1' and seen_title: + continue + if tag == 'h1': + seen_title = True - extracted.append("") # Add blank line between elements + level = int(tag[1]) + # Add blank line before heading (except first one) + if extracted: + extracted.append("") + extracted.append(f"{'#' * level} {text}") + extracted.append("") # Blank line after heading + elif tag in ['ul', 'ol']: + # Extract list items + for li in element.find_all('li', recursive=False): + li_text = li.get_text(strip=True) + if li_text: + extracted.append(f"- {li_text}") + extracted.append("") # Blank line after list + elif tag == 'blockquote': + extracted.append(f"> {text}") + extracted.append("") # Blank line after quote + # Extract external links links = [] @@ -98,9 +112,13 @@ def fetch_html(self, url: str) -> Optional[str]: if href.startswith('http') and link_text: links.append(f"[{link_text}]({href})") - # Combine content + # Combine content and clean up excessive blank lines content = "\n".join(extracted).strip() + # Remove consecutive blank lines (replace multiple \n\n with just \n\n) + while "\n\n\n" in content: + content = content.replace("\n\n\n", "\n\n") + # Add links section if there are external links if links: content += "\n\n## External Links\n" + "\n".join(links)