diff --git a/README.md b/README.md index 447b788..238ff78 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,22 @@ Fetch only from Radio New Zealand: python forcible.py fetch --source rnz ``` +### Fetch Full Article HTML + +Fetch full HTML content for articles that don't have it yet: + +```bash +python forcible.py fetch-html +``` + +Fetch HTML for a limited number of articles: + +```bash +python forcible.py fetch-html --limit 10 +``` + +This command fetches article content from URLs and extracts only the essential text (paragraphs, headings, links) to minimize token usage for LLM processing. Navigation, ads, and other non-content elements are removed. + ### List Articles List recent articles: @@ -150,6 +166,7 @@ This displays the article headline, content, and structured LLM analysis includi - **config.py**: Configuration management (supports both INI and JSON formats) - **database.py**: SQLite database interface for storing articles - **rnz_ingester.py**: Radio New Zealand RSS feed ingester +- **html_fetcher.py**: HTML content fetcher for retrieving full article content - **llm_processor.py**: LLM-based article analysis with structured outputs - **forcible.py**: Command-line interface @@ -162,7 +179,8 @@ This displays the article headline, content, and structured LLM analysis includi - `headline`: Article headline - `published_date`: Publication date (ISO format) - `fetched_date`: Date fetched from source -- `content`: Article content/summary +- `content`: Article content/summary from RSS feed +- `raw_html`: Extracted article content (text, headings, links) from article URL - `data`: JSON field for LLM analysis results (facts, relevance, PR probability, etc.) - `created_at`: Record creation timestamp - `updated_at`: Last update timestamp diff --git a/database.py b/database.py index afc59f7..ded397f 100644 --- a/database.py +++ b/database.py @@ -36,6 +36,7 @@ def _init_schema(self): published_date TEXT, fetched_date TEXT NOT NULL, content TEXT, + raw_html TEXT, data TEXT, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP @@ -140,6 +141,61 @@ def update_article_data(self, article_id: int, data: Dict[str, Any]): self.conn.commit() + def update_article_html(self, article_id: int, raw_html: str): + """ + Update article raw HTML content. + + Args: + article_id: Article ID + raw_html: Raw HTML content of the article + """ + cursor = self.conn.cursor() + updated_at = datetime.now(UTC).isoformat() + + cursor.execute(''' + UPDATE articles + SET raw_html = ?, updated_at = ? + WHERE id = ? + ''', (raw_html, updated_at, article_id)) + + self.conn.commit() + + def get_articles_without_html(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: + """ + Get articles that don't have raw HTML fetched yet. + + Args: + limit: Maximum number of articles to return (optional) + + Returns: + List of article dictionaries + """ + cursor = self.conn.cursor() + + query = ''' + SELECT * FROM articles + WHERE raw_html IS NULL + ORDER BY published_date DESC + ''' + + if limit: + query += ' LIMIT ?' + cursor.execute(query, (limit,)) + else: + cursor.execute(query) + + articles = [] + for row in cursor.fetchall(): + article = dict(row) + if article['data']: + try: + article['data'] = json.loads(article['data']) + except json.JSONDecodeError: + article['data'] = None + articles.append(article) + + return articles + def get_last_scrape_time(self, source_name: str) -> Optional[str]: """ Get the last scrape time for a source. diff --git a/forcible.py b/forcible.py index 79804e0..d7c482d 100755 --- a/forcible.py +++ b/forcible.py @@ -7,6 +7,7 @@ import argparse import sys import json +import traceback from pathlib import Path from datetime import datetime, UTC @@ -14,6 +15,7 @@ from database import Database from rnz_ingester import RNZIngester from llm_processor import LLMProcessor +from html_fetcher import HTMLFetcher def cmd_fetch(args): @@ -43,6 +45,46 @@ def cmd_fetch(args): sys.exit(1) +def cmd_fetch_html(args): + """Fetch full HTML content for articles.""" + try: + config = Config(args.config) + db = Database(config.get_database_path()) + + fetcher = HTMLFetcher(db) + + # Get articles without HTML + articles_to_fetch = db.get_articles_without_html(limit=args.limit) + + if not articles_to_fetch: + print("No articles need HTML fetching.") + db.close() + return + + print(f"Fetching HTML for {len(articles_to_fetch)} article(s)...\n") + + # Progress callback + def progress_callback(current, total, headline): + print(f"[{current}/{total}] Fetching: {headline[:60]}...") + + # Fetch HTML + success_count = fetcher.fetch_all_missing_html( + limit=args.limit, + progress_callback=progress_callback + ) + + db.close() + print(f"\nFetch complete! Successfully fetched {success_count} article(s).") + + except FileNotFoundError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error during HTML fetch: {e}", file=sys.stderr) + traceback.print_exc() + sys.exit(1) + + def cmd_list(args): """List articles from the database.""" try: @@ -244,7 +286,6 @@ def progress_callback(current, total, headline): sys.exit(1) except Exception as e: print(f"Error during processing: {e}", file=sys.stderr) - import traceback traceback.print_exc() sys.exit(1) @@ -345,6 +386,15 @@ def main(): ) parser_fetch.set_defaults(func=cmd_fetch) + # fetch-html command + parser_fetch_html = subparsers.add_parser('fetch-html', help='Fetch full HTML content for articles') + parser_fetch_html.add_argument( + '--limit', + type=int, + help='Maximum number of articles to fetch (default: all without HTML)' + ) + parser_fetch_html.set_defaults(func=cmd_fetch_html) + # list command parser_list = subparsers.add_parser('list', help='List articles') parser_list.add_argument('--source', help='Filter by source') diff --git a/html_fetcher.py b/html_fetcher.py new file mode 100644 index 0000000..528aa01 --- /dev/null +++ b/html_fetcher.py @@ -0,0 +1,182 @@ +""" +HTML content fetcher for news articles. +""" +import requests +from typing import Optional +from bs4 import BeautifulSoup + + +class HTMLFetcher: + """Handles fetching and extracting article content from URLs. + + Extracts essential content (text, headings, links) while removing + navigation, ads, and other non-content elements to minimize token usage. + """ + + def __init__(self, database): + """ + Initialize the HTML fetcher. + + Args: + database: Database instance + """ + self.db = database + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + + def fetch_html(self, url: str) -> Optional[str]: + """ + Fetch and extract article content from a URL. + + Extracts only the main article content (paragraphs, headings, links) + to minimize token usage for LLM processing. + + Args: + url: Article URL + + Returns: + Extracted article content or None if fetch fails + """ + try: + response = requests.get(url, headers=self.headers, timeout=30, allow_redirects=True) + response.raise_for_status() + + # Parse HTML + soup = BeautifulSoup(response.text, 'html.parser') + + # Remove unwanted elements + for element in soup(['script', 'style', 'nav', 'header', 'footer', + 'aside', 'iframe', 'noscript', 'form']): + element.decompose() + + # Try to find the main article content + # Common article containers + article_content = None + for selector in ['article', 'main', '[role="main"]', '.article-content', + '.post-content', '.entry-content', '#content']: + article_content = soup.select_one(selector) + if article_content: + break + + # If no article container found, use body + if not article_content: + article_content = soup.body if soup.body else soup + + # Extract text content with structure + extracted = [] + seen_title = False + + # Extract paragraphs, headings, and lists from the article content + for element in article_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'blockquote']): + text = element.get_text(strip=True) + if not text: # Skip empty elements + continue + + tag = element.name + + if tag == 'p': + extracted.append(text) + extracted.append("") # Blank line after paragraph + elif tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + # Skip duplicate h1 (title) if we've already seen one + if tag == 'h1' and seen_title: + continue + if tag == 'h1': + seen_title = True + + level = int(tag[1]) + # Add blank line before heading (except first one) + if extracted: + extracted.append("") + extracted.append(f"{'#' * level} {text}") + extracted.append("") # Blank line after heading + elif tag in ['ul', 'ol']: + # Extract list items + for li in element.find_all('li', recursive=False): + li_text = li.get_text(strip=True) + if li_text: + extracted.append(f"- {li_text}") + extracted.append("") # Blank line after list + elif tag == 'blockquote': + extracted.append(f"> {text}") + extracted.append("") # Blank line after quote + + + # Extract external links + links = [] + for a in article_content.find_all('a', href=True): + href = a['href'] + link_text = a.get_text(strip=True) + # Only include external links (http/https) + if href.startswith('http') and link_text: + links.append(f"[{link_text}]({href})") + + # Combine content and clean up excessive blank lines + content = "\n".join(extracted).strip() + + # Remove consecutive blank lines (replace multiple \n\n with just \n\n) + while "\n\n\n" in content: + content = content.replace("\n\n\n", "\n\n") + + # Add links section if there are external links + if links: + content += "\n\n## External Links\n" + "\n".join(links) + + return content if content else None + + except requests.RequestException as e: + print(f"Error fetching HTML from {url}: {e}") + return None + except Exception as e: + print(f"Unexpected error fetching HTML from {url}: {e}") + return None + + def fetch_article_html(self, article_id: int, url: str) -> bool: + """ + Fetch HTML for a specific article and store it in the database. + + Args: + article_id: Article ID + url: Article URL + + Returns: + True if successful, False otherwise + """ + html = self.fetch_html(url) + if html: + self.db.update_article_html(article_id, html) + return True + return False + + def fetch_all_missing_html(self, limit: Optional[int] = None, progress_callback: Optional[callable] = None) -> int: + """ + Fetch HTML for all articles that don't have it yet. + + Args: + limit: Maximum number of articles to process (optional) + progress_callback: Optional callback function for progress updates + + Returns: + Number of articles successfully fetched + """ + articles = self.db.get_articles_without_html(limit=limit) + + if not articles: + return 0 + + success_count = 0 + total = len(articles) + + for i, article in enumerate(articles): + article_id = article['id'] + url = article['url'] + headline = article['headline'] + + if progress_callback: + progress_callback(i + 1, total, headline) + + if self.fetch_article_html(article_id, url): + success_count += 1 + + return success_count diff --git a/requirements.txt b/requirements.txt index cf850e3..89e1d49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ requests>=2.31.0 openai>=1.3.0 python-dateutil>=2.8.2 pydantic>=2.0.0 +beautifulsoup4>=4.12.0