diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3facbd0 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,35 @@ +# Changelog + +## [Unreleased] - 2025-12-06 + +### Fixed +- Fixed scraping functionality to work with modern DownDetector website structure +- Improved bot detection bypass with updated headers and session handling +- Added backward compatibility with original HTML structure + +### Changed +- Updated User-Agent to modern Chrome version (120.0.0.0) +- Added comprehensive HTTP headers (Accept, Accept-Language, Sec-Fetch-*, etc.) +- Implemented session-based requests for better connection handling +- Added timeout (10 seconds) to prevent hanging requests + +### Added +- Multi-strategy HTML parsing: + - Strategy 1: Original selector for backward compatibility + - Strategy 2: Direct entry-title class lookup (most common) + - Strategy 3: Find elements with "status" or "entry-title" in class names + - Strategy 4: Fallback to h1 with status-related keywords +- Better error messages for different failure scenarios +- Response status validation with `raise_for_status()` +- Timeout handling for connection issues + +### Technical Details + +The scraper now attempts multiple strategies to find the status message, making it more resilient to website changes: + +1. **Backward Compatibility**: Tries the original `div#company > div.h2.entry-title` selector first +2. **Modern Structure**: Looks for `.entry-title` class on h1, h2, h3, or div elements +3. **Class Pattern Matching**: Searches for elements with "status" or "entry-title" in their class names +4. **Keyword-Based Fallback**: Uses h1 tags containing status keywords (problem, issue, outage, down, working, reports, no problems) + +This multi-layered approach ensures the scraper works with both old and new website structures. diff --git a/Main.py b/Main.py index 11c952e..7bbef7a 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,5 @@ from src.Scraper import URLInstance, menu, check_connection +import src.Errors as Errors # Checking the Internet Connection on Startup print("Checking Internet Connection..") @@ -22,7 +23,12 @@ option = int(input("Enter the Option Number: ")) if option == 1: - instance.get_status() + try: + instance.get_status() + except Errors.InvalidServiceName as e: + print(f"Service Error: {e}") + except Errors.NetworkError as e: + print(f"Network Error: {e}") elif option == 2: instance.open_url() elif option == 3: diff --git a/README.md b/README.md index 6eff795..a149fd9 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,16 @@ > *It scrapes the status message from DownDetector's Website and shows it in the terminal. So you don't need to open up the Website for checking the status of a service.* +### Recent Updates (December 2025) + +The scraper has been updated to work with the modern DownDetector website structure: +- ✅ Improved bot detection bypass with modern browser headers +- ✅ Multi-strategy HTML parsing for resilience against website changes +- ✅ Better error handling and timeout support +- ✅ Backward compatible with older website structures + +See [CHANGELOG.md](CHANGELOG.md) for detailed changes. + ## CONTRIBUTING > *For details on how to contribute to this project see [this](https://github.com/aaryanrr/DownDetector-CLI/blob/main/CONTRIBUTING.md)* diff --git a/src/Errors.py b/src/Errors.py index f6f97cb..1aa3345 100644 --- a/src/Errors.py +++ b/src/Errors.py @@ -3,3 +3,8 @@ class InvalidServiceName(Exception): # Raised when the Service Name entered is Invalid pass + + +class NetworkError(Exception): + # Raised when there are network/connection issues + pass diff --git a/src/Scraper.py b/src/Scraper.py index 39e3da9..c9da606 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -1,39 +1,108 @@ import webbrowser import requests -from requests.exceptions import ConnectionError from bs4 import BeautifulSoup import src.Errors as Errors +# Constants for HTTP headers to mimic real browser behavior +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -# Class for thw URL Instance +DEFAULT_HEADERS = { + "User-Agent": USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Cache-Control": "max-age=0", + "Referer": "https://www.google.com/" +} + +# Minimum length for status text to be considered valid +MIN_STATUS_TEXT_LENGTH = 5 + + +# Class for the URL Instance class URLInstance(object): url = "https://downdetector.com/status/" def __init__(self, service_name): self.url = f"{self.url}" + service_name + # Create a session for better connection handling and bot protection bypass + self.session = requests.Session() # Scrape the Status of the Service from the Page def get_status(self): - header = { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/74.0.3729.169 Safari/537.36", - 'referer': 'https://www.google.com/' - } try: - page = requests.get(self.url, headers=header) - soup = BeautifulSoup(page.content, 'html5lib') - status = soup.find('div', attrs={'id': 'company'}) - text = status.find('div', attrs={'class': 'h2 entry-title'}) - print(text.text.strip()) - - except AttributeError: - # Expecting AttributeError if the Name given is Invalid - # A NoneType object won't have the attribute .text as used above - raise Errors.InvalidServiceName("Name of the Service is Invalid!") - else: - pass + # Use session for better connection handling + response = self.session.get(self.url, headers=DEFAULT_HEADERS, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html5lib') + + # Try multiple selectors to find the status message (more robust approach) + status_text = None + + # Strategy 1: Try original selector for backward compatibility + company_div = soup.find('div', attrs={'id': 'company'}) + if company_div: + text_elem = company_div.find('div', attrs={'class': 'h2 entry-title'}) + if text_elem: + status_text = text_elem.text.strip() + + # Strategy 2: Try to find entry-title class directly (more common in modern design) + if not status_text: + entry_title = soup.find(['h1', 'h2', 'h3', 'div'], class_='entry-title') + if entry_title: + status_text = entry_title.text.strip() + + # Strategy 3: Try to find status-related elements with common class patterns + if not status_text: + # Look for elements with "status" in class name + def is_valid_status_element(elem): + class_names = elem.get('class', []) + if any('status' in str(c).lower() or 'entry-title' in str(c).lower() for c in class_names): + text = elem.text.strip() + # Filter out very short text or common non-status text + if text and len(text) > MIN_STATUS_TEXT_LENGTH and text.lower() not in ['status', 'info', 'information']: + return True + return False + + status_elem = next((elem for elem in soup.find_all(['div', 'span', 'h1', 'h2', 'h3']) + if is_valid_status_element(elem)), None) + if status_elem: + status_text = status_elem.text.strip() + + # Strategy 4: Look for the page title as fallback + if not status_text: + page_title = soup.find('h1') + if page_title: + text = page_title.text.strip() + text_lower = text.lower() + # Only use if it looks like a status message (contains certain keywords) + if any(keyword in text_lower for keyword in ['problem', 'issue', 'outage', 'down', 'working', 'reports', 'no problems']): + status_text = text + + if status_text: + print(status_text) + else: + raise Errors.InvalidServiceName( + f"Unable to find status for '{self.url.split('/')[-1]}'. " + "Please verify the service name is correct (e.g., 'facebook', 'twitter', 'instagram'). " + "Visit https://downdetector.com to confirm the service exists." + ) + + except requests.exceptions.RequestException as e: + print(f"Error fetching the page: {e}") + raise Errors.NetworkError("Could not fetch the page. Please check your internet connection and try again.") + except Exception as e: + print(f"Unexpected error: {e}") + raise # Prints the URL for the Status Page of the Service def get_url(self): @@ -52,11 +121,18 @@ def get_base_url(cls): # Function to Check the Internet Connection def check_connection(): try: - requests.get("https://downdetector.com") - except ConnectionError: - print("This program requires an active Internet Connection!") - else: + response = requests.get("https://downdetector.com", headers={"User-Agent": USER_AGENT}, timeout=10) + response.raise_for_status() print("All Good 👍") + except requests.exceptions.ConnectionError: + print("This program requires an active Internet Connection!") + quit() + except requests.exceptions.Timeout: + print("Connection timeout. Please check your internet connection!") + quit() + except Exception as e: + print(f"Connection check failed: {e}") + print("Proceeding anyway...") # Function for the Menu