From 3b458c543f788c2c4372494efd4900d2ab066041 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:02:53 +0000 Subject: [PATCH 1/7] Initial plan From a85251d4d2daca8da71de6a3722867d9803c89f3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:08:03 +0000 Subject: [PATCH 2/7] Update scraper to handle modern DownDetector HTML and improve bot protection bypass Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- src/Scraper.py | 101 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 14 deletions(-) diff --git a/src/Scraper.py b/src/Scraper.py index 39e3da9..88bcb27 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -13,27 +13,89 @@ class URLInstance(object): def __init__(self, service_name): self.url = f"{self.url}" + service_name + # Create a session for better connection handling and bot protection bypass + self.session = requests.Session() # Scrape the Status of the Service from the Page def get_status(self): - header = { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/74.0.3729.169 Safari/537.36", - 'referer': 'https://www.google.com/' + # Updated headers to mimic a real browser more closely + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Cache-Control": "max-age=0", + "Referer": "https://www.google.com/" } + try: - page = requests.get(self.url, headers=header) - soup = BeautifulSoup(page.content, 'html5lib') - status = soup.find('div', attrs={'id': 'company'}) - text = status.find('div', attrs={'class': 'h2 entry-title'}) - print(text.text.strip()) + # Use session for better connection handling + response = self.session.get(self.url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.content, 'html5lib') + + # Try multiple selectors to find the status message (more robust approach) + status_text = None + + # Strategy 1: Try original selector for backward compatibility + company_div = soup.find('div', attrs={'id': 'company'}) + if company_div: + text_elem = company_div.find('div', attrs={'class': 'h2 entry-title'}) + if text_elem: + status_text = text_elem.text.strip() + + # Strategy 2: Try to find entry-title class directly (more common in modern design) + if not status_text: + entry_title = soup.find(['h1', 'h2', 'h3', 'div'], class_='entry-title') + if entry_title: + status_text = entry_title.text.strip() + + # Strategy 3: Try to find status-related elements with common class patterns + if not status_text: + # Look for elements with "status" in class name + for elem in soup.find_all(['div', 'span', 'h1', 'h2', 'h3']): + class_names = elem.get('class', []) + if any('status' in str(c).lower() or 'entry-title' in str(c).lower() for c in class_names): + text = elem.text.strip() + # Filter out very short text or common non-status text + if text and len(text) > 5 and not text.lower() in ['status', 'info', 'information']: + status_text = text + break + + # Strategy 4: Look for the page title as fallback + if not status_text: + page_title = soup.find('h1') + if page_title: + text = page_title.text.strip() + # Only use if it looks like a status message (contains certain keywords) + if any(keyword in text.lower() for keyword in ['problem', 'issue', 'outage', 'down', 'working', 'reports', 'no problems']): + status_text = text + + if status_text: + print(status_text) + else: + raise Errors.InvalidServiceName( + "Could not find status information. The service name may be invalid or the page structure has changed." + ) except AttributeError: # Expecting AttributeError if the Name given is Invalid # A NoneType object won't have the attribute .text as used above raise Errors.InvalidServiceName("Name of the Service is Invalid!") - else: - pass + except requests.exceptions.RequestException as e: + print(f"Error fetching the page: {e}") + raise Errors.InvalidServiceName("Could not fetch the page. Please check your internet connection and try again.") + except Exception as e: + print(f"Unexpected error: {e}") + raise # Prints the URL for the Status Page of the Service def get_url(self): @@ -51,12 +113,23 @@ def get_base_url(cls): # Function to Check the Internet Connection def check_connection(): + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + } try: - requests.get("https://downdetector.com") + response = requests.get("https://downdetector.com", headers=headers, timeout=10) + response.raise_for_status() + print("All Good 👍") except ConnectionError: print("This program requires an active Internet Connection!") - else: - print("All Good 👍") + quit() + except requests.exceptions.Timeout: + print("Connection timeout. Please check your internet connection!") + quit() + except Exception as e: + print(f"Connection check failed: {e}") + print("Proceeding anyway...") # Function for the Menu From 89c08813533a21c7b7c1ce9f816187625757a2cd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:09:34 +0000 Subject: [PATCH 3/7] Add CHANGELOG and update README with recent improvements Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- CHANGELOG.md | 35 +++++++++++++++++++++++++++++++++++ README.md | 10 ++++++++++ 2 files changed, 45 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3facbd0 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,35 @@ +# Changelog + +## [Unreleased] - 2025-12-06 + +### Fixed +- Fixed scraping functionality to work with modern DownDetector website structure +- Improved bot detection bypass with updated headers and session handling +- Added backward compatibility with original HTML structure + +### Changed +- Updated User-Agent to modern Chrome version (120.0.0.0) +- Added comprehensive HTTP headers (Accept, Accept-Language, Sec-Fetch-*, etc.) +- Implemented session-based requests for better connection handling +- Added timeout (10 seconds) to prevent hanging requests + +### Added +- Multi-strategy HTML parsing: + - Strategy 1: Original selector for backward compatibility + - Strategy 2: Direct entry-title class lookup (most common) + - Strategy 3: Find elements with "status" or "entry-title" in class names + - Strategy 4: Fallback to h1 with status-related keywords +- Better error messages for different failure scenarios +- Response status validation with `raise_for_status()` +- Timeout handling for connection issues + +### Technical Details + +The scraper now attempts multiple strategies to find the status message, making it more resilient to website changes: + +1. **Backward Compatibility**: Tries the original `div#company > div.h2.entry-title` selector first +2. **Modern Structure**: Looks for `.entry-title` class on h1, h2, h3, or div elements +3. **Class Pattern Matching**: Searches for elements with "status" or "entry-title" in their class names +4. **Keyword-Based Fallback**: Uses h1 tags containing status keywords (problem, issue, outage, down, working, reports, no problems) + +This multi-layered approach ensures the scraper works with both old and new website structures. diff --git a/README.md b/README.md index 6eff795..a149fd9 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,16 @@ > *It scrapes the status message from DownDetector's Website and shows it in the terminal. So you don't need to open up the Website for checking the status of a service.* +### Recent Updates (December 2025) + +The scraper has been updated to work with the modern DownDetector website structure: +- ✅ Improved bot detection bypass with modern browser headers +- ✅ Multi-strategy HTML parsing for resilience against website changes +- ✅ Better error handling and timeout support +- ✅ Backward compatible with older website structures + +See [CHANGELOG.md](CHANGELOG.md) for detailed changes. + ## CONTRIBUTING > *For details on how to contribute to this project see [this](https://github.com/aaryanrr/DownDetector-CLI/blob/main/CONTRIBUTING.md)* From d88a054d5502e5d683ab507413dc4d8ccb508570 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:12:23 +0000 Subject: [PATCH 4/7] Address code review comments - refactor for better maintainability Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- src/Scraper.py | 61 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/Scraper.py b/src/Scraper.py index 88bcb27..84d17c9 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -5,8 +5,26 @@ import src.Errors as Errors +# Constants for HTTP headers to mimic real browser behavior +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" -# Class for thw URL Instance +DEFAULT_HEADERS = { + "User-Agent": USER_AGENT, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Cache-Control": "max-age=0", + "Referer": "https://www.google.com/" +} + + +# Class for the URL Instance class URLInstance(object): url = "https://downdetector.com/status/" @@ -18,26 +36,9 @@ def __init__(self, service_name): # Scrape the Status of the Service from the Page def get_status(self): - # Updated headers to mimic a real browser more closely - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Accept-Encoding": "gzip, deflate, br", - "DNT": "1", - "Connection": "keep-alive", - "Upgrade-Insecure-Requests": "1", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Cache-Control": "max-age=0", - "Referer": "https://www.google.com/" - } - try: # Use session for better connection handling - response = self.session.get(self.url, headers=headers, timeout=10) + response = self.session.get(self.url, headers=DEFAULT_HEADERS, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html5lib') @@ -61,22 +62,28 @@ def get_status(self): # Strategy 3: Try to find status-related elements with common class patterns if not status_text: # Look for elements with "status" in class name - for elem in soup.find_all(['div', 'span', 'h1', 'h2', 'h3']): + def is_valid_status_element(elem): class_names = elem.get('class', []) if any('status' in str(c).lower() or 'entry-title' in str(c).lower() for c in class_names): text = elem.text.strip() # Filter out very short text or common non-status text - if text and len(text) > 5 and not text.lower() in ['status', 'info', 'information']: - status_text = text - break + if text and len(text) > 5 and text.lower() not in ['status', 'info', 'information']: + return True + return False + + status_elem = next((elem for elem in soup.find_all(['div', 'span', 'h1', 'h2', 'h3']) + if is_valid_status_element(elem)), None) + if status_elem: + status_text = status_elem.text.strip() # Strategy 4: Look for the page title as fallback if not status_text: page_title = soup.find('h1') if page_title: text = page_title.text.strip() + text_lower = text.lower() # Only use if it looks like a status message (contains certain keywords) - if any(keyword in text.lower() for keyword in ['problem', 'issue', 'outage', 'down', 'working', 'reports', 'no problems']): + if any(keyword in text_lower for keyword in ['problem', 'issue', 'outage', 'down', 'working', 'reports', 'no problems']): status_text = text if status_text: @@ -113,12 +120,8 @@ def get_base_url(cls): # Function to Check the Internet Connection def check_connection(): - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - } try: - response = requests.get("https://downdetector.com", headers=headers, timeout=10) + response = requests.get("https://downdetector.com", headers={"User-Agent": USER_AGENT}, timeout=10) response.raise_for_status() print("All Good 👍") except ConnectionError: From d3314d5f54cc5c171ff8524b5ad40e9b4eaa4428 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:16:54 +0000 Subject: [PATCH 5/7] Address final code review comments - add NetworkError exception and extract constants Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- Main.py | 8 +++++++- src/Errors.py | 5 +++++ src/Scraper.py | 11 +++++------ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/Main.py b/Main.py index 11c952e..7bbef7a 100644 --- a/Main.py +++ b/Main.py @@ -1,4 +1,5 @@ from src.Scraper import URLInstance, menu, check_connection +import src.Errors as Errors # Checking the Internet Connection on Startup print("Checking Internet Connection..") @@ -22,7 +23,12 @@ option = int(input("Enter the Option Number: ")) if option == 1: - instance.get_status() + try: + instance.get_status() + except Errors.InvalidServiceName as e: + print(f"Service Error: {e}") + except Errors.NetworkError as e: + print(f"Network Error: {e}") elif option == 2: instance.open_url() elif option == 3: diff --git a/src/Errors.py b/src/Errors.py index f6f97cb..1aa3345 100644 --- a/src/Errors.py +++ b/src/Errors.py @@ -3,3 +3,8 @@ class InvalidServiceName(Exception): # Raised when the Service Name entered is Invalid pass + + +class NetworkError(Exception): + # Raised when there are network/connection issues + pass diff --git a/src/Scraper.py b/src/Scraper.py index 84d17c9..0308206 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -23,6 +23,9 @@ "Referer": "https://www.google.com/" } +# Minimum length for status text to be considered valid +MIN_STATUS_TEXT_LENGTH = 5 + # Class for the URL Instance class URLInstance(object): @@ -67,7 +70,7 @@ def is_valid_status_element(elem): if any('status' in str(c).lower() or 'entry-title' in str(c).lower() for c in class_names): text = elem.text.strip() # Filter out very short text or common non-status text - if text and len(text) > 5 and text.lower() not in ['status', 'info', 'information']: + if text and len(text) > MIN_STATUS_TEXT_LENGTH and text.lower() not in ['status', 'info', 'information']: return True return False @@ -93,13 +96,9 @@ def is_valid_status_element(elem): "Could not find status information. The service name may be invalid or the page structure has changed." ) - except AttributeError: - # Expecting AttributeError if the Name given is Invalid - # A NoneType object won't have the attribute .text as used above - raise Errors.InvalidServiceName("Name of the Service is Invalid!") except requests.exceptions.RequestException as e: print(f"Error fetching the page: {e}") - raise Errors.InvalidServiceName("Could not fetch the page. Please check your internet connection and try again.") + raise Errors.NetworkError("Could not fetch the page. Please check your internet connection and try again.") except Exception as e: print(f"Unexpected error: {e}") raise From 0a12ffffbc54b95a6d92858779754b526c1ee86f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 6 Dec 2025 10:19:04 +0000 Subject: [PATCH 6/7] Fix ConnectionError import to use specific requests exception Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- src/Scraper.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Scraper.py b/src/Scraper.py index 0308206..5dd2119 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -1,6 +1,5 @@ import webbrowser import requests -from requests.exceptions import ConnectionError from bs4 import BeautifulSoup import src.Errors as Errors @@ -123,7 +122,7 @@ def check_connection(): response = requests.get("https://downdetector.com", headers={"User-Agent": USER_AGENT}, timeout=10) response.raise_for_status() print("All Good 👍") - except ConnectionError: + except requests.exceptions.ConnectionError: print("This program requires an active Internet Connection!") quit() except requests.exceptions.Timeout: From e16e998bdbfab6295a922863a103060bd117360a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 7 Dec 2025 20:12:15 +0000 Subject: [PATCH 7/7] Improve error message to be more helpful and actionable for users Co-authored-by: aaryanrr <73213670+aaryanrr@users.noreply.github.com> --- src/Scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Scraper.py b/src/Scraper.py index 5dd2119..c9da606 100644 --- a/src/Scraper.py +++ b/src/Scraper.py @@ -92,7 +92,9 @@ def is_valid_status_element(elem): print(status_text) else: raise Errors.InvalidServiceName( - "Could not find status information. The service name may be invalid or the page structure has changed." + f"Unable to find status for '{self.url.split('/')[-1]}'. " + "Please verify the service name is correct (e.g., 'facebook', 'twitter', 'instagram'). " + "Visit https://downdetector.com to confirm the service exists." ) except requests.exceptions.RequestException as e: