akshayutture · akshayutture · Apr 4, 2024 · augmentcode · Dec 11, 2025 · coderabbitai
diff --git a/crawler.py b/crawler.py
@@ -0,0 +1,46 @@
+# Simple web crawler
+
+    import requests
+    from bs4 import BeautifulSoup
+    import csv
-    import requests
-    from bs4 import BeautifulSoup
-    import csv
+    import csv
+    import requests
+    from bs4 import BeautifulSoup
-    import requests
-    from bs4 import BeautifulSoup
-    import csv
+    import csv
+    import requests
+    from bs4 import BeautifulSoup
+
+    # initialize the data structure where to
+    # store the scraped data
+    products = []
+
+    # initialize the list of discovered urls
+    # with the first page to visit
+    urls = ["https://scrapeme.live/shop/"]
+
+    # until all pages have been visited
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+
+        # crawling logic
+        response = requests.get(current_url)
+        soup = BeautifulSoup(response.content, "html.parser")
+
+        link_elements = soup.select("a[href]")
+
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                urls.append(url)
+
-    while len(urls) != 0:
-        # get the page to visit from the list
-        current_url = urls.pop()
-    
-        # crawling logic
-        response = requests.get(current_url)
-        soup = BeautifulSoup(response.content, "html.parser")
-    
-        link_elements = soup.select("a[href]")
-    
-        for link_element in link_elements:
-            url = link_element["href"]
-            if "https://scrapeme.live/shop" in url:
-                urls.append(url)
-    
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+    
+        # crawling logic
+        response = requests.get(current_url)
+        soup = BeautifulSoup(response.content, "html.parser")
+    
+        link_elements = soup.select("a[href]")
+    
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                if url not in urls:
+                    urls.append(url)
-    while len(urls) != 0:
-        # get the page to visit from the list
-        current_url = urls.pop()
-    
-        # crawling logic
-        response = requests.get(current_url)
-        soup = BeautifulSoup(response.content, "html.parser")
-    
-        link_elements = soup.select("a[href]")
-    
-        for link_element in link_elements:
-            url = link_element["href"]
-            if "https://scrapeme.live/shop" in url:
-                urls.append(url)
-    
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+    
+        # crawling logic
+        response = requests.get(current_url)
+        soup = BeautifulSoup(response.content, "html.parser")
+    
+        link_elements = soup.select("a[href]")
+    
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                if url not in urls:
+                    urls.append(url)
+        # if current_url is product page
+        product = {}
+        product["url"] = current_url
+        product["image"] = soup.select_one(".wp-post-image")["src"]
+        product["name"] = soup.select_one(".product_title").text()
+        product["price"] = soup.select_one(".price")
-        product["price"] = soup.select_one(".price")
+        product["price"] = soup.select_one(".price").text()
-        product["price"] = soup.select_one(".price")
+        product["price"] = soup.select_one(".price").text()
+
+        products.append(product)
-    while len(urls) != 0:
-        # get the page to visit from the list
-        current_url = urls.pop()
-    
-        # crawling logic
-        response = requests.get(current_url)
-        soup = BeautifulSoup(response.content, "html.parser")
-    
-        link_elements = soup.select("a[href]")
-    
-        for link_element in link_elements:
-            url = link_element["href"]
-            if "https://scrapeme.live/shop" in url:
-                urls.append(url)
-    
-        # if current_url is product page
-        product = {}
-        product["url"] = current_url
-        product["image"] = soup.select_one(".wp-post-image")["src"]
-        product["name"] = soup.select_one(".product_title").text()
-        product["price"] = soup.select_one(".price")
-    
-        products.append(product)
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+    
+        # crawling logic
+        try:
+            response = requests.get(current_url)
+            soup = BeautifulSoup(response.content, "html.parser")
+        except requests.RequestException as e:
+            print(f"Request error: {e}")
+            continue
+    
+        link_elements = soup.select("a[href]")
+    
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                urls.append(url)
+    
+        # if current_url is product page
+        product = {}
+        product["url"] = current_url
+        try:
+            product["image"] = soup.select_one(".wp-post-image")["src"]
+            product["name"] = soup.select_one(".product_title").text()
+            product["price"] = soup.select_one(".price")
+        except AttributeError as e:
+            print(f"Data extraction error: {e}")
+            continue
+    
+        products.append(product)
-    while len(urls) != 0:
-        # get the page to visit from the list
-        current_url = urls.pop()
-    
-        # crawling logic
-        response = requests.get(current_url)
-        soup = BeautifulSoup(response.content, "html.parser")
-    
-        link_elements = soup.select("a[href]")
-    
-        for link_element in link_elements:
-            url = link_element["href"]
-            if "https://scrapeme.live/shop" in url:
-                urls.append(url)
-    
-        # if current_url is product page
-        product = {}
-        product["url"] = current_url
-        product["image"] = soup.select_one(".wp-post-image")["src"]
-        product["name"] = soup.select_one(".product_title").text()
-        product["price"] = soup.select_one(".price")
-    
-        products.append(product)
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+    
+        # crawling logic
+        try:
+            response = requests.get(current_url)
+            soup = BeautifulSoup(response.content, "html.parser")
+        except requests.RequestException as e:
+            print(f"Request error: {e}")
+            continue
+    
+        link_elements = soup.select("a[href]")
+    
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                urls.append(url)
+    
+        # if current_url is product page
+        product = {}
+        product["url"] = current_url
+        try:
+            product["image"] = soup.select_one(".wp-post-image")["src"]
+            product["name"] = soup.select_one(".product_title").text()
+            product["price"] = soup.select_one(".price")
+        except AttributeError as e:
+            print(f"Data extraction error: {e}")
+            continue
+    
+        products.append(product)
+
+    # initialize the CSV output file
+    with open('products.csv', 'w') as csv_file:
+        writer = csv.writer(csv_file)
+
+        # populating the CSV
+        for product in products:
+            writer.writerow(product.values())
-    with open('products.csv', 'w') as csv_file:
-        writer = csv.writer(csv_file)
-    
-        # populating the CSV
-        for product in products:
-            writer.writerow(product.values())
+    with open('products.csv', 'w', newline='') as csv_file:
+        writer = csv.writer(csv_file)
+    
+        # populating the CSV
+        for product in products:
+            writer.writerow(product.values())
-    with open('products.csv', 'w') as csv_file:
-        writer = csv.writer(csv_file)
-    
-        # populating the CSV
-        for product in products:
-            writer.writerow(product.values())
+    with open('products.csv', 'w', newline='') as csv_file:
+        writer = csv.writer(csv_file)
+    
+        # populating the CSV
+        for product in products:
+            writer.writerow(product.values())