From 1fcdbe3809ea7912a711b373a19e5f22669815bb Mon Sep 17 00:00:00 2001
From: akshayutture <akshay.utture@gmail.com>
Date: Wed, 3 Apr 2024 17:58:54 -0700
Subject: [PATCH] Create crawler.py

---
 crawler.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 crawler.py

diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..0f6fd7e
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,46 @@
+# Simple web crawler
+
+    import requests
+    from bs4 import BeautifulSoup
+    import csv
+    
+    # initialize the data structure where to
+    # store the scraped data
+    products = []
+    
+    # initialize the list of discovered urls
+    # with the first page to visit
+    urls = ["https://scrapeme.live/shop/"]
+    
+    # until all pages have been visited
+    while len(urls) != 0:
+        # get the page to visit from the list
+        current_url = urls.pop()
+    
+        # crawling logic
+        response = requests.get(current_url)
+        soup = BeautifulSoup(response.content, "html.parser")
+    
+        link_elements = soup.select("a[href]")
+    
+        for link_element in link_elements:
+            url = link_element["href"]
+            if "https://scrapeme.live/shop" in url:
+                urls.append(url)
+    
+        # if current_url is product page
+        product = {}
+        product["url"] = current_url
+        product["image"] = soup.select_one(".wp-post-image")["src"]
+        product["name"] = soup.select_one(".product_title").text()
+        product["price"] = soup.select_one(".price")
+    
+        products.append(product)
+    
+    # initialize the CSV output file
+    with open('products.csv', 'w') as csv_file:
+        writer = csv.writer(csv_file)
+    
+        # populating the CSV
+        for product in products:
+            writer.writerow(product.values())