From 1fcdbe3809ea7912a711b373a19e5f22669815bb Mon Sep 17 00:00:00 2001 From: akshayutture Date: Wed, 3 Apr 2024 17:58:54 -0700 Subject: [PATCH] Create crawler.py --- crawler.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..0f6fd7e --- /dev/null +++ b/crawler.py @@ -0,0 +1,46 @@ +# Simple web crawler + + import requests + from bs4 import BeautifulSoup + import csv + + # initialize the data structure where to + # store the scraped data + products = [] + + # initialize the list of discovered urls + # with the first page to visit + urls = ["https://scrapeme.live/shop/"] + + # until all pages have been visited + while len(urls) != 0: + # get the page to visit from the list + current_url = urls.pop() + + # crawling logic + response = requests.get(current_url) + soup = BeautifulSoup(response.content, "html.parser") + + link_elements = soup.select("a[href]") + + for link_element in link_elements: + url = link_element["href"] + if "https://scrapeme.live/shop" in url: + urls.append(url) + + # if current_url is product page + product = {} + product["url"] = current_url + product["image"] = soup.select_one(".wp-post-image")["src"] + product["name"] = soup.select_one(".product_title").text() + product["price"] = soup.select_one(".price") + + products.append(product) + + # initialize the CSV output file + with open('products.csv', 'w') as csv_file: + writer = csv.writer(csv_file) + + # populating the CSV + for product in products: + writer.writerow(product.values())