Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Simple web crawler

import requests

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All code in this file is indented at the top level (e.g., the imports), which will raise IndentationError: unexpected indent and prevent the script from running.

🤖 Was this useful? React with 👍 or 👎

from bs4 import BeautifulSoup
import csv
Comment on lines +3 to +5

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider grouping imports according to PEP 8 standards.

- import requests
- from bs4 import BeautifulSoup
- import csv
+ import csv
+ import requests
+ from bs4 import BeautifulSoup

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
import requests
from bs4 import BeautifulSoup
import csv
import csv
import requests
from bs4 import BeautifulSoup


# initialize the data structure where to
# store the scraped data
products = []

# initialize the list of discovered urls
# with the first page to visit
urls = ["https://scrapeme.live/shop/"]

# until all pages have been visited
while len(urls) != 0:
# get the page to visit from the list
current_url = urls.pop()

# crawling logic
response = requests.get(current_url)
soup = BeautifulSoup(response.content, "html.parser")

link_elements = soup.select("a[href]")

for link_element in link_elements:
url = link_element["href"]
if "https://scrapeme.live/shop" in url:
urls.append(url)

Comment on lines +16 to +30

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider checking for duplicate URLs before appending to avoid re-visiting the same page.

+ if url not in urls:
      urls.append(url)

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
while len(urls) != 0:
# get the page to visit from the list
current_url = urls.pop()
# crawling logic
response = requests.get(current_url)
soup = BeautifulSoup(response.content, "html.parser")
link_elements = soup.select("a[href]")
for link_element in link_elements:
url = link_element["href"]
if "https://scrapeme.live/shop" in url:
urls.append(url)
while len(urls) != 0:
# get the page to visit from the list
current_url = urls.pop()
# crawling logic
response = requests.get(current_url)
soup = BeautifulSoup(response.content, "html.parser")
link_elements = soup.select("a[href]")
for link_element in link_elements:
url = link_element["href"]
if "https://scrapeme.live/shop" in url:
if url not in urls:
urls.append(url)

# if current_url is product page
product = {}
product["url"] = current_url
product["image"] = soup.select_one(".wp-post-image")["src"]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The “if current_url is product page” extraction runs for every visited URL; on non-product pages select_one(...) can be None, so ...["src"] / later attribute access will raise at runtime.

🤖 Was this useful? React with 👍 or 👎

product["name"] = soup.select_one(".product_title").text()

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

soup.select_one(".product_title").text() will crash because BeautifulSoup’s Tag.text is a property (and get_text() is the method), so calling .text() attempts to call a string.

🤖 Was this useful? React with 👍 or 👎

product["price"] = soup.select_one(".price")

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ensure that the .text() method is called for the "price" element to extract its text content.

- product["price"] = soup.select_one(".price")
+ product["price"] = soup.select_one(".price").text()

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
product["price"] = soup.select_one(".price")
product["price"] = soup.select_one(".price").text()


products.append(product)
Comment on lines +16 to +38

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add error handling for network requests and data extraction.

+ try:
      response = requests.get(current_url)
      soup = BeautifulSoup(response.content, "html.parser")
+ except requests.RequestException as e:
+     print(f"Request error: {e}")
+     continue
+ try:
      product["image"] = soup.select_one(".wp-post-image")["src"]
      product["name"] = soup.select_one(".product_title").text()
      product["price"] = soup.select_one(".price")
+ except AttributeError as e:
+     print(f"Data extraction error: {e}")
+     continue

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
while len(urls) != 0:
# get the page to visit from the list
current_url = urls.pop()
# crawling logic
response = requests.get(current_url)
soup = BeautifulSoup(response.content, "html.parser")
link_elements = soup.select("a[href]")
for link_element in link_elements:
url = link_element["href"]
if "https://scrapeme.live/shop" in url:
urls.append(url)
# if current_url is product page
product = {}
product["url"] = current_url
product["image"] = soup.select_one(".wp-post-image")["src"]
product["name"] = soup.select_one(".product_title").text()
product["price"] = soup.select_one(".price")
products.append(product)
while len(urls) != 0:
# get the page to visit from the list
current_url = urls.pop()
# crawling logic
try:
response = requests.get(current_url)
soup = BeautifulSoup(response.content, "html.parser")
except requests.RequestException as e:
print(f"Request error: {e}")
continue
link_elements = soup.select("a[href]")
for link_element in link_elements:
url = link_element["href"]
if "https://scrapeme.live/shop" in url:
urls.append(url)
# if current_url is product page
product = {}
product["url"] = current_url
try:
product["image"] = soup.select_one(".wp-post-image")["src"]
product["name"] = soup.select_one(".product_title").text()
product["price"] = soup.select_one(".price")
except AttributeError as e:
print(f"Data extraction error: {e}")
continue
products.append(product)


# initialize the CSV output file
with open('products.csv', 'w') as csv_file:
writer = csv.writer(csv_file)

# populating the CSV
for product in products:
writer.writerow(product.values())
Comment on lines +41 to +46

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Specify the newline parameter in open to prevent blank lines in the CSV file on certain platforms.

- with open('products.csv', 'w') as csv_file:
+ with open('products.csv', 'w', newline='') as csv_file:

Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
with open('products.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
# populating the CSV
for product in products:
writer.writerow(product.values())
with open('products.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
# populating the CSV
for product in products:
writer.writerow(product.values())