-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_parser.py
More file actions
37 lines (27 loc) · 1.17 KB
/
html_parser.py
File metadata and controls
37 lines (27 loc) · 1.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Tuple
from product_page_classifier import ProductPageClassifier
from product_url_analyser import is_dead_end_url
from logger_config import setup_logger
class HTMLParser:
def __init__(self):
self.logger = setup_logger()
self.productPageClassifer = ProductPageClassifier()
def parse_html(self, page_url: str, html: str, seed_domain: str) -> Tuple[List[str], List[str]]:
soup = BeautifulSoup(html, "lxml")
child_urls = set()
product_urls = set()
result = self.productPageClassifer.analyze(soup, page_url, None, self.logger.info)
if result["is_product_page"]:
product_urls.add(page_url)
for tag in soup.find_all("a", href=True):
href = tag["href"]
url = urljoin(page_url, href)
parsed = urlparse(url)
if parsed.netloc != seed_domain:
continue
normalized = parsed._replace(fragment="").geturl().rstrip("/")
if not is_dead_end_url(parsed.path):
child_urls.add(normalized)
return child_urls, product_urls