-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path65.py
More file actions
27 lines (17 loc) · 852 Bytes
/
65.py
File metadata and controls
27 lines (17 loc) · 852 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
"""Problem 65: Write a program links.py that takes URL of a webpage as argument
and prints all the URLs linked from that webpage."""
import urllib.request
import os
import sys
import re
# if no argument is provided - URL is set to bbc.com's home page
try:
URL = sys.argv[1]
except IndexError:
URL = 'http://www.bbc.com/'
response = urllib.request.urlopen(URL) # open URL
htmlString = response.read().decode('utf-8') # get the contents of the page as bytes, then convert to utf-8 string
pattern = re.compile(r'https?://(\w+\.*)+(/?[^"\'\n]+)*') # regexp to find full URL's from the page's html code
matchIter = pattern.finditer(htmlString) # get iterator over matches in the string
for match in enumerate(matchIter): # enumerate the iterator to display the match number
print("{0} -- {1}".format(match[0], match[1].group(0)))