-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsingleURLfetch.py
More file actions
41 lines (32 loc) · 1.07 KB
/
singleURLfetch.py
File metadata and controls
41 lines (32 loc) · 1.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
foosite = 'http://www.gentoo.org'
import pycurl
c = pycurl.Curl()
c.setopt(pycurl.URL, foosite)
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
import StringIO
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.perform()
from bs4 import BeautifulSoup
import redis
souper = BeautifulSoup(b.getvalue(), 'lxml')
'''for link in souper.find_all('a'):
print(link.get('href'))'''
r = redis.Redis(host='localhost', port=6379, db=2)
for link in souper.find_all('a'):
resolve = link.get('href')
try:
if resolve.partition('//')[1] == '':
if resolve.rfind('/') <= 1:
print ("bad http URL found", foosite, resolve)
else:
r.lpush(foosite, foosite + resolve)
else:
r.lpush(foosite, resolve)
except:
print('error in resolve', foosite, resolve)
break
print(r.lrange(foosite, 0, -1))
r.delete(foosite)