-
Notifications
You must be signed in to change notification settings - Fork 46
Expand file tree
/
Copy pathwebb.py
More file actions
438 lines (369 loc) · 13.7 KB
/
Copy pathwebb.py
File metadata and controls
438 lines (369 loc) · 13.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
# Version 0.9 [First version of the library]
# Under Apache License Version 2.0
# @Hardik Vasa
#Import Libraries
import time #For Delay calculations
import sys #for system related information
from subprocess import Popen, PIPE
import socket
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
#Get IP of a website from the URL
def get_ip(url):
ip = socket.gethostbyname(url)
print(ip)
#Traceroute to a website
def traceroute(url):
p = Popen(['tracert', url], stdout=PIPE)
while True:
line = p.stdout.readline()
line2 = str(line).replace('\\r','').replace('\\n','')
print(line2)
if not line:
break
#Downloading entire Web Document (Raw Page Content) for the crawler
def download_page(url):
version = (3,0)
cur_version = sys.version_info
if cur_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
opener = urllib.request.FancyURLopener({})
try:
open_url = opener.open(url)
page = str(open_url.read()).replace('\\n', '')
return page
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
#Extract the title tag
def title(url):
page = download_page(url)
start_title = page.find("<title")
end_start_title = page.find(">",start_title+1)
stop_title = page.find("</title>", end_start_title + 1)
title = page[end_start_title + 1 : stop_title]
print (title)
#Finding 'Next Link' on a given web page for crawler
def get_next_link(s):
start_link = s.find("<a href")
if start_link == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_quote = s.find('"', start_link)
end_quote = s.find('"',start_quote+1)
link = str(s[start_quote+1:end_quote])
return link, end_quote
#Getting all links with the help of 'get_next_links' for crawler
def get_all_links(page):
links = []
while True:
link, end_link = get_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Check for URL extension so crawler does not crawl images and text files
def extension_scan(url):
a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
j = 0
while j < (len(a)):
if a[j] in url:
#print("There!")
flag2 = 1
break
else:
#print("Not There!")
flag2 = 0
j = j+1
#print(flag2)
return flag2
#URL parsing for incomplete or duplicate URLs for users
def url_normalize(url,seed_page):
url = url.lower() #Make it lower case
s = urlparse(url) #parse the given url
seed_page = seed_page.lower() #Make it lower case
t = urlparse(seed_page) #parse the seed page (reference page)
i = 0
while i<=7:
if url == "/":
url = seed_page
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
elif "?" in url:
url = url[:url.find("?")]
elif s.netloc == "":
url = seed_page + s.path
flag = 0
elif "www" not in url:
url = "www."[:7] + url[7:]
flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
elif s.netloc != t.netloc:
url = url
flag = 1
break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
if flag == 0:
return url
else:
return "Invalid URL"
#URL parsing for incomplete or duplicate URLs for crawler
def url_parse(url,seed_page):
url = url.lower().replace(' ','%20') #Make it lower case
s = urlparse(url) #parse the given url
t = urlparse(seed_page) #parse the seed page (reference page)
i = 0
while i<=7:
if url == "/":
url = seed_page
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
elif "?" in url:
url = url[:url.find("?")]
elif s.netloc == "":
url = seed_page + s.path
flag = 0
elif "www" not in url:
url = "www."[:7] + url[7:]
flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
elif s.netloc != t.netloc:
url = url
flag = 1
break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
return(url, flag)
#Finding 'Next Link' on a given web page for users
def find_next_link(s):
start_link = s.find("<a href")
if start_link == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_quote = s.find('"', start_link)
end_quote = s.find('"',start_quote+1)
link = str(s[start_quote+1:end_quote])
return link, end_quote
#Getting all links as list with the help of 'get_next_links' for users
def find_all_links_as_list(url):
page = download_page(url)
links = []
while True:
link, end_link = find_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Get all the links from the find_all_links_as_list function and print it in order for users
def find_all_links(*arg):
url = arg[0]
s = urlparse(url)
if not s.scheme:
url = "http://" + url
t = urlparse(url)
seed_page = t.scheme+'://'+t.netloc
print(seed_page)
lists = find_all_links_as_list(url)
if len(arg)>1:
if arg[1] == "absolute":
for i in lists:
i = url_normalize(i,seed_page)
print(i)
else:
print("Invalid Second Argument")
else:
for i in lists:
print(i)
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl(*arg):
to_crawl = [arg[0]] #Define list name 'Seed Page'
crawled=[] #Define list name 'Seed Page'
i=0; #Initiate Variable to count No. of Iterations
while to_crawl: #Continue Looping till the 'to_crawl' list is not empty
urll = to_crawl.pop(0) #If there are elements in to_crawl then pop out the first element
a = urlparse(arg[0])
seed_page = a.scheme+"://"+a.netloc
urll,flag = url_parse(urll,seed_page)
flag2 = extension_scan(urll)
#If flag = 1, then the URL is outside the seed domain URL
if flag == 1 or flag2 == 1:
pass #Do Nothing
else:
if urll in crawled: #Else check if the URL is already crawled
pass #Do Nothing
else: #If the URL is not already crawled, then crawl i and extract all the links from it
print("\n"+urll)
if len(arg)>1:
delay = arg[1]
time.sleep(delay)
#print(download_page(urll))
to_crawl = to_crawl + get_all_links(download_page(urll))
crawled.append(urll)
#Remove duplicated from to_crawl
n = 1
j = 0
#k = 0
while j < (len(to_crawl)-n):
if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
to_crawl.pop(j)
n = n+1
else:
pass #Do Nothing
j = j+1
i=i+1 #Iteration Counter
#print(to_crawl)
print("Iteration No. = " + str(i))
print("Pages to Crawl = " + str(len(to_crawl)))
print("Pages Crawled = " + str(len(crawled)))
return ''
#Finding 'Next Image' from the given raw page for users (image search)
def get_next_image_link(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_di"')
start_content = s.find('imgurl=',start_line+1)
end_content = s.find('&',start_content+1)
content_raw = str(s[start_content+7:end_content])
return content_raw, end_content
#Getting all links with the help of 'get_next_image_link'
def get_all_image_links(page):
items = []
while True:
item, end_content = get_next_image_link(page)
if item == "no_links":
break
else:
items.append(item) #Append all the links in the list named 'Links'
#time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return items
############## Download Google Images ############
#Download Image Links
def download_google_images(search_keyword):
result = (str(type(search_keyword)))
if 'list' in result:
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
items = items + (get_all_image_links(raw_html))
print ("Image Links = "+str(items))
print ("Total Image Links = "+str(len(items)))
print ("\n")
i = i+1
info = open('output.txt', 'a') #Open the text file called database.txt
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n") #Write the title of the page
info.close() #Close the file
else:
items = []
iteration = "Item name = " + str(search_keyword)
print (iteration)
search = search_keyword.replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
items = items + (get_all_image_links(raw_html))
print ("Image Links = "+str(items))
print ("Total Image Links = "+str(len(items)))
print ("\n")
info = open('output.txt', 'a') #Open the text file called database.txt
info.write(str(search_keyword) + ": " + str(items) + "\n\n\n") #Write the title of the page
info.close() #Close the file
######### Images Download From a Webpage #########
#Finding 'Next Image Link' for get_all_images
def get_next_images_link(s):
start_line = s.find("<img")
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_link = s.find('src=', start_line)
end_link = s.find('"',start_link+5)
link = str(s[start_link+5:end_link])
return link, end_link
#Getting all image links with the help of 'get_next_links' for get_all_images
def get_all_images_links(url):
page = download_page(url)
links = []
while True:
link, end_link = get_next_images_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
page = page[end_link:]
return links
#Download all images in hard disk
def get_all_images(*arg):
url = arg[0]
import urllib
links = get_all_images_links(url)
print(links)
if len(arg)>1 and arg[1] == "download":
s = urlparse(url)
seed_page = s.scheme+'://'+s.netloc
i = 0
while i<len(links):
link,flag = url_parse(links[i],seed_page)
print("downloading --> "+link)
try:
file = urllib.URLopener()
file.retrieve(link, str("img "+str(i)+".jpg"))
except:
pass
i = i+1
else:
pass
########## End ##########