-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProcessor.py
More file actions
41 lines (36 loc) · 1.32 KB
/
Processor.py
File metadata and controls
41 lines (36 loc) · 1.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import urllib2
import os
from bs4 import BeautifulSoup
from urlparse import urlparse
class Processor:
def __init__(self, folder):
self._folder = folder
def ProcessUrl(self, url):
id = self.ParseId(url)
urls = self.ParsePage(url, id)
fullFolder = self.DownloadUrls(urls, id)
return dict(fullFolder = fullFolder, urls = urls)
def ParseId(self, url):
path = urlparse(url).path
pathPart = path.split("/").pop()
id = int(pathPart.split("-")[0])
return id
def ParsePage(self, url, id):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
divContainer = soup.find("div", {"id": "news-id-%i" % id})
images = divContainer.find_all("img")
result = [x.attrs["src"] for x in images]
return result
def DownloadUrls(self, urls, id):
fullFolder = os.path.join(self._folder, str(id))
if not os.path.exists(fullFolder):
os.mkdir(fullFolder)
for url in urls:
print(url)
local = os.path.join(fullFolder + "/", os.path.basename(url))
if not os.path.exists(local):
f = urllib2.urlopen(url)
with open(local, "wb") as local_file:
local_file.write(f.read())
return fullFolder