-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathcomicScraper.py
More file actions
executable file
·145 lines (110 loc) · 2.99 KB
/
comicScraper.py
File metadata and controls
executable file
·145 lines (110 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
Taylor King
telltaylor13@gmail.com
Purpose: Grablinks from a index page of getcomics.info
Outputs: A link.dat file that contatins a list of links to copy
for jdownloader
TODO:
allow input of a search url and get_link results by page
Add support for pages like https://getcomics.info/other-comics/sex-criminals-001-010-tpb-free-get_link/
To run:
source mods/bin/activate
python3 comicScraper.py
'''
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import os.path
import os
rawlinks = "links.txt"
set_links = set()
def index_page(url):
# Grabs index page
# Filters each link to a page
# to the appropriate function
print("Index Page")
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
post_info = soup.findAll('h1',{'class':'post-title'})
for info in post_info:
tags = info.findAll('a')
for tag in tags:
href_value = tag.get('href')
#print(href_value)
if "week" in href_value:
week_page(href_value)
else:
get_link(href_value)
def get_link(url):
# Figures out if a page
# is a red button or a
# collection page passes
# a soup to either red or collection
# functions
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
titlesoup = soup
testtag = soup.find('a',{'title':'Download Now'})
if testtag == None:
tags = soup.findAll('a',{'rel':'noopener noreferrer'})
for tag in tags:
#print(tag)
span = tag.find('span')
#print(type(span))
if span != None:
if span.text == "Main Server":
link = tag.get('href')
title = titlesoup.find('section',{'class':'post-contents'}).h2
titletext = title.text
titletext = titletext.replace("The Story – ", "")
titletext = titletext.replace(" ","_")
set_links.add(link + " " + titletext)
print(titletext)
else:
link = testtag.get('href')
title = titlesoup.find('section',{'class':'post-contents'}).h2
titletext = title.text
titletext = titletext.replace("The Story – ","")
titletext = titletext.replace(" ","_")
set_links.add(link + " " + titletext)
print(titletext)
def week_page(url):
print("Week Page")
# New request
# New Soup
# Grab each link
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
tags = soup.findAll('a',{'rel':'noopener noreferrer'})
for tag in tags:
link = tag.get('href')
if link != None:
get_link(link)
else:
pass
def write_links(linkset):
print("Writing links")
with open(rawlinks,"a+") as dataFile:
for link in linkset:
dataFile.write(link+'\n')
n = 2
#iterates over the newest n pages of comics (minimum 2)
base_url = "https://getcomics.info"
query = ""
for i in range(1,n):
if i == 1:
url = base_url + query
print(url)
index_page(url)
else:
#https://getcomics.info/page/3/
#https://getcomics.info/page/3/
url = "https://getcomics.info/page/"+str(i)+query
print(url)
index_page(url)
write_links(set_links)
set_links.clear()