ComicScraper/comicScraper.py at master · Gink3/ComicScraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
	Taylor King
	telltaylor13@gmail.com

	Purpose: Grablinks from a index page of getcomics.info
	Outputs: A link.dat file that contatins a list of links to copy
	for jdownloader


	TODO:
		allow input of a search url and get_link results by page
		Add support for pages like https://getcomics.info/other-comics/sex-criminals-001-010-tpb-free-get_link/

	To run:
		source mods/bin/activate
		python3 comicScraper.py


'''
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import os.path
import os

rawlinks = "links.txt"
set_links = set()


def index_page(url):
	#	Grabs index page
	#	Filters each link to a page
	#	to the appropriate function
	print("Index Page")
	response = requests.get(url)
	data = response.text
	soup = BeautifulSoup(data,'html.parser')
	post_info = soup.findAll('h1',{'class':'post-title'})
	for info in post_info:
		tags = info.findAll('a')
		for tag in tags:
			href_value = tag.get('href')
			#print(href_value)
			if "week" in href_value:
				week_page(href_value)
			else:
				get_link(href_value)


def get_link(url):
	#	Figures out if a page
	#	is a red button or a
	#	collection page passes
	#	a soup to either red or collection
	#	functions
	response = requests.get(url)
	data = response.text
	soup = BeautifulSoup(data,'html.parser')
	titlesoup = soup
	testtag = soup.find('a',{'title':'Download Now'})
	if testtag == None:
		tags = soup.findAll('a',{'rel':'noopener noreferrer'})
		for tag in tags:
			#print(tag)
			span = tag.find('span')
			#print(type(span))
			if span != None:
				if span.text == "Main Server":
					link = tag.get('href')
					title = titlesoup.find('section',{'class':'post-contents'}).h2
					titletext = title.text
					titletext = titletext.replace("The Story – ", "")
					titletext = titletext.replace(" ","_")
					set_links.add(link + " " + titletext)
					print(titletext)
	else:
		link = testtag.get('href')
		title = titlesoup.find('section',{'class':'post-contents'}).h2
		titletext = title.text
		titletext = titletext.replace("The Story – ","")
		titletext = titletext.replace(" ","_")
		set_links.add(link + " " + titletext)
		print(titletext)


def week_page(url):
	print("Week Page")
	#	New request
	#	New Soup
	#	Grab each link
	response = requests.get(url)
	data = response.text
	soup = BeautifulSoup(data,'html.parser')
	tags = soup.findAll('a',{'rel':'noopener noreferrer'})
	for tag in tags:
		link = tag.get('href')
		if link != None:
			get_link(link)
		else:
			pass


def write_links(linkset):
	print("Writing links")
	with open(rawlinks,"a+") as dataFile:
		for link in linkset:
			dataFile.write(link+'\n')


n = 2
#iterates over the newest n pages of comics (minimum 2)

base_url = "https://getcomics.info"
query = ""
for i in range(1,n):
	if i == 1:

		url = base_url + query
		print(url)
		index_page(url)

	else:

		#https://getcomics.info/page/3/
		#https://getcomics.info/page/3/
		url = "https://getcomics.info/page/"+str(i)+query
		print(url)
		index_page(url)
	write_links(set_links)
	set_links.clear()