-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
260 lines (239 loc) · 10.3 KB
/
script.py
File metadata and controls
260 lines (239 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import time
import os
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import argparse
from dotenv import load_dotenv
import sys
# Settings
CONFLUENCE_BASE_URL = "https://dokumentacja-inpost.atlassian.net/wiki/spaces/PL/overview" # <- Change to your URL
DOWNLOAD_DIR = os.path.abspath("downloads")
# Create folder for files
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
# Selenium configuration (Chrome)
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
"download.default_directory": DOWNLOAD_DIR,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
})
# chrome_options.add_argument("--headless=new")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--allow-insecure-localhost')
# Browser initialization
browser = webdriver.Chrome(options=chrome_options)
wait = WebDriverWait(browser, 30)
# Load environment variables from .env file
load_dotenv()
CONFLUENCE_BASE_URL = os.getenv("CONFLUENCE_BASE_URL", CONFLUENCE_BASE_URL)
# Colored labels
class Color:
INFO = '\033[94m'
SUCCESS = '\033[92m'
WARNING = '\033[93m'
ERROR = '\033[91m'
END = '\033[0m'
class Logger:
def __init__(self, logfile):
self.logfile = logfile
def log(self, msg, level="INFO"):
color = getattr(Color, level, Color.INFO)
line = f"[{level}] {msg}"
print(f"{color}{line}{Color.END}")
with open(self.logfile, 'a', encoding='utf-8') as f:
f.write(line + '\n')
logger = Logger(os.path.join(os.path.dirname(__file__), 'log.txt'))
def cprint(msg, level="INFO"):
logger.log(msg, level)
# Function to download all links to subpages
def get_all_page_links(start_url, visited=None):
if visited is None:
visited = set()
cprint(f"Visiting: {start_url}", "INFO")
browser.get(start_url)
time.sleep(2)
links = set()
# Find all links to subpages and chapters
for a in browser.find_elements(By.CSS_SELECTOR, "a[href*='/pages/viewpage.action?pageId='], a[href*='/wiki/spaces/PL/pages/']"):
href = a.get_attribute("href")
# Filter only links to pages in this space
if href and '/pages/' in href and '/wiki/spaces/PL/' in href and href not in visited:
cprint(f"[LINK] Found: {href}", "INFO")
links.add(href)
all_links = set(links)
visited.update(links)
# Recursively get links from subpages
for link in links:
cprint(f"[RECUR] Entering subpage: {link}", "INFO")
if link not in visited:
sub_links = get_all_page_links(link, visited)
all_links.update(sub_links)
download_doc_for_page(link)
return list(all_links)
# Function to download DOC for a single page
def download_doc_for_page(page_url):
cprint(f"Processing: {page_url}", "INFO")
try:
import re
match = re.search(r'pageId=(\d+)', page_url)
if not match:
match = re.search(r'/pages/(\d+)', page_url)
if not match:
match = re.search(r'/wiki/spaces/PL/pages/(\d+)', page_url)
if not match:
match = re.search(r'/pages/(\d+)(?:/|$)', page_url)
if not match:
match = re.search(r'/spaces/PL/pages/(\d+)(?:/|$)', page_url)
slug = page_url.rstrip('/').split('/')[-1]
export_url = None
if match:
page_id = match.group(1)
export_url = f"https://dokumentacja-inpost.atlassian.net/wiki/exportword?pageId={page_id}"
else:
cprint(f"Could not find pageId in URL: {page_url}", "ERROR")
return
cprint(f"Opening: {export_url}", "INFO")
browser.get(export_url)
if "Nie masz uprawnień" in browser.page_source or "You do not have permission" in browser.page_source:
cprint(f"No permission to download: {export_url}", "ERROR")
return
if "Nie znaleziono strony" in browser.page_source or "Page Not Found" in browser.page_source:
cprint(f"Page not found: {export_url}", "ERROR")
return
file_downloaded = False
timeout = 60
start_time = time.time()
before_files = set(os.listdir(DOWNLOAD_DIR))
while time.time() - start_time < timeout:
after_files = set(os.listdir(DOWNLOAD_DIR))
new_files = after_files - before_files
doc_files = [f for f in new_files if slug in f and (f.lower().endswith('.doc') or f.lower().endswith('.docx'))]
if doc_files:
cprint(f"Downloaded: {doc_files[-1]}", "SUCCESS")
file_downloaded = True
break
time.sleep(1)
if not file_downloaded:
cprint(f"File not downloaded for: {export_url}", "ERROR")
except Exception as e:
cprint(f"Failed to download: {page_url} ({e})", "ERROR")
def expand_all_chevrons():
# Expand all menu subtrees (chevrons)
expanded_html = set()
no_new_chevrons_count = 0
max_no_new_chevrons = 3
while True:
chevrons = browser.find_elements(By.CSS_SELECTOR, '[data-testid="chevron-down"], [data-testid="chevron-right"]')
new_chevrons = []
for c in chevrons:
try:
chevron_html = c.get_attribute('outerHTML')
aria_expanded = c.get_attribute('aria-expanded')
except Exception:
continue
if chevron_html not in expanded_html and aria_expanded != "true":
new_chevrons.append(c)
if not new_chevrons:
no_new_chevrons_count += 1
if no_new_chevrons_count >= max_no_new_chevrons:
break
time.sleep(0.2)
continue
no_new_chevrons_count = 0
# Klikaj po kolei, po każdym kliknięciu czekaj aż spinner zniknie
for chevron in new_chevrons:
try:
chevron_html = chevron.get_attribute('outerHTML')
browser.execute_script("arguments[0].scrollIntoView();", chevron)
browser.execute_script("arguments[0].click();", chevron)
expanded_html.add(chevron_html)
try:
WebDriverWait(browser, 10).until_not(
lambda d: d.find_elements(By.CSS_SELECTOR, '[data-testid="tree-item-spinner"]')
)
except Exception:
pass
except Exception:
continue
time.sleep(0.1) # Krótkie oczekiwanie na aktualizację drzewa
# Function to collect links (without downloading)
def collect_all_links(start_url, visited=None):
if visited is None:
visited = set()
cprint(f"Visiting: {start_url}", "INFO")
browser.get(start_url)
time.sleep(2)
expand_all_chevrons()
time.sleep(1)
links = set()
for a in browser.find_elements(By.CSS_SELECTOR, "a[href*='/pages/viewpage.action?pageId='], a[href*='/wiki/spaces/PL/pages/']"):
href = a.get_attribute("href")
if href and '/pages/' in href and '/wiki/spaces/PL/' in href and href not in visited:
cprint(f"[LINK] Found: {href}", "INFO")
links.add(href)
all_links = set(links)
visited.update(links)
for link in links:
if link not in visited:
sub_links = collect_all_links(link, visited)
all_links.update(sub_links)
return list(all_links)
def verify_downloaded_files(links, download_dir):
statuses = {}
for link in links:
import re
# Get slug from the end of the URL
slug = link.rstrip('/').split('/')[-1]
found = False
for f in os.listdir(download_dir):
# Check if slug (case-insensitive, spaces as + or _) is in the file name, regardless of position and extension
fname = f.lower().replace(' ', '+').replace(' ', '_')
slug_variants = [slug.lower(), slug.lower().replace('+', ' '), slug.lower().replace('+', '_'), slug.lower().replace('_', ' ')]
if any(variant in fname for variant in slug_variants) and (fname.endswith('.doc') or fname.endswith('.docx')):
found = True
break
if not found:
# Additional heuristic: check if the file is the newest and was created in the last 10 minutes
files = [os.path.join(download_dir, f) for f in os.listdir(download_dir) if f.lower().endswith(('.doc', '.docx'))]
if files:
latest = max(files, key=os.path.getmtime)
if (time.time() - os.path.getmtime(latest)) < 600:
found = True
statuses[link] = 'downloaded' if found else 'not_downloaded'
return statuses
# Export links to CSV with flag
def export_links_to_csv(links, filename, statuses=None):
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['url', 'status'])
for link in links:
status = statuses.get(link, 'nie_zebrane') if statuses else 'nie_zebrane'
writer.writerow([link, status])
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--retry-failed', action='store_true', help='Download only those that failed')
args = parser.parse_args()
cprint(f"Starting link collection from: {CONFLUENCE_BASE_URL}", "INFO")
all_links = collect_all_links(CONFLUENCE_BASE_URL)
statuses = verify_downloaded_files(all_links, DOWNLOAD_DIR)
export_links_to_csv(all_links, 'confluence_links.csv', statuses)
cprint(f"Collected {len(all_links)} links. Statuses saved in confluence_links.csv", "SUCCESS")
# Downloading files for all or only undownloaded
to_download = all_links if not args.retry_failed else [l for l in all_links if statuses.get(l) != 'pobrano']
if to_download:
cprint(f"Downloading {len(to_download)} files...", "INFO")
for link in to_download:
try:
download_doc_for_page(link)
except Exception as e:
cprint(f"Failed to download: {link} ({e})", "ERROR")
else:
cprint("No files to download.", "WARNING")
browser.quit()
cprint("Finished.", "SUCCESS")