-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathData_Collection.py
More file actions
104 lines (85 loc) · 3.67 KB
/
Data_Collection.py
File metadata and controls
104 lines (85 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import random
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
def Login_operation():
# Chromedriver的路径
chromedriver_path = r"Driver/chromedriver-win64/chromedriver.exe"
# 配置Chrome选项
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = r"Driver/chrome-win64/chrome.exe"
# 启动Chrome浏览器
driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)
# 打开网址
driver.get("https://www.fenqubiao.com/Default.aspx")
# 找到搜索框并输入关键词
username = driver.find_element(By.ID, 'Username')
username.send_keys("zzuli")
password = driver.find_element(By.ID, 'Password')
password.send_keys("zzuli")
submit = driver.find_element(By.ID, 'login_button')
submit.click()
# 等待搜索结果加载
time.sleep(5)
# 获取页面源码
link = driver.find_element(By.XPATH, "//a[@href='/Connect/NewVersion.aspx']")
link.click()
link = driver.find_element(By.XPATH, "//a[@href='/Macro/Journal?name=计算机科学']")
link.click()
return driver
original_data = []
def get_Collection(driver: WebDriver, url):
global partition
driver.get(url)
random_wait_time = random.randint(1, 3)
driver.implicitly_wait(random_wait_time) # 等待10秒钟,可以根据实际情况调整
journal_info_element = driver.find_element(By.CLASS_NAME, "box-body")
# 提取期刊信息
journal_info = {}
rows = journal_info_element.find_elements(By.TAG_NAME, "tr")
td_elements = journal_info_element.find_elements(By.TAG_NAME, "td")
# 遍历每个 td 元素
for td in td_elements:
# 在当前 td 元素下查找所有的 span 元素
span_elements = td.find_elements(By.TAG_NAME, "span")
# 如果存在 span 元素
if span_elements:
# 遍历每个 span 元素,获取其 class 值
partition = []
for span in span_elements:
class_value = span.get_attribute("class")
# javascript
script = f"""
return window.getComputedStyle(document.querySelector('span.{class_value}'), '::before').getPropertyValue('content');
"""
pseudo_element_content = driver.execute_script(script)
partition.append(pseudo_element_content)
# 输出伪元素内容
# print("td 下存在 span,class 值为:", class_value, "::before 伪元素内容:", pseudo_element_content)
break # 如果找到了 span 元素,则跳出循环
journal_info['分区'] = partition
for row in rows:
columns = row.find_elements(By.TAG_NAME, "td")
key = columns[0].text.strip()
value = columns[1].text.strip()
journal_info[key] = value
original_data.append(journal_info)
# 输出期刊信息
# print(journal_info)
if __name__ == '__main__':
driver = Login_operation()
with open('links.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for idx, (link, journal) in tqdm(enumerate(data.items()), total=len(data), desc="Processing links"):
print(link, journal)
get_Collection(driver, link)
if idx % 50 == 0:
name = int(idx / 50)
with open('data/data{}.json'.format(name), 'w', encoding='utf-8') as f:
json.dump(original_data, f, ensure_ascii=False, indent=4)
# 重置original_data,每50条重置一次,防止占用内存
original_data = []