-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDecisionTree.py
More file actions
87 lines (67 loc) · 2.68 KB
/
DecisionTree.py
File metadata and controls
87 lines (67 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import sys
import os
import random
import tldextract
import numpy
import scipy.stats
import sklearn.tree
import sklearn.feature_selection
random.seed(0)
site_resources = {}
page_loads = {}
def featureSelection():
max_len = max(len(x) for x in page_loads.values())
for key in page_loads:
tmp_list = []
for item in page_loads[key]:
tmp_list.append(hash(item))
while len(tmp_list) < max_len:
tmp_list.append(hash(''))
page_loads[key]=tmp_list
selector = sklearn.feature_selection.VarianceThreshold(threshold = (.8 * (1-.8)))
data = selector.fit_transform(list(page_loads.values()))
for idx,key in enumerate(page_loads):
page_loads[key] = data[idx]
def fitTree():
inputs = []
outputs = []
for key in page_loads:
outputs.append(key[1])
inputs.append(page_loads[key])
dtc = sklearn.tree.DecisionTreeClassifier()
dtc.fit(inputs, outputs)
total = 0.0
for idx,item in enumerate(inputs):
guess = dtc.predict([item])
if guess == outputs[idx]:
total += 1.0
total /= len(inputs)
print(total)
print(dtc.tree_.max_depth)
def readResources(base_dir,criteria=None):
for root, ases, files in os.walk(base_dir):
for as_dir in ases:
for as_root, dirs, site_files in os.walk(os.path.join(base_dir,as_dir)):
for site_file in site_files:
url = site_file[:site_file.rfind('.')]
url_domain = tldextract.extract(url).domain
with open(os.path.join(base_dir,as_root,site_file), 'r') as f:
for line in f.readlines():
resource = line.strip().split(',')[0]
resource_domain = tldextract.extract(resource).domain
if url not in site_resources:
site_resources[url] = set()
if (as_root,url) not in page_loads:
page_loads[(as_root,url)] = set()
if (criteria=='~TLD' and (url_domain != resource_domain)):
page_loads[(as_root,url)].add(resource)
elif (criteria=='TLD' and (url_domain == resource_domain)):
page_loads[(as_root,url)].add(resource)
elif criteria is None:
page_loads[(as_root,url)].add(resource)
site_resources[url].add(resource)
if __name__ == "__main__":
readResources(sys.argv[1])
featureSelection()
fitTree()
pass