Commitment-Document/Commit.tex at master · AUFinalProject/Commitment-Document · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
\documentclass[10pt, a4paper]{article}
\usepackage{xcolor}
\pagestyle{headings}
\usepackage{setspace}
\setstretch{0.9}
%----
\begin{document}
\renewcommand{\thepage}{\arabic{page}}% Arabic page numbers
\pagecolor{yellow!20}
% Page 1
% Here are our name.
\begingroup
  \centering
  \large \underline{\textbf{Malicious PDF File Detection - Commitment Document}}\\
  \bigbreak
  \large \textit{Shir Bentabou \qquad Alexey Titov}\\
  \bigbreak
  \large \textit{Supervisors: Dr. Amit Dvir and Dr. Ran Dubin}\\
\endgroup
\section{\large Accomplished Tasks:}
\begin{itemize}
% 1
\item Hand in project abstract.
% 2
\item Hand in project proposal to supervisors.
% 3
\item Hand in project poster.
% 4
\item Deeply know PDF file structure, features and fields.
% 5
\item Study about phishing, URLs, and JavaScript uses in PDF files.
% 6
\item Research methods from previous researches.
% 7
\item Build the plan and schedule for our project: phases, tasks in each phase, deadlines for each task.
% 8
\item Research existing tools for our usage in the project.
% 9
\item \underline{First phase} – Researching and creating our work tools:
	\begin{itemize}
	% 9.1
	\item Extracting telemetry.
	% 9.2
	\item Extracting text from picture.
	% 9.3
	\item Extracting text from pdf file (using PDFMiner).
	% 9.4
	\item Extracting URLs (using pyPDF).
	% 9.5
	\item Extracting URLs from JS in the file tags (using peePDF).
	% 9.6
	\item Extracting preview of a PDF file (using PIL + pdf2image).
	\end{itemize}
% 10
\item \underline{Second phase} – Creating an image-based classification machine:
	\begin{itemize}
	% 10.1
	\item Research vector features.
	% 10.2
	\item Building the feature vector.
	% 10.3
	\item Applying machine learning algorithms on the feature vector.
	\end{itemize}
% 11
\item \underline{Third phase} – Creating a text-based classification machine:
	\begin{itemize}
	% 11.1
	\item Research vector building methods.
	% 11.2
	\item Applying the vector methods on our samples.
	% 11.3
	\item Applying machine learning algorithms on the text vector.
	% 11.4
	\item Applying a deep learning method on the text vector.
	\end{itemize}
% 12
\item \underline{Fourth phase} – Creating a classification machine based on PDF tags, JS, URLs, objects and streams:
	\begin{itemize}
	% 12.1
	\item Researching the features that will build the vector for this machine in each one of the four parts: PDF tags, JS, URLs, objects and streams.
	% 12.2
	\item Research existing tools for the extraction of the features chosen (JAST, Analyze PDF, peePDF).
	% 12.3
	\item Extraction of the features from samples.
	% 12.4
	\item Building the feature vector.
	% 12.5
	\item Applying machine learning algorithms on the feature vector.
	% 12.6
	\item Applying a deep learning method on the feature vector.
	\end{itemize}
\end{itemize}


% Page 2
\newpage
\section{\large To Be Accomplished:}
\begin{itemize}
% 1
\item Prepare project day presentation.
% 2
\item Writing project book.
% 3
\item \underline{Fifth phase} – Creating an ensemble machine:
	\begin{itemize}
	% 3.1
	\item Combining the three machines into an ensemble machine.
	% 3.2
	\item  Determining the overall classification method for the ensemble machine.
	% 3.3
	\item Applying machine learning algorithms to ensemble machine:
		\begin{itemize}
		% 3.3.1
		\item Random Forest, AdaBoost (Adaptive Boosting), Gradient Tree Boosting, XGBoost.
		\end{itemize}
	\end{itemize}
% 4
\item \underline{Improvement phase} - Deciding improvement phase aim, and numeric success rate for each classifier, and ensemble machine as well.
% 5 Phases
\item Improvements for each phase:
	\begin{itemize}
	% 5.1
	\item Second phase:
		\begin{itemize}
		% 5.1.1
		\item Try to improve picture classification in two ways:
			\begin{itemize}
			% 5.1.1.1
			\item Applying additional vector building methods (such as near similar image matching).
			% 5.1.1.2
			\item Applying additional machine learning algorithms on the vectors.
			\end{itemize}
		\end{itemize}
		% 5.1.2
		\item Third phase:
		\begin{itemize}
		% 5.1.2.1
		\item Try to improve text classification in the following way:
			\begin{itemize}
			% 5.1.2.1.1
			\item Applying additional machine learning algorithms on the different vector building methods (word2vec, TF-IDF) to achieve better results.
			\end{itemize}
		\end{itemize}
		% 5.1.2.2
		\item Fourth phase:
		\begin{itemize}
		% 5.1.2.2.1
		\item Improve feature selection in the following ways:
			\begin{itemize}
			% 5.1.2.2.1.1
			\item Random choice method.
			% 5.1.2.2.1.2
			\item Summing features method.
			% 5.1.2.2.1.3
			\item Combining features as new features in vector.
			\end{itemize}
		% 5.1.2.2.2
		\item Applying additional machine learning algorithms on the vectors.
		\end{itemize}
	\end{itemize}
% 6
\item Overall improvements:
	\begin{itemize}
	% 6.1
	\item Applying iterative retraining methods on the machines.
	\end{itemize}
\end{itemize}

\end{document}