-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulan_processing.cpp
More file actions
99 lines (80 loc) · 2.74 KB
/
mulan_processing.cpp
File metadata and controls
99 lines (80 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
/*
【Mulan数据集转MLC_SVM所需数据格式】
1.读取XML文件,统计标签数目
2.读取ARFF文件头文件信息,统计标签数目+特征数目之和,进而得到特征数目
3.读取ARFF文件数据信息,按行读取并重组为MLC_SVM所需的数据格式,写入新的数据文件
3-补充 如何重组数据
3.1 字符串切割
3.2 格式化并合并单条数据的特征字符串
3.3 生成稀疏标签字符串
3.4 重组字符串并写入新文件
*/
#include "data_processing.h"
int countLabelsFromXml(string filePath) {
ifstream infile;
string fileLine;
int labelsCount = 0;
infile.open(filePath);
if (!infile.is_open()) cout << filePath << " open failed" << endl;
while (getline(infile, fileLine)) {
if (fileLine.substr(1,6).compare("label ") == 0) labelsCount++;
}
infile.close();
return labelsCount;
}
void transferDataFromArff(string arffFilePath, int labelsCount, string newDataFilePath) {
ifstream infile;
ofstream outfile;
string fileLine;
vector<string> attributeValues;
string formativeFeatures="";
string formativeLabels="";
int attributesCount = 0;
int featuresCount;
int exampleCount=1;
int i;
infile.open(arffFilePath);
if (!infile.is_open()) cout << arffFilePath << " open failed" << endl;
outfile.open(newDataFilePath);
if (!outfile.is_open()) cout << newDataFilePath << " create failed" << endl;
//count attributes
while (getline(infile, fileLine)) {
if (fileLine.compare("@data") == 0) break;
else {
if (!fileLine.empty()&&fileLine.substr(1, 9).compare("attribute") == 0) attributesCount++;
}
}
//calculate the number of features
featuresCount = attributesCount - labelsCount;
//read the data line and transfer
while (getline(infile, fileLine)) {
//split the data line
attributeValues.clear();
SplitString(fileLine, attributeValues, ",");
//format the feature string and create entire new one
formativeFeatures = "";
for (i = 0; i < featuresCount; i++) {
formativeFeatures += to_string(i + 1) + ":" + attributeValues[i]+" ";
}
formativeFeatures.pop_back();
//create sparse label string
formativeLabels = "";
for (i = 0; i < labelsCount; i++) {
if (attributeValues[i + featuresCount] == "1") formativeLabels += to_string(i + 1) + ",";
}
formativeLabels.pop_back();
//write to the new data file
outfile << "example" << exampleCount << " " << formativeLabels << " " << formativeFeatures << endl;
exampleCount++;
}
infile.close();
outfile.close();
}
int main() {
string xmlFilePath = "./Datasets/Mulan/Corel5k.xml";
string arffFilePath = "./Datasets/Mulan/Corel5k-train.arff";
string newDataFilePath = "./Datasets/Mulan/Corel5k-train.txt";
int labelsCount = countLabelsFromXml(xmlFilePath);
transferDataFromArff(arffFilePath, labelsCount, newDataFilePath);
system("pause");
}