-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cpp
More file actions
175 lines (124 loc) · 7.81 KB
/
main.cpp
File metadata and controls
175 lines (124 loc) · 7.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#include <iostream>
#include <emmintrin.h>
#include <immintrin.h>
#include "fileRead.h"
namespace avxBlending {
constexpr unsigned char mask1256[] = {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 16, 255, 17, 255, 18, 255, 19, 255, 20, 255, 21, 255, 22, 255, 23, 255};
constexpr unsigned char mask3478[] = {8, 255, 9, 255, 10, 255, 11, 255, 12, 255, 13, 255, 14, 255, 15, 255, 24, 255, 25, 255, 26, 255, 27, 255, 28, 255, 29, 255, 30, 255, 31, 255};
constexpr unsigned char packed1256[] = { 1, 3, 5, 255, 9, 11, 13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 3, 5, 255, 9, 11, 13, 255, 255, 255, 255, 255, 255, 255, 255, 255};
constexpr unsigned char packed3478[] = {255, 255, 255, 255, 255, 255, 255, 255, 1, 3, 5, 255, 9, 11, 13, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 3, 5, 255, 9, 11, 13, 255};
constexpr unsigned char minuend[] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
constexpr unsigned char alpha[] = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
}
unsigned int clockStart = 0;
unsigned int clockEnd = 0;
unsigned int clockStartAVX = 0;
unsigned int clockEndAVX = 0;
char *blending(char *backFile, char *frontFile) {
clockStart = clock();
const int backBufferStart = *(reinterpret_cast<const int *>(backFile + 0x0a));
const int frontBufferStart = *(reinterpret_cast<const int *>(frontFile + 0x0a));
const int backWidth = *(reinterpret_cast<const int *>(backFile + 0x12));
const int frontWidth = *(reinterpret_cast<const int *>(frontFile + 0x12));
const int frontHeight = *(reinterpret_cast<const int *>(frontFile + 0x16));
backFile += backBufferStart;
frontFile += frontBufferStart;
int startPosX = 300;
int startPosY = 250;
for (int posY = startPosY; posY < startPosY + frontHeight; ++posY) {
for (int posX = startPosX; posX < startPosX + frontWidth; ++posX) {
auto backRGBA = reinterpret_cast<unsigned char *>(backFile + posY * backWidth * 4 + posX * 4);
auto frontRGBA = reinterpret_cast<unsigned char *>(frontFile + (posY - startPosY) * frontWidth * 4 + (posX - startPosX) * 4);
unsigned char frontAlpha = *(frontRGBA + 3);
for (int i = 0; i < 3; ++i) {
*(backRGBA + i) = (*(frontRGBA + i) * frontAlpha + *(backRGBA + i) * (256 - frontAlpha)) >> 8;
}
*(backRGBA + 3) = 0xff;
}
}
clockEnd = clock();
backFile -= backBufferStart;
return backFile;
}
char *blendingAVX(char *backFile, char *frontFile) {
clockStartAVX = clock();
const int backBufferStart = *(reinterpret_cast<const int *>(backFile + 0x0a));
const int frontBufferStart = *(reinterpret_cast<const int *>(frontFile + 0x0a));
const int backWidth = *(reinterpret_cast<const int *>(backFile + 0x12));
const int frontWidth = *(reinterpret_cast<const int *>(frontFile + 0x12));
const int frontHeight = *(reinterpret_cast<const int *>(frontFile + 0x16));
backFile += backBufferStart;
frontFile += frontBufferStart;
int startPosX = 300;
int startPosY = 250;
const __m256i mask1256 = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::mask1256));
const __m256i mask3478 = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::mask3478));
const __m256i packed1256 = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::packed1256));
const __m256i packed3478 = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::packed3478));
const __m256i minuend = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::minuend));
const __m256i alpha = _mm256_loadu_si256(reinterpret_cast<const __m256i_u *>(avxBlending::alpha));
for (int posY = startPosY; posY < startPosY + frontHeight; ++posY) {
int posX = startPosX;
for (; posX < startPosX + frontWidth - 8; posX += 8) {
__m256i back = _mm256_loadu_si256(reinterpret_cast<__m256i_u const *>(backFile + posY * backWidth * 4 + posX * 4));
__m256i front = _mm256_loadu_si256(reinterpret_cast<__m256i_u const *>(frontFile + (posY - startPosY) * frontWidth * 4 + (posX - startPosX) * 4));
__m256i front1256 = _mm256_shuffle_epi8(front, mask1256); // front pixels 1 2 5 6
__m256i front3478 = _mm256_shuffle_epi8(front, mask3478); // front pixels 3 4 7 8
__m256i back1256 = _mm256_shuffle_epi8(back, mask1256); // back pixels 1 2 5 6
__m256i back3478 = _mm256_shuffle_epi8(back, mask3478); // back pixels 3 4 7 8
__m256i frontAlpha1256 = _mm256_shufflehi_epi16(front1256, 0xff); // shuffle alpha
__m256i frontAlpha3478 = _mm256_shufflehi_epi16(front3478, 0xff);
frontAlpha1256 = _mm256_shufflelo_epi16(frontAlpha1256, 0xff);
frontAlpha3478 = _mm256_shufflelo_epi16(frontAlpha3478, 0xff);
front1256 = _mm256_mullo_epi16(front1256, frontAlpha1256); //alpha * front
front3478 = _mm256_mullo_epi16(front3478, frontAlpha3478);
__m256i backAlpha1256 = _mm256_sub_epi16(minuend, frontAlpha1256); // 255 - alpha
__m256i backAlpha3478 = _mm256_sub_epi16(minuend, frontAlpha3478);
back1256 = _mm256_mullo_epi16(back1256, backAlpha1256); // (255 - alpha) * back
back3478 = _mm256_mullo_epi16(back3478, backAlpha3478);
__m256i out1256 = _mm256_add_epi16(back1256, front1256); // (255 - alpha) * back + alpha * front
__m256i out3478 = _mm256_add_epi16(back3478, front3478);
out1256 = _mm256_shuffle_epi8(out1256, packed1256); // pack pixels
out3478 = _mm256_shuffle_epi8(out3478, packed3478);
out1256 = _mm256_add_epi64(out1256, out3478);
out1256 = _mm256_add_epi64(out1256, alpha);
_mm256_storeu_si256(reinterpret_cast<__m256i_u *>(backFile + posY * backWidth * 4 + posX * 4), out1256);
}
for (; posX < startPosX + frontWidth; ++posX) {
auto backRGBA = reinterpret_cast<unsigned char *>(backFile + posY * backWidth * 4 + posX * 4);
auto frontRGBA = reinterpret_cast<unsigned char *>(frontFile + (posY - startPosY) * frontWidth * 4 + (posX - startPosX) * 4);
unsigned char frontAlpha = *(frontRGBA + 3);
for (int i = 0; i < 3; ++i) {
*(backRGBA + i) = (*(frontRGBA + i) * frontAlpha + *(backRGBA + i) * (256 - frontAlpha)) >> 8;
}
*(backRGBA + 3) = 0xff;
}
}
clockEndAVX = clock();
backFile -= backBufferStart;
return backFile;
}
void blending(const char *backFileName, const char *frontFileName, const char *mergeFileName, const char opt = 0) {
size_t backFileSize = 0;
char *backFile = readTextFromFile(backFileName, &backFileSize);
size_t frontFileSize = 0;
char *frontFile = readTextFromFile(frontFileName, &frontFileSize);
char *merge = nullptr;
if (opt == 'o')
merge = blendingAVX(backFile, frontFile);
else
merge = blending(backFile, frontFile);
writeFile(mergeFileName, merge, backFileSize);
}
int main() {
const char *backFileName = "../Table.bmp";
const char *frontFileName = "../AskhatCat.bmp";
const char *mergeFileName = "../merge.bmp";
const char *mergeAVXFileName = "../mergeAVX.bmp";
blending(backFileName, frontFileName, mergeFileName);
blending(backFileName, frontFileName, mergeAVXFileName, 'o');
printf("blending: %g\n", (clockEnd - clockStart) / 1000000.0);
printf("AVX blending: %g\n", (clockEndAVX - clockStartAVX) / 1000000.0);
printf("speed up: %g\n", static_cast<double >(clockEnd - clockStart) / static_cast<double >(clockEndAVX - clockStartAVX));
return 0;
}