-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcompute.cpp
More file actions
46 lines (39 loc) · 1.45 KB
/
compute.cpp
File metadata and controls
46 lines (39 loc) · 1.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// $ g++ -std=c++11 -O3 -march=native compute.cpp && ./a.out
// $ g++ -std=c++11 -O3 -march=native compute.cpp -ftree-vectorize -fopt-info-vec-optimized && ./a.out -n 1000000000
#include <stdio.h>
#include <math.h>
#include "utils.h"
#define CLOCK_FREQ 3.3e9
void compute_fn(double* A, double B, double C) {
(*A) = (*A) * B + C;
//(*A) = C / (*A);
//(*A) = sqrt(*A);
//(*A) = sin(*A);
}
int main(int argc, char** argv) {
Timer t;
long repeat = read_option<long>("-n", argc, argv);
double A = 1.5;
double B = 1./2;
double C = 2.;
t.tic();
for (long i = 0; i < repeat; i++) compute_fn(&A, B, C);
printf("%f seconds\n", t.toc());
printf("%f cycles/eval\n", t.toc()*CLOCK_FREQ/repeat);
printf("%f Gflop/s\n", 2*repeat/1e9/t.toc());
return A;
}
// Synopsis
//
// By design, this computation is such that only one fused-multiply-accumulate
// instruction can execute at one time i.e. this computation cannot be
// vectorized or pipelined. Therefor, this example can be used to measure
// latency of operations.
//
// * Compare the observed latency with the expected latency for _mm256_fmadd_pd
// instruction for your architecture from this link:
// (https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pd&expand=2508)
//
// * Try replacing the mult-add operation with some other computation, like
// division, sqrt, sin, cos etc. to measure the latency of those operations and
// compare it with the latency of mult-add.