Commit 60aa19e5 authored by Manuel Rodrigues's avatar Manuel Rodrigues
Browse files

Initial commit: For loops poor auto-vec pattern

parents
CC=icc
RM=rm -fr
.PHONY: vadd matmul all clean clean-bin wipe
SRCDIR=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/src/
BINDIR=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/bin/
OBJDIR=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/obj/
$(shell if [ ! -d $(OBJDIR) ]; then mkdir -p $(OBJDIR); fi)
$(shell if [ ! -d $(BINDIR) ]; then mkdir -p $(BINDIR); fi)
# ICC FLAGS
CFLAGS=-std=c99 -unroll0 -qopt-report-phase=vec,loop -qopt-report=5 -qopt-report-file=stderr
vadd: CC_FLAG := -DVADD
matmul: CC_FLAG := -DMATMUL
all: CC_FLAG := -DVADD -DMATMUL
all: pattern.all
$(RM) $(OBJDIR)*.o
pattern.all: $(OBJDIR)main.o $(OBJDIR)vadd.o $(OBJDIR)matmul.o
$(CC) -o $@ $^
mv pattern.all $(BINDIR)
vadd: pattern.vadd
$(RM) $(OBJDIR)*.o
pattern.vadd: $(OBJDIR)main.o $(OBJDIR)vadd.o
$(CC) -o $@ $^
mv pattern.vadd $(BINDIR)
$(OBJDIR)vadd.o: $(SRCDIR)vadd.c
$(CC) $(CFLAGS) -c -o $@ $<
matmul: pattern.matmul
$(RM) $(OBJDIR)*.o
pattern.matmul: $(OBJDIR)main.o $(OBJDIR)matmul.o
$(CC) -o $@ $^
mv pattern.matmul $(BINDIR)
$(OBJDIR)matmul.o: $(SRCDIR)matmul.c
$(CC) $(CFLAGS) -c -o $@ $<
$(OBJDIR)main.o: $(SRCDIR)main.c
$(CC) $(CC_FLAG) -c -o $@ $<
clean:
$(RM) $(OBJDIR)*.o
clean-bin:
$(RM) $(bindir)pattern.*
wipe:
$(RM) $(OBJDIR)
$(RM) $(BINDIR)
Overview
--------
This is a synthetic application that highlights the pattern of having **for loops** in a program that can be executed using vector instructions. The idea is that the compiler can optimize the machine code using automatic methods to generate the correspondent vector instructions and execute the code in an efficient manner.
In this application we explore 2 kernels: vector addition & matrix multiplication.
The vector addition is characterized as a simple **for loop** that iterates over arrays and performs the addition operation element wise.
The matrix multiplication is implemented as a series of nested **for loops** that iterate over the rows and columns of the matrices to perform the matrix multiplication.
By compiling the application and looking into the information provided by the compiler we can assess the level of optimizations (or lack of) that the compiler was able to automatically implement.
Getting started
---------------
Prerequisites
-------------
To build and run this kernel you will need:
* Intel C compiler;
* Extrae & Paraver (Only if you want to trace your execution)
Tested with
-----------
Intel C compiler versions:
- 17.0.4
- 19.0.5.281
Intel C compiler flags used
--------------------------
- `-unroll0`: Explicitly tells the compiler to disable loop unrolling.
- `-qopt-report-phase=vec`: Tells the compiler to generate the optimization report for vec phase.
- `-qopt-report=5`: Tells the compiler to generate an optimization report. '5' specifies the level of the detail in the report (5 is the maximum value).
- `-qopt-report-file=stderr`: Output of the optimization report goes to stderr.
Building and running the kernel
-------------------------------
This application can be compiled with the provided Makefile. Just type in a console:
- `make all` - Compiles both kernels and creates the executable (in the bin directory): pattern.all
- `make vadd` - Compiles the vadd kernel and creates the executable (in the bin directory): pattern.vadd
- `make matmul` - Compiles the matmul kernel and creates the executable (in the bin directory): pattern.matmul
To run the application, navigate to the bin directory and run the correspondent executable: e.g. `./pattern.all`
Documentation
-------------
To know more about the issues and features that this application provides, please visit:
[POP-Co-design: Sequential loops pattern](https://co-design.pop-coe.eu/patterns/sequential-loops.html)
[Vectorization Essentials](https://software.intel.com/content/www/us/en/develop/articles/vectorization-essential.html)
[Vectorization Essentials, Utilizing Full Vectors](https://software.intel.com/content/www/us/en/develop/articles/utilizing-full-vectors.html)
#include <stdio.h>
#ifdef VADD
#include "vadd.h"
#define n 134217728 // Size of the arrays
#endif
#ifdef MATMUL
#include "matmul.h"
#define L 512
#define M 512
#define N 512
#endif
int main(int argc, char *argv[])
{
#ifdef VADD
double *a,*b,*c;
// Allocation of data buffers: a, b & c
a=(double *)malloc(sizeof(double)*n);
b=(double *)malloc(sizeof(double)*n);
c=(double *)malloc(sizeof(double)*n);
// Initialization of a & b
/*int i;
for(i=0;i<n;i++)
{
a[i]=(double)i;
b[i]=(double)i/2.0;
}*/
// Execution of the vector addition operation
vadd(c,a,b,n);
//printf("c[%i]=%f | c[%i]=%f", n-1, n-2, c[n-1], c[n-2]);
printf("vadd completed!\n");
#endif
#ifdef MATMUL
double *A, *B, *C;
// Allocation of matrices: A[MxN], B[LxM] & C[LxN]
B=(double *)malloc(sizeof(double)*L*M);
A=(double *)malloc(sizeof(double)*M*N);
C=(double *)malloc(sizeof(double)*L*N);
// Execution of the matrix multiplication operation
matmul(A, B, C, L, M, N);
printf("matmul completed!\n");
#endif
return 0;
}
#include "matmul.h"
// C(lxn) := A(lxm)·B(mxn)
// C should be initialized to zero before calling matmul
void matmul(const double* A, const double* B, double* C, const int L, const int M, const int N){
for(int i=0;i<L;i++)
for(int j=0;j<N;j++)
for(int k=0;k<M;k++) C[i*N+j]+=A[i*M+k]*B[k*N+j];
}
#include <stdlib.h>
void matmul(const double* A, const double* B, double* C, const int L, const int M, const int N);
#include "vadd.h"
void vadd(double *c, double *a, double *b, int n){
for(int i=0; i<n; i++) c[i]=a[i]+b[i];
}
#include <stdlib.h>
void vadd(double *c, double *a, double *b, int n);
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment