I have made a program of LU Decomposition using Gauss Elimination method in Intel Array Building Block but it is running quite slow.
For a 512 x 512 matrix time for my serial and parallel programs are
Serial execution : 76 ms
Parallel Execution using Arbb : 230 ms
My Arbb code is
#include<arbb.hpp>
#include<stdio.h>
using namespace arbb;
void decompose(arbb::dense<arbb::f32, 2>& source, const arbb::i32 dim)
{
_for(i32 k=0, k<dim-1, k++)
{
arbb::dense<f32, 2> pivot_col = section(source, k, 1, k+1, dim-k-1);
pivot_col = pivot_col / source(k, k);
source = replace(source, k, 1, k+1, dim-k-1, pivot_col);
arbb::dense<f32, 2> pivot_col_tmp(dim-k-1, dim-k-1);
_for(i32 col=0, col<dim-k-1, col++)
{
pivot_col_tmp = replace(pivot_col_tmp, col, 1, 0, dim-k-1, pivot_col);
}_end_for;
arbb::dense<f32, 2> pivot_row(dim-k-1, dim-k-1);
_for(i32 row=0, row<dim-k-1, row++)
{
pivot_row = replace(pivot_row, 0, dim-k-1, row, 1, section(source, k+1, dim-k-1, k, 1));
}_end_for;
arbb::dense<f32, 2> sum(dim-k-1, dim-k-1);
sum = pivot_col_tmp * pivot_row;
arbb::dense<f32, 2> temp_source(dim-k-1, dim-k-1);
temp_source = section(source, k+1, dim-k-1, k+1, dim-k-1);
temp_source = temp_source - sum;
source = replace(source, k+1, dim-k-1, k+1, dim-k-1, temp_source);
}_end_for;
}
If some one finds mistake in it, Please figure it out

