3D engine

271 帖子 / 0 全新
最新文章
如需更全面地了解编译器优化,请参阅优化注意事项

Quote:

shaynox s. wrote:

Here's my function for matrix rotation:

#define		REPERE		    (PITCH * (WIDTH/2 - 1)) + (BPP * (LENGTH/2 - 1))
#define		BPP			    4
#define		LENGTH		    800     // x
#define		WIDTH		    600     // y
#define		DEGTORAD(angle) angle * 0.017453292  // angle * PI/180
#define		RAPPORT			(LENGTH/WIDTH)
#define		FOCALE          800
#define		PROFONDEUR      2000


float   rotation_x, rotation_y, rotation_z;

void        put_pixel(float x, float y, float z, int pixel)
{
    float   x_end;
    float   y_end;
    float   z_end;
    int     *ptr_pixel;

    float   cx = cos(DEGTORAD(rotation_x));
    float   cy = cos(DEGTORAD(rotation_y));
    float   cz = cos(DEGTORAD(rotation_z));
    float   sx = sin(DEGTORAD(rotation_x));
    float   sy = sin(DEGTORAD(rotation_y));
    float   sz = sin(DEGTORAD(rotation_z));

    x_end = x * ((cy * cz))                       - y * ((cy * sz))                        - z * (sy)     ;
    y_end = x * ((cx * sz)      - (sx * sy * cz)) + y * ((sx * sy * sz) + (cx * cz))       - z * (sx * cy);
    z_end = x * ((cx * sy * cz) + (sx * sz))      + y * ((sx * cz)      - (cx * sy * sz )) + z * (cx * cy);

    x = x_end / RAPPORT;
    y = y_end;
    z = z_end;

    x = (x * FOCALE) / (z + PROFONDEUR);
    y = (y * FOCALE) / (z + PROFONDEUR);

    ptr_pixel = screen->pixels + REPERE - screen->pitch * (int)y + (int)x * BPP;

    if (ptr_pixel > screen->pixels && ptr_pixel < (screen->pixels + LENGTH*WIDTH * BPP))
        if ((x <= LENGTH/2 && x >= -LENGTH/2) && (y <= WIDTH/2 && y >= -WIDTH/2))
            *(ptr_pixel) = pixel;
}

 

Very nice:)

Which 3D graphics book are you using for all those equations?

Do you really need parametrized macro #define it is prone to the errors. I think that best option could be usage of C++ with its constexpr keyword  and put all those macro values in header file with static linkage. Then compiler could calculate them at the compile time. 

Regarding ptr_pixel pointer declaration I usually tend to initialize it to NULL.

And i initialyse vesa mode only in 16 bits, this code is in 32 bit mode:

		
		; disable the interrupts
			cli				
			
		; Charge la GDT
			lgdt	[GDT64]		
	
	; *1:	
		; switch to protected mode		
			mov		eax, cr0 		
			or		al, 0x1		; PE = 1
			mov		cr0, eax
			
		; Désactiver la Pagination
			mov		eax, cr0 		
			and		eax, 01111111_11111111_11111111_11111111b ; PG = 0
			mov		cr0, eax
		jmp		(CODE32_SELECTOR-GDT64):KERNEL32
	; *

[BITS 32]
KERNEL32:

 

previous code*

 

Quote:

shaynox s. wrote:

no, i just do that:

			mov		edi, [PhysBasePtr]
		; REPERE + -(PITCH * y + x * BPP)
			mov		[edi + REPERE + eax], ebx

 

So you are writing directly to Video RAM?

Thanks ^^

I just unroll matrix rotation Rx*Ry*Rz:

Rotation matrix on x:																						Vecteur:
  _________________________ _________________________ _________________________ _________________________        _________________________ 
 |						   |						 |						   |						 |      |                         |
 | 			  1 		   | 		    0 			 | 		      0 		   |			 0           |      |            x            |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|
 |						   |						 |						   |						 |      |                         |
 | 			  0 		   |        cos(phi_x)       |       -sin(phi_x)       | 		     0 			 |      |            y            |
 |_________________________|_________________________|_________________________|_________________________|  *   |_________________________|
 |						   |						 |						   |					     |      |                         |
 | 			  0 		   |        sin(phi_x)       |        cos(phi_x)       |			 0			 |      |            z            |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|
 |						   |						 |						   |					     |      |                         |
 |			  0		       |			 0			 |			  0			   |		     1  	     |      |          color          |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|
	
			*Result:
				x' = x
				y' = y.cos(phi) + -z.sin(phi)
				   = y.cos(phi) - z.sin(phi)
				z' = y.sin(phi) + z.cos(phi)
Rotation matrix on y:																						    Vecteur:
  _________________________ _________________________ _________________________ _________________________        _________________________ 
 |						   |						 |						   |						 |      |                         |
 |        cos(phi_y)       | 		    0 			 |        -sin(phi_y)      | 		    0 			 |      |            x            |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|
 |						   |						 |						   |						 |      |                         |
 | 			  0 		   |			1	         | 			  0 		   | 		    0 			 |      |            y            |
 |_________________________|_________________________|_________________________|_________________________|  *   |_________________________|
 |						   |						 |						   |						 |      |                         |
 |        sin(phi_y)       | 		    0 			 |        cos(phi_y)       | 		    0 			 |      |            z            |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|
 |						   |					     |						   |						 |      |                         |
 |			  0			   |			0	         |			  0			   |			1			 |      |          color          |
 |_________________________|_________________________|_________________________|_________________________|      |_________________________|

			*Result:
				x' = x.cos(phi) + -z.sin(phi)
				   = x.cos(phi) - z.sin(phi)
				y' = y
				z' = x.sin(phi) + z.cos(phi)
Rotation matrix on z:																					        Vecteur:
  _________________________ _________________________ _________________________ _________________________        _________________________
 |						   |						 |						    |						 |      |      	                  |
 |       cos(phi_z)        |       -sin(phi_z)       |			 0			    |			0            |      |            x            |
 |_________________________|_________________________|__________________________|________________________|      |_________________________|
 |						   |						 |						    |						 |      |                         |
 |       sin(phi_z)        |        cos(phi_z)       | 			 0 			    | 		    0 			 |      |            y            |
 |_________________________|_________________________|__________________________|________________________|  *   |_________________________|
 |						   |						 |						    |						 |      |                         |
 |			  0			   | 		    0 			 |			 1			    |			0			 |      |            z            |
 |_________________________|_________________________|__________________________|________________________|      |_________________________|
 |						   |						 |						    |						 |      |                         |
 |			  0			   | 		    0 			 |			 0			    |			1			 |      |          color          |
 |_________________________|_________________________|__________________________|________________________|      |_________________________|


			*Result:
				x' = x.cos(phi) + -y.sin(phi)
				   = x.cos(phi) - y.sin(phi)
				y' = x.sin(phi) + y.cos(phi)
				z' = z
Rotation matrix on x,y and z:																														   Vecteur:
 ___________________________________ ___________________________________ ___________________________________ ___________________________________        _________________________
|						            |						            |						            |						            |      |      	                 |
|       cos(phi_y) * cos(phi_z)     |       cos(phi_y) * -sin(phi_z)    |            -sin(phi_y)            |			  0                     |      |            x            |
|___________________________________|___________________________________|___________________________________|___________________________________|      |_________________________|
|						            |						            |						            |						            |      |                         |
| -sin(phi_x)*sin(phi_y)*cos(phi_z) |  sin(phi_x)*sin(phi_y)*sin(phi_z) |      -sin(phi_x)*cos(phi_y)       | 		      0 			        |      |            y            |
|    + cos(phi_x) * sin(phi_z)      |     + cos(phi_x)*cos(phi_z)       |						            |						            |      |                         | 
|___________________________________|___________________________________|___________________________________|___________________________________|  *   |_________________________|
|						            |						            |						            |						            |      |                         |
|  cos(phi_x)*sin(phi_y)*cos(phi_z) | cos(phi_x)*sin(phi_y)*-sin(phi_z) |      cos(phi_x)*cos(phi_y)        |			  0			            |      |            z            |
|    + sin(phi_x)*sin(phi_z)        |    + sin(phi_x)*cos(phi_z)        |						            |						            |      |                         |
|___________________________________|___________________________________|___________________________________|___________________________________|      |_________________________|
|						            |						            |						            |						            |      |                         |
|			  0			            | 		    0 			            |			 0			            |			  1			            |      |          color          |
|___________________________________|___________________________________|___________________________________|___________________________________|      |_________________________|

			*Result:
				x' = x.(cos(phi_y) * cos(phi_z)) - y.(cos(phi_y) * sin(phi_z)) - z.sin(phi_y)
				y' = x.(cos(phi_x) * sin(phi_z) - sin(phi_x) * sin(phi_y) * cos(phi_z)) + y.(sin(phi_x) * sin(phi_y) * sin(phi_z) + cos(phi_x) * cos(phi_z)) - z.(sin(phi_x) * cos(phi_y))
				z' = x.(cos(phi_x) * sin(phi_y) * cos(phi_z) + sin(phi_x) * sin(phi_z)) + y.(cos(phi_x) * sin(phi_y) * -sin(phi_z) + sin(phi_x) * cos(phi_z)) + z.(cos(phi_x) * cos(phi_y))		
				   = x.(cos(phi_x) * sin(phi_y) * cos(phi_z) + sin(phi_x) * sin(phi_z)) + y.(sin(phi_x) * cos(phi_z) - cos(phi_x) * sin(phi_y) * sin(phi_z) ) + z.(cos(phi_x) * cos(phi_y))		

no, i wrote on RAM, exemple if i have 1Go of RAM, the LFB_ptr (linear framebuffer) would equal to 0x4000_0000 = 1_073_741_824 = 1Go (and i don't get it, normally, point outside of ram cause a reset of CPU no ?

        ; Algo: LFB = 0x40000000+((RAM_SIZE_Go-1)*0x40000000)
            ;
            ; IF RAM == 1 Go { LFB = 0x040000000 }

And maybe gpu scan this portion of memory for transfer data into it's ram (vram), do you have any idea how ?

For parametrized macro #define, indeed it's not needed, i change it now, just it was for more easier reading.

 

 Asm code is from my kernel.asm ^^

pixel_ptr is intialyse by, well i send u all project, C (code block project) and asm (nasm project) with notepad++ like editor:

you need to install SDL, and for asm program, you need to change value of cx/bx by video number with 800*600 resolution. change only field of mode number.

(you can change for higher resolution, but you will need change some constant data: rapport, ect, i don't remember other but doesn't matter for now ^^)

				; Return VBE Mode Information
					mov		ax, 0x4F01
					mov		cx, 0x115			; Mode number
					mov		di, ModeInfoBlock   ; Pointer to ModeInfoBlock structure
					int		0x10
			; Set VBE Mode
				mov		ax, 0x4F02
				mov		bx, 0_1_0_0_0_0_0_1_0_0_0_1_0_1_0_1b
						; 	| | | | | | | `-`-`-`-`-`-`-`-`----- Mode number: 0x115 = 800 * 600
						; 	| | | | | `-`---------------------- Reserved (must be 0)
						;   | | | | `------------------------- 0 = Use current default refresh rate, 1 = Use user specified CRTC values for refresh rate
						; 	| | `-`-------------------------- Reserved for VBE/AF (must be 0)
						; 	| `----------------------------- 0 = Use windowed frame buffer model, 1 = Use linear/flat frame buffer model
						; 	`------------------------------ 0 = Clear display memory, 1 = Don't clear display memory
				mov		di, CRTCInfoBlock
				int		0x10

C project:

下载application/zip test_equations.zip

ASM project: Some comment are in french.

Ffor run it, go to 2.MAKE, run batch, then open disk (small icon) and choose one USB key empty, then select Removable Disk 1 (remove all other usb key for let only the test key), deselect open as readonly.

Then copy all data from HackOS.bin to your usb key at start offset (0), save it, and boot your usb key on another PC or your actual PC.

下载application/zip NASM.zip

For found mode number, if it's don't work (0x115), run the program "list mode vesa.exe" who list all video mode number, include in zip.

"list mode vesa.exe" is 16-bit program.

.>>And maybe gpu scan this portion of memory for transfer data into it's ram (vram), do you have any idea how ?>>>

Usually VBE  is used during the BIOS phase of pc boot process. I only suppose that at early stage of Windows boot  process so called bootvid.sys  driver is used and this driver probably calls int 0x10 services. After loading miniport and display driver(Windows) probably all the memory transactions are done through DMA engines which read/write so called Memory-Mapped I/O where hundreds of GPU registers are mapped to. I think that GPU Video BIOS firmware is setting up those memory regions for later usage by the display driver. Probably display driver working directly with the DirectX kernel driver is writing vertex data, fonts and bitmaps into those regions which are later read by GPU Command Processor Unit(ATI/AMD) and directly send to GPU scheduler.

Please have a look at those freely available AMD GPU docs.

http://www.x.org/docs/AMD/old/

 

 

If you are interested I have a library of elementary and special functions implemented in Java and in C partly vectorized. They are fast mainly because I pre-calculated their coefficients with the help of Mathematica 8 and used Horner Scheme for result convergence. Moreover I have also library of various integrators also in multithreaded version(Windows) only. You will need this when trying to calculate numerically BRDF functions.

Thank you very much for uploading your source code I really appreciate it.

>>>and i don't get it, normally, point outside of ram cause a reset of CPU no ?>>>

IIRC it will cause some fault when the virtual address cannot be mapped to physical one.

 

>>>he target is intel i7 cpu core, and here the sample of asm code make by gcc>>>

What are your GCC settings? Can you post it here? 

IIRC x87 floating point code path is reserved for CPU's older than Pentium 4 and for 32-bit executable. So 64-bit executable by ABI should be compiled without x87 machine code.

Please read following blog: https://software.intel.com/en-us/blogs/2012/09/26/gcc-x86-performance-hints

the options of gcc are:

-ffast-math
-fforce-addr    
-masm=intel
-march=corei7-avx
-O2e

and yes i use windows xp 32 bit, i think i will migrate on 64 bit.

I run my kernel on flat memory IA-32 mode (don't work in IA-32e mode :/ only can paging 1Go memory max:

			; 	PML4E.P(0) = PML4E.R/W(1) = PML4E.U/S(2) = 1
		   ;|; 	PML4E.PWT(3) = PML4E.PCD(4) = PML4E.A(5) = PML4E.PS(7) = 0
			;	PML4E.addr.PDPTE(51:12) =              [                     0x02                 ] * 0x1000
				mov		qword [0x1000], 00000000_00000_000_00000000_00000000_00000000_00000000_0010_0000_00000111b
		    ;															  32
		   ;|;	
			;	PDPTE.P(0) = PDPTE.R/W(1) = PDPTE.U/S(2) = PDPTE.A(5) = PDPTE.D(6) = PDPTE.PS(7) = PDPTE.G(8) = PDPTE.PAT(12) = 1
		   ;|;	PDPTE.PWT(3) = PDPTE.PCD(4) = 0
			;	PDPTE.addr.1Go(51:30) =   			   [          0           ]
				mov		qword [0x2000], 00000000_00000_000_00000000_00000000_00_000000_00000000_00010001_11100111b
			;															  32
		   ;|;  

yes why not, your library contain asm in line for calculate ? what is BRDF functions ?

(what do  you think about my ASCII-art documentation in my asm source ? ^^ )

-O2 *

when i put -m32 -mfpmath=sse, it's crash :/

it's work if i remove: -fforce-addr and -ffast-math :/

so here's my new final option:

-masm=intel
-m32

-mfpmath=sse -Ofast -flto

-march=corei7-avx

But ironically, it's fall down at 20 fps :o, originaly option run under 70 fps for 103_997 vertex.

But the asm code made by gcc is fill with

_put_pixel:
LFB68:
	.file 1 "C:/Documents and Settings/shilupyox/Mes documents/test_equations/main.c"
	.loc 1 224 0
	.cfi_startproc
LVL0:
	push	esi
	.cfi_def_cfa_offset 8
	.cfi_offset 6, -8
	push	ebx
	.cfi_def_cfa_offset 12
	.cfi_offset 3, -12
	sub	esp, 68
	.cfi_def_cfa_offset 80
	.loc 1 230 0
	movss	xmm3, DWORD PTR _rotation_x
	cvtps2pd	xmm3, xmm3
	mulsd	xmm3, QWORD PTR LC0
	movsd	QWORD PTR [esp], xmm3
	movsd	QWORD PTR [esp+32], xmm3
	call	_cos
LVL1:
	.loc 1 231 0
	movss	xmm2, DWORD PTR _rotation_y
	cvtps2pd	xmm2, xmm2
	mulsd	xmm2, QWORD PTR LC0
	.loc 1 230 0
	fstp	QWORD PTR [esp+16]
LVL2:
	movsd	xmm5, QWORD PTR [esp+16]
	unpcklpd	xmm5, xmm5
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+16], xmm5
	.loc 1 231 0
	movsd	QWORD PTR [esp], xmm2
	movsd	QWORD PTR [esp+40], xmm2
	call	_cos
LVL3:
	.loc 1 232 0
	movss	xmm0, DWORD PTR _rotation_z
	cvtps2pd	xmm0, xmm0
	mulsd	xmm0, QWORD PTR LC0
	.loc 1 231 0
	fstp	QWORD PTR [esp+24]
	movsd	xmm1, QWORD PTR [esp+24]
	unpcklpd	xmm1, xmm1
	cvtpd2ps	xmm1, xmm1
	movss	DWORD PTR [esp+60], xmm1
LVL4:
	.loc 1 232 0
	movsd	QWORD PTR [esp], xmm0
	movsd	QWORD PTR [esp+48], xmm0
	call	_cos
LVL5:
	.loc 1 233 0
	movsd	xmm3, QWORD PTR [esp+32]
	.loc 1 232 0
	fstp	QWORD PTR [esp+24]
LVL6:
	movsd	xmm4, QWORD PTR [esp+24]
	.loc 1 233 0
	movsd	QWORD PTR [esp], xmm3
	.loc 1 232 0
	movddup	xmm5, xmm4
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+24], xmm5
	.loc 1 233 0
	call	_sin
LVL7:
	.loc 1 234 0
	movsd	xmm2, QWORD PTR [esp+40]
	.loc 1 233 0
	fstp	QWORD PTR [esp+32]
LVL8:
	movsd	xmm1, QWORD PTR [esp+32]
	.loc 1 234 0
	movsd	QWORD PTR [esp], xmm2
	.loc 1 233 0
	movddup	xmm5, xmm1
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+32], xmm5
	.loc 1 234 0
	call	_sin
LVL9:
	.loc 1 235 0
	movsd	xmm0, QWORD PTR [esp+48]
	.loc 1 234 0
	fstp	QWORD PTR [esp+40]
	movsd	xmm2, QWORD PTR [esp+40]
	.loc 1 235 0
	movsd	QWORD PTR [esp], xmm0
	.loc 1 234 0
	movddup	xmm7, xmm2
	cvtpd2ps	xmm7, xmm7
	movss	DWORD PTR [esp+56], xmm7
LVL10:
	.loc 1 235 0
	call	_sin
LVL11:
	.loc 1 237 0
	movss	xmm4, DWORD PTR [esp+80]
	.loc 1 235 0
	fstp	QWORD PTR [esp+40]
	.loc 1 238 0
	movss	xmm5, DWORD PTR [esp+84]
	.loc 1 237 0
	addss	xmm4, DWORD PTR _translate_x
	.loc 1 235 0
	movsd	xmm1, QWORD PTR [esp+40]
	.loc 1 239 0
	movss	xmm3, DWORD PTR [esp+88]
	.loc 1 237 0
	movaps	xmm7, xmm4
	.loc 1 235 0
	movddup	xmm2, xmm1
	cvtpd2ps	xmm2, xmm2
LVL12:
	.loc 1 240 0
	movss	xmm1, DWORD PTR [esp+60]
	movss	xmm4, DWORD PTR [esp+24]
LVL13:
	movaps	xmm0, xmm1
	.loc 1 238 0
	addss	xmm5, DWORD PTR _translate_y
LVL14:
	.loc 1 240 0
	mulss	xmm4, xmm1
	.loc 1 239 0
	addss	xmm3, DWORD PTR _translate_z
LVL15:
	.loc 1 240 0
	movss	DWORD PTR [esp+40], xmm7
	mulss	xmm0, xmm2
	.loc 1 241 0
	movss	xmm6, DWORD PTR [esp+32]
	.loc 1 251 0
	mov	ecx, DWORD PTR _screen
	.loc 1 240 0
	mulss	xmm4, xmm7
	.loc 1 241 0
	movss	xmm7, DWORD PTR [esp+56]
LVL16:
	.loc 1 240 0
	mulss	xmm0, xmm5
	.loc 1 241 0
	mulss	xmm6, xmm7
	.loc 1 251 0
	mov	ebx, DWORD PTR [ecx+20]
	movzx	ecx, WORD PTR [ecx+16]
	.loc 1 240 0
	subss	xmm4, xmm0
LVL17:
	.loc 1 241 0
	movss	DWORD PTR [esp+56], xmm6
LVL18:
	movaps	xmm0, xmm6
	movss	xmm6, DWORD PTR [esp+24]
	mulss	xmm0, xmm2
	mulss	xmm6, DWORD PTR [esp+16]
	addss	xmm0, xmm6
	movss	xmm6, DWORD PTR [esp+32]
	mulss	xmm6, xmm1
	mulss	xmm0, xmm5
	.loc 1 242 0
	mulss	xmm1, DWORD PTR [esp+16]
	.loc 1 241 0
	mulss	xmm6, xmm3
	.loc 1 242 0
	mulss	xmm1, xmm3
	.loc 1 241 0
	subss	xmm0, xmm6
LVL19:
	.loc 1 242 0
	movss	xmm6, DWORD PTR [esp+16]
	mulss	xmm6, xmm7
LVL20:
	addss	xmm1, DWORD PTR LC2
	.loc 1 240 0
	mulss	xmm7, xmm3
LVL21:
	.loc 1 242 0
	movss	xmm3, DWORD PTR [esp+24]
	mulss	xmm3, xmm6
	.loc 1 240 0
	subss	xmm4, xmm7
LVL22:
	.loc 1 242 0
	movss	xmm7, DWORD PTR [esp+32]
	mulss	xmm6, xmm2
	mulss	xmm7, xmm2
	.loc 1 241 0
	mulss	xmm2, DWORD PTR [esp+16]
LVL23:
	.loc 1 248 0
	mulss	xmm4, DWORD PTR LC1
LVL24:
	.loc 1 242 0
	addss	xmm3, xmm7
	movss	xmm7, DWORD PTR [esp+24]
	mulss	xmm3, DWORD PTR [esp+40]
	addss	xmm1, xmm3
	movss	xmm3, DWORD PTR [esp+32]
	mulss	xmm3, xmm7
	subss	xmm3, xmm6
	mulss	xmm3, xmm5
	.loc 1 248 0
	addss	xmm1, xmm3
	.loc 1 241 0
	movss	xmm3, DWORD PTR [esp+56]
	mulss	xmm3, xmm7
	.loc 1 248 0
	divss	xmm4, xmm1
LVL25:
	.loc 1 241 0
	subss	xmm2, xmm3
	mulss	xmm2, DWORD PTR [esp+40]
	addss	xmm0, xmm2
LVL26:
	.loc 1 249 0
	mulss	xmm0, DWORD PTR LC1
LVL27:
	divss	xmm0, xmm1
LVL28:
	.loc 1 251 0
	cvttss2si	edx, xmm4
	lea	edx, [958396+edx*4]
	cvttss2si	esi, xmm0
	imul	ecx, esi
	sub	edx, ecx
	add	edx, ebx
LVL29:
	.loc 1 253 0
	cmp	ebx, edx
	jae	L1
	.loc 1 253 0 is_stmt 0 discriminator 1
	add	ebx, 1920000
	cmp	edx, ebx
	jae	L1
	.loc 1 254 0 is_stmt 1
	comiss	xmm4, DWORD PTR LC3
	ja	L1
	comiss	xmm4, DWORD PTR LC4
	jb	L1
	.loc 1 254 0 is_stmt 0 discriminator 1
	comiss	xmm0, DWORD PTR LC5
	ja	L1
	comiss	xmm0, DWORD PTR LC6
	jb	L1
	.loc 1 255 0 is_stmt 1
	mov	eax, DWORD PTR [esp+92]
	mov	DWORD PTR [edx], eax
	.p2align 4,,10
L1:
	.loc 1 256 0
	add	esp, 68
	.cfi_def_cfa_offset 12
	pop	ebx
	.cfi_restore 3
	.cfi_def_cfa_offset 8
	pop	esi
	.cfi_restore 6
	.cfi_def_cfa_offset 4
	ret

i would say onyl one word, WTF

gcc use only first packed simple precision for calculate what i want, haha, i understand know why he fall down unders 20 fps, maybe only human intelligence can use smid instruction in smart way.

I ear about built in, but it's still slow, beaucause gcc still create an environnement for each function: stack.

in summary gcc see SMID register as 32 bit space oO

it's for those kind of reason i hate high level language x_x and programming in assembly

I try to build my matrix rotation in windows executable format, but it's crash ^^, i know why know, fault to my wish to store value in RAM in my way :/

My way is viewing RAM like one big array 1D, no more no less. Maybe programers hate this way to program :o wiht built in section of text, code, bss ect

i know why now*

i try to built application with intel compiler, but i don't know how to link sdl library, when i try to build, but it didn't work

Do you have any tutorial for start coding with intel compiler, library?

>>>-march=corei7-avx>>> It seems that compiler should generate AVX-code. Try to compile this code as 64-bit executable on 64-bit OS.

You can also post this question on ICC forum.

>>>yes why not, your library contain asm in line for calculate >>>

Yes it contains inline SSE assembly and I am working now on full implementation od dozens of functions. I will upload it very soon.

BRDF - Bidirectional Reflectance Distribution Function. This function describes reflectance of the light and it is part of rendering equation you can read more here http://en.wikipedia.org/wiki/Bidirectional_reflectance_distribution_function

You will need to implement this function when you will try to develop more advanced rendering algorithms like Radiosity rendering.

Btw implementation can be found in PBRT renderer source code.

 

Quote:

shaynox s. wrote:

it's work if i remove: -fforce-addr and -ffast-math :/

so here's my new final option:

-masm=intel

-m32

-mfpmath=sse -Ofast -flto

-march=corei7-avx

But ironically, it's fall down at 20 fps :o, originaly option run under 70 fps for 103_997 vertex.

But the asm code made by gcc is fill with

_put_pixel:
LFB68:
	.file 1 "C:/Documents and Settings/shilupyox/Mes documents/test_equations/main.c"
	.loc 1 224 0
	.cfi_startproc
LVL0:
	push	esi
	.cfi_def_cfa_offset 8
	.cfi_offset 6, -8
	push	ebx
	.cfi_def_cfa_offset 12
	.cfi_offset 3, -12
	sub	esp, 68
	.cfi_def_cfa_offset 80
	.loc 1 230 0
	movss	xmm3, DWORD PTR _rotation_x
	cvtps2pd	xmm3, xmm3
	mulsd	xmm3, QWORD PTR LC0
	movsd	QWORD PTR [esp], xmm3
	movsd	QWORD PTR [esp+32], xmm3
	call	_cos
LVL1:
	.loc 1 231 0
	movss	xmm2, DWORD PTR _rotation_y
	cvtps2pd	xmm2, xmm2
	mulsd	xmm2, QWORD PTR LC0
	.loc 1 230 0
	fstp	QWORD PTR [esp+16]
LVL2:
	movsd	xmm5, QWORD PTR [esp+16]
	unpcklpd	xmm5, xmm5
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+16], xmm5
	.loc 1 231 0
	movsd	QWORD PTR [esp], xmm2
	movsd	QWORD PTR [esp+40], xmm2
	call	_cos
LVL3:
	.loc 1 232 0
	movss	xmm0, DWORD PTR _rotation_z
	cvtps2pd	xmm0, xmm0
	mulsd	xmm0, QWORD PTR LC0
	.loc 1 231 0
	fstp	QWORD PTR [esp+24]
	movsd	xmm1, QWORD PTR [esp+24]
	unpcklpd	xmm1, xmm1
	cvtpd2ps	xmm1, xmm1
	movss	DWORD PTR [esp+60], xmm1
LVL4:
	.loc 1 232 0
	movsd	QWORD PTR [esp], xmm0
	movsd	QWORD PTR [esp+48], xmm0
	call	_cos
LVL5:
	.loc 1 233 0
	movsd	xmm3, QWORD PTR [esp+32]
	.loc 1 232 0
	fstp	QWORD PTR [esp+24]
LVL6:
	movsd	xmm4, QWORD PTR [esp+24]
	.loc 1 233 0
	movsd	QWORD PTR [esp], xmm3
	.loc 1 232 0
	movddup	xmm5, xmm4
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+24], xmm5
	.loc 1 233 0
	call	_sin
LVL7:
	.loc 1 234 0
	movsd	xmm2, QWORD PTR [esp+40]
	.loc 1 233 0
	fstp	QWORD PTR [esp+32]
LVL8:
	movsd	xmm1, QWORD PTR [esp+32]
	.loc 1 234 0
	movsd	QWORD PTR [esp], xmm2
	.loc 1 233 0
	movddup	xmm5, xmm1
	cvtpd2ps	xmm5, xmm5
	movss	DWORD PTR [esp+32], xmm5
	.loc 1 234 0
	call	_sin
LVL9:
	.loc 1 235 0
	movsd	xmm0, QWORD PTR [esp+48]
	.loc 1 234 0
	fstp	QWORD PTR [esp+40]
	movsd	xmm2, QWORD PTR [esp+40]
	.loc 1 235 0
	movsd	QWORD PTR [esp], xmm0
	.loc 1 234 0
	movddup	xmm7, xmm2
	cvtpd2ps	xmm7, xmm7
	movss	DWORD PTR [esp+56], xmm7
LVL10:
	.loc 1 235 0
	call	_sin
LVL11:
	.loc 1 237 0
	movss	xmm4, DWORD PTR [esp+80]
	.loc 1 235 0
	fstp	QWORD PTR [esp+40]
	.loc 1 238 0
	movss	xmm5, DWORD PTR [esp+84]
	.loc 1 237 0
	addss	xmm4, DWORD PTR _translate_x
	.loc 1 235 0
	movsd	xmm1, QWORD PTR [esp+40]
	.loc 1 239 0
	movss	xmm3, DWORD PTR [esp+88]
	.loc 1 237 0
	movaps	xmm7, xmm4
	.loc 1 235 0
	movddup	xmm2, xmm1
	cvtpd2ps	xmm2, xmm2
LVL12:
	.loc 1 240 0
	movss	xmm1, DWORD PTR [esp+60]
	movss	xmm4, DWORD PTR [esp+24]
LVL13:
	movaps	xmm0, xmm1
	.loc 1 238 0
	addss	xmm5, DWORD PTR _translate_y
LVL14:
	.loc 1 240 0
	mulss	xmm4, xmm1
	.loc 1 239 0
	addss	xmm3, DWORD PTR _translate_z
LVL15:
	.loc 1 240 0
	movss	DWORD PTR [esp+40], xmm7
	mulss	xmm0, xmm2
	.loc 1 241 0
	movss	xmm6, DWORD PTR [esp+32]
	.loc 1 251 0
	mov	ecx, DWORD PTR _screen
	.loc 1 240 0
	mulss	xmm4, xmm7
	.loc 1 241 0
	movss	xmm7, DWORD PTR [esp+56]
LVL16:
	.loc 1 240 0
	mulss	xmm0, xmm5
	.loc 1 241 0
	mulss	xmm6, xmm7
	.loc 1 251 0
	mov	ebx, DWORD PTR [ecx+20]
	movzx	ecx, WORD PTR [ecx+16]
	.loc 1 240 0
	subss	xmm4, xmm0
LVL17:
	.loc 1 241 0
	movss	DWORD PTR [esp+56], xmm6
LVL18:
	movaps	xmm0, xmm6
	movss	xmm6, DWORD PTR [esp+24]
	mulss	xmm0, xmm2
	mulss	xmm6, DWORD PTR [esp+16]
	addss	xmm0, xmm6
	movss	xmm6, DWORD PTR [esp+32]
	mulss	xmm6, xmm1
	mulss	xmm0, xmm5
	.loc 1 242 0
	mulss	xmm1, DWORD PTR [esp+16]
	.loc 1 241 0
	mulss	xmm6, xmm3
	.loc 1 242 0
	mulss	xmm1, xmm3
	.loc 1 241 0
	subss	xmm0, xmm6
LVL19:
	.loc 1 242 0
	movss	xmm6, DWORD PTR [esp+16]
	mulss	xmm6, xmm7
LVL20:
	addss	xmm1, DWORD PTR LC2
	.loc 1 240 0
	mulss	xmm7, xmm3
LVL21:
	.loc 1 242 0
	movss	xmm3, DWORD PTR [esp+24]
	mulss	xmm3, xmm6
	.loc 1 240 0
	subss	xmm4, xmm7
LVL22:
	.loc 1 242 0
	movss	xmm7, DWORD PTR [esp+32]
	mulss	xmm6, xmm2
	mulss	xmm7, xmm2
	.loc 1 241 0
	mulss	xmm2, DWORD PTR [esp+16]
LVL23:
	.loc 1 248 0
	mulss	xmm4, DWORD PTR LC1
LVL24:
	.loc 1 242 0
	addss	xmm3, xmm7
	movss	xmm7, DWORD PTR [esp+24]
	mulss	xmm3, DWORD PTR [esp+40]
	addss	xmm1, xmm3
	movss	xmm3, DWORD PTR [esp+32]
	mulss	xmm3, xmm7
	subss	xmm3, xmm6
	mulss	xmm3, xmm5
	.loc 1 248 0
	addss	xmm1, xmm3
	.loc 1 241 0
	movss	xmm3, DWORD PTR [esp+56]
	mulss	xmm3, xmm7
	.loc 1 248 0
	divss	xmm4, xmm1
LVL25:
	.loc 1 241 0
	subss	xmm2, xmm3
	mulss	xmm2, DWORD PTR [esp+40]
	addss	xmm0, xmm2
LVL26:
	.loc 1 249 0
	mulss	xmm0, DWORD PTR LC1
LVL27:
	divss	xmm0, xmm1
LVL28:
	.loc 1 251 0
	cvttss2si	edx, xmm4
	lea	edx, [958396+edx*4]
	cvttss2si	esi, xmm0
	imul	ecx, esi
	sub	edx, ecx
	add	edx, ebx
LVL29:
	.loc 1 253 0
	cmp	ebx, edx
	jae	L1
	.loc 1 253 0 is_stmt 0 discriminator 1
	add	ebx, 1920000
	cmp	edx, ebx
	jae	L1
	.loc 1 254 0 is_stmt 1
	comiss	xmm4, DWORD PTR LC3
	ja	L1
	comiss	xmm4, DWORD PTR LC4
	jb	L1
	.loc 1 254 0 is_stmt 0 discriminator 1
	comiss	xmm0, DWORD PTR LC5
	ja	L1
	comiss	xmm0, DWORD PTR LC6
	jb	L1
	.loc 1 255 0 is_stmt 1
	mov	eax, DWORD PTR [esp+92]
	mov	DWORD PTR [edx], eax
	.p2align 4,,10
L1:
	.loc 1 256 0
	add	esp, 68
	.cfi_def_cfa_offset 12
	pop	ebx
	.cfi_restore 3
	.cfi_def_cfa_offset 8
	pop	esi
	.cfi_restore 6
	.cfi_def_cfa_offset 4
	ret

i would say onyl one word, WTF

gcc use only first packed simple precision for calculate what i want, haha, i understand know why he fall down unders 20 fps, maybe only human intelligence can use smid instruction in smart way.

I ear about built in, but it's still slow, beaucause gcc still create an environnement for each function: stack.

It seems that you are using MinGW probably. Do you really need double precision for 3D rendering or even for geometric computation? Bear in mind that display coordinates/ display colour grades or display space uses integer numbers. Your code was not vectorised because of usage of non-packed machine code instructions. Did you try to enable auto-vectorization in GCC? http://gcc.gnu.org/projects/tree-ssa/vectorization.html

 

 

引文:

shaynox s. 写道:

in summary gcc see SMID register as 32 bit space oO

it's for those kind of reason i hate high level language x_x and programming in assembly

You should try ICC compiler integrated in Visual Studio. I am using such setup for my programming. ICC is very good at exploiting auto-vectorization opportunities. You should review your code and search for vectorization opportunities. Usually  3D code can be easily vectorized because pixels do not always have adjacent dependency.

>>>i try to built application with intel compiler, but i don't know how to link sdl library, when i try to build, but it didn't work>>>

I know how to do it inside VS project settings.

Regarding ICC tutorials you can read compiler manual

https://software.intel.com/sites/products/documentation/doclib/iss/2013/compiler/cpp-lin/

In order to exploit vectorization you can lay out your data as SoA (Structure Of Array).

#define WIDTH 1920

#define HEIGHT 1080

#define SCREEN_SIZE (WIDTH * HEIGHT)

#define NUM_OF_VERTICES 2048

typedef struct

{

     float coord_X [WIDTH];

    float coord_Y [HEIGHT];

    BYTE component_Red [SCREEN_SIZE];

    BYTE component_Blue [SCREEN_SIZE];

    BYTE component_Green [SCREEN_SIZE];

    BYTE component_Alpha [SCREEN_SIZE];

} PixelData, *PPixelData;

 

   More optimal version of SoA layout:

 struct Vertex

{

      float coord_X [NUM_OF_VERTICES];

      float coord_Y [NUM_OF_VERTICES];

      float coord_Z [NUM_OF_VERTICES];

     float coord_W [NUM_OF_VERTICES];

};

no don't worry i don't use double precision for drawing pixel, finnaly i working on seven 64 bit and have instaling mingw64, but surprise, the fps it's divide by 2.

the only auto vectorisation gcc do is: movaps like before: (avx instruction)

put_pixel:
.LFB46:
	.file 1 "C:/Users/hackos/Documents/codeblock/3D-engine/main.c"
	.loc 1 26 0
	.cfi_startproc
.LVL0:
	push	rbx
	.seh_pushreg	rbx
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	sub	rsp, 208
	.seh_stackalloc	208
	.cfi_def_cfa_offset 224
	vmovaps	XMMWORD PTR 48[rsp], xmm6
	.seh_savexmm	xmm6, 48
	vmovaps	XMMWORD PTR 64[rsp], xmm7
	.seh_savexmm	xmm7, 64
	vmovaps	XMMWORD PTR 80[rsp], xmm8
	.seh_savexmm	xmm8, 80
	vmovaps	XMMWORD PTR 96[rsp], xmm9
	.seh_savexmm	xmm9, 96
	vmovaps	XMMWORD PTR 112[rsp], xmm10
	.seh_savexmm	xmm10, 112
	vmovaps	XMMWORD PTR 128[rsp], xmm11
	.seh_savexmm	xmm11, 128
	vmovaps	XMMWORD PTR 144[rsp], xmm12
	.seh_savexmm	xmm12, 144
	vmovaps	XMMWORD PTR 160[rsp], xmm13
	.seh_savexmm	xmm13, 160
	vmovaps	XMMWORD PTR 176[rsp], xmm14
	.seh_savexmm	xmm14, 176
	vmovaps	XMMWORD PTR 192[rsp], xmm15
	.seh_savexmm	xmm15, 192
	.cfi_offset 23, -176
	.cfi_offset 24, -160
	.cfi_offset 25, -144
	.cfi_offset 26, -128
	.cfi_offset 27, -112
	.cfi_offset 28, -96
	.cfi_offset 29, -80
	.cfi_offset 30, -64
	.cfi_offset 31, -48
	.cfi_offset 32, -32
	.seh_endprologue
	.loc 1 32 0
	vxorpd	xmm15, xmm15, xmm15
	vcvtss2sd	xmm15, xmm15, DWORD PTR rotation_object[rip]
	vmovsd	xmm12, QWORD PTR .LC0[rip]
	vmulsd	xmm15, xmm15, xmm12
	.loc 1 26 0
	mov	rbx, QWORD PTR 256[rsp]
	vmovaps	xmm11, xmm0
	vmovss	DWORD PTR 44[rsp], xmm1
	.loc 1 32 0
	vmovapd	xmm0, xmm15
.LVL1:
	.loc 1 26 0
	vmovss	DWORD PTR 40[rsp], xmm2
	.loc 1 32 0
	call	cos
.LVL2:
	.loc 1 33 0
	vxorpd	xmm14, xmm14, xmm14
	vcvtss2sd	xmm14, xmm14, DWORD PTR 4+rotation_object[rip]
	.loc 1 32 0
	vxorps	xmm10, xmm10, xmm10
	.loc 1 33 0
	vmulsd	xmm14, xmm14, xmm12
	.loc 1 32 0
	vcvtsd2ss	xmm10, xmm10, xmm0
.LVL3:
	.loc 1 33 0
	vmovapd	xmm0, xmm14
	call	cos
.LVL4:
	.loc 1 34 0
	vxorpd	xmm3, xmm3, xmm3
	vcvtss2sd	xmm3, xmm3, DWORD PTR 8+rotation_object[rip]
	.loc 1 33 0
	vxorps	xmm6, xmm6, xmm6
	.loc 1 34 0
	vmulsd	xmm12, xmm3, xmm12
	.loc 1 33 0
	vcvtsd2ss	xmm6, xmm6, xmm0
.LVL5:
	.loc 1 34 0
	vmovapd	xmm0, xmm12
	call	cos
.LVL6:
	vxorps	xmm9, xmm9, xmm9
	vcvtsd2ss	xmm9, xmm9, xmm0
.LVL7:
	.loc 1 35 0
	vmovapd	xmm0, xmm15
	call	sin
.LVL8:
	vxorps	xmm8, xmm8, xmm8
	vcvtsd2ss	xmm8, xmm8, xmm0
.LVL9:
	.loc 1 36 0
	vmovapd	xmm0, xmm14
	call	sin
.LVL10:
	vxorps	xmm15, xmm15, xmm15
	vcvtsd2ss	xmm15, xmm15, xmm0
.LVL11:
	.loc 1 37 0
	vmovapd	xmm0, xmm12
	call	sin
.LVL12:
	vxorps	xmm7, xmm7, xmm7
	.loc 1 40 0
	vmovss	xmm1, DWORD PTR 44[rsp]
	.loc 1 43 0
	vmulss	xmm12, xmm15, xmm8
	.loc 1 37 0
	vcvtsd2ss	xmm7, xmm7, xmm0
.LVL13:
	.loc 1 40 0
	vaddss	xmm13, xmm1, DWORD PTR 4[rbx]
	.loc 1 43 0
	vmulss	xmm5, xmm9, xmm10
	.loc 1 41 0
	vmovss	xmm2, DWORD PTR 40[rsp]
	.loc 1 43 0
	vmulss	xmm1, xmm6, xmm8
	.loc 1 41 0
	vaddss	xmm2, xmm2, DWORD PTR 8[rbx]
	.loc 1 44 0
	vmulss	xmm14, xmm15, xmm10
	.loc 1 39 0
	vaddss	xmm11, xmm11, DWORD PTR [rbx]
.LVL14:
	.loc 1 43 0
	vmulss	xmm0, xmm12, xmm7
	.loc 1 42 0
	vmulss	xmm4, xmm6, xmm9
	.loc 1 43 0
	vmulss	xmm1, xmm1, xmm2
	.loc 1 42 0
	vmulss	xmm15, xmm15, xmm2
.LVL15:
	.loc 1 43 0
	vaddss	xmm5, xmm0, xmm5
	.loc 1 44 0
	vmulss	xmm0, xmm14, xmm9
	.loc 1 42 0
	vmulss	xmm3, xmm4, xmm11
	vmulss	xmm4, xmm6, xmm7
	.loc 1 43 0
	vmulss	xmm5, xmm5, xmm13
	.loc 1 44 0
	vmulss	xmm6, xmm10, xmm6
.LVL16:
	vmulss	xmm14, xmm14, xmm7
	.loc 1 42 0
	vmulss	xmm4, xmm4, xmm13
	vsubss	xmm5, xmm5, xmm1
	.loc 1 44 0
	vmulss	xmm1, xmm7, xmm8
	vmulss	xmm6, xmm6, xmm2
	vaddss	xmm2, xmm6, DWORD PTR .LC2[rip]
.LVL17:
	.loc 1 43 0
	vmulss	xmm7, xmm7, xmm10
.LVL18:
	.loc 1 42 0
	vsubss	xmm3, xmm3, xmm4
.LVL19:
	.loc 1 50 0
	vmovss	xmm4, DWORD PTR .LC1[rip]
	.loc 1 44 0
	vaddss	xmm1, xmm0, xmm1
	.loc 1 42 0
	vsubss	xmm3, xmm3, xmm15
	.loc 1 44 0
	vmulss	xmm1, xmm1, xmm11
	.loc 1 50 0
	vmulss	xmm3, xmm3, xmm4
	vaddss	xmm6, xmm2, xmm1
	.loc 1 44 0
	vmulss	xmm1, xmm8, xmm9
	.loc 1 53 0
	vmovss	xmm2, DWORD PTR .LC3[rip]
	.loc 1 43 0
	vmulss	xmm9, xmm12, xmm9
.LVL20:
	.loc 1 53 0
	vmovaps	xmm0, xmm2
	.loc 1 44 0
	vsubss	xmm1, xmm1, xmm14
	.loc 1 43 0
	vsubss	xmm7, xmm7, xmm9
	.loc 1 44 0
	vmulss	xmm1, xmm1, xmm13
	.loc 1 43 0
	vmulss	xmm11, xmm7, xmm11
	.loc 1 50 0
	vaddss	xmm1, xmm6, xmm1
	.loc 1 43 0
	vaddss	xmm11, xmm5, xmm11
	.loc 1 50 0
	vdivss	xmm6, xmm3, xmm1
.LVL21:
	.loc 1 51 0
	vmulss	xmm11, xmm11, xmm4
	vdivss	xmm11, xmm11, xmm1
.LVL22:
	.loc 1 53 0
	vmovaps	xmm1, xmm2
	call	glColor3f
.LVL23:
	.loc 1 54 0
	xor	ecx, ecx
	call	glBegin
.LVL24:
	.loc 1 55 0
	vmulss	xmm0, xmm6, DWORD PTR .LC5[rip]
	vmulss	xmm1, xmm11, DWORD PTR .LC4[rip]
	call	glVertex2f
	nop
.LVL25:
	.loc 1 57 0
	vmovaps	xmm6, XMMWORD PTR 48[rsp]
.LVL26:
	vmovaps	xmm7, XMMWORD PTR 64[rsp]
	vmovaps	xmm8, XMMWORD PTR 80[rsp]
.LVL27:
	vmovaps	xmm9, XMMWORD PTR 96[rsp]
	vmovaps	xmm10, XMMWORD PTR 112[rsp]
.LVL28:
	vmovaps	xmm11, XMMWORD PTR 128[rsp]
.LVL29:
	vmovaps	xmm12, XMMWORD PTR 144[rsp]
	vmovaps	xmm13, XMMWORD PTR 160[rsp]
	vmovaps	xmm14, XMMWORD PTR 176[rsp]
	vmovaps	xmm15, XMMWORD PTR 192[rsp]
	add	rsp, 208
	.cfi_restore 32
	.cfi_restore 31
	.cfi_restore 30
	.cfi_restore 29
	.cfi_restore 28
	.cfi_restore 27
	.cfi_restore 26
	.cfi_restore 25
	.cfi_restore 24
	.cfi_restore 23
	.cfi_def_cfa_offset 16
	pop	rbx
	.cfi_restore 3
	.cfi_def_cfa_offset 8
	.loc 1 56 0
	jmp	glEnd
.LVL30:
	.cfi_endproc
.LFE46:
	.seh_endproc
	.section	.text.unlikely,"x"
.LCOLDE6:
	.text
.LHOTE6:
	.section	.text.unlikely,"x"
.LCOLDB12:
	.text
.LHOTB12:
	.p2align 4,,15
	.globl	trace_circle
	.def	trace_circle;	.scl	2;	.type	32;	.endef
	.seh_proc	trace_circle

but still compute on packed single :/ but i will search how to, and i will try with icc.

I had try SoA for this code, but don't take any effect:

typedef struct a
{

    float x;

    float y;

    float z;

    float cosX;
    float cosY;
    float cosZ;
    float sinX;
    float sinY;
    float sinZ;
    float x_end;
    float y_end;
    float z_end;

}a;
struct a az;
void        put_pixel(float x, float y, float z, int color, float *repere)
{

    int     offset_pixel;

    az.cosX = cos(DEG2RAD(rotation_object[0]));
    az.cosY = cos(DEG2RAD(rotation_object[1]));
    az.cosZ = cos(DEG2RAD(rotation_object[2]));
    az.sinX = sin(DEG2RAD(rotation_object[0]));
    az.sinY = sin(DEG2RAD(rotation_object[1]));
    az.sinZ = sin(DEG2RAD(rotation_object[2]));

    az.x = x;
    az.y = y;
    az.z = z;
    az.x+=repere[0];
    az.y+=repere[1];
    az.z+=repere[2];
    az.x_end = az.x * ((az.cosY * az.cosZ))                        - az.y * ((az.cosY * az.sinZ))                         - az.z * (az.sinY);
    az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ))  - az.z * (az.sinX * az.cosY);
    az.z_end = az.x * ((az.cosX * az.sinY * az.cosZ) + (az.sinX * az.sinZ)) + az.y * ((az.sinX * az.cosZ) - (az.cosX * az.sinY * az.sinZ )) + az.z * (az.cosX * az.cosY);

    az.x = az.x_end;
    az.y = az.y_end;
    az.z = az.z_end;

    az.x = (az.x * FOCALE) / (az.z + PROFONDEUR);
    az.y = (az.y * FOCALE) / (az.z + PROFONDEUR);

    offset_pixel = REPERE - (LENGTH * (int)az.y) + (int)az.x;

    if (offset_pixel < LENGTH*WIDTH && offset_pixel >= 0)
        if ((az.x <= LENGTH/2 && az.x >= -LENGTH/2) && (az.y <= WIDTH/2 && az.y >= -WIDTH/2))
            WindowsD[offset_pixel] = color;
}

But is it possible to get my putpixel.kernel.asm with gcc or icc just with auto vectorisation ?

In summary, i fall down at 30 fps with 103_998 vertex with current workspace (gcc x64,avx instruction, packed single manipulate) than 60 fps with last workspace (gcc x86, fpu instruction, packed single manipulate)

Finnaly when i run into visual studio, i don't get it why fps is 180 fps oO, here the asm code of put pixel:

put_object PROC

; 188  : {

$LN16:

; 189  : 	while (size--)

	test	edx, edx
	je	$LN14@put_object
	mov	rax, rsp
	mov	QWORD PTR [rax+8], rbx
	mov	QWORD PTR [rax+24], rsi
	push	rdi
	sub	rsp, 192				; 000000c0H
	movaps	XMMWORD PTR [rax-24], xmm6
	movsdx	xmm6, QWORD PTR __real@3f91df469963e11d
	movaps	XMMWORD PTR [rax-40], xmm7

; 188  : {

	mov	rsi, r8
	mov	ebx, edx
	mov	rdi, rcx
	movaps	XMMWORD PTR [rax-56], xmm8
	movaps	XMMWORD PTR [rax-72], xmm9
	movaps	XMMWORD PTR [rax-88], xmm10
	movss	xmm9, DWORD PTR rotation_object
	cvtps2pd xmm9, xmm9
	movaps	XMMWORD PTR [rax-104], xmm11
	movaps	XMMWORD PTR [rax-120], xmm12
	movaps	XMMWORD PTR [rsp+64], xmm13
	mulsd	xmm9, xmm6

; 52   : 	az.cosX = cos(DEG2RAD(rotation_object[0]));

	movaps	xmm0, xmm9
	movaps	XMMWORD PTR [rsp+48], xmm14
	movaps	XMMWORD PTR [rsp+32], xmm15
	call	cos
	movss	xmm8, DWORD PTR rotation_object+4
	xorps	xmm14, xmm14
	cvtps2pd xmm8, xmm8
	cvtsd2ss xmm14, xmm0
	mulsd	xmm8, xmm6

; 53   : 	az.cosY = cos(DEG2RAD(rotation_object[1]));

	movaps	xmm0, xmm8
	call	cos
	movss	xmm7, DWORD PTR rotation_object+8
	xorps	xmm15, xmm15
	cvtps2pd xmm7, xmm7
	cvtsd2ss xmm15, xmm0
	mulsd	xmm7, xmm6

; 54   : 	az.cosZ = cos(DEG2RAD(rotation_object[2]));

	movaps	xmm0, xmm7
	call	cos
	xorps	xmm12, xmm12
	cvtsd2ss xmm12, xmm0

; 55   : 	az.sinX = sin(DEG2RAD(rotation_object[0]));

	movaps	xmm0, xmm9
	call	sin
	xorps	xmm3, xmm3
	cvtsd2ss xmm3, xmm0

; 56   : 	az.sinY = sin(DEG2RAD(rotation_object[1]));

	movaps	xmm0, xmm8
	movss	DWORD PTR az$4$[rsp], xmm3
	call	sin
	xorps	xmm6, xmm6
	cvtsd2ss xmm6, xmm0

; 57   : 	az.sinZ = sin(DEG2RAD(rotation_object[2]));

	movaps	xmm0, xmm7
	movss	DWORD PTR az$5$[rsp], xmm6
	call	sin

; 52   : 	az.cosX = cos(DEG2RAD(rotation_object[0]));

	movss	xmm11, DWORD PTR __real@44000000
	movss	xmm9, DWORD PTR __real@c4000000
	xorps	xmm13, xmm13
	lea	rcx, QWORD PTR [rdi+8]
	lea	r9, OFFSET FLAT:WindowsD
	movss	xmm10, DWORD PTR __real@43c80000
	movss	xmm8, DWORD PTR __real@c3c80000
	cvtsd2ss xmm13, xmm0
	npad	8
$LL2@put_object:

; 190  : 	{
; 191  : 		put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);

	movss	xmm5, DWORD PTR [rcx]
	movss	xmm3, DWORD PTR [rcx-4]
	movss	xmm4, DWORD PTR [rcx-8]

; 66   : 	az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);

	movaps	xmm1, xmm6
	movaps	xmm0, xmm13
	movaps	xmm7, xmm12

; 189  : 	while (size--)

	dec	ebx

; 58   : 
; 59   : 	az.x = x;
; 60   : 	az.y = y;
; 61   : 	az.z = z;
; 62   : 	az.x += repere[0];
; 63   : 	az.y += repere[1];
; 64   : 	az.z += repere[2];
; 65   : 	az.x_end = az.x * ((az.cosY * az.cosZ)) - az.y * ((az.cosY * az.sinZ)) - az.z * (az.sinY);

	mulss	xmm0, xmm15
	mulss	xmm7, xmm15

; 190  : 	{
; 191  : 		put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);

	cvttss2si r8d, DWORD PTR [rcx+4]

; 55   : 	az.sinX = sin(DEG2RAD(rotation_object[0]));

	movss	xmm2, DWORD PTR az$4$[rsp]

; 56   : 	az.sinY = sin(DEG2RAD(rotation_object[1]));

	movss	DWORD PTR az+28, xmm6

; 66   : 	az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);

	mulss	xmm1, xmm2
	movss	DWORD PTR az+24, xmm2
	movss	DWORD PTR az, xmm4
	movss	DWORD PTR az+4, xmm3
	movss	DWORD PTR az+8, xmm5
	movss	DWORD PTR az+12, xmm14
	movss	DWORD PTR az+16, xmm15
	movss	DWORD PTR az+20, xmm12
	movss	DWORD PTR az+32, xmm13
	addss	xmm4, DWORD PTR [rsi]
	mulss	xmm7, xmm4
	movss	DWORD PTR az, xmm4
	addss	xmm3, DWORD PTR [rsi+4]
	movss	DWORD PTR az+4, xmm3
	addss	xmm5, DWORD PTR [rsi+8]
	mulss	xmm0, xmm3
	subss	xmm7, xmm0
	movaps	xmm0, xmm5
	mulss	xmm0, xmm6
	movaps	xmm6, xmm13
	mulss	xmm6, xmm14
	subss	xmm7, xmm0
	movaps	xmm0, xmm1
	mulss	xmm0, xmm12
	mulss	xmm1, xmm13
	movss	DWORD PTR az+36, xmm7
	subss	xmm6, xmm0
	movaps	xmm0, xmm12

; 67   : 	az.z_end = az.x * ((az.cosX * az.sinY * az.cosZ) + (az.sinX * az.sinZ)) + az.y * ((az.sinX * az.cosZ) - (az.cosX * az.sinY * az.sinZ)) + az.z * (az.cosX * az.cosY);
; 68   : 
; 69   : 	az.x = az.x_end;
; 70   : 	az.y = az.y_end;
; 71   : 	az.z = az.z_end;
; 72   : 
; 73   : 	az.x = (az.x * FOCALE) / (az.z + PROFONDEUR);

	mulss	xmm7, DWORD PTR __real@44480000
	mulss	xmm0, xmm14
	mulss	xmm6, xmm4
	addss	xmm1, xmm0
	movaps	xmm0, xmm2
	mulss	xmm0, xmm15
	mulss	xmm1, xmm3
	mulss	xmm2, xmm12
	addss	xmm6, xmm1
	movss	xmm1, DWORD PTR az$5$[rsp]
	mulss	xmm0, xmm5
	mulss	xmm1, xmm14
	subss	xmm6, xmm0
	movaps	xmm0, xmm1
	mulss	xmm0, xmm13
	mulss	xmm1, xmm12
	movss	DWORD PTR az+40, xmm6
	subss	xmm2, xmm0
	movaps	xmm0, xmm13
	mulss	xmm0, DWORD PTR az$4$[rsp]

; 74   : 	az.y = (az.y * FOCALE) / (az.z + PROFONDEUR);

	mulss	xmm6, DWORD PTR __real@44480000
	mulss	xmm2, xmm3
	addss	xmm1, xmm0
	movaps	xmm0, xmm15
	mulss	xmm1, xmm4
	mulss	xmm0, xmm14
	addss	xmm2, xmm1
	mulss	xmm0, xmm5
	addss	xmm2, xmm0
	movss	DWORD PTR az+44, xmm2
	movss	DWORD PTR az+8, xmm2
	addss	xmm2, DWORD PTR __real@44fa0000
	divss	xmm6, xmm2
	divss	xmm7, xmm2

; 75   : 
; 76   : 	offset_pixel = REPERE - (LENGTH * (int)az.y) + (int)az.x;

	cvttss2si eax, xmm6
	cvttss2si edx, xmm7
	movss	DWORD PTR az, xmm7
	shl	eax, 10
	sub	edx, eax
	add	edx, 409087				; 00063dffH
	movss	DWORD PTR az+4, xmm6

; 77   : 
; 78   : 	if (offset_pixel < LENGTH*WIDTH && offset_pixel >= 0)

	cmp	edx, 819199				; 000c7fffH
	ja	SHORT $LN5@put_object

; 79   : 		if ((az.x <= LENGTH / 2 && az.x >= -LENGTH / 2) && (az.y <= WIDTH / 2 && az.y >= -WIDTH / 2))

	comiss	xmm11, xmm7
	jb	SHORT $LN5@put_object
	comiss	xmm7, xmm9
	jb	SHORT $LN5@put_object
	comiss	xmm10, xmm6
	jb	SHORT $LN5@put_object
	comiss	xmm6, xmm8
	jb	SHORT $LN5@put_object

; 80   : 			WindowsD[offset_pixel] = color;

	movsxd	rax, edx
	mov	DWORD PTR [r9+rax*4], r8d
$LN5@put_object:

; 189  : 	while (size--)

	movss	xmm6, DWORD PTR az$5$[rsp]

; 192  : 		object += 4;

	add	rcx, 16
	test	ebx, ebx
	jne	$LL2@put_object

; 193  : 	}
; 194  : }

	movaps	xmm15, XMMWORD PTR [rsp+32]
	movaps	xmm14, XMMWORD PTR [rsp+48]
	movaps	xmm13, XMMWORD PTR [rsp+64]
	lea	r11, QWORD PTR [rsp+192]
	mov	rbx, QWORD PTR [r11+16]
	mov	rsi, QWORD PTR [r11+32]
	movaps	xmm12, XMMWORD PTR [rsp+80]
	movaps	xmm11, XMMWORD PTR [rsp+96]
	movaps	xmm10, XMMWORD PTR [rsp+112]
	movaps	xmm9, XMMWORD PTR [rsp+128]
	movaps	xmm8, XMMWORD PTR [rsp+144]
	movaps	xmm7, XMMWORD PTR [rsp+160]
	movaps	xmm6, XMMWORD PTR [rsp+176]
	mov	rsp, r11
	pop	rdi
$LN14@put_object:
	ret	0
put_object ENDP
don't get it, really, and if i put /arch:AVX option, i fall down at 160 fps, 20 frame less :/

and it's the same thing, only work on one packed single:

put_object PROC

; 188  : {

$LN16:

; 189  : 	while (size--)

	test	edx, edx
	je	$LN14@put_object
	mov	rax, rsp
	mov	QWORD PTR [rax+8], rbx
	mov	QWORD PTR [rax+24], rsi
	push	rdi
	sub	rsp, 208				; 000000d0H
	vmovss	xmm0, DWORD PTR rotation_object
	vmovaps	XMMWORD PTR [rax-24], xmm6
	vmovsd	xmm6, QWORD PTR __real@3f91df469963e11d
	vmovaps	XMMWORD PTR [rax-40], xmm7
	vmovaps	XMMWORD PTR [rax-56], xmm8
	vmovaps	XMMWORD PTR [rax-72], xmm9

; 188  : {

	mov	rsi, r8
	mov	ebx, edx
	mov	rdi, rcx
	vcvtps2pd xmm0, xmm0
	vmovaps	XMMWORD PTR [rax-88], xmm10
	vmovaps	XMMWORD PTR [rax-104], xmm11
	vmovaps	XMMWORD PTR [rax-120], xmm12
	vmulsd	xmm8, xmm0, xmm6
	vmovaps	XMMWORD PTR [rsp+80], xmm13

; 52   : 	az.cosX = cos(DEG2RAD(rotation_object[0]));

	vmovaps	xmm0, xmm8
	vmovaps	XMMWORD PTR [rsp+64], xmm14
	vmovaps	XMMWORD PTR [rsp+48], xmm15
	call	cos
	vmovss	xmm1, DWORD PTR rotation_object+4
	vcvtps2pd xmm1, xmm1
	_vcvtsd2ss2 xmm15, xmm0
	vmulsd	xmm7, xmm1, xmm6

; 53   : 	az.cosY = cos(DEG2RAD(rotation_object[1]));

	vmovups	xmm0, xmm7
	call	cos
	_vcvtsd2ss2 xmm1, xmm0
	vmovss	DWORD PTR az$2$[rsp], xmm1
	vmovss	xmm1, DWORD PTR rotation_object+8
	vcvtps2pd xmm1, xmm1
	vmulsd	xmm6, xmm1, xmm6

; 54   : 	az.cosZ = cos(DEG2RAD(rotation_object[2]));

	vmovups	xmm0, xmm6
	call	cos
	_vcvtsd2ss2 xmm13, xmm0

; 55   : 	az.sinX = sin(DEG2RAD(rotation_object[0]));

	vmovups	xmm0, xmm8
	call	sin
	_vcvtsd2ss2 xmm8, xmm0

; 56   : 	az.sinY = sin(DEG2RAD(rotation_object[1]));

	vmovups	xmm0, xmm7
	vmovss	DWORD PTR az$4$[rsp], xmm8
	call	sin
	_vcvtsd2ss2 xmm7, xmm0

; 57   : 	az.sinZ = sin(DEG2RAD(rotation_object[2]));

	vmovups	xmm0, xmm6
	vmovss	DWORD PTR az$5$[rsp], xmm7
	call	sin

; 52   : 	az.cosX = cos(DEG2RAD(rotation_object[0]));

	vmovss	xmm12, DWORD PTR __real@44000000
	vmovss	xmm10, DWORD PTR __real@c4000000
	lea	rcx, QWORD PTR [rdi+8]
	lea	r9, OFFSET FLAT:WindowsD
	vmovss	xmm11, DWORD PTR __real@43c80000
	vmovss	xmm9, DWORD PTR __real@c3c80000
	_vcvtsd2ss2 xmm14, xmm0
	npad	13
$LL2@put_object:

; 190  : 	{
; 191  : 		put_pixel(*(object + 0), *(object + 1), *(object + 2), *(object + 3), repere);

	vmovss	xmm1, DWORD PTR [rcx-4]
	vmovss	xmm2, DWORD PTR [rcx]
	vmovss	xmm0, DWORD PTR [rcx-8]
	dec	ebx
	vcvttss2si r8d, DWORD PTR [rcx+4]

; 53   : 	az.cosY = cos(DEG2RAD(rotation_object[1]));

	vmovss	xmm4, DWORD PTR az$2$[rsp]

; 58   : 
; 59   : 	az.x = x;
; 60   : 	az.y = y;

	vmovss	DWORD PTR az+4, xmm1
	vmovss	DWORD PTR az, xmm0

; 61   : 	az.z = z;

	vmovss	DWORD PTR az+8, xmm2
	vmovss	DWORD PTR az+16, xmm4
	vmovss	DWORD PTR az+12, xmm15
	vmovss	DWORD PTR az+20, xmm13
	vmovss	DWORD PTR az+24, xmm8
	vmovss	DWORD PTR az+28, xmm7
	vmovss	DWORD PTR az+32, xmm14

; 62   : 	az.x += repere[0];

	vaddss	xmm8, xmm0, DWORD PTR [rsi]

; 63   : 	az.y += repere[1];
; 64   : 	az.z += repere[2];
; 65   : 	az.x_end = az.x * ((az.cosY * az.cosZ)) - az.y * ((az.cosY * az.sinZ)) - az.z * (az.sinY);

	vmulss	xmm0, xmm13, xmm4
	vmulss	xmm3, xmm0, xmm8
	vmovss	DWORD PTR az, xmm8
	vaddss	xmm5, xmm1, DWORD PTR [rsi+4]
	vmulss	xmm1, xmm14, xmm4
	vmovss	DWORD PTR az+4, xmm5
	vaddss	xmm7, xmm2, DWORD PTR [rsi+8]
	vmulss	xmm2, xmm1, xmm5
	vmovss	xmm1, DWORD PTR az$5$[rsp]
	vsubss	xmm3, xmm3, xmm2

; 66   : 	az.y_end = az.x * ((az.cosX * az.sinZ) - (az.sinX * az.sinY * az.cosZ)) + az.y * ((az.sinX * az.sinY * az.sinZ) + (az.cosX * az.cosZ)) - az.z * (az.sinX * az.cosY);

	vmulss	xmm2, xmm1, DWORD PTR az$4$[rsp]
	vmulss	xmm0, xmm7, xmm1
	vmulss	xmm1, xmm14, xmm15
	vsubss	xmm6, xmm3, xmm0
	vmovss	DWORD PTR az+36, xmm6
	vmulss	xmm0, xmm2, xmm13
	vmulss	xmm2, xmm2, xmm14
	vsubss	xmm1, xmm1, xmm0
	vmulss	xmm3, xmm1, xmm8
	vmulss	xmm0, xmm13, xmm15
	vaddss	xmm1, xmm2, xmm0
	vmulss	xmm0, xmm4, DWORD PTR az$4$[rsp]
	vmulss	xmm2, xmm1, xmm5
	vaddss	xmm3, xmm3, xmm2

; 67   : 	az.z_end = az.x * ((az.cosX * az.sinY * az.cosZ) + (az.sinX * az.sinZ)) + az.y * ((az.sinX * az.cosZ) - (az.cosX * az.sinY * az.sinZ)) + az.z * (az.cosX * az.cosY);

	vmulss	xmm2, xmm15, DWORD PTR az$5$[rsp]
	vmulss	xmm1, xmm0, xmm7
	vsubss	xmm4, xmm3, xmm1
	vmovss	DWORD PTR az+40, xmm4
	vmulss	xmm1, xmm13, DWORD PTR az$4$[rsp]
	vmulss	xmm0, xmm2, xmm14
	vmulss	xmm2, xmm2, xmm13
	vsubss	xmm1, xmm1, xmm0
	vmulss	xmm0, xmm14, DWORD PTR az$4$[rsp]
	vmulss	xmm3, xmm1, xmm5
	vaddss	xmm1, xmm2, xmm0
	vmulss	xmm0, xmm15, DWORD PTR az$2$[rsp]
	vmulss	xmm2, xmm1, xmm8
	vaddss	xmm3, xmm3, xmm2
	vmulss	xmm1, xmm0, xmm7

; 68   : 
; 69   : 	az.x = az.x_end;
; 70   : 	az.y = az.y_end;
; 71   : 	az.z = az.z_end;
; 72   : 
; 73   : 	az.x = (az.x * FOCALE) / (az.z + PROFONDEUR);

	vmulss	xmm0, xmm6, DWORD PTR __real@44480000
	vaddss	xmm2, xmm3, xmm1
	vaddss	xmm3, xmm2, DWORD PTR __real@44fa0000
	vmovss	DWORD PTR az+44, xmm2
	vmovss	DWORD PTR az+8, xmm2

; 74   : 	az.y = (az.y * FOCALE) / (az.z + PROFONDEUR);

	vmulss	xmm1, xmm4, DWORD PTR __real@44480000
	vdivss	xmm5, xmm0, xmm3
	vdivss	xmm0, xmm1, xmm3

; 75   : 
; 76   : 	offset_pixel = REPERE - (LENGTH * (int)az.y) + (int)az.x;

	vcvttss2si eax, xmm0
	vcvttss2si edx, xmm5
	vmovss	DWORD PTR az, xmm5
	shl	eax, 10
	sub	edx, eax
	add	edx, 409087				; 00063dffH
	vmovss	DWORD PTR az+4, xmm0

; 77   : 
; 78   : 	if (offset_pixel < LENGTH*WIDTH && offset_pixel >= 0)

	cmp	edx, 819199				; 000c7fffH
	ja	SHORT $LN5@put_object

; 79   : 		if ((az.x <= LENGTH / 2 && az.x >= -LENGTH / 2) && (az.y <= WIDTH / 2 && az.y >= -WIDTH / 2))

	vcomiss	xmm12, xmm5
	jb	SHORT $LN5@put_object
	vcomiss	xmm5, xmm10
	jb	SHORT $LN5@put_object
	vcomiss	xmm11, xmm0
	jb	SHORT $LN5@put_object
	vcomiss	xmm0, xmm9
	jb	SHORT $LN5@put_object

; 80   : 			WindowsD[offset_pixel] = color;

	movsxd	rax, edx
	mov	DWORD PTR [r9+rax*4], r8d
$LN5@put_object:

; 189  : 	while (size--)

	vmovss	xmm8, DWORD PTR az$4$[rsp]
	vmovss	xmm7, DWORD PTR az$5$[rsp]

; 192  : 		object += 4;

	add	rcx, 16
	test	ebx, ebx
	jne	$LL2@put_object

; 193  : 	}
; 194  : }

	vmovaps	xmm15, XMMWORD PTR [rsp+48]
	vmovaps	xmm14, XMMWORD PTR [rsp+64]
	vmovaps	xmm13, XMMWORD PTR [rsp+80]
	lea	r11, QWORD PTR [rsp+208]
	mov	rbx, QWORD PTR [r11+16]
	mov	rsi, QWORD PTR [r11+32]
	vmovaps	xmm12, XMMWORD PTR [rsp+96]
	vmovaps	xmm11, XMMWORD PTR [rsp+112]
	vmovaps	xmm10, XMMWORD PTR [rsp+128]
	vmovaps	xmm9, XMMWORD PTR [rsp+144]
	vmovaps	xmm8, XMMWORD PTR [rsp+160]
	vmovaps	xmm7, XMMWORD PTR [rsp+176]
	vmovaps	xmm6, XMMWORD PTR [rsp+192]
	mov	rsp, r11
	pop	rdi
$LN14@put_object:
	ret	0
put_object ENDP

But anyway, the code product by visual studio est more readable than asm's gcc :o and more fast, 180 fps vs 30.

i will integrate intel compiler later.

And for c++, i don't know what's going on if i translate into c++.

>>>I had try SoA for this code, but don't take any effect:>>>

You should either try to use Array of structures or Structure of Arrays in order to force compiler to use auto-vectorization. Of course SoA is preffered because of linear data layou of arrays. Try to align data on L1D cache line size that's mean 64-bytes in order to help hardware prefetchers to load full cache line. Moreover try to align data also on page boundary 4KB.

can't put avx2, cause i don't have required processor.

here's my project build with visual studio 2013:

Downloadapplication/zip SDL2.zip

The object model are a little modified, i don't use vertex texture and other thing, and i had add 4th coord: color

x, y, z, color, x, ect

And i use it include it directly in source code by:

float     object3D[size_object]={

                                  #include <object.obj>

                                  }

more easier to manage object3D for me.

#include    "object.obj"*

 

>>>But is it possible to get my putpixel.kernel.asm with gcc or icc just with auto vectorisation >>>

Try to rewrite  putpixel function  in order to work on SoA data. Use "restrict" keyword while declaring pointers  do not use RAW or WAR that's mean Read after Write or Write after Read. Try to eliminate branching inside the for loops.

a[i] = a[i] + x; // do not use it

a[i] = a[i+1] + x; // do not use it.

>>>can't put avx2, cause i don't have required procesor>>>

Do you have Core i7 IvyBridge CPU?

Your code should benefit from Haswell CPU AVX2 ISA mainly because  of FMA units on Port0 and Port1. If compiler could  emit FMA instruction you could speed up execution of polynomial like code  which contains additions and multiplications.So you can achieve 16 DP FP/cycle/core.

 result = (a + b) * (c + d)

I will later test your code on my Core i5 Haswell machine.

AoS : fail :/

When you talk about aligned data, do you mean the way how data are store in RAM ? i know high level language fragment data, and in assember we work always with aligned data, i triy to put static like my object: rhino, but steel mulss :/

But i'm curious what is asm code if i calculate rotation matrix without unroll matrix.

How do you align data also on page boundary 4KB ?

"restrict" pointer is like static array no ?

on intel compiler i run on 210 fps :o

>>>When you talk about aligned data, do you mean the way how data are store in RAM >>>

Yes and usually compiler will arange array linearly as opposed to allocation of objects on the heap or to allocation of linked list which of course will be allocated during the runtime.

i'm on laptop with core i7 sandy bridge..
 

I have brought (good word ?^^) 250+ fps, with testing some option intel compiler, althoug the avx instruction use XMM register, and 32 first bit for compute:

 

;;; 	static float coord[3];
;;; 	static float end_coord[3];
;;; 
;;; 	static float cosX;
;;; 	static float cosY;
;;; 	static float cosZ;
;;; 	static float sinX;
;;; 	static float sinY;
;;; 	static float sinZ;
;;; 	int     offset_pixel;
;;; 
;;; 	cosX = cos(DEG2RAD(rotation_object[0]));

        vxorpd    xmm3, xmm3, xmm3                              ;39.9
$LN3:
        vcvtss2sd xmm3, xmm3, DWORD PTR [rotation_object]       ;39.9
$LN4:
        vmovups   XMMWORD PTR [32+rsp], xmm15                   ;27.1
$LN5:
        mov       r14d, r9d                                     ;27.1
$LN6:
        vmovsd    xmm15, QWORD PTR [_2il0floatpacket.0]         ;39.13
$LN7:
        vmovups   XMMWORD PTR [112+rsp], xmm11                  ;27.1
$LN8:
        vmovaps   xmm11, xmm2                                   ;27.1
$LN9:
        vmovups   XMMWORD PTR [128+rsp], xmm10                  ;27.1
$LN10:
        vmovaps   xmm10, xmm1                                   ;27.1
$LN11:
        vmovups   XMMWORD PTR [144+rsp], xmm9                   ;27.1
$LN12:
        vmovaps   xmm9, xmm0                                    ;27.1
$LN13:
        vmulsd    xmm0, xmm3, xmm15                             ;39.9
$LN14:
        vmovups   XMMWORD PTR [48+rsp], xmm14                   ;27.1
$LN15:
        vmovups   XMMWORD PTR [64+rsp], xmm13                   ;27.1
$LN16:
        vmovups   XMMWORD PTR [96+rsp], xmm12                   ;27.1
$LN17:
        vmovups   XMMWORD PTR [80+rsp], xmm6                    ;27.1
$LN18:
        call      __libm_sse2_sincos                            ;39.9
$LN19:
                                ; LOE rbx rbp rsi rdi r12 r13 r15 r14d xmm0 xmm1 xmm7 xmm8 xmm9 xmm10 xmm11 xmm15
.B1.12::                        ; Preds .B1.1
$LN20:

;;; 	cosY = cos(DEG2RAD(rotation_object[1]));

        vxorpd    xmm2, xmm2, xmm2                              ;40.9
$LN21:
        vmovapd   xmm14, xmm0                                   ;39.9
$LN22:
        vcvtss2sd xmm2, xmm2, DWORD PTR [rotation_object+4]     ;40.9
$LN23:
        vcvtsd2ss xmm13, xmm1, xmm1                             ;39.2
$LN24:
        vmulsd    xmm0, xmm15, xmm2                             ;40.9
$LN25:
        vmovss    DWORD PTR [cosX.5146.0.1], xmm13              ;39.2
$LN26:
        call      __libm_sse2_sincos                            ;40.9
$LN27:
                                ; LOE rbx rbp rsi rdi r12 r13 r15 r14d xmm0 xmm1 xmm7 xmm8 xmm9 xmm10 xmm11 xmm13 xmm14 xmm15
.B1.11::                        ; Preds .B1.12
$LN28:

;;; 	cosZ = cos(DEG2RAD(rotation_object[2]));

        vxorpd    xmm2, xmm2, xmm2                              ;41.9
$LN29:
        vmovapd   xmm6, xmm0                                    ;40.9
$LN30:
        vcvtss2sd xmm2, xmm2, DWORD PTR [rotation_object+8]     ;41.9
$LN31:
        vcvtsd2ss xmm12, xmm1, xmm1                             ;40.2
$LN32:
        vmulsd    xmm0, xmm15, xmm2                             ;41.9
$LN33:
        vmovss    DWORD PTR [cosY.5146.0.1], xmm12              ;40.2
$LN34:
        call      __libm_sse2_sincos                            ;41.9

 

Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 1: Basic Architecture

 

14.1.2  Instruction Syntax Enhancements

Intel AVX employs an instruction encoding scheme using a new prefix (known as “VEX” prefix). Instruction
encoding using the VEX prefix can directly encode a register operand within the VEX prefix. This support two new
instruction syntax in Intel 64 architecture:

• A non-destructive operand (in a three-operand instruction syntax): The non-destructive source reduces the
number of registers, register-register copies and explicit load operations required in typical SSE loops, reduces
code size, and improves micro-fusion opportunities.

• A third source operand (in a four-operand instruction syntax) via the upper 4 bits in an 8-bit immediate field.
Support for the third source operand is defined for selected instructions (e.g. VBLENDVPD, VBLENDVPS,
PBLENDVB).

Two-operand instruction syntax previously expressed in legacy SSE instruction as

ADDPS xmm1, xmm2/m128

128-bit AVX equivalent can be expressed in three-operand syntax as

VADDPS xmm1, xmm2, xmm3/m128

In four-operand syntax, the extra register operand is encoded in the immediate byte.
Note SIMD instructions supporting three-operand syntax but processing only 128-bits of data are considered part
of the 256-bit SIMD instruction set extensions of AVX, because bits 255:128 of the destination register are zeroed
by the processor.

I think it's impossible to full auto vectorise data, except maybe with Intrinsic Functions ...

I have test with 8_000_000 vertex, it's run under 15 fps, it's good news cause the data's vectorisation is not full.

Downloadapplication/zip 3D engine-benchmark.zip

 

 

 

 

>>>call      __libm_sse2_sincos    >>>

Force compiler to inline functions calls.                       

>>>"restrict" pointer is like static array no ?>>>

Please read description here: http://stackoverflow.com/questions/2005473/rules-for-using-the-restrict-keyword-in-c

>>>I think it's impossible to full auto vectorise data,>>>

I think that it's mainly depends on compiler analysis of the code in order to exploit vectorization. At least data accesses should fit SSE or AVX  registers length. There should not be some kind of interdependency between the vectorized data or between data load/store.Moreover compiler will try to calculate theoretical speedup of the vectorization and will try to asses if vectorization will provide the same final result when compared to serialized code.

>>> vmovups   XMMWORD PTR [48+rsp], xmm14   >>>

Can you check with GDB the content of xmm14 register? You should look for 4 SP FP data.               

typedef struct a 


02 { 


03   


04     float x; 


05   


06     float y; 


07   


08     float z; 


09   


10     float cosX; 


11     float cosY; 


12     float cosZ; 


13     float sinX; 


14     float sinY; 


15     float sinZ; 


16     float x_end; 


17     float y_end; 


18     float z_end; 


19   


20 }a; 

I think that you cannot force compiler to vectorize code when your struct members are single variables. You need to operate on float array members.

For  vmovups   XMMWORD PTR [48+rsp], xmm14, icl do that for store SMID registrer only, but don't vector calculation, try to found mulps/haddps, ect, and you will don't see any Vector calculation.

I will try assembler inline.

 

And for allgned/unaligned memory, there is a big surprise, look that code:

        vmovaps   xmm6, xmm1                                    ;195.28
$LN1464:
        vmovups   XMMWORD PTR [1152+rsp], xmm7                  ;195.28

it mean intel compiler transfer aligned memory to an register, but transfer unaligned memory to an variable memory.

In summary we can't do vmovaps XMMWORD PTR [1152+rsp], xmm7 :/

but i don't understand, aligned value in memory is for vector calculation yes ?

So why it's work when i do unaligned memory transfer and vector calculation on them:

;====================================================================================================
;FONCTIONS		FONCTIONS		  FONCTIONS	    	FONCTIONS	    	FONCTIONS	
;====================================================================================================			
; make_rotations:

		;=============
		; yaw
		;=============
			Yaw:	; y
				; On applique la rotation au point	|[esi + 0] = x
				;									|[esi + 4] = y
				;									|[esi + 8] = z
				; On calcule x = x.cos(phi.y) * cos(phi.z) - y.cos(phi.y) * sin(phi.z) - z.sin(phi.y)
				;
				; On calcule  A = x.cos(phi.y), B = y.cos(phi.y) et C = z.sin(phi.y)
					movups	xmm0, [_xmm2 + 4]
					movups	xmm1, [coordonee]
					mulps	xmm0, xmm1

				; On calcule D = A * cos(phi.z), E = B * sin(phi.z) et C = C * 1
					movups	xmm1, [_xmm1 + 8]
					mulps	xmm0, xmm1

				; On calcule F = D - E, C = C - 0
					hsubps	xmm0, xmm0
				
				; On calcule xmm0 = F - C
					hsubps	xmm0, xmm0
										
				; On modifie x selon selon le rapport entre x et y pour que x soit proportionnelle à y 
					movd	xmm1, [rapport]
					divps	xmm0, xmm1
					
				; On save la new coordonée
					movd	[_x], xmm0

		;=============
		; / yaw
		;=============	
	
		;=============
		; pitch
		;=============
			Pitch:	; x
				; On applique la rotation au point	|[esi + 0] = x
				;									|[esi + 4] = y
				;									|[esi + 8] = z
				; On calcule y = x.(cos(phi.x) * sin(phi.z) - sin(phi.x) * cos(phi.z) * sin(phi.y)) + 
				;				 y.(sin(phi.x) * sin(phi.z) * sin(phi.y) + cos(phi.x) * cos(phi.z)) - 
				;				 z.(sin(phi.x) * cos(phi.y))
				;
				; On calcule A = cos(phi.x) * sin(phi.z), B = sin(phi.x) * cos(phi.z), E = cos(phi.x) * cos(phi.z) et F = sin(phi.x) * sin(phi.z)
					movddup xmm0, [_xmm0 + 8]
					movups 	xmm1, [_xmm1]
					mulps	xmm0, xmm1

				; on sauve xmm0 dans xmm7 pour le copier dans xmm0 de Roll car l'equation de y ressemblent a l'equation de z mis a part que la valeur sin(phi.y) est 
				; multiplié par d'autres equations

				; On calcule C' = A' * sin(phi.y) et G' = E' * sin(phi.y)
					movddup	xmm7, [_xmm2 + 12]
					mulps	xmm7, xmm0		
					
				; On calcule C = B * sin(phi.y) et G = F * sin(phi.y)
					movddup	xmm2, [_xmm2 + 16]
					mulps	xmm0, xmm2
					
				; Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
				; En somme on separe les deux partie x et y:	xmm0 =	A) cos(phi.x) * sin(phi.z)								xmm0 =	cos(phi.x) * sin(phi.z) 					
				;											 			C) sin(phi.x) * cos(phi.z) * sin(phi.y) 			=>			sin(phi.x) * sin(phi.y) * cos(phi.z)
				;														E) cos(phi.x) * cos(phi.z)								xmm1 =	cos(phi.x) * cos(phi.z) 
				;														G) sin(phi.x) * sin(phi.z) * sin(phi.y)							sin(phi.x) * sin(phi.y) * sin(phi.z) 
					movhlps xmm1, xmm0
					 
				; On calcule D = A - C
					hsubps xmm0, xmm0
					
				; On calcule H = E + G					
					haddps xmm1, xmm1
 
				; On calcule sin(phi.x) * cos(phi.y) et cos(phi.x) * cos(phi.y)
				;
				; On calcule I.roll = cos(phi.x) * cos(phi.y) et I.Pitch = sin(phi.x) * cos(phi.y) 
					movlps		xmm3, [_xmm0 + 8]
					movlps		xmm2, [_xmm2 + 4]
					mulps		xmm2, xmm3
					movshdup 	xmm3, xmm2
				; On calcule x.D + y.H - z.I
				;
				; On calcule J = x.D, K = y.H et L = z.I
					movups		xmm5, [coordonee]
					movsldup	xmm4, xmm1	; y.H
					movss		xmm4, xmm0	; x.D
					movlhps 	xmm4, xmm3	; z.I.Pitch
					mulps		xmm4, xmm5
					
				; On calcule M = J + K
					haddps	xmm4, xmm4
					
				; On calcule N = M - L
					hsubps	xmm4, xmm4
					
				; On save la new coordonée
					movd	[_y], xmm4
					
		;=============
		; / pitch
		;=============
		;=============
		; roll
		;=============
			Roll:	; z	
				; On applique la rotation au point	|[esi + 0] = x
				;									|[esi + 4] = y
				;									|[esi + 8] = z
				; On calcule z' = x.(cos(phi.x) * cos(phi.z) * sin(phi.y) + sin(phi.x) * sin(phi.z)) + 
				;				  y.(sin(phi.x) * cos(phi.z) - cos(phi.x) * sin(phi.z) * sin(phi.y)) +
				;				  z.(cos(phi.x) * cos(phi.y))
				;			
				; Copie le contenu du haut (64..127) d'un paquet de valeurs réel de simple précision (4*32 bits) dans sa partie basse (0..31).
				; En somme on separe les deux partie x et y:	xmm7 =	C') cos(phi.x) * sin(phi.z) * sin(phi.y)				xmm7 =	C') cos(phi.x) * sin(phi.z) * sin(phi.y))
				;											 			B') sin(phi.x) * cos(phi.z)						 =>				B') sin(phi.x) * cos(phi.z)
				;														G') cos(phi.x) * cos(phi.z) * sin(phi.y)				xmm1 =	G') cos(phi.x) * cos(phi.z) * sin(phi.y)
				;														F') sin(phi.x) * sin(phi.z)										F') sin(phi.x) * sin(phi.z
					movhlps xmm1, xmm7
					
				; On calcule D' = -B' + C'
					movd	xmm6, [conv_signe]
					orps	xmm7, xmm6
					haddps	xmm7, xmm7
					
				; On calcule H' = G' + F'
					haddps	xmm1, xmm1		
					
				; On calcule x.D' + y.H' + z.I'
				;
				; On calcule J = x.D', K = y.H' et L = z.I'
					movups		xmm3, [coordonee]
					movsldup	xmm4, xmm7	; y.D'
					movss		xmm4, xmm1	; x.H'
					movlhps 	xmm4, xmm2	; z.I'
					mulps		xmm4, xmm3
					
				; On calcule M' = J' + K'
					haddps	xmm4, xmm4
					
				; On calcule N' = M' + L'
					haddps	xmm4, xmm4
		;=============
		; / roll
		;=============
; ret			
;====================================================================================================
;END_FONCTIONS		END_FONCTIONS		  END_FONCTIONS	    	END_FONCTIONS	    	END_FONCTIONS	
;====================================================================================================				
				

 

I still don't understand Data structure alignment, i read this on wikipedia:

Data structure alignment is the way data is arranged and accessed in computer memory. It consists of two separate but related issues: data alignment and data structure padding. When a modern computer reads from or writes to a memory address, it will do this in word sized chunks (e.g. 4 byte chunks on a 32-bit system) or larger.

why this system ? [0x0000_0000] point on first byte address memory and [0x0000_0001] point on second byte

mov       eax, [0x0000_0000]     ; begin to store 4 byte after the first byte of RAM to eax

mov       eax, [0x0000_0001]     ; begin to store 4 byte after the second byte of RAM to eax

can we disable this memory management, and access to data address byte after byte ?

For clear_screen i use the same algorithme like in my kernel.asm:

void        clear_screen(void)
{
	int     loop = 0;
	while (loop < LENGTH*WIDTH)
	{
		WindowsD[loop++] = 0;
	}
}
	;=============
	 ; void clear_screen (void)
	 ; Clear screen
	 ; Entrée : None
	 ; Sotie: Screen
	 ; Destroyed: edi
	;=============	 
	clear_screen:
		mov		edi, [PhysBasePtr]					
		mov		ecx, (WIDTH*LENGTH*4)/16
		; vxorps	ymm1, ymm1			; 256 bit instruction !		
		xorps	xmm0, xmm0
		clear_s:
			; vmovdqu	[edi], ymm1		; 256 bit instruction !		
			movdqu 	[edi], xmm0		
			add		edi, 16
		loop	clear_s
	ret
	;===============
	; / clear_screen
	;===============

But the one is fastest than second :/

Here's the assembly code from c:

        lea       rcx, QWORD PTR [WindowsD]                     ;190.3
$LN709:
        xor       edx, edx                                      ;190.3
$LN710:
        mov       r8d, 8294400                                  ;190.3
$LN711:
        call      _intel_fast_memset                            ;190.3

(mov           r8d,  8294400 is for loop instruction: while(r8d--) { .. .}, 8294400 is the size of WindowsD who translate by LENGTH*WIDTH.

(xor            edx, edx is the value who will clear memory)

for call      _intel_fast_memset, i don't acces to this code, but it do probably:

while(r8d--)
{
     [rcx] = edx;     / fill pixel location at rcx by 0x0000_0000
     rcx += 4;
}

 

页面

发表评论

登录添加评论。还不是成员?立即加入