[ardour-dev] AMD64 SSE optimisation

Fri Dec 16 05:00:51 PST 2005

Hi

Here's a patch to allow Ardour's SSE optimising code to be used on x86_64.
I've got it running on my AMD64 system but haven't done any performance tests
yet.
If anyone notices any glaring errors or wrong assumptions in the patch,
please let me know. BTW this is just a basic patch and will break
compilation on 32-bit systems.

John
-------------- next part --------------
diff -uprN ardour-0.99/libs/ardour/globals.cc ardour-sse64/libs/ardour/globals.cc

--- ardour-0.99/libs/ardour/globals.cc	2005-09-22 04:26:06.000000000 +0100
+++ ardour-sse64/libs/ardour/globals.cc	2005-12-16 11:32:46.000000000 +0000
@@ -193,15 +193,15 @@ ARDOUR::init (AudioEngine& engine, bool 
 		unsigned int use_sse = 0;
 
 		asm volatile (
-				 "mov $1, %%eax\n"
-				 "pushl %%ebx\n"
+				 "movq $1, %%rax\n"
+				 "pushq %%rbx\n"
 				 "cpuid\n"
-				 "popl %%ebx\n"
-				 "andl $33554432, %%edx\n"
-				 "movl %%edx, %0\n"
+				 "popq %%rbx\n"
+				 "andq $33554432, %%rdx\n"
+				 "movq %%rdx, %0\n"
 		 	     : "=m" (use_sse)
 	   		     : 
- 	    		 : "%eax", "%ecx", "%edx", "memory");
+ 	    		 : "%rax", "%rcx", "%rdx", "memory");
 
 		if (use_sse) {
 			cerr << "Enabling SSE optimized routines" << endl;
diff -uprN ardour-0.99/libs/ardour/mix.cc ardour-sse64/libs/ardour/mix.cc
--- ardour-0.99/libs/ardour/mix.cc	2005-09-22 04:26:06.000000000 +0100
+++ ardour-sse64/libs/ardour/mix.cc	2005-12-16 11:32:28.000000000 +0000
@@ -31,7 +31,7 @@
 float
 debug_compute_peak (ARDOUR::Sample *buf, jack_nframes_t nsamples, float current) 
 {
-	if ( ((int)buf % 16) != 0) {
+	if ( ((long int)buf % 16) != 0) {
 		cerr << "compute_peak(): buffer unaligned!" << endl;
 	}
 
@@ -41,7 +41,7 @@ debug_compute_peak (ARDOUR::Sample *buf,
 void
 debug_apply_gain_to_buffer (ARDOUR::Sample *buf, jack_nframes_t nframes, float gain)
 {
-	if ( ((int)buf % 16) != 0) {
+	if ( ((long int)buf % 16) != 0) {
 		cerr << "apply_gain_to_buffer(): buffer unaligned!" << endl;
 	}
 
@@ -51,11 +51,11 @@ debug_apply_gain_to_buffer (ARDOUR::Samp
 void
 debug_mix_buffers_with_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, jack_nframes_t nframes, float gain)
 {
-	if ( ((int)dst & 15) != 0) {
+	if ( ((long int)dst & 15) != 0) {
 		cerr << "mix_buffers_with_gain(): dst unaligned!" << endl;
 	}
 
-	if ( ((int)dst & 15) != ((int)src & 15) ) {
+	if ( ((long int)dst & 15) != ((long int)src & 15) ) {
 		cerr << "mix_buffers_with_gain(): dst & src don't have the same alignment!" << endl;
 		mix_buffers_with_gain(dst, src, nframes, gain);
 	} else {
@@ -66,11 +66,11 @@ debug_mix_buffers_with_gain (ARDOUR::Sam
 void
 debug_mix_buffers_no_gain (ARDOUR::Sample *dst, ARDOUR::Sample *src, jack_nframes_t nframes)
 {
-	if ( ((int)dst & 15) != 0) {
+	if ( ((long int)dst & 15) != 0) {
 		cerr << "mix_buffers_no_gain(): dst unaligned!" << endl;
 	}
 
-	if ( ((int)dst & 15) != ((int)src & 15) ) {
+	if ( ((long int)dst & 15) != ((long int)src & 15) ) {
 		cerr << "mix_buffers_no_gain(): dst & src don't have the same alignment!" << endl;
 		mix_buffers_no_gain(dst, src, nframes);
 	} else {
diff -uprN ardour-0.99/libs/ardour/sse_functions.s ardour-sse64/libs/ardour/sse_functions.s
--- ardour-0.99/libs/ardour/sse_functions.s	2005-09-08 20:41:11.000000000 +0100
+++ ardour-sse64/libs/ardour/sse_functions.s	2005-12-16 11:33:10.000000000 +0000
@@ -25,95 +25,95 @@
 	.type	x86_sse_mix_buffers_with_gain, at function
 
 x86_sse_mix_buffers_with_gain:
-#; 8(%ebp)	= float	*dst 	= %edi
-#; 12(%ebp) = float *src	= %esi
-#; 16(%ebp) = long	nframes = %ecx
-#; 20(%ebp) = float	gain    = st(0)
+#; 8(%rbp)	= float	*dst 	= %rdi
+#; 12(%rbp) = float *src	= %rsi
+#; 16(%rbp) = long	nframes = %rcx
+#; 20(%rbp) = float	gain    = %st(0)
 
-	pushl %ebp
-	movl %esp, %ebp
+	pushq %rbp
+	movq %rsp, %rbp
 
 	#; save the registers
-#;	pushl %eax
-	pushl %ebx
-#;	pushl %ecx
-	pushl %edi
-	pushl %esi
+#;	pushq %rax
+	pushq %rbx
+#;	pushq %rcx
+	pushq %rdi
+	pushq %rsi
 	
 	#; if nframes == 0, go to end
-	movl 16(%ebp), %ecx #; nframes
-	cmp	$0, %ecx
+	movq 16(%rbp), %rcx #; nframes
+	cmp	$0, %rcx
 	je	.MBWG_END
 
 	#; Check for alignment
 
-	movl 8(%ebp), %edi  #; dst 
-	movl 12(%ebp), %esi #; src
+	movq 8(%rbp), %rdi  #; dst 
+	movq 12(%rbp), %rsi #; src
 
-	movl %edi, %eax
-	andl $12, %eax #; mask alignemnt offset
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignment offset
 
-	movl %esi, %ebx
-	andl $12, %ebx #; mask alignment offset
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
 
-	cmp %eax, %ebx
+	cmp %rax, %rbx
 	jne .MBWG_NONALIGN #; if not aligned, calculate manually
 
 	#; if we are aligned
-	cmp $0, %ebx
+	cmp $0, %rbx
 	jz .MBWG_SSE
 	
 	#; Pre-loop, we need to run 1-3 frames "manually" without
 	#; SSE instructions
 
-	movss 20(%ebp), %xmm1 #; xmm1
+	movss 20(%rbp), %xmm1 #; xmm1
 
 .MBWG_PRELOOP:
 	
-	movss (%esi), %xmm0
+	movss (%rsi), %xmm0
 	mulss %xmm1, %xmm0
-	addss (%edi), %xmm0
-	movss %xmm0, (%edi)
+	addss (%rdi), %xmm0
+	movss %xmm0, (%rdi)
 
-	addl $4, %edi #; dst++
-	addl $4, %esi #; src++
-	decl %ecx 	  #; nframes--
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rcx 	  #; nframes--
 	jz .MBWG_END
 
-#;	cmp $0, %ecx
+#;	cmp $0, %rcx
 #;	je .MBWG_END #; if we run out of frames, go to end
 	
-	addl $4, %ebx
+	addq $4, %rbx
 	
-	cmp $16, %ebx #; test if we've reached 16 byte alignment
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
 	jne .MBWG_PRELOOP
 
 
 .MBWG_SSE:
 
-	cmp $4, %ecx #; we know it's not zero, but if it's not >=4, then
+	cmp $4, %rcx #; we know it's not zero, but if it's not >=4, then
 	jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
 
 	#; copy gain to fill %xmm1
-	movss   20(%ebp), %xmm1
+	movss   20(%rbp), %xmm1
     shufps  $0x00, %xmm1, %xmm1
 
 
 .MBWG_SSELOOP:
 
-	movaps	(%esi), %xmm0 #; source => xmm0
+	movaps	(%rsi), %xmm0 #; source => xmm0
 	mulps	%xmm1,  %xmm0 #; apply gain to source
-	addps	(%edi), %xmm0 #; mix with destination
-	movaps  %xmm0, (%edi) #; copy result to destination
+	addps	(%rdi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%rdi) #; copy result to destination
 	
-	addl $16, %edi #; dst+=4
-	addl $16, %esi #; src+=4
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
 
-	subl $4, %ecx #; nframes-=4
-	cmp $4, %ecx
+	subq $4, %rcx #; nframes-=4
+	cmp $4, %rcx
 	jge .MBWG_SSELOOP
 
-	cmp $0, %ecx
+	cmp $0, %rcx
 	je .MBWG_END
 
 	#; if there are remaining frames, the nonalign code will do nicely
@@ -122,28 +122,28 @@ x86_sse_mix_buffers_with_gain:
 .MBWG_NONALIGN:
 	#; not aligned!
 
-	movss 20(%ebp), %xmm1 #; gain => xmm1
+	movss 20(%rbp), %xmm1 #; gain => xmm1
 
 .MBWG_NONALIGNLOOP:
 
-	movss (%esi), %xmm0
+	movss (%rsi), %xmm0
 	mulss %xmm1, %xmm0
-	addss (%edi), %xmm0
-	movss %xmm0, (%edi)
+	addss (%rdi), %xmm0
+	movss %xmm0, (%rdi)
 	
-	addl $4, %edi
-	addl $4, %esi
+	addq $4, %rdi
+	addq $4, %rsi
 	
-	decl %ecx
+	decq %rcx
 	jnz .MBWG_NONALIGNLOOP
 
 .MBWG_END:
 
-	popl %esi
-	popl %edi
-#;	popl %ecx
-	popl %ebx
-#;	popl %eax
+	popq %rsi
+	popq %rdi
+#;	popq %rcx
+	popq %rbx
+#;	popq %rax
 	
 	#; return
 	leave
@@ -160,42 +160,42 @@ x86_sse_mix_buffers_with_gain:
 	.type	x86_sse_mix_buffers_no_gain, at function
 
 x86_sse_mix_buffers_no_gain:
-#; 8(%ebp)	= float	*dst 	= %edi
-#; 12(%ebp) = float *src	= %esi
-#; 16(%ebp) = long	nframes = %ecx
+#; 8(%rbp)	= float	*dst 	= %rdi
+#; 12(%rbp) = float *src	= %rsi
+#; 16(%rbp) = long	nframes = %rcx
 
-	pushl %ebp
-	movl %esp, %ebp
+	pushq %rbp
+	movq %rsp, %rbp
 
 	#; save the registers
-#;	pushl %eax
-	pushl %ebx
-#;	pushl %ecx
-	pushl %edi
-	pushl %esi
+#;	pushq %rax
+	pushq %rbx
+#;	pushq %rcx
+	pushq %rdi
+	pushq %rsi
 	
 	#; the real function
 
 	#; if nframes == 0, go to end
-	movl 16(%ebp), %ecx #; nframes
-	cmp	$0, %ecx
+	movq 16(%rbp), %rcx #; nframes
+	cmp	$0, %rcx
 	je	.MBNG_END
 
 	#; Check for alignment
 
-	movl 8(%ebp), %edi  #; dst 
-	movl 12(%ebp), %esi #; src
+	movq 8(%rbp), %rdi  #; dst 
+	movq 12(%rbp), %rsi #; src
 
-	movl %edi, %eax
-	andl $12, %eax #; mask alignemnt offset
+	movq %rdi, %rax
+	andq $12, %rax #; mask alignemnt offset
 
-	movl %esi, %ebx
-	andl $12, %ebx #; mask alignment offset
+	movq %rsi, %rbx
+	andq $12, %rbx #; mask alignment offset
 
-	cmp %eax, %ebx
+	cmp %rax, %rbx
 	jne .MBNG_NONALIGN #; if not aligned, calculate manually
 
-	cmp $0, %ebx
+	cmp $0, %rbx
 	je .MBNG_SSE
 
 	#; Pre-loop, we need to run 1-3 frames "manually" without
@@ -203,38 +203,38 @@ x86_sse_mix_buffers_no_gain:
 
 .MBNG_PRELOOP:
 		
-	movss (%esi), %xmm0
-	addss (%edi), %xmm0
-	movss %xmm0, (%edi)
-
-	addl $4, %edi #; dst++
-	addl $4, %esi #; src++
-	decl %ecx 	  #; nframes--
+	movss (%rsi), %xmm0
+	addss (%rdi), %xmm0
+	movss %xmm0, (%rdi)
+
+	addq $4, %rdi #; dst++
+	addq $4, %rsi #; src++
+	decq %rcx 	  #; nframes--
 	jz	.MBNG_END
-	addl $4, %ebx
+	addq $4, %rbx
 	
-	cmp $16, %ebx #; test if we've reached 16 byte alignment
+	cmp $16, %rbx #; test if we've reached 16 byte alignment
 	jne .MBNG_PRELOOP
 
 .MBNG_SSE:
 
-	cmp $4, %ecx #; if there are frames left, but less than 4
+	cmp $4, %rcx #; if there are frames left, but less than 4
 	jnge .MBNG_NONALIGN #; we can't run SSE
 
 .MBNG_SSELOOP:
 
-	movaps	(%esi), %xmm0 #; source => xmm0
-	addps	(%edi), %xmm0 #; mix with destination
-	movaps  %xmm0, (%edi) #; copy result to destination
+	movaps	(%rsi), %xmm0 #; source => xmm0
+	addps	(%rdi), %xmm0 #; mix with destination
+	movaps  %xmm0, (%rdi) #; copy result to destination
 	
-	addl $16, %edi #; dst+=4
-	addl $16, %esi #; src+=4
+	addq $16, %rdi #; dst+=4
+	addq $16, %rsi #; src+=4
 
-	subl $4, %ecx #; nframes-=4
-	cmp $4, %ecx
+	subq $4, %rcx #; nframes-=4
+	cmp $4, %rcx
 	jge .MBNG_SSELOOP
 
-	cmp $0, %ecx
+	cmp $0, %rcx
 	je .MBNG_END
 
 	#; if there are remaining frames, the nonalign code will do nicely
@@ -243,23 +243,23 @@ x86_sse_mix_buffers_no_gain:
 .MBNG_NONALIGN:
 	#; not aligned!
 
-	movss (%esi), %xmm0 #; src => xmm0
-	addss (%edi), %xmm0 #; xmm0 += dst
-	movss %xmm0, (%edi) #; xmm0 => dst
+	movss (%rsi), %xmm0 #; src => xmm0
+	addss (%rdi), %xmm0 #; xmm0 += dst
+	movss %xmm0, (%rdi) #; xmm0 => dst
 	
-	addl $4, %edi
-	addl $4, %esi
+	addq $4, %rdi
+	addq $4, %rsi
 	
-	decl %ecx
+	decq %rcx
 	jnz .MBNG_NONALIGN
 
 .MBNG_END:
 
-	popl %esi
-	popl %edi
-#;	popl %ecx
-	popl %ebx
-#;	popl %eax
+	popq %rsi
+	popq %rdi
+#;	popq %rcx
+	popq %rbx
+#;	popq %rax
 	
 	#; return
 	leave
@@ -276,110 +276,110 @@ x86_sse_mix_buffers_no_gain:
 	.type	x86_sse_apply_gain_to_buffer, at function
 
 x86_sse_apply_gain_to_buffer:
-#; 8(%ebp)	= float	*buf 	= %edi
-#; 12(%ebp) = long	nframes = %ecx
-#; 16(%ebp) = float	gain    = st(0)
+#; 8(%rbp)	= float	*buf 	= %rdi
+#; 12(%rbp) = long	nframes = %rcx
+#; 16(%rbp) = float	gain    = st(0)
 
-	pushl %ebp
-	movl %esp, %ebp
+	pushq %rbp
+	movq %rsp, %rbp
 
-	#; save %edi
-	pushl %edi
+	#; save %rdi
+	pushq %rdi
 	
 	#; the real function
 
 	#; if nframes == 0, go to end
-	movl 12(%ebp), %ecx #; nframes
-	cmp	$0, %ecx
+	movq 12(%rbp), %rcx #; nframes
+	cmp	$0, %rcx
 	je	.AG_END
 
 	#; create the gain buffer in %xmm1
-	movss	16(%ebp), %xmm1
+	movss	16(%rbp), %xmm1
 	shufps	$0x00, %xmm1, %xmm1
 	
 	#; Check for alignment
 
-	movl 8(%ebp), %edi #; buf 
-	movl %edi, %edx #; buf => %edx
-	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	movq 8(%rbp), %rdi #; buf 
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 	jz	.AG_SSE #; if buffer IS aligned
 
 	#; PRE-LOOP
 	#; we iterate 1-3 times, doing normal x87 float comparison
-	#; so we reach a 16 byte aligned "buf" (=%edi) value
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
 
 .AGLP_START:
 
 	#; Load next value from the buffer
-	movss (%edi), %xmm0
+	movss (%rdi), %xmm0
 	mulss %xmm1, %xmm0
-	movss %xmm0, (%edi)
+	movss %xmm0, (%rdi)
 
 	#; increment buffer, decrement counter
-	addl $4, %edi #; buf++;
+	addq $4, %rdi #; buf++;
 	
-	decl %ecx   #; nframes--
+	decq %rcx   #; nframes--
 	jz	.AG_END #; if we run out of frames, we go to the end
 	
-	addl $4, %edx #; one non-aligned byte less
-	cmp $16, %edx
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
 	jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 
 .AG_SSE:
 
-	#; We have reached the 16 byte aligned "buf" ("edi") value
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
 
 	#; Figure out how many loops we should do
-	movl %ecx, %eax #; copy remaining nframes to %eax for division
-	movl $0, %edx   #; 0 the edx register
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
+	movq $0, %rdx   #; 0 the rdx register
 	
 	
-	pushl %edi
-	movl $4, %edi
-	divl %edi #; %edx = remainder == 0
-	popl %edi
+	pushq %rdi
+	movq $4, %rdi
+	divq %rdi #; %rdx = remainder == 0
+	popq %rdi
 
-	#; %eax = SSE iterations
-	cmp $0, %eax
+	#; %rax = SSE iterations
+	cmp $0, %rax
 	je .AGPOST_START
 
 	
 .AGLP_SSE:
 
-	movaps (%edi), %xmm0
+	movaps (%rdi), %xmm0
 	mulps %xmm1, %xmm0
-	movaps %xmm0, (%edi)
+	movaps %xmm0, (%rdi)
 
-	addl $16, %edi
-#;	subl $4, %ecx   #; nframes-=4
+	addq $16, %rdi
+#;	subq $4, %rcx   #; nframes-=4
 
-	decl %eax
+	decq %rax
 	jnz .AGLP_SSE
 
 	#; Next we need to post-process all remaining frames
-	#; the remaining frame count is in %ecx
+	#; the remaining frame count is in %rcx
 	
 	#; if no remaining frames, jump to the end
-#;	cmp $0, %ecx
-	andl $3, %ecx #; nframes % 4
+#;	cmp $0, %rcx
+	andq $3, %rcx #; nframes % 4
 	je .AG_END
 
 .AGPOST_START:
 
-	movss (%edi), %xmm0
+	movss (%rdi), %xmm0
 	mulss %xmm1, %xmm0
-	movss %xmm0, (%edi)
+	movss %xmm0, (%rdi)
 
 	#; increment buffer, decrement counter
-	addl $4, %edi #; buf++;
+	addq $4, %rdi #; buf++;
 	
-	decl %ecx   #; nframes--
+	decq %rcx   #; nframes--
 	jnz	.AGPOST_START #; if we run out of frames, we go to the end
 	
 .AG_END:
 
 
-	popl %edi
+	popq %rdi
 	
 	#; return
 	leave
@@ -400,24 +400,24 @@ abs_mask:
 
 	
 x86_sse_compute_peak:
-#; 8(%ebp)	= float	*buf 	= %edi
-#; 12(%ebp) = long	nframes = %ecx
-#; 16(%ebp) = float	current = st(0)
+#; 8(%rbp)	= float	*buf 	= %rdi
+#; 12(%rbp) = long	nframes = %rcx
+#; 16(%rbp) = float	current = st(0)
 
-	pushl %ebp
-	movl %esp, %ebp
+	pushq %rbp
+	movq %rsp, %rbp
 
-	#; save %edi
-	pushl %edi
+	#; save %rdi
+	pushq %rdi
 	
 	#; the real function
 
 	#; Load "current" in xmm0
-	movss 16(%ebp), %xmm0
+	movss 16(%rbp), %xmm0
 
 	#; if nframes == 0, go to end
-	movl 12(%ebp), %ecx #; nframes
-	cmp	$0, %ecx
+	movq 12(%rbp), %rcx #; nframes
+	cmp	$0, %rcx
 	je	.CP_END
 
 	#; create the "abs" mask in %xmm2
@@ -426,58 +426,58 @@ x86_sse_compute_peak:
 
 	#; Check for alignment
 
-	movl 8(%ebp), %edi #; buf 
-	movl %edi, %edx #; buf => %edx
-	andl $12, %edx #; mask bits 1 & 2, result = 0, 4, 8 or 12
+	movq 8(%rbp), %rdi #; buf 
+	movq %rdi, %rdx #; buf => %rdx
+	andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
 	jz	.CP_SSE #; if buffer IS aligned
 
 	#; PRE-LOOP
 	#; we iterate 1-3 times, doing normal x87 float comparison
-	#; so we reach a 16 byte aligned "buf" (=%edi) value
+	#; so we reach a 16 byte aligned "buf" (=%rdi) value
 
 .LP_START:
 
 	#; Load next value from the buffer
-	movss (%edi), %xmm1
+	movss (%rdi), %xmm1
 	andps %xmm2, %xmm1
 	maxss %xmm1, %xmm0
 
 	#; increment buffer, decrement counter
-	addl $4, %edi #; buf++;
+	addq $4, %rdi #; buf++;
 	
-	decl %ecx   #; nframes--
+	decq %rcx   #; nframes--
 	jz	.CP_END #; if we run out of frames, we go to the end
 	
-	addl $4, %edx #; one non-aligned byte less
-	cmp $16, %edx
+	addq $4, %rdx #; one non-aligned byte less
+	cmp $16, %rdx
 	jne .LP_START #; if more non-aligned frames exist, we do a do-over
 
 .CP_SSE:
 
-	#; We have reached the 16 byte aligned "buf" ("edi") value
+	#; We have reached the 16 byte aligned "buf" ("rdi") value
 
 	#; Figure out how many loops we should do
-	movl %ecx, %eax #; copy remaining nframes to %eax for division
+	movq %rcx, %rax #; copy remaining nframes to %rax for division
 
-	shr $2,%eax #; unsigned divide by 4
+	shr $2,%rax #; unsigned divide by 4
 	jz .POST_START
 
-	#; %eax = SSE iterations
+	#; %rax = SSE iterations
 
 	#; current maximum is at %xmm0, but we need to ..
 	shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
 
-	#;prefetcht0 16(%edi)
+	#;prefetcht0 16(%rdi)
 
 .LP_SSE:
 
-	movaps (%edi), %xmm1
+	movaps (%rdi), %xmm1
 	andps %xmm2, %xmm1
 	maxps %xmm1, %xmm0
 
-	addl $16, %edi
+	addq $16, %rdi
 
-	decl %eax
+	decq %rax
 	jnz .LP_SSE
 
 	#; Calculate the maximum value contained in the 4 FP's in %xmm0
@@ -491,31 +491,31 @@ x86_sse_compute_peak:
 	#; now every float in %xmm0 is the same value, current maximum value
 	
 	#; Next we need to post-process all remaining frames
-	#; the remaining frame count is in %ecx
+	#; the remaining frame count is in %rcx
 	
 	#; if no remaining frames, jump to the end
 
-	andl $3, %ecx #; nframes % 4
+	andq $3, %rcx #; nframes % 4
 	jz .CP_END
 
 .POST_START:
 
-	movss (%edi), %xmm1
+	movss (%rdi), %xmm1
 	andps %xmm2, %xmm1
 	maxss %xmm1, %xmm0
 	
-	addl $4, %edi 	#; buf++;
+	addq $4, %rdi 	#; buf++;
 	
-	decl %ecx		#; nframes--;
+	decq %rcx		#; nframes--;
 	jnz .POST_START
 
 .CP_END:
 
 	#; Load the value from xmm0 to the float stack for returning
-	movss %xmm0, 16(%ebp)
-	flds 16(%ebp)
+	movss %xmm0, 16(%rbp)
+	flds 16(%rbp)
 
-	popl %edi
+	popq %rdi
 	
 	#; return
 	leave
diff -uprN ardour-0.99/SConstruct ardour-sse64/SConstruct
--- ardour-0.99/SConstruct	2005-09-24 03:53:13.000000000 +0100
+++ ardour-sse64/SConstruct	2005-12-16 11:32:00.000000000 +0000
@@ -36,8 +36,8 @@ opts.AddOptions(
     PathOption('PREFIX', 'Set the install "prefix"', '/usr/local'),
     BoolOption('VST', 'Compile with support for VST', 0),
     BoolOption('VERSIONED', 'Add version information to ardour/gtk executable name inside the build directory', 0),
-    BoolOption('USE_SSE_EVERYWHERE', 'Ask the compiler to use x86/SSE instructions and also our hand-written x86/SSE optimizations when possible (off by default)', 0),
-    BoolOption('BUILD_SSE_OPTIMIZATIONS', 'Use our hand-written x86/SSE optimizations when possible (off by default)', 0)
+    BoolOption('USE_SSE_EVERYWHERE', 'Ask the compiler to use x86/SSE instructions and also our hand-written x86/SSE optimizations when possible (off by default)', 1),
+    BoolOption('BUILD_SSE_OPTIMIZATIONS', 'Use our hand-written x86/SSE optimizations when possible (off by default)', 1)
   )