# AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
#
# K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.


# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


ifdef(`OPERATION_add_n', `
	define(M4_inst,        adcl)
	define(M4_function_n,  mpn_add_n)
	define(M4_function_nc, mpn_add_nc)
	define(M4_description, add)
',`ifdef(`OPERATION_sub_n', `
	define(M4_inst,        sbbl)
	define(M4_function_n,  mpn_sub_n)
	define(M4_function_nc, mpn_sub_nc)
	define(M4_description, subtract)
',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
')')')

MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)


`#' mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
`#'                          mp_size_t size);
`#' mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
`#'	                      mp_size_t size, mp_limb_t carry);
`#'
`#' Calculate src1,size M4_description src2,size, and store the result in
`#' dst,size.  The return value is the carry bit from the top of the result
`#' (1 or 0).
#
# The _nc version accepts 1 or 0 for an initial carry into the low limb of
# the calculation.  Note values other than 1 or 0 here will lead to garbage
# results.
#
# Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
# an in-place dst+=src to 2.5 c/l.  The unrolled loops have 1 cycle/loop of
# loop control, which with 4 limbs/loop means an extra 0.25 c/l.

define(PARAM_CARRY, `FRAME+20(%esp)')
define(PARAM_SIZE,  `FRAME+16(%esp)')
define(PARAM_SRC2,  `FRAME+12(%esp)')
define(PARAM_SRC1,  `FRAME+8(%esp)')
define(PARAM_DST,   `FRAME+4(%esp)')
deflit(`FRAME',0)

dnl  minimum 5 because the unrolled code can't handle less
deflit(UNROLL_THRESHOLD, 5)

	.text
	ALIGN(32)

PROLOGUE(M4_function_nc)
	movl	PARAM_CARRY, %eax
	jmp	LF(M4_function_n,start)
EPILOGUE()


PROLOGUE(M4_function_n)
	xorl	%eax, %eax
L(start):
	movl	PARAM_SIZE, %ecx
	pushl	%ebx
FRAME_pushl()

	movl	PARAM_SRC1, %ebx
	pushl	%edi
FRAME_pushl()

	movl	PARAM_SRC2, %edx
	cmpl	$UNROLL_THRESHOLD, %ecx

	movl	PARAM_DST, %edi
	jae	L(unroll)


	shrl	%eax		# initial carry flag

	# offset 0x21 here, close enough to aligned
L(simple):
	# eax	scratch
	# ebx	src1
	# ecx	counter
	# edx	src2
	# esi
	# edi	dst
	# ebp
	#
	# The store to (%edi) could be done with a stosl; it'd be smaller
	# code, but there's no speed gain and a cld would have to be added
	# (per mpn/x86/README.family).

	movl	(%ebx), %eax
	leal	4(%ebx), %ebx
	
	M4_inst	(%edx), %eax

	movl	%eax, (%edi)
	leal	4(%edi), %edi

	leal	4(%edx), %edx
	loop	L(simple)

	
	movl	$0, %eax
	popl	%edi

	setc	%al

	popl	%ebx
	ret


#------------------------------------------------------------------------------
L(unroll):
	# eax	carry
	# ebx	src1
	# ecx	counter
	# edx	src2
	# esi
	# edi	dst
	# ebp

	cmpl	%edi, %ebx
	pushl	%esi

	je	L(inplace)

ifdef(`OPERATION_add_n',`
	cmpl	%edi, %edx

	je	L(inplace_reverse)
')

	movl	%ecx, %esi

	andl	$~3, %ecx
	andl	$3, %esi

	leal	(%ebx,%ecx,4), %ebx
	leal	(%edx,%ecx,4), %edx
	leal	(%edi,%ecx,4), %edi

	negl	%ecx
	shrl	%eax

	ALIGN(32)
L(normal_top):
	# eax	counter, qwords, negative
	# ebx	src1
	# ecx	scratch
	# edx	src2
	# esi
	# edi	dst
	# ebp

 	movl	(%ebx,%ecx,4), %eax
	leal	5(%ecx), %ecx
 	M4_inst	-20(%edx,%ecx,4), %eax
 	movl	%eax, -20(%edi,%ecx,4)

 	movl	4-20(%ebx,%ecx,4), %eax
 	M4_inst	4-20(%edx,%ecx,4), %eax
 	movl	%eax, 4-20(%edi,%ecx,4)

 	movl	8-20(%ebx,%ecx,4), %eax
 	M4_inst	8-20(%edx,%ecx,4), %eax
 	movl	%eax, 8-20(%edi,%ecx,4)

 	movl	12-20(%ebx,%ecx,4), %eax
 	M4_inst	12-20(%edx,%ecx,4), %eax
 	movl	%eax, 12-20(%edi,%ecx,4)

	loop	L(normal_top)


	decl	%esi
	jz	L(normal_finish_one)
	js	L(normal_done)

	# two or three more limbs

 	movl	(%ebx), %eax
 	M4_inst	(%edx), %eax
 	movl	%eax, (%edi)

 	movl	4(%ebx), %eax
 	M4_inst	4(%edx), %eax
	decl	%esi
 	movl	%eax, 4(%edi)

	jz	L(normal_done)
	movl	$2, %ecx

L(normal_finish_one):
 	movl	(%ebx,%ecx,4), %eax
 	M4_inst	(%edx,%ecx,4), %eax
 	movl	%eax, (%edi,%ecx,4)

L(normal_done):	
	popl	%esi
	popl	%edi

	movl	$0, %eax
	popl	%ebx

	setc	%al

	ret


#------------------------------------------------------------------------------

ifdef(`OPERATION_add_n',`
L(inplace_reverse):
	# dst==src2

	movl	%ebx, %edx
')

L(inplace):
	# eax	initial carry
	# ebx
	# ecx	size
	# edx	src
	# esi
	# edi	dst
	# ebp

	leal	-1(%ecx), %esi
	decl	%ecx

	andl	$~3, %ecx
	andl	$3, %esi

 	movl	(%edx), %ebx		# src low limb
	leal	(%edx,%ecx,4), %edx

	leal	(%edi,%ecx,4), %edi
	negl	%ecx

	shrl	%eax


	ALIGN(32)
L(inplace_top):
	# eax
	# ebx	next src limb
	# ecx	size
	# edx	src
	# esi
	# edi	dst
	# ebp

 	M4_inst	%ebx, (%edi,%ecx,4)

 	movl	4(%edx,%ecx,4), %eax
	leal	5(%ecx), %ecx

 	M4_inst	%eax, 4-20(%edi,%ecx,4)

 	movl	8-20(%edx,%ecx,4), %eax
 	movl	12-20(%edx,%ecx,4), %ebx

 	M4_inst	%eax, 8-20(%edi,%ecx,4)
 	M4_inst	%ebx, 12-20(%edi,%ecx,4)

 	movl	16-20(%edx,%ecx,4), %ebx
	loop	L(inplace_top)


	# now %esi is 0 to 3 representing respectively 1 to 4 limbs more

 	M4_inst	%ebx, (%edi)

	decl	%esi
	jz	L(inplace_finish_one)
	js	L(inplace_done)

	# two or three more limbs

 	movl	4(%edx), %eax
 	movl	8(%edx), %ebx
 	M4_inst	%eax, 4(%edi)
 	M4_inst	%ebx, 8(%edi)

	decl	%esi
	movl	$2, %ecx

	jz	L(normal_done)

L(inplace_finish_one):
 	movl	4(%edx,%ecx,4), %eax
 	M4_inst	%eax, 4(%edi,%ecx,4)

L(inplace_done):	
	popl	%esi
	popl	%edi

	movl	$0, %eax
	popl	%ebx

	setc	%al

	ret

EPILOGUE()