# AMD K6 mpn_lshift -- mpn left shift.
#
# K6: 1.75 cycles/limb


# Copyright (C) 1999, 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


# mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
#                       unsigned shift);
#
# Shift src,size left by shift many bits and store the result in dst,size.
# Zeros are shifted in at the right.  Return the bits shifted out at the
# left.

defframe(PARAM_SHIFT,16)
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC,  8)
defframe(PARAM_DST,  4)
deflit(`FRAME',0)

dnl  used after src has been fetched
define(VAR_RETVAL,`PARAM_SRC')

dnl  minimum 9, because unrolled loop can't handle less
deflit(UNROLL_THRESHOLD, 12)

	.text
	ALIGN(32)

PROLOGUE(mpn_lshift)
	# The 1 limb case can be done without the push %ebx, but it's then
	# still the same speed.  The push is left as a free helping hand for
	# the two_or_more code.

	movl	PARAM_SIZE, %eax
	pushl	%ebx
deflit(`FRAME',4)

	movl	PARAM_SRC, %ebx
	decl	%eax

	movl	PARAM_SHIFT, %ecx
	jnz	L(two_or_more)

	movl	(%ebx), %edx		# src limb
	movl	PARAM_DST, %ebx

	shldl(	%cl, %edx, %eax)	# return value

 	shll	%cl, %edx

	movl	%edx, (%ebx)		# dst limb
	popl	%ebx

	ret


#------------------------------------------------------------------------------
	ALIGN(16)	# avoid offset 0x1f
L(two_or_more):
	# eax	size-1
	# ebx	src
	# ecx	shift
	# edx

	movl	(%ebx,%eax,4), %edx	# src high limb
	negl	%ecx

	movd	PARAM_SHIFT, %mm6
	addl	$32, %ecx

	shrl	%cl, %edx
	cmpl	$UNROLL_THRESHOLD-1, %eax

	movl	%edx, VAR_RETVAL
	jae	L(unroll)


	# eax	size-1
	# ebx	src
	# ecx	32-shift
	# edx	retval
	#
	# mm6	shift

	movd	%ecx, %mm7
	movl	%eax, %ecx

	movl	PARAM_DST, %eax

L(simple):
	# ecx	dst
	# ebx	src
	# ecx	counter
	# edx	retval
	#
	# mm0	scratch
	# mm6	shift
	# mm7	32-shift

	movq	-4(%ebx,%ecx,4), %mm0
 	psrlq	%mm7, %mm0

	movd	%mm0, (%eax,%ecx,4)
	loop	L(simple)


	movd	(%ebx), %mm0
	popl	%ebx

 	psllq	%mm6, %mm0

	movd	%mm0, (%eax)
	movl	%edx, %eax

	emms_or_femms
	ret


#------------------------------------------------------------------------------
# The strange offsets used on src and dst are due to the following,
# - needing no displacement (%ebx,%eax,4) and (%edx,%eax,4) first in the loop
# - needing the loop running %eax downwards and wanting to stop when %eax
#   goes negative
# - wanting to end up with %eax set to -1 to -4 so as to be able to test for
#   0-3 extras with test $2 and test $1

L(unroll):
	# eax	size-1
	# ebx	src
	# ecx	32-shift
	# edx	retval (but instead VAR_RETVAL is used)
	#
	# mm6	shift

	addl	$32, %ecx
	movl	PARAM_DST, %edx

	movd	%ecx, %mm7
	subl	$7, %eax		# size-8

	leal	(%edx,%eax,4), %ecx	# alignment of dst
	addl	$40, %edx

	movq	32-8(%ebx,%eax,4), %mm2		# src high qword
	testb	$4, %cl

	jz	L(dst_aligned)
	psllq	%mm6, %mm2

	psrlq	$32, %mm2
	decl	%eax

	movd	%mm2, 32-40+4-4(%edx,%eax,4)	# dst high limb
	movq	32-8(%ebx,%eax,4), %mm2		# new src high qword
L(dst_aligned):

	movq	32-16(%ebx,%eax,4), %mm0	# src second highest qword
	addl	$24, %ebx


	# This loop is the important bit, the rest is just support for it.
	# Four src limbs are held at the start, and four more will be read.
	# Four dst limbs will be written.
	#
	# The magic ingredients for speed here are the same as in rshift,
	#
	# - aligning the code to 32 bytes
	# - fitting the first 10 instructions into 32 bytes (the first fetch
	#   and store must have no displacements)
	# - the instruction scheduling shown


	# Offset 0x95 here, so use a jump to get to L(top) in one cycle.
	# This guards against executing through a bunch of nop's if a dumb
	# assembler doesn't generate multi-byte do-nothing instructions when
	# aligning.

	jmp L(top)

	ALIGN(32)
L(top):
	# eax	limb counter
	# ebx	src + 24
	# ecx
	# edx	dst + 40
	#
	# mm0	src next qword
	# mm1	scratch
	# mm2	src prev qword
	# mm6	shift
	# mm7	64-shift

	psllq	%mm6, %mm2
	subl	$4, %eax

	movq	%mm0, %mm1
	psrlq	%mm7, %mm0

	por	%mm0, %mm2
	movq	(%ebx,%eax,4), %mm0

	psllq	%mm6, %mm1
	movq	%mm2, (%edx,%eax,4)

	movq	%mm0, %mm2
	psrlq	%mm7, %mm0

	por	%mm0, %mm1
	movq	-8(%ebx,%eax,4), %mm0

	movq	%mm1, -8(%edx,%eax,4)
	jnc	L(top)


	# Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
	#
	# -16(%ebx) is the next source, and -16(%edx) is the next destination.
	# %eax is between -4 and -1, representing respectively 0 to 3 extra
	# limbs that must be read.


	testb	$2, %al
	jz	L(finish_nottwo)

	# Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
	# new mm2 and a new mm0 is loaded.

	psllq	%mm6, %mm2
	movq	%mm0, %mm1

	psrlq	%mm7, %mm0
	subl	$2, %eax

	por	%mm0, %mm2
	movq	-8(%ebx,%eax,4), %mm0

	movq	%mm2, -8(%edx,%eax,4)
	movq	%mm1, %mm2
L(finish_nottwo):


	# lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0

	testb	$1, %al
	psllq	%mm6, %mm2

	movq	%mm0, %mm1
	psrlq	%mm7, %mm0

	por	%mm0, %mm2
	psllq	%mm6, %mm1

	movq	%mm2, -16(%edx,%eax,4)
	jz	L(finish_even)


	# Size is odd, so mm1 and one extra limb to process.

	movd	-24(%ebx), %mm0		# src[0]
	popl	%ebx
deflit(`FRAME',0)

	movq	%mm0, %mm2
	psllq	$32, %mm0

	psrlq	%mm7, %mm0

	psllq	%mm6, %mm2
	por	%mm0, %mm1

	movq	%mm1, 4-40(%edx)	# dst[1,2]
	movd	%mm2, -40(%edx)		# dst[0]

	movl	VAR_RETVAL, %eax

	emms_or_femms
	ret


L(finish_even):
deflit(`FRAME',4)
	# Size is even, so only mm1 left to process.

	movq	%mm1, -40(%edx)		# dst[0,1]
	movl	VAR_RETVAL, %eax

	popl	%ebx
	emms_or_femms
	ret

EPILOGUE()