# AMD K6 mpn_rshift -- mpn right shift.
#
# K6: 1.75 cycles/limb


# Copyright (C) 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


# mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
#                       unsigned shift);
#
# Shift src,size right by shift many bits and store the result in dst,size.
# Zeros are shifted in at the left.  Return the bits shifted out at the
# right.

defframe(PARAM_SHIFT,16)
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC,  8)
defframe(PARAM_DST,  4)
deflit(`FRAME',0)

dnl  Minimum 9, because the unrolled loop can't handle less.
dnl  Doesn't depend on femms_available_p because there's the same
dnl  emms_or_femms in both the simple and unrolled.

deflit(UNROLL_THRESHOLD, 9)

	.text
	ALIGN(32)

PROLOGUE(mpn_rshift)
	# The 1 limb case can be done without the push %ebx, but it's then
	# still the same speed.  The push is left as a free helping hand for
	# the two_or_more code.

	movl	PARAM_SIZE, %eax
	pushl	%ebx
deflit(`FRAME',4)

	movl	PARAM_SRC, %ebx
	decl	%eax

	movl	PARAM_SHIFT, %ecx
	jnz	L(two_or_more)

	movl	(%ebx), %edx		# src limb
	movl	PARAM_DST, %ebx

	shrdl(	%cl, %edx, %eax)	# return value

 	shrl	%cl, %edx

	movl	%edx, (%ebx)		# dst limb
	popl	%ebx

	ret


#------------------------------------------------------------------------------
	ALIGN(16)	# avoid offset 0x1f
L(two_or_more):
	# eax	size-1
	# ebx	src
	# ecx	shift
	# edx

	movl	(%ebx), %edx	# src low limb
	negl	%ecx

	addl	$32, %ecx
	movd	PARAM_SHIFT, %mm6

	shll	%cl, %edx
	cmpl	$UNROLL_THRESHOLD-1, %eax

	jae	L(unroll)


	# eax	size-1
	# ebx	src
	# ecx	32-shift
	# edx	retval
	#
	# mm6	shift

	movl	PARAM_DST, %ecx
	leal	(%ebx,%eax,4), %ebx

	leal	-4(%ecx,%eax,4), %ecx
	negl	%eax

	# This loop runs at about 3 cycles/limb, which is the amount of
	# decoding, this is despite every second access being unaligned.

L(simple):
	# eax	counter (negative)
	# ebx	src, pointing at last limb
	# ecx	dst, pointing at last limb
	# edx	retval
	#
	# mm0	scratch
	# mm6	shift

	movq	(%ebx,%eax,4), %mm0
	incl	%eax

 	psrlq	%mm6, %mm0

	movd	%mm0, (%ecx,%eax,4)
	jnz	L(simple)


	movq	%mm0, (%ecx)
	movl	%edx, %eax

	popl	%ebx

	emms_or_femms
	ret


#------------------------------------------------------------------------------
# The strange offsets used on src and dst are due to the following,
# - needing no displacement (%ebx,%eax,4) and (%edx,%eax,4) first in the loop
# - needing the loop running %eax upwards from negative values and wanting
#   to stop when %eax goes positive
# - wanting to end up with %eax between 0 and 3 so as to be able to test for
#   0-3 extras with test $2 and test $1

L(unroll):
	# eax	size-1
	# ebx	src
	# ecx	32-shift
	# edx	retval
	#
	# mm6	shift

	addl	$32, %ecx
	subl	$7, %eax		# size-8

	movd	%ecx, %mm7
	movl	PARAM_DST, %ecx

	movq	(%ebx), %mm2		# src low qword
	leal	4(%ebx,%eax,4), %ebx	# src end - 28

	testb	$4, %cl
	leal	-12(%ecx,%eax,4), %ecx	# dst end - 44

	notl	%eax			# -(size-7)
	jz	L(dst_aligned)

	psrlq	%mm6, %mm2
	incl	%eax

	movd	%mm2, 12(%ecx,%eax,4)
	movq	(%ebx,%eax,4), %mm2	# new src low qword
L(dst_aligned):

	movq	8(%ebx,%eax,4), %mm0	# src second lowest qword


	# This loop is the important bit, the rest is just support for it.
	# Four src limbs are held at the start, and four more will be read.
	# Four dst limbs will be written.
	#
	# The magic ingredients for speed here are
	#
	# - aligning the code to 32 bytes
	# - fitting the first 10 instructions into 32 bytes (the first fetch
	#   and store must have no displacements)
	# - the instruction scheduling shown


	# Offset 0x88 here, so use a jump to get to L(top) in one cycle.
	# This guards against executing through a bunch of nop's if a dumb
	# assembler doesn't generate multi-byte do-nothing instructions when
	# aligning.

	jmp	L(top)

	ALIGN(32)
L(top):
	# eax	limb counter, negative
	# ebx	src end - 28
	# ecx	dst end - 44
	# ecx	retval
	#
	# mm0	src next qword
	# mm1	scratch
	# mm2	src prev qword
	# mm6	shift
	# mm7	64-shift

	psrlq	%mm6, %mm2
	addl	$4, %eax

	movq	%mm0, %mm1
	psllq	%mm7, %mm0

	por	%mm0, %mm2
	movq	(%ebx,%eax,4), %mm0

	psrlq	%mm6, %mm1
	movq	%mm2, (%ecx,%eax,4)

	movq	%mm0, %mm2
	psllq	%mm7, %mm0

	por	%mm0, %mm1
	movq	8(%ebx,%eax,4), %mm0

	movq	%mm1, 8(%ecx,%eax,4)
	ja	L(top)		# jump if no carry and not zero


	# Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
	# to 3 representing respectively 3 to 0 further limbs.


	testb	$2, %al
	jnz	L(finish_nottwo)

	# Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
	# becomes new mm2 and a new mm0 is loaded.

	psrlq	%mm6, %mm2
	movq	%mm0, %mm1

	psllq	%mm7, %mm0
	addl	$2, %eax

	por	%mm0, %mm2
	movq	8(%ebx,%eax,4), %mm0

	movq	%mm2, 8(%ecx,%eax,4)
	movq	%mm1, %mm2
L(finish_nottwo):


	testb	$1, %al
	psrlq	%mm6, %mm2

	movq	%mm0, %mm1
	psllq	%mm7, %mm0

	por	%mm0, %mm2
	psrlq	%mm6, %mm1

	movq	%mm2, 16(%ecx,%eax,4)
	jnz	L(finish_even)


	# one further extra limb to process

	movd	28-4(%ebx), %mm0	# src[size-1], most significant limb
	popl	%ebx

	movq	%mm0, %mm2
	psllq	%mm7, %mm0

	por	%mm0, %mm1
	psrlq	%mm6, %mm2

	movq	%mm1, 44-12(%ecx)	# dst[size-3,size-2]
	movd	%mm2, 44-4(%ecx)	# dst[size-1]

	movl	%edx, %eax		# retval

	emms_or_femms
	ret


L(finish_even):
	# no further extra limbs

	movq	%mm1, 44-8(%ecx)	# dst[size-2,size-1]
	movl	%edx, %eax		# retval

	popl	%ebx

	emms_or_femms
	ret

EPILOGUE()