# Intel P5 mpn_rshift -- mpn right shift.
#
# P5: 1.75 cycles/limb.


# Copyright (C) 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


# mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
#                       unsigned shift);
#
# Shift src,size right by shift many bits and store the result in dst,size.
# Zeros are shifted in at the left.  Return the bits shifted out at the
# right.
#
# It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
# and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
#
# Full speed depends on source and destination being aligned.  Unaligned mmx
# loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
# setups and finish-ups are done to ensure alignment for the loop.
#
# MMX shifts work out a bit faster even for the simple loop.

defframe(PARAM_SHIFT,16)
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC,  8)
defframe(PARAM_DST,  4)
deflit(`FRAME',0)

dnl  Minimum 5, because the unrolled loop can't handle less.
deflit(UNROLL_THRESHOLD, 5)

	.text
	ALIGN(8)

PROLOGUE(mpn_rshift)

	pushl	%ebx
	pushl	%edi
deflit(`FRAME',8)

	movl	PARAM_SIZE, %eax
	movl	PARAM_DST, %edx

	movl	PARAM_SRC, %ebx
	movl	PARAM_SHIFT, %ecx

	cmp	$UNROLL_THRESHOLD, %eax
	jae	L(unroll)

	decl	%eax
	movl	(%ebx), %edi		# src low limb

	jnz	L(simple)

	shrdl(	%cl, %edi, %eax)	# eax was decremented to zero

 	shrl	%cl, %edi

	movl	%edi, (%edx)		# dst low limb
	popl	%edi			# risk of data cache bank clash

	popl	%ebx

	ret


#------------------------------------------------------------------------------
	ALIGN(8)
L(simple):
	# eax	size-1
	# ebx	src
	# ecx	shift
	# edx	dst
	# esi
	# edi
	# ebp
deflit(`FRAME',8)

	movd	(%ebx), %mm5		# src[0]
	leal	(%ebx,%eax,4), %ebx	# &src[size-1]

 	movd	%ecx, %mm6		# rshift
	leal	-4(%edx,%eax,4), %edx	# &dst[size-2]

	psllq	$32, %mm5
	negl	%eax


# This loop is 5 or 8 cycles, with every second load unaligned and a wasted
# cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
# cycles and would be 8 in a simple loop.  Using mmx helps the return value
# and last limb calculations too.

L(simple_top):
	# eax	counter, limbs, negative
	# ebx	&src[size-1]
	# ecx	return value
	# edx	&dst[size-2]
	#
	# mm0	scratch
	# mm5	return value
	# mm6	shift

	movq	(%ebx,%eax,4), %mm0
	incl	%eax

 	psrlq	%mm6, %mm0

	movd	%mm0, (%edx,%eax,4)
	jnz	L(simple_top)


	movd	(%ebx), %mm0
	psrlq	%mm6, %mm5		# return value

 	psrlq	%mm6, %mm0
	popl	%edi

	movd	%mm5, %eax
	popl	%ebx

	movd	%mm0, 4(%edx)

	emms

	ret


#------------------------------------------------------------------------------
	ALIGN(8)
L(unroll):
	# eax	size
	# ebx	src
	# ecx	shift
	# edx	dst
	# esi
	# edi
	# ebp
deflit(`FRAME',8)

	movd	(%ebx), %mm5		# src[0]
	movl	$4, %edi

 	movd	%ecx, %mm6		# rshift
	testl	%edi, %ebx

	psllq	$32, %mm5
	jz	L(start_src_aligned)


	# src isn't aligned, process low limb separately (marked xxx) and
	# step src and dst by one limb, making src aligned.
	#
	# source                  ebx
	# --+-------+-------+-------+
	#           |          xxx  |
	# --+-------+-------+-------+
	#         4mod8   0mod8   4mod8
	#
	#         dest            edx
	#         --+-------+-------+
	#           |       |  xxx  |  
	#         --+-------+-------+

	movq	(%ebx), %mm0		# unaligned load

	psrlq	%mm6, %mm0
	addl	$4, %ebx

	decl	%eax

	movd	%mm0, (%edx)
	addl	$4, %edx
L(start_src_aligned):


	movq	(%ebx), %mm1
	testl	%edi, %edx

	psrlq	%mm6, %mm5		# retval
	jz	L(start_dst_aligned)

	# dst isn't aligned, add 4 to make it so, and pretend the shift is
	# 32 bits extra.  Low limb of dst (marked xxx) handled here
	# separately.
	#
	#          source          ebx
	#          --+-------+-------+
	#            |      mm1      |
	#          --+-------+-------+
	#                  4mod8   0mod8
	#
	#  dest                    edx
	#  --+-------+-------+-------+
	#                    |  xxx  |        
	#  --+-------+-------+-------+
	#          4mod8   0mod8   4mod8

	movq	%mm1, %mm0
	addl	$32, %ecx		# new shift

	psrlq	%mm6, %mm0

	movd	%ecx, %mm6

	movd	%mm0, (%edx)
	addl	$4, %edx
L(start_dst_aligned):


	movq	8(%ebx), %mm3
	negl	%ecx

 	movq	%mm3, %mm2		# mm2 src qword
        addl    $64, %ecx

        movd    %ecx, %mm7
 	psrlq	%mm6, %mm1

	leal	-12(%ebx,%eax,4), %ebx
	leal	-20(%edx,%eax,4), %edx

 	psllq	%mm7, %mm3
	subl	$7, %eax		# size-7

 	por	%mm1, %mm3		# mm3 ready to store
	negl	%eax			# -(size-7)

	jns	L(finish)


	# This loop is the important bit, the rest is just support.  Careful
	# instruction scheduling achieves the claimed 1.75 c/l.  The
	# relevant parts of the pairing rules are:
	#
	# - mmx loads and stores execute only in the U pipe
	# - only one mmx shift in a pair
	# - wait one cycle before storing an mmx register result
	# - the usual address generation interlock
	#
	# Two qword calculations are slightly interleaved.  The instructions
	# marked "#" belong to the second qword, and the "# prev" one is for
	# the second qword from the previous iteration.

	ALIGN(8)
L(unroll_loop):
	# eax	counter, limbs, negative
	# ebx	&src[size-12]
	# ecx
	# edx	&dst[size-12]
	# esi
	# edi
	#
	# mm0
	# mm1
	# mm2	src qword from -8(%ebx,%eax,4)
	# mm3	dst qword ready to store to -8(%edx,%eax,4)
	#
	# mm5	return value
	# mm6	rshift
	# mm7	lshift

 	movq	(%ebx,%eax,4), %mm0
 	psrlq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psllq	%mm7, %mm0

 	movq	%mm3, -8(%edx,%eax,4)	# prev
 	por	%mm2, %mm0

 	movq	8(%ebx,%eax,4), %mm3	#
 	psrlq	%mm6, %mm1		#

 	movq	%mm0, (%edx,%eax,4)
 	movq	%mm3, %mm2		#

 	psllq	%mm7, %mm3		#
	addl	$4, %eax

 	por	%mm1, %mm3		#
	js	L(unroll_loop)


L(finish):
	# eax	0 to 3 representing respectively 3 to 0 limbs remaining

	testb	$2, %al

	jnz	L(finish_no_two)

 	movq	(%ebx,%eax,4), %mm0
 	psrlq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psllq	%mm7, %mm0

 	movq	%mm3, -8(%edx,%eax,4)	# prev
 	por	%mm2, %mm0

	movq	%mm1, %mm2
	movq	%mm0, %mm3

	addl	$2, %eax
L(finish_no_two):


	# eax	2 or 3 representing respectively 1 or 0 limbs remaining
	#
	# mm2	src prev qword, from -8(%ebx,%eax,4)
	# mm3	dst qword, for -8(%edx,%eax,4)

	testb	$1, %al
	popl	%edi

	movd	%mm5, %eax	# retval
	jnz	L(finish_zero)


	# One extra limb, destination was aligned.
	#
	# source                ebx
	# +-------+---------------+--
	# |       |      mm2      |
	# +-------+---------------+--
	#
	# dest                                  edx
	# +-------+---------------+---------------+--
	# |       |               |      mm3      |
	# +-------+---------------+---------------+--
	#
	# mm6 = shift
	# mm7 = ecx = 64-shift


	# One extra limb, destination was unaligned.
	#
	# source                ebx
	# +-------+---------------+--
	# |       |      mm2      |
	# +-------+---------------+--
	#
	# dest                          edx
	# +---------------+---------------+--
	# |               |      mm3      |
	# +---------------+---------------+--
	#
	# mm6 = shift+32
	# mm7 = ecx = 64-(shift+32)


	# In both cases there's one extra limb of src to fetch and combine
	# with mm2 to make a qword at 8(%edx), and in the aligned case
	# there's a further extra limb of dst to be formed.


        movd    8(%ebx), %mm0
 	psrlq	%mm6, %mm2

        movq    %mm0, %mm1
        psllq   %mm7, %mm0

	movq	%mm3, (%edx)
        por     %mm2, %mm0

        psrlq   %mm6, %mm1
        andl	$32, %ecx

	popl	%ebx
	jz	L(finish_one_unaligned)

        # dst was aligned, must store one extra limb
	movd	%mm1, 16(%edx)
L(finish_one_unaligned):

	movq    %mm0, 8(%edx)

	emms

        ret


L(finish_zero):

	# No extra limbs, destination was aligned.
	#
	# source        ebx
	# +---------------+--
	# |      mm2      |
	# +---------------+--
	#
	# dest                        edx+4
	# +---------------+---------------+--
	# |               |      mm3      |
	# +---------------+---------------+--
	#
	# mm6 = shift
	# mm7 = ecx = 64-shift


	# No extra limbs, destination was unaligned.
	#
	# source        ebx
	# +---------------+--
	# |      mm2      |
	# +---------------+--
	#
	# dest                edx+4
	# +-------+---------------+--
	# |       |      mm3      |
	# +-------+---------------+--
	#
	# mm6 = shift+32
	# mm7 = 64-(shift+32)


	# The movd for the unaligned case is clearly the same data as the
	# movq for the aligned case, it's just a choice between whether one
	# or two limbs should be written.


 	movq	%mm3, 4(%edx)
 	psrlq	%mm6, %mm2

 	movd	%mm2, 12(%edx)
	andl	$32, %ecx

	popl	%ebx
	jz	L(finish_zero_unaligned)

 	movq	%mm2, 12(%edx)
L(finish_zero_unaligned):

	emms

	ret

EPILOGUE()