# Intel P5 mpn_lshift -- mpn left shift.
#
# P5: 1.75 cycles/limb.


# Copyright (C) 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


# mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
#                       unsigned shift);
#
# Shift src,size left by shift many bits and store the result in dst,size.
# Zeros are shifted in at the right.  Return the bits shifted out at the
# left.
#
# The comments in mpn_rshift apply here too.

defframe(PARAM_SHIFT,16)
defframe(PARAM_SIZE, 12)
defframe(PARAM_SRC,  8)
defframe(PARAM_DST,  4)
deflit(`FRAME',0)

dnl  minimum 5, because the unrolled loop can't handle less
deflit(UNROLL_THRESHOLD, 5)

	.text
	ALIGN(8)

PROLOGUE(mpn_lshift)

	pushl	%ebx
	pushl	%edi
deflit(`FRAME',8)

	movl	PARAM_SIZE, %eax
	movl	PARAM_DST, %edx

	movl	PARAM_SRC, %ebx
	movl	PARAM_SHIFT, %ecx

	cmp	$UNROLL_THRESHOLD, %eax
	jae	L(unroll)

	movl	-4(%ebx,%eax,4), %edi	# src high limb
	decl	%eax

	jnz	L(simple)

	shldl(	%cl, %edi, %eax)	# eax was decremented to zero

 	shll	%cl, %edi

	movl	%edi, (%edx)		# dst low limb
	popl	%edi			# risk of data cache bank clash

	popl	%ebx

	ret


#------------------------------------------------------------------------------
L(simple):
	# eax	size-1
	# ebx	src
	# ecx	shift
	# edx	dst
	# esi
	# edi
	# ebp
deflit(`FRAME',8)

	movd	(%ebx,%eax,4), %mm5	# src high limb

 	movd	%ecx, %mm6		# lshift
	negl	%ecx

	psllq	%mm6, %mm5
	addl	$32, %ecx

	movd	%ecx, %mm7
	psrlq	$32, %mm5		# retval


L(simple_top):
	# eax	counter, limbs, negative
	# ebx	src
	# ecx
	# edx	dst
	# esi
	# edi
	#
	# mm0	scratch
	# mm5	return value
	# mm6	shift
	# mm7	32-shift

	movq	-4(%ebx,%eax,4), %mm0
	decl	%eax

 	psrlq	%mm7, %mm0

	#

	movd	%mm0, 4(%edx,%eax,4)
	jnz	L(simple_top)


	movd	(%ebx), %mm0

	movd	%mm5, %eax
 	psllq	%mm6, %mm0

	popl	%edi
	popl	%ebx

	movd	%mm0, (%edx)

	emms

	ret


#------------------------------------------------------------------------------
	ALIGN(8)
L(unroll):
	# eax	size
	# ebx	src
	# ecx	shift
	# edx	dst
	# esi
	# edi
	# ebp
deflit(`FRAME',8)

	movd	-4(%ebx,%eax,4), %mm5	# src high limb
	leal	(%ebx,%eax,4), %edi

 	movd	%ecx, %mm6		# lshift
	andl	$4, %edi

	psllq	%mm6, %mm5
	jz	L(start_src_aligned)


	# src isn't aligned, process high limb separately (marked xxx) to
	# make it so.
	#
	#  source     -8(ebx,%eax,4)
	#                  |
	#  +-------+-------+-------+--
	#  |               |          
	#  +-------+-------+-------+--
	#        0mod8   4mod8   0mod8
	#
	#  dest
	#     -4(edx,%eax,4)
	#          |
	#  +-------+-------+--
	#  |  xxx  |       |  
	#  +-------+-------+--

	movq	-8(%ebx,%eax,4), %mm0	# unaligned load

	psllq	%mm6, %mm0
	decl	%eax

	psrlq	$32, %mm0

	#

	movd	%mm0, (%edx,%eax,4)
L(start_src_aligned):

	movq	-8(%ebx,%eax,4), %mm1	# src high qword
	leal	(%edx,%eax,4), %edi

	andl	$4, %edi
	psrlq	$32, %mm5		# return value

	movq	-16(%ebx,%eax,4), %mm3	# src second highest qword
	jz	L(start_dst_aligned)

	# dst isn't aligned, subtract 4 to make it so, and pretend the shift
	# is 32 bits extra.  High limb of dst (marked xxx) handled here
	# separately.
	#
	#  source     -8(ebx,%eax,4)
	#                  |
	#  +-------+-------+--
	#  |      mm1      |  
	#  +-------+-------+--
	#                0mod8   4mod8
	#
	#  dest
	#     -4(edx,%eax,4)
	#          |
	#  +-------+-------+-------+--
	#  |  xxx  |               |
	#  +-------+-------+-------+--
	#        0mod8   4mod8   0mod8

	movq	%mm1, %mm0
	addl	$32, %ecx		# new shift

	psllq	%mm6, %mm0

	movd	%ecx, %mm6
	psrlq	$32, %mm0

	# wasted cycle here waiting for %mm0

	movd	%mm0, -4(%edx,%eax,4)
	subl	$4, %edx
L(start_dst_aligned):


 	psllq	%mm6, %mm1
	negl	%ecx			# -shift

        addl    $64, %ecx		# 64-shift
 	movq	%mm3, %mm2

        movd    %ecx, %mm7
	subl	$8, %eax		# size-8

 	psrlq	%mm7, %mm3

 	por	%mm1, %mm3		# mm3 ready to store
	jc	L(finish)


	# The comments in mpn_rshift apply here too.

	ALIGN(8)
L(unroll_loop):
	# eax	counter, limbs
	# ebx	src
	# ecx
	# edx	dst
	# esi
	# edi
	#
	# mm0
	# mm1
	# mm2	src qword from 48(%ebx,%eax,4)
	# mm3	dst qword ready to store to 56(%edx,%eax,4)
	#
	# mm5	return value
	# mm6	lshift
	# mm7	rshift

 	movq	8(%ebx,%eax,4), %mm0
 	psllq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psrlq	%mm7, %mm0

 	movq	%mm3, 24(%edx,%eax,4)	# prev
 	por	%mm2, %mm0

 	movq	(%ebx,%eax,4), %mm3	#
 	psllq	%mm6, %mm1		#

 	movq	%mm0, 16(%edx,%eax,4)
 	movq	%mm3, %mm2		#

 	psrlq	%mm7, %mm3		#
	subl	$4, %eax

 	por	%mm1, %mm3		#
	jnc	L(unroll_loop)


L(finish):
	# eax	-4 to -1 representing respectively 0 to 3 limbs remaining

	testb	$2, %al

	jz	L(finish_no_two)

 	movq	8(%ebx,%eax,4), %mm0
 	psllq	%mm6, %mm2

 	movq	%mm0, %mm1
 	psrlq	%mm7, %mm0

 	movq	%mm3, 24(%edx,%eax,4)	# prev
 	por	%mm2, %mm0

	movq	%mm1, %mm2
	movq	%mm0, %mm3

	subl	$2, %eax
L(finish_no_two):


	# eax	-4 or -3 representing respectively 0 or 1 limbs remaining
	#
	# mm2	src prev qword, from 48(%ebx,%eax,4)
	# mm3	dst qword, for 56(%edx,%eax,4)

	testb	$1, %al
	movd	%mm5, %eax	# retval

	popl	%edi
	jz	L(finish_zero)


	# One extra src limb, destination was aligned.
	#
	#                 source                  ebx
	#                 --+---------------+-------+
	#                   |      mm2      |       |
	#                 --+---------------+-------+
	#
	# dest         edx+12           edx+4     edx
	# --+---------------+---------------+-------+
	#   |      mm3      |               |       |
	# --+---------------+---------------+-------+
	#
	# mm6 = shift
	# mm7 = ecx = 64-shift


	# One extra src limb, destination was unaligned.
	#
	#                 source                  ebx
	#                 --+---------------+-------+
	#                   |      mm2      |       |
	#                 --+---------------+-------+
	#
	#         dest         edx+12           edx+4
	#         --+---------------+---------------+
	#           |      mm3      |               |
	#         --+---------------+---------------+
	#
	# mm6 = shift+32
	# mm7 = ecx = 64-(shift+32)


	# In both cases there's one extra limb of src to fetch and combine
	# with mm2 to make a qword at 4(%edx), and in the aligned case
	# there's an extra limb of dst to be formed from that extra src limb
	# left shifted.


        movd    (%ebx), %mm0
 	psllq	%mm6, %mm2

	movq	%mm3, 12(%edx)
	psllq	$32, %mm0

        movq    %mm0, %mm1
        psrlq   %mm7, %mm0

        por     %mm2, %mm0
        psllq   %mm6, %mm1

	movq    %mm0, 4(%edx)
	psrlq	$32, %mm1

        andl	$32, %ecx
	popl	%ebx

	jz	L(finish_one_unaligned)

	movd	%mm1, (%edx)
L(finish_one_unaligned):

	emms

        ret


L(finish_zero):

	# No extra src limbs, destination was aligned.
	#
	#                 source          ebx
	#                 --+---------------+
	#                   |      mm2      |
	#                 --+---------------+
	#
	# dest          edx+8             edx
	# --+---------------+---------------+
	#   |      mm3      |               |
	# --+---------------+---------------+
	#
	# mm6 = shift
	# mm7 = ecx = 64-shift


	# No extra src limbs, destination was unaligned.
	#
	#               source            ebx
	#                 --+---------------+
	#                   |      mm2      |
	#                 --+---------------+
	#
	#         dest          edx+8   edx+4
	#         --+---------------+-------+
	#           |      mm3      |       |
	#         --+---------------+-------+
	#
	# mm6 = shift+32
	# mm7 = ecx = 64-(shift+32)


	# The movd for the unaligned case writes the same data to 4(%edx)
	# that the movq does for the aligned case.


 	movq	%mm3, 8(%edx)
	andl	$32, %ecx

 	psllq	%mm6, %mm2
	jz	L(finish_zero_unaligned)

 	movq	%mm2, (%edx)
L(finish_zero_unaligned):

	psrlq	$32, %mm2
	popl	%ebx

	movd	%mm5, %eax	# retval

	movd	%mm2, 4(%edx)

	emms

	ret

EPILOGUE()