# AMD K7 mpn_lshift -- mpn left shift. # # K7: 1.21 cycles/limb (at 16 limbs/loop). # Copyright (C) 1999, 2000 Free Software Foundation, Inc. # # This file is part of the GNU MP Library. # # The GNU MP Library is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. include(`../config.m4') dnl K7: UNROLL_COUNT cycles/limb dnl 4 1.51 dnl 8 1.26 dnl 16 1.21 dnl 32 1.2 dnl Maximum possible with the current code is 64. deflit(UNROLL_COUNT, 16) # mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size, # unsigned shift); # # Shift src,size left by shift many bits and store the result in dst,size. # Zeros are shifted in at the right. The bits shifted out at the left are # the return value. # # The comments in mpn_rshift apply here too. ifdef(`PIC',` deflit(UNROLL_THRESHOLD, 10) ',` deflit(UNROLL_THRESHOLD, 10) ') defframe(PARAM_SHIFT,16) defframe(PARAM_SIZE, 12) defframe(PARAM_SRC, 8) defframe(PARAM_DST, 4) defframe(SAVE_EDI, -4) defframe(SAVE_ESI, -8) defframe(SAVE_EBX, -12) deflit(SAVE_SIZE, 12) .text ALIGN(32) PROLOGUE(mpn_lshift) deflit(`FRAME',0) movl PARAM_SIZE, %eax movl PARAM_SRC, %edx subl $SAVE_SIZE, %esp deflit(`FRAME',SAVE_SIZE) movl PARAM_SHIFT, %ecx movl %edi, SAVE_EDI movl PARAM_DST, %edi decl %eax jnz L(more_than_one_limb) movl (%edx), %edx shldl( %cl, %edx, %eax) # eax was decremented to zero shll %cl, %edx movl %edx, (%edi) movl SAVE_EDI, %edi addl $SAVE_SIZE, %esp ret #------------------------------------------------------------------------------ L(more_than_one_limb): # eax size-1 # ebx # ecx shift # edx src # esi # edi dst # ebp movd PARAM_SHIFT, %mm6 movd (%edx,%eax,4), %mm5 # src high limb cmp $UNROLL_THRESHOLD-1, %eax jae L(unroll) negl %ecx movd (%edx), %mm4 # src low limb addl $32, %ecx movd %ecx, %mm7 L(simple_top): # eax loop counter, limbs # ebx # ecx # edx src # esi # edi dst # ebp # # mm0 scratch # mm4 src low limb # mm5 src high limb # mm6 shift # mm7 32-shift movq -4(%edx,%eax,4), %mm0 decl %eax psrlq %mm7, %mm0 movd %mm0, 4(%edi,%eax,4) jnz L(simple_top) psllq %mm6, %mm5 psllq %mm6, %mm4 psrlq $32, %mm5 movd %mm4, (%edi) # dst low limb movd %mm5, %eax # return value movl SAVE_EDI, %edi addl $SAVE_SIZE, %esp emms ret #-------------------------------------------------------------------------- ALIGN(16) L(unroll): # eax size-1 # ebx (saved) # ecx shift # edx src # esi # edi dst # ebp # # mm5 src high limb, for return value # mm6 lshift movl %esi, SAVE_ESI movl %ebx, SAVE_EBX leal -4(%edx,%eax,4), %edx # &src[size-2] testb $4, %dl movq (%edx), %mm1 # src high qword jz L(start_src_aligned) # src isn't aligned, process high limb (marked xxx) separately to # make it so # # source -4(edx,%eax,4) # | # +-------+-------+-------+-- # | xxx | # +-------+-------+-------+-- # 0mod8 4mod8 0mod8 # # dest -4(edi,%eax,4) # | # +-------+-------+-- # | xxx | | # +-------+-------+-- psllq %mm6, %mm1 subl $4, %edx movl %eax, PARAM_SIZE # size-1 psrlq $32, %mm1 decl %eax # size-2 is new size-1 movd %mm1, 4(%edi,%eax,4) movq (%edx), %mm1 # new src high qword L(start_src_aligned): leal -4(%edi,%eax,4), %edi # &dst[size-2] psllq %mm6, %mm5 testl $4, %edi psrlq $32, %mm5 # return value jz L(start_dst_aligned) # dst isn't aligned, subtract 4 bytes to make it so, and pretend the # shift is 32 bits extra. High limb of dst (marked xxx) handled # here separately. # # source %edx # +-------+-------+-- # | mm1 | # +-------+-------+-- # 0mod8 4mod8 # # dest %edi # +-------+-------+-------+-- # | xxx | # +-------+-------+-------+-- # 0mod8 4mod8 0mod8 movq %mm1, %mm0 psllq %mm6, %mm1 addl $32, %ecx # shift+32 psrlq $32, %mm1 movd %mm1, 4(%edi) movq %mm0, %mm1 subl $4, %edi movd %ecx, %mm6 # new lshift L(start_dst_aligned): decl %eax # size-2, two last limbs handled at end movq %mm1, %mm2 # copy of src high qword negl %ecx andl $~1, %eax # round size down to even addl $64, %ecx movl %eax, %ebx negl %eax andl $UNROLL_MASK, %eax decl %ebx shll %eax movd %ecx, %mm7 # rshift = 64-lshift ifdef(`PIC',` call L(pic_calc) L(here): ',` leal L(entry) (%eax,%eax,4), %esi ') shrl $UNROLL_LOG2, %ebx # loop counter leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi movl PARAM_SIZE, %eax # for use at end jmp *%esi ifdef(`PIC',` L(pic_calc): # See README.family about old gas bugs leal (%eax,%eax,4), %esi addl $L(entry)-L(here), %esi addl (%esp), %esi ret ') #------------------------------------------------------------------------------ ALIGN(32) L(top): # eax size (for use at end) # ebx loop counter # ecx rshift # edx src # esi computed jump # edi dst # ebp # # mm0 scratch # mm1 \ carry (alternating, mm2 first) # mm2 / # mm6 lshift # mm7 rshift # # 10 code bytes/limb # # The two chunks differ in whether mm1 or mm2 hold the carry. # The computed jump puts the initial carry in both mm1 and mm2. L(entry): deflit(CHUNK_COUNT, 4) forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) deflit(`disp1', eval(disp0 - 8)) movq disp0(%edx), %mm0 psllq %mm6, %mm2 movq %mm0, %mm1 psrlq %mm7, %mm0 por %mm2, %mm0 movq %mm0, disp0(%edi) movq disp1(%edx), %mm0 psllq %mm6, %mm1 movq %mm0, %mm2 psrlq %mm7, %mm0 por %mm1, %mm0 movq %mm0, disp1(%edi) ') subl $UNROLL_BYTES, %edx subl $UNROLL_BYTES, %edi decl %ebx jns L(top) define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))') L(end): testb $1, %al movl SAVE_EBX, %ebx psllq %mm6, %mm2 # wanted left shifted in all cases below movd %mm5, %eax movl SAVE_ESI, %esi jz L(end_even) L(end_odd): # Size odd, destination was aligned. # # source edx+8 edx+4 # --+---------------+-------+ # | mm2 | | # --+---------------+-------+ # # dest edi # --+---------------+---------------+-------+ # | written | | | # --+---------------+---------------+-------+ # # mm6 = shift # mm7 = ecx = 64-shift # Size odd, destination was unaligned. # # source edx+8 edx+4 # --+---------------+-------+ # | mm2 | | # --+---------------+-------+ # # dest edi # --+---------------+---------------+ # | written | | # --+---------------+---------------+ # # mm6 = shift+32 # mm7 = ecx = 64-(shift+32) # In both cases there's one extra limb of src to fetch and combine # with mm2 to make a qword at (%edi), and in the aligned case # there's an extra limb of dst to be formed from that extra src limb # left shifted. movd disp(4) (%edx), %mm0 testb $32, %cl movq %mm0, %mm1 psllq $32, %mm0 psrlq %mm7, %mm0 psllq %mm6, %mm1 por %mm2, %mm0 movq %mm0, disp(0) (%edi) jz L(end_odd_unaligned) movd %mm1, disp(-4) (%edi) L(end_odd_unaligned): movl SAVE_EDI, %edi addl $SAVE_SIZE, %esp emms ret L(end_even): # Size even, destination was aligned. # # source edx+8 # --+---------------+ # | mm2 | # --+---------------+ # # dest edi # --+---------------+---------------+ # | written | | # --+---------------+---------------+ # # mm6 = shift # mm7 = ecx = 64-shift # Size even, destination was unaligned. # # source edx+8 # --+---------------+ # | mm2 | # --+---------------+ # # dest edi+4 # --+---------------+-------+ # | written | | # --+---------------+-------+ # # mm6 = shift+32 # mm7 = ecx = 64-(shift+32) # The movq for the aligned case overwrites the movd for the # unaligned case. movq %mm2, %mm0 psrlq $32, %mm2 testb $32, %cl movd %mm2, disp(4) (%edi) jz L(end_even_unaligned) movq %mm0, disp(0) (%edi) L(end_even_unaligned): movl SAVE_EDI, %edi addl $SAVE_SIZE, %esp emms ret EPILOGUE()