# Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. # # P6: 6.35 cycles/limb (at 16 limbs/loop). # Copyright (C) 1999, 2000 Free Software Foundation, Inc. # # This file is part of the GNU MP Library. # # The GNU MP Library is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. include(`../config.m4') dnl P6 UNROLL_COUNT cycles/limb dnl 8 6.7 dnl 16 6.35 dnl 32 6.3 dnl 64 6.3 dnl Maximum possible with the current code is 64. deflit(UNROLL_COUNT, 16) ifdef(`OPERATION_addmul_1', ` define(M4_inst, addl) define(M4_function_1, mpn_addmul_1) define(M4_function_1c, mpn_addmul_1c) define(M4_description, add it to) define(M4_desc_retval, carry) ',`ifdef(`OPERATION_submul_1', ` define(M4_inst, subl) define(M4_function_1, mpn_submul_1) define(M4_function_1c, mpn_submul_1c) define(M4_description, subtract it from) define(M4_desc_retval, borrow) ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 ')')') MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) `#' mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, `#' mp_limb_t mult); `#' mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, `#' mp_limb_t mult, mp_limb_t carry); `#' `#' Calculate src,size multiplied by mult and M4_description dst,size. `#' Return the M4_desc_retval limb from the top of the result. # # This code is pretty much the same as the K6 code. The unrolled loop is # the same, but there's just a few scheduling tweaks in the setups and the # simple loop. # # A number of variations have been tried for the unrolled loop, with one or # two carries, and with loads scheduled earlier, but nothing faster than 6 # cycles/limb has been found. ifdef(`PIC',` deflit(UNROLL_THRESHOLD, 5) ',` deflit(UNROLL_THRESHOLD, 5) ') defframe(PARAM_CARRY, 20) defframe(PARAM_MULTIPLIER,16) defframe(PARAM_SIZE, 12) defframe(PARAM_SRC, 8) defframe(PARAM_DST, 4) .text ALIGN(32) PROLOGUE(M4_function_1c) pushl %ebx deflit(`FRAME',4) movl PARAM_CARRY, %ebx jmp LF(M4_function_1,start_nc) EPILOGUE() PROLOGUE(M4_function_1) push %ebx deflit(`FRAME',4) xorl %ebx, %ebx # initial carry L(start_nc): movl PARAM_SIZE, %ecx pushl %esi deflit(`FRAME',8) movl PARAM_SRC, %esi pushl %edi deflit(`FRAME',12) movl PARAM_DST, %edi pushl %ebp deflit(`FRAME',16) cmpl $UNROLL_THRESHOLD, %ecx movl PARAM_MULTIPLIER, %ebp jae L(unroll) # simple loop # this is offset 0x22, so close enough to aligned L(simple): # eax scratch # ebx carry # ecx counter # edx scratch # esi src # edi dst # ebp multiplier movl (%esi), %eax addl $4, %edi mull %ebp addl %ebx, %eax adcl $0, %edx M4_inst %eax, -4(%edi) movl %edx, %ebx adcl $0, %ebx decl %ecx leal 4(%esi), %esi jnz L(simple) popl %ebp popl %edi popl %esi movl %ebx, %eax popl %ebx ret #------------------------------------------------------------------------------ # VAR_JUMP holds the computed jump temporarily because there's not enough # registers when doing the mul for the initial two carry limbs. # # The add/adc for the initial carry in %ebx is necessary only for the # mpn_add/submul_1c entry points. Duplicating the startup code to # eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good # idea. dnl overlapping with parameters already fetched define(VAR_COUNTER,`PARAM_SIZE') define(VAR_JUMP, `PARAM_DST') # this is offset 0x43, so close enough to aligned L(unroll): # eax # ebx initial carry # ecx size # edx # esi src # edi dst # ebp movl %ecx, %edx decl %ecx subl $2, %edx negl %ecx shrl $UNROLL_LOG2, %edx andl $UNROLL_MASK, %ecx movl %edx, VAR_COUNTER movl %ecx, %edx # 15 code bytes per limb ifdef(`PIC',` call L(pic_calc) L(here): ',` shll $4, %edx negl %ecx leal L(entry) (%edx,%ecx,1), %edx ') movl (%esi), %eax # src low limb movl %edx, VAR_JUMP leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi mull %ebp addl %ebx, %eax # initial carry (from _1c) adcl $0, %edx movl %edx, %ebx # high carry leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi movl VAR_JUMP, %edx testl $1, %ecx movl %eax, %ecx # low carry cmovnz_ebx_ecx # high,low carry other way around cmovnz_eax_ebx jmp *%edx ifdef(`PIC',` L(pic_calc): shll $4, %edx negl %ecx # See README.family about old gas bugs leal (%edx,%ecx,1), %edx addl $L(entry)-L(here), %edx addl (%esp), %edx ret ') # ----------------------------------------------------------- ALIGN(32) L(top): deflit(`FRAME',16) # eax scratch # ebx carry hi # ecx carry lo # edx scratch # esi src # edi dst # ebp multiplier # # VAR_COUNTER loop counter # # 15 code bytes per limb addl $UNROLL_BYTES, %edi L(entry): deflit(CHUNK_COUNT,2) forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128))) deflit(`disp1', eval(disp0 + 4)) Zdisp( movl, disp0,(%esi), %eax) mull %ebp Zdisp( M4_inst,%ecx, disp0,(%edi)) adcl %eax, %ebx movl %edx, %ecx adcl $0, %ecx movl disp1(%esi), %eax mull %ebp M4_inst %ebx, disp1(%edi) adcl %eax, %ecx movl %edx, %ebx adcl $0, %ebx ') decl VAR_COUNTER leal UNROLL_BYTES(%esi), %esi jns L(top) deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128))) M4_inst %ecx, disp0(%edi) movl %ebx, %eax popl %ebp popl %edi popl %esi popl %ebx adcl $0, %eax ret EPILOGUE()