# AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. # # K6: 7.65 to 8.5 cycles/limb (at 16 limbs/loop and depending on the data), # PIC adds about 6 cycles at the start. # Copyright (C) 1999, 2000 Free Software Foundation, Inc. # # This file is part of the GNU MP Library. # # The GNU MP Library is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. include(`../config.m4') dnl K6: large multpliers small multpliers dnl UNROLL_COUNT cycles/limb cycles/limb dnl 4 9.5 7.78 dnl 8 9.0 7.78 dnl 16 8.4 7.65 dnl 32 8.4 8.2 dnl dnl Maximum possible unrolling with the current code is 32. dnl dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256 dnl byte block, which might explain the good speed at that unrolling. deflit(UNROLL_COUNT, 16) ifdef(`OPERATION_addmul_1', ` define(M4_inst, addl) define(M4_function_1, mpn_addmul_1) define(M4_function_1c, mpn_addmul_1c) define(M4_description, add it to) define(M4_desc_retval, carry) ',`ifdef(`OPERATION_submul_1', ` define(M4_inst, subl) define(M4_function_1, mpn_submul_1) define(M4_function_1c, mpn_submul_1c) define(M4_description, subtract it from) define(M4_desc_retval, borrow) ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1 ')')') MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c) `#' mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, `#' mp_limb_t mult); `#' mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size, `#' mp_limb_t mult, mp_limb_t carry); `#' `#' Calculate src,size multiplied by mult and M4_description dst,size. `#' Return the M4_desc_retval limb from the top of the result. # # The jadcl0()s in the unrolled loop makes the speed data dependent. Small # multipliers (most significant few bits clear) result in few carry bits and # speeds up to 7.65 cycles/limb are attained. Large multipliers (most # significant few bits set) make the carry bits 50/50 and lead to something # more like 8.4 c/l. (With adcl's both of these would be 9.3 c/l.) # # It's important that the gains for jadcl0 on small multipliers don't come # at the cost of slowing down other data. Tests on uniformly distributed # random data, designed to confound branch prediction, show about a 7% # speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all # overheads included). # # In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus # 11.0 cycles/limb), and hence isn't used. # # In the simple loop, note that running ecx from negative to zero and using # it as an index in the two movs wouldn't help. It would save one # instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired # that would be collapsed by this. # # # jadcl0 # ------ # # jadcl0() being faster than adcl $0 seems to be an artifact of two things, # firstly the instruction decoding and secondly the fact that there's a # carry bit for the jadcl0 only on average about 1/4 of the time. # # The code in the unrolled loop decodes something like the following. # # decode cycles # mull %ebp 2 # M4_inst %esi, disp(%edi) 1 # adcl %eax, %ecx 2 # movl %edx, %esi \ 1 # jnc 1f / # incl %esi \ 1 # 1: movl disp(%ebx), %eax / # --- # 7 # # In a back-to-back style test this measures 7 with the jnc not taken, or 8 # with it taken (both when correctly predicted). This is opposite to the # measurements showing small multipliers running faster than large ones. # Watch this space for more info ... # # It's not clear how much branch misprediction might be costing. The K6 # doco says it will be 1 to 4 cycles, but presumably it's near the low end # of that range to get the measured results. # # # In the code the two carries are more or less the preceding mul product and # the calculation is roughly # # x*y + u*b+v # # where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and # v are the two limbs it's added to (being the low of the next mul, and a # limb from the destination). # # To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and # there are b^2-(b^2-x*y) = x*y many such values, giving a probability of # x*y/b^2. If x, y, u and v are random and uniformly distributed between 0 # and b-1, then the total probability can be summed over x and y, # # 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1) # --- * sum sum --- = --- * ------- * ------- = 1/4 # b^2 x=0 y=1 b^2 b^4 2 2 # # Actually it's a very tiny bit less than 1/4 of course. If y is fixed, # then the probability is 1/2*y/b thus varying linearly between 0 and 1/2. ifdef(`PIC',` deflit(UNROLL_THRESHOLD, 9) ',` deflit(UNROLL_THRESHOLD, 6) ') defframe(PARAM_CARRY, 20) defframe(PARAM_MULTIPLIER,16) defframe(PARAM_SIZE, 12) defframe(PARAM_SRC, 8) defframe(PARAM_DST, 4) .text ALIGN(32) PROLOGUE(M4_function_1c) pushl %esi deflit(`FRAME',4) movl PARAM_CARRY, %esi jmp LF(M4_function_1,start_nc) EPILOGUE() PROLOGUE(M4_function_1) push %esi deflit(`FRAME',4) xorl %esi, %esi # initial carry L(start_nc): movl PARAM_SIZE, %ecx pushl %ebx deflit(`FRAME',8) movl PARAM_SRC, %ebx pushl %edi deflit(`FRAME',12) cmpl $UNROLL_THRESHOLD, %ecx movl PARAM_DST, %edi pushl %ebp deflit(`FRAME',16) jae L(unroll) # simple loop movl PARAM_MULTIPLIER, %ebp L(simple): # eax scratch # ebx src # ecx counter # edx scratch # esi carry # edi dst # ebp multiplier movl (%ebx), %eax addl $4, %ebx mull %ebp addl $4, %edi addl %esi, %eax adcl $0, %edx M4_inst %eax, -4(%edi) adcl $0, %edx movl %edx, %esi loop L(simple) popl %ebp popl %edi popl %ebx movl %esi, %eax popl %esi ret #------------------------------------------------------------------------------ # The unrolled loop uses a "two carry limbs" scheme. At the top of the loop # the carries are ecx=lo, esi=hi, then they swap for each limb processed. # For the computed jump an odd size means they start one way around, an even # size the other. # # VAR_JUMP holds the computed jump temporarily because there's not enough # registers at the point of doing the mul for the initial two carry limbs. # # The add/adc for the initial carry in %esi is necessary only for the # mpn_addmul/submul_1c entry points. Duplicating the startup code to # eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good # idea. dnl overlapping with parameters already fetched define(VAR_COUNTER, `PARAM_SIZE') define(VAR_JUMP, `PARAM_DST') L(unroll): # eax # ebx src # ecx size # edx # esi initial carry # edi dst # ebp movl %ecx, %edx decl %ecx subl $2, %edx negl %ecx shrl $UNROLL_LOG2, %edx andl $UNROLL_MASK, %ecx movl %edx, VAR_COUNTER movl %ecx, %edx shll $4, %edx negl %ecx # 15 code bytes per limb ifdef(`PIC',` call L(pic_calc) L(here): ',` leal L(entry) (%edx,%ecx,1), %edx ') movl (%ebx), %eax # src low limb movl PARAM_MULTIPLIER, %ebp movl %edx, VAR_JUMP mull %ebp addl %esi, %eax # initial carry (from _1c) jadcl0( %edx) leal 4(%ebx,%ecx,4), %ebx movl %edx, %esi # high carry movl VAR_JUMP, %edx leal (%edi,%ecx,4), %edi testl $1, %ecx movl %eax, %ecx # low carry jz L(noswap) movl %esi, %ecx # high,low carry other way around movl %eax, %esi L(noswap): jmp *%edx ifdef(`PIC',` L(pic_calc): # See README.family about old gas bugs leal (%edx,%ecx,1), %edx addl $L(entry)-L(here), %edx addl (%esp), %edx ret ') # ----------------------------------------------------------- ALIGN(32) L(top): deflit(`FRAME',16) # eax scratch # ebx src # ecx carry lo # edx scratch # esi carry hi # edi dst # ebp multiplier # # 15 code bytes per limb leal UNROLL_BYTES(%edi), %edi L(entry): forloop(`i', 0, UNROLL_COUNT/2-1, ` deflit(`disp0', eval(2*i*4)) deflit(`disp1', eval(disp0 + 4)) Zdisp( movl, disp0,(%ebx), %eax) mull %ebp Zdisp( M4_inst,%ecx, disp0,(%edi)) adcl %eax, %esi movl %edx, %ecx jadcl0( %ecx) movl disp1(%ebx), %eax mull %ebp M4_inst %esi, disp1(%edi) adcl %eax, %ecx movl %edx, %esi jadcl0( %esi) ') decl VAR_COUNTER leal UNROLL_BYTES(%ebx), %ebx jns L(top) popl %ebp M4_inst %ecx, UNROLL_BYTES(%edi) popl %edi movl %esi, %eax popl %ebx jadcl0( %eax) popl %esi ret EPILOGUE()