# AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. # # K7: 1.64 cycles/limb (at 16 limb/loop). # Copyright (C) 1999, 2000 Free Software Foundation, Inc. # # This file is part of the GNU MP Library. # # The GNU MP Library is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. include(`../config.m4') dnl K7: UNROLL_COUNT cycles/limb dnl 8 1.9 dnl 16 1.64 dnl 32 1.7 dnl 64 2.0 dnl Maximum possible with the current code is 64. deflit(UNROLL_COUNT, 16) ifdef(`OPERATION_add_n', ` define(M4_inst, adcl) define(M4_function_n, mpn_add_n) define(M4_function_nc, mpn_add_nc) define(M4_description, add) ',`ifdef(`OPERATION_sub_n', ` define(M4_inst, sbbl) define(M4_function_n, mpn_sub_n) define(M4_function_nc, mpn_sub_nc) define(M4_description, subtract) ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n ')')') MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) `#' mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, `#' mp_size_t size); `#' mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, `#' mp_size_t size, mp_limb_t carry); `#' `#' Calculate src1,size M4_description src2,size, and store the result in # dst,size. The return value is the carry bit from the top of the result (1 # or 0). # # The _nc version accepts 1 or 0 for an initial carry into the low limb of # the calculation. Note values other than 1 or 0 here will lead to garbage # results. # # This code runs at 1.64 cycles/limb, which is probably the best possible # with plain integer operations. Each limb is 2 loads and 1 store, and in # one cycle the K7 can do two loads, or a load and a store, leading to 1.5 # c/l. dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. ifdef(`PIC',` deflit(UNROLL_THRESHOLD, 8) ',` deflit(UNROLL_THRESHOLD, 8) ') defframe(PARAM_CARRY,20) defframe(PARAM_SIZE, 16) defframe(PARAM_SRC2, 12) defframe(PARAM_SRC1, 8) defframe(PARAM_DST, 4) defframe(SAVE_EBP, -4) defframe(SAVE_ESI, -8) defframe(SAVE_EBX, -12) defframe(SAVE_EDI, -16) deflit(STACK_SPACE, 16) .text ALIGN(32) deflit(`FRAME',0) PROLOGUE(M4_function_nc) movl PARAM_CARRY, %eax jmp LF(M4_function_n,start) EPILOGUE() PROLOGUE(M4_function_n) xorl %eax, %eax # carry L(start): movl PARAM_SIZE, %ecx subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE) movl %edi, SAVE_EDI movl %ebx, SAVE_EBX cmpl $UNROLL_THRESHOLD, %ecx movl PARAM_SRC2, %edx movl PARAM_SRC1, %ebx jae L(unroll) movl PARAM_DST, %edi leal (%ebx,%ecx,4), %ebx leal (%edx,%ecx,4), %edx leal (%edi,%ecx,4), %edi negl %ecx shrl %eax # This loop in in a single 16 byte code block already, so no # alignment necessary. L(simple): # eax scratch # ebx src1 # ecx counter # edx src2 # esi # edi dst # ebp movl (%ebx,%ecx,4), %eax M4_inst (%edx,%ecx,4), %eax movl %eax, (%edi,%ecx,4) incl %ecx jnz L(simple) movl $0, %eax movl SAVE_EDI, %edi movl SAVE_EBX, %ebx setc %al addl $STACK_SPACE, %esp ret # ----------------------------------------------------------------------------- # This is at 0x55, close enough to aligned. L(unroll): deflit(`FRAME',STACK_SPACE) movl %ebp, SAVE_EBP andl $~1, %ecx # size low bit masked out andl $1, PARAM_SIZE # size low bit kept movl %ecx, %edi decl %ecx movl PARAM_DST, %ebp shrl $UNROLL_LOG2, %ecx negl %edi movl %esi, SAVE_ESI andl $UNROLL_MASK, %edi ifdef(`PIC',` call L(pic_calc) L(here): ',` leal L(entry) (%edi,%edi,8), %esi # 9 bytes per ') negl %edi shrl %eax leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi jmp *%esi ifdef(`PIC',` L(pic_calc): # See README.family about old gas bugs leal (%edi,%edi,8), %esi addl $L(entry)-L(here), %esi addl (%esp), %esi ret ') # ----------------------------------------------------------------------------- ALIGN(32) L(top): # eax zero # ebx src1 # ecx counter # edx src2 # esi scratch (was computed jump) # edi dst # ebp scratch leal UNROLL_BYTES(%edx), %edx L(entry): deflit(CHUNK_COUNT, 2) forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) deflit(`disp1', eval(disp0 + 4)) Zdisp( movl, disp0,(%ebx), %esi) movl disp1(%ebx), %ebp Zdisp( M4_inst,disp0,(%edx), %esi) Zdisp( movl, %esi, disp0,(%edi)) M4_inst disp1(%edx), %ebp movl %ebp, disp1(%edi) ') decl %ecx leal UNROLL_BYTES(%ebx), %ebx leal UNROLL_BYTES(%edi), %edi jns L(top) mov PARAM_SIZE, %esi movl SAVE_EBP, %ebp movl $0, %eax decl %esi js L(even) movl (%ebx), %ecx M4_inst UNROLL_BYTES(%edx), %ecx movl %ecx, (%edi) L(even): movl SAVE_EDI, %edi movl SAVE_EBX, %ebx setc %al movl SAVE_ESI, %esi addl $STACK_SPACE, %esp ret EPILOGUE()