# AMD K6 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, # mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. # # alignment dst/src1/src2, A=0mod8, N=4mod8 # A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N # # K6 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor # K6 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor # K6 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior # Copyright (C) 1999, 2000 Free Software Foundation, Inc. # # This file is part of the GNU MP Library. # # The GNU MP Library is free software; you can redistribute it and/or modify # it under the terms of the GNU Library General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at your # option) any later version. # # The GNU MP Library is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public # License for more details. # # You should have received a copy of the GNU Library General Public License # along with the GNU MP Library; see the file COPYING.LIB. If not, write to # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, # MA 02111-1307, USA. include(`../config.m4') dnl M4_p and M4_i are the MMX and integer instructions dnl M4_*_neg_dst means whether to negate the final result before writing dnl M4_*_neg_src2 means whether to negate the src2 values before using them define(M4_choose_op, m4_assert_numargs(7) `ifdef(`OPERATION_$1',` define(`M4_function', `mpn_$1') define(`M4_operation', `$1') define(`M4_p', `$2') define(`M4_p_neg_dst', `$3') define(`M4_p_neg_src2',`$4') define(`M4_i', `$5') define(`M4_i_neg_dst', `$6') define(`M4_i_neg_src2',`$7') ')') dnl xnor is done in "iorn" style because it's a touch faster than "nior" dnl style (the two are equivalent for xor). M4_choose_op( and_n, pand,0,0, andl,0,0) M4_choose_op( andn_n, pandn,0,0, andl,0,1) M4_choose_op( nand_n, pand,1,0, andl,1,0) M4_choose_op( ior_n, por,0,0, orl,0,0) M4_choose_op( iorn_n, por,0,1, orl,0,1) M4_choose_op( nior_n, por,1,0, orl,1,0) M4_choose_op( xor_n, pxor,0,0, xorl,0,0) M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) ifdef(`M4_function',, `m4_error(`Unrecognised or undefined OPERATION symbol ')') MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) `#' void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, `#' mp_size_t size); `#' `#' Do src1,size M4_operation src2,size, storing the result in dst,size. # # Unaligned movq loads and stores are a bit slower than aligned ones. The # test at the start of the routine checks the alignment of src1 and if # necessary processes one limb separately at the low end to make it aligned. # # The raw speeds without this alignment switch are as follows. # # alignment dst/src1/src2, A=0mod8, N=4mod8 # A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N # # K6 1.5 2.0 1.5 2.0 and,andn,ior,xor # K6 1.75 2.2 2.0 2.28 iorn,xnor # K6 2.0 2.25 2.35 2.28 nand,nior # # # Future: # # K6 can do one 64-bit load per cycle so each of these routines should be # able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be # able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. # The others are 4 instructions per 2 limbs, and so can only approach 1.0 # because there's nowhere to hide some loop control. defframe(PARAM_SIZE,16) defframe(PARAM_SRC2,12) defframe(PARAM_SRC1,8) defframe(PARAM_DST, 4) deflit(`FRAME',0) .text ALIGN(32) PROLOGUE(M4_function) movl PARAM_SIZE, %ecx pushl %ebx FRAME_pushl() movl PARAM_SRC1, %eax movl PARAM_SRC2, %ebx cmpl $1, %ecx movl PARAM_DST, %edx ja L(two_or_more) movl (%ebx), %ecx popl %ebx ifelse(M4_i_neg_src2,1,`notl %ecx') M4_i (%eax), %ecx ifelse(M4_i_neg_dst,1,` notl %ecx') movl %ecx, (%edx) ret L(two_or_more): # eax src1 # ebx src2 # ecx size # edx dst # esi # edi # ebp # # carry bit is low of size pushl %esi FRAME_pushl() testl $4, %eax jz L(alignment_ok) movl (%ebx), %esi addl $4, %ebx ifelse(M4_i_neg_src2,1,`notl %esi') M4_i (%eax), %esi addl $4, %eax ifelse(M4_i_neg_dst,1,` notl %esi') movl %esi, (%edx) addl $4, %edx decl %ecx L(alignment_ok): movl %ecx, %esi shrl %ecx jnz L(still_two_or_more) movl (%ebx), %ecx popl %esi ifelse(M4_i_neg_src2,1,`notl %ecx') M4_i (%eax), %ecx ifelse(M4_i_neg_dst,1,` notl %ecx') popl %ebx movl %ecx, (%edx) ret L(still_two_or_more): ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` pcmpeqd %mm7, %mm7 # all ones ') ALIGN(16) L(top): # eax src1 # ebx src2 # ecx counter # edx dst # esi # edi # ebp # # carry bit is low of size movq -8(%ebx,%ecx,8), %mm0 ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') M4_p -8(%eax,%ecx,8), %mm0 ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') movq %mm0, -8(%edx,%ecx,8) loop L(top) jnc L(no_extra) movl -4(%ebx,%esi,4), %ebx ifelse(M4_i_neg_src2,1,`notl %ebx') M4_i -4(%eax,%esi,4), %ebx ifelse(M4_i_neg_dst,1,` notl %ebx') movl %ebx, -4(%edx,%esi,4) L(no_extra): popl %esi popl %ebx emms_or_femms ret EPILOGUE()