# AMD K6 mpn_divexact_by3 -- mpn division by 3, expecting no remainder.
#
# K6: 11.0 cycles/limb


# Copyright (C) 2000 Free Software Foundation, Inc.
#
# This file is part of the GNU MP Library.
#
# The GNU MP Library is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# The GNU MP Library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
# License for more details.
#
# You should have received a copy of the GNU Library General Public License
# along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
# the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
# MA 02111-1307, USA.


include(`../config.m4')


# mp_limb_t mpn_divexact_by3 (mp_ptr dst, mp_srcptr src, mp_size_t size);
#
# Divide src,size by 3 and store the quotient in dst,size.  If src,size
# isn't exactly divisible by 3 the result in dst,size won't be very useful.
# The return value is 0 if src,size was divisible by 3, or non-zero if not.
#
# Using %esi in the (%esi,%ecx,4) or 0(%esi,%ecx,4) addressing mode doesn't
# lead to vector decoding, unlike plain (%esi) does.

defframe(PARAM_SIZE,12)
defframe(PARAM_SRC, 8)
defframe(PARAM_DST, 4)
deflit(`FRAME',0)

dnl  multiplicative inverse of 3, modulo 2^32
deflit(INVERSE_3, 0xAAAAAAAB)

	.text
	ALIGN(32)

PROLOGUE(mpn_divexact_by3)

	movl	PARAM_SIZE, %ecx
	pushl	%esi		defframe_pushl(SAVE_ESI)

	movl	PARAM_SRC, %esi
	pushl	%edi		defframe_pushl(SAVE_EDI)

	movl	PARAM_DST, %edi
	pushl	%ebx		defframe_pushl(SAVE_EBX)

	leal	(%esi,%ecx,4), %esi
	xorl	%ebx, %ebx

	pushl	$3		defframe_pushl(VAR_THREE)
	leal	(%edi,%ecx,4), %edi

	negl	%ecx


	ALIGN(32)	# need 32 for claimed speed
L(top):
	# eax	scratch, low product
	# ebx	carry limb (0 to 3)
	# ecx	counter, limbs, negative
	# edx	scratch, high product
	# esi	&src[size]
	# edi	&dst[size]
	# ebp
	#
	# The 0(%esi,%ecx,4) pads so the finishup instructions are on a 32
	# byte boundary, saving a couple of cycles (that's a fixed couple,
	# not per loop).

	movl	0(%esi,%ecx,4), %eax
	subl	%ebx, %eax

	setc	%bl

	imull	$INVERSE_3, %eax

	movl	%eax, (%edi,%ecx,4)
	addl	$2, %ecx

	mull	VAR_THREE

	addl	%edx, %ebx
	loop	L(top)


	movl	SAVE_ESI, %esi
	movl	%ebx, %eax

	movl	SAVE_EBX, %ebx

	movl	SAVE_EDI, %edi
	addl	$FRAME, %esp

	ret

EPILOGUE()