//------------------------------------------------------------------------------ // // File : MPEGSUBB.a // // Author : Stéphane TAVENARD // // (C) Copyright 1997-1998 Stéphane TAVENARD // All Rights Reserved // // #Rev| Date | Comment // ----|----------|-------------------------------------------------------- // 0 |10/04/1997| Initial revision ST // 1 |18/05/1997| Optimized windowing for '060 ST // 2 |01/11/1997| Use link instead of static vars ST // 3 |08/04/1998| Use overflow optimization from ... (IRC man) ST // 4 |21/06/1998| Use external dewindow ST // // ------------------------------------------------------------------------ // // MPEG SUBroutines optimized ! // //------------------------------------------------------------------------------ .globl MPEGSUBB_antialias .globl MPEGSUBB_filter_band .globl MPEGSUBB_window_band .text #define INT_FACTOR 14 #define ALIAS_BITS 15 #define IMMED # #define IMM # #define _IMM #define DBRA(x,y) \ subq.l IMM 1, x; \ jbge y /* I tried using the coldfire MAC unit to improve these and * it slowed things down. Cycle counting indicated 10 without * the MAC and 9 with so there must be some fun stuff going on */ // // ALIAS_U bu,csi,bd,cai,, // performs: (bu * csi) - (bd * cai) -> // is used for calculation // d6 must be set to ALIAS_BITS // #define ALIAS_U(p1,p2,p3,p4,p5,p6) \ move.w p1,p6; \ muls.w IMM p2,p6; \ move.w p3,p5; \ muls.w IMM p4,p5; \ sub.l p5,p6; \ asr.l d6,p6 // // ALIAS_D bd,csi,bu,cai,, // performs: (bd * csi) + (bu * cai) -> // is used for calculation // d6 must be set to ALIAS_BITS // #define ALIAS_D(p1,p2,p3,p4,p5,p6) \ move.w p1,p6; \ muls.w IMM p2,p6; \ move.w p3,p5; \ muls.w IMM p4,p5; \ add.l p5,p6; \ asr.l d6,p6 #define bd a1 #define bu a2 // // ALIAS_B csi,cai // perform 1 alias butterfly // uses d0..d3 // #define ALIAS_B(p1,p2) \ move.w -(bu),d2 ; \ move.w (bd),d3 ; \ ALIAS_U (d2,p1,d3,p2,d0,d1) ; \ move.w d1,(bu) ; \ ALIAS_D (d3,p1,d2,p2,d0,d1) ; \ move.w d1,(bd)+ // Layer III antialiasing filter // // a0 = sample buffer xr (INT16) // d0 = sblimit // MPEGSUBB_antialias: move.l sp@(4), a0 move.l sp@(8), d0 sub.l #28,sp movem.l d2-d7/a2,(sp) clr.l d7 move.w d0,d7 subq.l #1,d7 jbmi antialias2 clr.l d5 // d5 =i = 0 moveq.l #ALIAS_BITS,d6 // for alias mult antialias1: add.l #18,d5 // i += SSLIMIT lea (a0,d5.l*2),bu // bu = xr[ i ] move.l bu,bd // bd = bu ALIAS_B (_IMM 28098,_IMM -16858) ALIAS_B (_IMM 28892,_IMM -15457) ALIAS_B (_IMM 31117,_IMM -10268) ALIAS_B (_IMM 32221,_IMM -5960) ALIAS_B (_IMM 32621,_IMM -3099) ALIAS_B (_IMM 32740,_IMM -1342) ALIAS_B (_IMM 32764,_IMM -465) ALIAS_B (_IMM 32767,_IMM -121) DBRA (d7, antialias1) antialias2: movem.l (sp),d2-d7/a2 add.l #28,sp rts #define ps1 a3 #define ps2 a4 #define pd1 a5 //#define pd2 a6 // #2 // // Filter COS values for fast cosine transform // #define SH 1 // In order to preserve bits in multiply #define SHIFT(x) ((x)/2) #define MUL_SHIFT 16-SH #define COS1_64 0x4014 // 0x8028>>SH #define COS3_64 0x40b3 // 0x8167>>SH #define COS5_64 0x41fa // 0x83f4>>SH #define COS7_64 0x43f9 // 0x87f2>>SH #define COS9_64 0x46cc // 0x8d98>>SH #define COS11_64 0x4a9d // 0x953b>>SH #define COS13_64 0x4fae // 0x9f5c>>SH #define COS15_64 0x5660 // 0xacc0>>SH #define COS17_64 0x5f4d // 0xbe9a>>SH #define COS19_64 0x6b70 // 0xd6e0>>SH #define COS21_64 0x7c7d // 0xf8fa>>SH #define COS23_64 0x95b0 // 0x12b60>>SH #define COS25_64 0xbdf9 // 0x17bf2>>SH #define COS27_64 0x10765 // 0x20ecb>>SH #define COS29_64 0x1b42c // 0x36859>>SH #define COS31_64 0x51852 // 0xa30a4>>SH #define COS1_32 0x404f // 0x809f>>SH #define COS3_32 0x42e1 // 0x85c2>>SH #define COS5_32 0x4891 // 0x9123>>SH #define COS7_32 0x52cb // 0xa596>>SH #define COS9_32 0x64e2 // 0xc9c5>>SH #define COS11_32 0x87c4 // 0x10f89>>SH #define COS13_32 0xdc79 // 0x1b8f2>>SH #define COS15_32 0x28cf2 // 0x519e5>>SH #define COS1_16 0x4141 // 0x8282>>SH #define COS3_16 0x4cf9 // 0x99f2>>SH #define COS5_16 0x7332 // 0xe665>>SH #define COS7_16 0x1480d // 0x2901b>>SH #define COS1_8 0x4546 // 0x8a8c>>SH #define COS3_8 0xa73d // 0x14e7b>>SH #define COS1_4 0x5a82 // 0xb505>>SH // Multiply an immediate constant with a register appropriately scaled #define MUL32I(p1,p2) \ move.l p1, d5; \ muls.l d5, p2; \ asr.l d6, p2 // Multiply against a register with appropriate scaling // We can do almost every addressing mode here except for immediate :-( #define MUL32AM(p1,p2) \ muls.l p1, p2; \ asr.l d6, p2 #define FFF_MUL(p1,p2) \ move.l (ps1)+,d0 ; \ move.l d0,d1 ; \ move.l -(ps2),d2 ; \ add.l d2,d1 ; \ move.l d1,(pd1)+ ; \ sub.l d2,d0 ; \ MUL32I (IMMED p1,d0) ; \ move.l d0,p2*4-4(pd1) // // FF4_MUL s1,s2,s3,s4 // s1..s4 must be registers // d0, s1, s2 modified // // s1 + s4 -> (pd1)+ // s2 + s3 -> (pd1)+ // (s1 - s4) * COS1_8 -> (pd1)+ // (s2 - s3) * COS3_8 -> (pd1)+ // #define FF4_MUL(p1,p2,p3,p4) \ move.l p1,d0 ; \ add.l p4,d0 ; \ move.l d0,(pd1)+ ; \ move.l p2,d0 ; \ add.l p3,d0 ; \ move.l d0,(pd1)+ ; \ sub.l p4,p1 ; \ MUL32I (IMMED COS1_8,p1) ; \ move.l p1,(pd1)+ ; \ sub.l p3,p2 ; \ MUL32I (IMMED COS3_8,p2) ; \ move.l p2,(pd1)+ // // FF2_MUL s1,s2,COSx // s1..s2, COSx must be registers // d0, s1 modified // // s1 + s2 -> (pd1)+ // (s1 - s2) * COSx -> (pd1)+ // #define FF2_MUL(p1,p2,p3) \ move.l p1,d0 ; \ add.l p2,d0 ; \ move.l d0,(pd1)+ ; \ sub.l p2,p1 ; \ MUL32AM (p3,p1) ; \ move.l p1,(pd1)+ fast_filter_sub: // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) lea 16*4(ps1),ps2 // ps2=@p(16) // lea filter_pp,pd1 // pd1=@pp(0) lea -16*4(a6),pd1 // pd1=@pp(0) FFF_MUL (COS1_32,8) // pp(i=0..7) = p(i) + p(15-i) FFF_MUL (COS3_32,8) // pp(i=8..15) = COSx*[p(i) - p(15-i)] FFF_MUL (COS5_32,8) FFF_MUL (COS7_32,8) FFF_MUL (COS9_32,8) FFF_MUL (COS11_32,8) FFF_MUL (COS13_32,8) FFF_MUL (COS15_32,8) // lea filter_pp,ps1 // ps1=@pp(0) lea -16*4(a6),ps1 // ps1=@pp(0) lea 8*4(ps1),ps2 // ps2=@pp(8) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) FFF_MUL (COS1_16,4) // p(i=0..3) = pp(i) + pp(7-i) FFF_MUL (COS3_16,4) // p(i=4..7) = COSx*[pp(i) - pp(7-i)] FFF_MUL (COS5_16,4) FFF_MUL (COS7_16,4) lea 4*4(ps1),ps1 // ps1=@pp(8) lea 8*4(ps1),ps2 // ps2=@pp(16) lea 4*4(pd1),pd1 // pd1=@p(8) FFF_MUL (COS1_16,4) FFF_MUL (COS3_16,4) FFF_MUL (COS5_16,4) FFF_MUL (COS7_16,4) // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) // lea filter_pp,pd1 // pd1=@pp(0) lea -16*4(a6),pd1 // pd1=@pp(0) move.l (ps1)+, d1 // get p0..p3 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) move.l (ps1)+, d1 // get p4..p7 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) move.l (ps1)+, d1 // get p8..p11 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) move.l (ps1)+, d1 // get p12..p15 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) // lea filter_pp,ps1 // ps1=@pp(0) lea -16*4(a6),ps1 // ps1=@pp(0) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) move.l #COS1_4,d3 move.l #8-1,d4 fast_filter_s2: move.l (ps1)+, d1 move.l (ps1)+, d2 FF2_MUL (d1,d2,d3) DBRA (d4,fast_filter_s2) rts // // GET_P // copy p[ index ] into a destination // p is ps1 // #define GET_P(p1,p2) move.l p1*4(ps1),p2 // // ADD_P // add p[ index ] to a destination // p is ps1 // #define ADD_P(p1,p2) add.l p1*4(ps1),p2 // // SET_S0 // copy a source into s0[ index ] // s0 is a1 // #define SET_S0(p1,p2) \ move.l d0, -(sp); \ move.w p2, d0; \ move.w d0,p1*2*16(a1); \ move.l (sp)+, d0 // // SET_S1 // copy a source into s1[ index ] // s1 is a2 // #define SET_S1(p1,p2) move.w p2,p1*2*16(a2) // // SET_S0_P // copy into s0[ index ] // copy - into s0[ 32-index ] // s0 is a1 // is modified // #define SET_S0_P(p1,p2) \ move.w p2,p1*2*16(a1) ; \ neg.l p2 ; \ move.w p2,-p1*2*16+1024(a1) // // SET_S1_P // copy - into s1[ index ] // copy - into s1[ 32-index ] // s1 is a2 // is modified // #define SET_S1_P(p1,p2) \ neg.l p2 ; \ move.w p2,p1*2*16(a2) ; \ move.w p2,-p1*2*16+1024(a2) // Apply the FAST synthesis filter to a sub band // Generate full frequency sample // // a0: bandPtr (=fraction) // a1: out_filter_buffer 0 // a2: out_filter_buffer 1 // d0.w: freq_div // #2 // // registers allocation: d6=MUL_SHIFT // MPEGSUBB_filter_band: move.l sp@(4), a0 move.l sp@(8), a1 move.l sp@(12), d1 move.l sp@(16), d0 ext.l d0 sub.l #44, sp movem.l d2-d7/a2-a6,(sp) move.l d1, a2 move.l d0,d7 // #2 link a6,#-32*4 // #2 need of 2*16 longs moveq.l #MUL_SHIFT,d6 // For MUL32 normalize shift move.l a0,ps1 // ps1=fraction(0) lea 32*2(ps1),ps2 // ps2=fraction(32) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) // move.w freq_div,d0 // cmp.w #4,d0 cmp.l #4,d7 // #2 jbne filter_band1 // Filter only 1/4 freq move.l #8-1,d2 filter_band_q1: // p(i=0..7) = f(i) move.w (ps1)+,d0 ext.l d0 move.l d0,(pd1)+ DBRA (d2,filter_band_q1) // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) lea 8*4(ps1),ps2 // ps2=@p(8) // lea filter_pp,pd1 // pd1=@pp(0) lea -16*4(a6),pd1 // pd1=@pp(0) FFF_MUL (COS1_16,4) // pp(i=0..3) = p(i) + p(7-i) FFF_MUL (COS3_16,4) // pp(i=4..7) = COSx*[p(i) - p(7-i)] FFF_MUL (COS5_16,4) FFF_MUL (COS7_16,4) // lea filter_pp,ps1 // ps1=@pp(0) lea -16*4(a6),ps1 // ps1=@pp(0) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) move.l (ps1)+, d1 // get pp0..pp3 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) move.l (ps1)+, d1 // get pp4..pp7 move.l (ps1)+, d2 move.l (ps1)+, d3 move.l (ps1)+, d4 FF4_MUL (d1,d2,d3,d4) // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) move.l #COS1_4,d3 move.l #4-1,d4 fast_filter_q2: move.l (ps1)+,d1 move.l (ps1)+,d2 FF2_MUL (d1,d2,d3) DBRA (d4,fast_filter_q2) // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) GET_P (1,d1) SET_S0 (0,d1) neg.l d1 SET_S1 (0,d1) GET_P (5,d1) ADD_P (7,d1) SET_S0_P (4,d1) GET_P (3,d1) SET_S0_P (8,d1) GET_P (7,d1) SET_S0_P (12,d1) SET_S0 (16,#0) GET_P (6,d3) ADD_P (7,d3) move.l d3,d4 ADD_P (5,d4) SET_S1_P (4,d4) GET_P (2,d4) ADD_P (3,d4) SET_S1_P (8,d4) move.l d3,d4 ADD_P (4,d4) SET_S1_P (12,d4) GET_P (0,d4) neg.l d4 SET_S1 (16,d4) jbra filter_band9 filter_band1: // cmp.w #2,d0 cmp.l #2,d7 // #2 jbne filter_band2 // Filter only 1/2 freq move.l #16-1,d2 filter_band_h1: // p(i=0..15) = f(i) move.w (ps1)+,d0 ext.l d0 move.l d0,(pd1)+ DBRA (d2,filter_band_h1) jbra filter_band3 filter_band2: // Filter full freq move.l #16-1,d2 filter_band_f1: // p(i=0..15) = f(i) + f(31-i) move.w (ps1)+,d0 move.w -(ps2),d1 ext.l d0 ext.l d1 add.l d0,d1 move.l d1,(pd1)+ DBRA (d2,filter_band_f1) filter_band3: jbsr fast_filter_sub // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) GET_P (13,d0) ADD_P (15,d0) GET_P (1,d1) SET_S0 (0,d1) neg.l d1 SET_S1 (0,d1) move.l d0,d1 ADD_P (9,d1) SET_S0_P (2,d1) GET_P (5,d1) ADD_P (7,d1) SET_S0_P (4,d1) move.l d0,d1 ADD_P (11,d1) SET_S0_P (6,d1) GET_P (3,d1) SET_S0_P (8,d1) GET_P (11,d1) ADD_P (15,d1) SET_S0_P (10,d1) GET_P (7,d1) SET_S0_P (12,d1) GET_P (15,d1) SET_S0_P (14,d1) SET_S0 (16,#0) ADD_P (14,d0) GET_P (12,d1) ADD_P (14,d1) ADD_P (15,d1) GET_P (10,d2) ADD_P (11,d2) GET_P (6,d3) ADD_P (7,d3) move.l d0,d4 ADD_P (9,d4) SET_S1_P (2,d4) move.l d3,d4 ADD_P (5,d4) SET_S1_P (4,d4) move.l d0,d4 add.l d2,d4 SET_S1_P (6,d4) GET_P (2,d4) ADD_P (3,d4) SET_S1_P (8,d4) move.l d1,d4 add.l d2,d4 SET_S1_P (10,d4) move.l d3,d4 ADD_P (4,d4) SET_S1_P (12,d4) move.l d1,d4 ADD_P (8,d4) SET_S1_P (14,d4) GET_P (0,d4) neg.l d4 SET_S1 (16,d4) // move.w freq_div,d0 // cmp.w #1,d0 cmp.l #1,d7 // #2 jbne filter_band9 move.l a0,ps1 // ps1=fraction(0) lea 32*2(ps1),ps2 // ps2=fraction(32) // lea filter_p,pd1 // pd1=@p(0) lea -32*4(a6),pd1 // pd1=@p(0) lea pc@(filter_cos64),a0 move.l #16-1,d2 filter_band_f2: // p(i=0..15) = COSx*[f(i) - f(31-i)] move.w (ps1)+,d0 move.w -(ps2),d1 ext.l d0 ext.l d1 sub.l d1,d0 MUL32AM ((a0)+,d0) move.l d0,(pd1)+ DBRA (d2,filter_band_f2) jbsr fast_filter_sub // lea filter_p,ps1 // ps1=@p(0) lea -32*4(a6),ps1 // ps1=@p(0) GET_P (13,d0) ADD_P (15,d0) GET_P (11,d1) ADD_P (15,d1) GET_P (5,d2) ADD_P (7,d2) move.l d0,d3 ADD_P (9,d3) move.l d3,d4 ADD_P (1,d4) SET_S0_P (1,d4) move.l d2,d4 add.l d3,d4 SET_S0_P (3,d4) move.l d0,d3 ADD_P (11,d3) move.l d3,d4 add.l d2,d4 SET_S0_P (5,d4) move.l d3,d4 ADD_P (3,d4) SET_S0_P (7,d4) move.l d1,d4 ADD_P (3,d4) SET_S0_P (9,d4) move.l d1,d4 ADD_P (7,d4) SET_S0_P (11,d4) GET_P (7,d4) ADD_P (15,d4) SET_S0_P (13,d4) GET_P (15,d4) SET_S0_P (15,d4) ADD_P (14,d0) GET_P (12,d1) ADD_P (14,d1) ADD_P (15,d1) GET_P (10,d2) ADD_P (11,d2) GET_P (6,d3) ADD_P (7,d3) GET_P (1,d4) ADD_P (9,d4) add.l d0,d4 SET_S1_P (1,d4) GET_P (5,d5) add.l d3,d5 add.l d0,d5 GET_P (9,d4) add.l d5,d4 SET_S1_P (3,d4) move.l d5,d4 add.l d2,d4 SET_S1_P (5,d4) GET_P (2,d5) ADD_P (3,d5) add.l d2,d5 move.l d0,d4 add.l d5,d4 SET_S1_P (7,d4) move.l d1,d4 add.l d5,d4 SET_S1_P (9,d4) GET_P (4,d5) add.l d3,d5 add.l d1,d5 move.l d2,d4 add.l d5,d4 SET_S1_P (11,d4) GET_P (8,d4) add.l d5,d4 SET_S1_P (13,d4) GET_P (0,d4) ADD_P (8,d4) add.l d1,d4 SET_S1_P (15,d4) filter_band9: unlk a6 // #2 movem.l (sp),d2-d7/a2-a6 add.l #44,sp rts filter_cos64: dc.l COS1_64, COS3_64, COS5_64, COS7_64 dc.l COS9_64, COS11_64, COS13_64, COS15_64 dc.l COS17_64, COS19_64, COS21_64, COS23_64 dc.l COS25_64, COS27_64, COS29_64, COS31_64 // #2 Begin // pcm_loops[ freq_div ] = = (32 / freq_div) - 1 pcm_loops: dc.w 0, 31, 15, 9, 7, 0 // #2 End #define WINDOW_CLIP 1 // // Window a sub band filtered sample // // a0: out_filter_buffer // a1: out_sample_buffer // a2: dewindow (##4) // d0: buffer offset // d1.w: w_begin (#2) // d2.w: w_width (#2) // d3.w: freq_div (#2) // d4.l: dew_shift (##4) // -> a1 = out_sample_buffer + out_sample_length MPEGSUBB_window_band: sub.l #44, sp movem.l d2-d7/a2-a6,(sp) move.l sp@(48), a0 move.l sp@(52), a1 move.l sp@(56), a2 move.l sp@(60), d0 move.l sp@(64), d1 move.l sp@(68), d2 move.l sp@(72), d3 move.l sp@(76), d4 ext.l d1 ext.l d2 ext.l d3 // move.w w_begin,d1 // #2 move.l d2,-(sp) // #2 move.l a2,a4 // ##4 lea.l pc@(pcm_loops),a2 // #2 move.w (a2,d3.l*2),d6 // #2 ext.l d6 move.l d3,d2 // #2 // lea.l window_table,a2 // ##4 External now lea.l (a4,d1.l*2),a2 // ##4 a2 = &dewindow[ w_begin ] moveq.l # INT_FACTOR-15,d7 // ##4 External now add.l d4,d7 // ##4 new scale add.l d0,d1 and.l #15,d1 // d1 = start move.l a0,a4 // buf0 = &buf_ptr[ 0 ] lea (a0,d1.l*2),a3 // buf1 = &buf_ptr[ start ] // move.w w_width,d3 // #2 move.l (sp),d3 // #2 add.l d1,d3 ext.l d3 cmp.l #16,d3 jble window_band1 move.l #16,d3 // d3 = top window_band1: sub.l d1,d3 // d3 = cnt1 // move.w w_width,d4 // #2 move.l (sp),d4 // #2 ext.l d4 sub.l d3,d4 // d4 = cnt0 // move.w freq_div,d2 // #2 lsl.l #4,d2 move.l d2,d5 sub.l d3,d5 // off1 = freq_div*16 - cnt1 ext.l d5 add.l d5,d5 move.l d5,a5 // a5 = off1 move.l d2,d5 sub.l d4,d5 // off0 = freq_div*16 - cnt0 add.l d5,d5 move.l d5,a6 // a6 = off0 // sub.w w_width,d2 // offd = freq_div*16 - w_width (#2) sub.l (sp),d2 // offd = freq_div*16 - w_width (#2) ext.l d2 add.l d2,d2 // moveq.l # WINDOW_FACTOR+INT_FACTOR-15,d7 // ##4 External now // move.w pcm_count,d6 // #2 // subq.w #1,d6 // for DBRA loop ; #2 #define MULADDN(p1, p2) jbsr mulladdwin_ ## p1 ## _ ## p2 #ifdef COLDFIRE_MAC /* This code isn't really set up for using the coldfire MAC unit. * It would be better if the in memory data items were all in .l format * then the macl instruction suite could be used to decrease execution * time fairly impressively. */ // The mac instruction in this one is "mac.w d5l, d1l" #define MULADDWIN(p1) \ move.w (p1)+, d1; \ move.w (a2)+, d5; \ dc.w 0xa205; \ dc.w 0x0000 // MULADD0 // The mac instruction here is "move.l #0, acc" #define MULADD0(p1) \ dc.w 0xa13c; \ dc.l 0; \ MULADDN (a3,p1) // MULADD1 // The mac instruction here is "move.l acc, d5" #define MULADD1(p1) \ add.l a5,a3; \ MULADDN (a4,p1); \ dc.w 0xa185 #else // ! COLDFIRE_MAC // MULADDWIN {a3|a4} #define MULADDWIN(p1) \ move.w (p1)+,d1; \ muls.w (a2)+,d1; \ add.l d1,d5 // MULADD0 #define MULADD0(p1) \ clr.l d5; \ MULADDN (a3,p1) // MULADD1 #define MULADD1(p1) \ add.l a5,a3; \ MULADDN (a4,p1) #endif // COLDFIRE_MAC // CHECKBOUNDS #ifdef WINDOW_CLIP #define CHECKBOUNDS(p1) \ add.l a6,a4; \ add.l d2,a2; \ asr.l d7,d5; \ move.w d5,a0; \ cmp.l a0,d5; \ bne.b 2f; \ 1: \ move.w d5,(a1)+; \ DBRA (d6,p1); \ jbra window_band9; \ 2: \ not.l d5; \ add.l d5,d5; \ move.l IMMED 0xFFFF8000,d5; \ negx.l d5; \ bra.s 1b; \ /* move.w d5,(a1)+; */ \ /* DBRA (d6,p1); */ \ /* bra window_band9; */ #else /* WINDOW_CLIP */ #define CHECKBOUNDS(p1) \ add.l a6,a4; \ add.l d2,a2; \ asr.l d7,d5; \ move.w d5,(a1)+; \ DBRA (d6,p1); \ jbra window_band9 #endif /* WINDOW_CLIP */ // move.w w_width,d3 // #2 move.l (sp),d3 // #2 ext.l d3 lea pc@(window_q0muls),a0 cmp.l #4,d3 jble window_qmul lea pc@(window_q1muls),a0 cmp.l #8,d3 jble window_qmul lea pc@(window_q2muls),a0 window_qmul: move.l (a0,d4.l*4),a0 jmp (pc,a0) window_q2mul0: MULADD0 (16) MULADD1 (0) CHECKBOUNDS (window_q2mul0) window_q2mul1: MULADD0 (15) MULADD1 (1) CHECKBOUNDS (window_q2mul1) window_q2mul2: MULADD0 (14) MULADD1 (2) CHECKBOUNDS (window_q2mul2) window_q2mul3: MULADD0 (13) MULADD1 (3) CHECKBOUNDS (window_q2mul3) window_q2mul4: MULADD0 (12) MULADD1 (4) CHECKBOUNDS (window_q2mul4) window_q2mul5: MULADD0 (11) MULADD1 (5) CHECKBOUNDS (window_q2mul5) window_q2mul6: MULADD0 (10) MULADD1 (6) CHECKBOUNDS (window_q2mul6) window_q2mul7: MULADD0 (9) MULADD1 (7) CHECKBOUNDS (window_q2mul7) window_q2mul8: MULADD0 (8) MULADD1 (8) CHECKBOUNDS (window_q2mul8) window_q2mul9: MULADD0 (7) MULADD1 (9) CHECKBOUNDS (window_q2mul9) window_q2mul10: MULADD0 (6) MULADD1 (10) CHECKBOUNDS (window_q2mul10) window_q2mul11: MULADD0 (5) MULADD1 (11) CHECKBOUNDS (window_q2mul11) window_q2mul12: MULADD0 (4) MULADD1 (12) CHECKBOUNDS (window_q2mul12) window_q2mul13: MULADD0 (3) MULADD1 (13) CHECKBOUNDS (window_q2mul13) window_q2mul14: MULADD0 (2) MULADD1 (14) CHECKBOUNDS (window_q2mul14) window_q2mul15: MULADD0 (1) MULADD1 (15) CHECKBOUNDS (window_q2mul15) window_q1mul0: MULADD0 (8) MULADD1 (0) CHECKBOUNDS (window_q1mul0) window_q1mul1: MULADD0 (7) MULADD1 (1) CHECKBOUNDS (window_q1mul1) window_q1mul2: MULADD0 (6) MULADD1 (2) CHECKBOUNDS (window_q1mul2) window_q1mul3: MULADD0 (5) MULADD1 (3) CHECKBOUNDS (window_q1mul3) window_q1mul4: MULADD0 (4) MULADD1 (4) CHECKBOUNDS (window_q1mul4) window_q1mul5: MULADD0 (3) MULADD1 (5) CHECKBOUNDS (window_q1mul5) window_q1mul6: MULADD0 (2) MULADD1 (6) CHECKBOUNDS (window_q1mul6) window_q1mul7: MULADD0 (1) MULADD1 (7) CHECKBOUNDS (window_q1mul7) window_q0mul0: MULADD0 (4) MULADD1 (0) CHECKBOUNDS (window_q0mul0) window_q0mul1: MULADD0 (3) MULADD1 (1) CHECKBOUNDS (window_q0mul1) window_q0mul2: MULADD0 (2) MULADD1 (2) CHECKBOUNDS (window_q0mul2) window_q0mul3: MULADD0 (1) MULADD1 (3) CHECKBOUNDS (window_q0mul3) window_band9: move.l (sp)+,d2 movem.l (sp),d2-d7/a2-a6 add.l #44, sp rts mulladdwin_a3_16: MULADDWIN(a3) mulladdwin_a3_15: MULADDWIN(a3) mulladdwin_a3_14: MULADDWIN(a3) mulladdwin_a3_13: MULADDWIN(a3) mulladdwin_a3_12: MULADDWIN(a3) mulladdwin_a3_11: MULADDWIN(a3) mulladdwin_a3_10: MULADDWIN(a3) mulladdwin_a3_9: MULADDWIN(a3) mulladdwin_a3_8: MULADDWIN(a3) mulladdwin_a3_7: MULADDWIN(a3) mulladdwin_a3_6: MULADDWIN(a3) mulladdwin_a3_5: MULADDWIN(a3) mulladdwin_a3_4: MULADDWIN(a3) mulladdwin_a3_3: MULADDWIN(a3) mulladdwin_a3_2: MULADDWIN(a3) mulladdwin_a3_1: MULADDWIN(a3) mulladdwin_a3_0: rts mulladdwin_a4_16: MULADDWIN(a4) mulladdwin_a4_15: MULADDWIN(a4) mulladdwin_a4_14: MULADDWIN(a4) mulladdwin_a4_13: MULADDWIN(a4) mulladdwin_a4_12: MULADDWIN(a4) mulladdwin_a4_11: MULADDWIN(a4) mulladdwin_a4_10: MULADDWIN(a4) mulladdwin_a4_9: MULADDWIN(a4) mulladdwin_a4_8: MULADDWIN(a4) mulladdwin_a4_7: MULADDWIN(a4) mulladdwin_a4_6: MULADDWIN(a4) mulladdwin_a4_5: MULADDWIN(a4) mulladdwin_a4_4: MULADDWIN(a4) mulladdwin_a4_3: MULADDWIN(a4) mulladdwin_a4_2: MULADDWIN(a4) mulladdwin_a4_1: MULADDWIN(a4) mulladdwin_a4_0: rts /* Lookup table to allow us to dispatch into the above pile * of procedures easily. These are stored PC relative from the * branch instruction before the window_q2mul0 label which means * we can pack this lot into the text segment and thus reduce * data size a little. */ window_q2muls: dc.l window_q2mul0 - window_q2mul0 + 2 dc.l window_q2mul1 - window_q2mul0 + 2 dc.l window_q2mul2 - window_q2mul0 + 2 dc.l window_q2mul3 - window_q2mul0 + 2 dc.l window_q2mul4 - window_q2mul0 + 2 dc.l window_q2mul5 - window_q2mul0 + 2 dc.l window_q2mul6 - window_q2mul0 + 2 dc.l window_q2mul7 - window_q2mul0 + 2 dc.l window_q2mul8 - window_q2mul0 + 2 dc.l window_q2mul9 - window_q2mul0 + 2 dc.l window_q2mul10 - window_q2mul0 + 2 dc.l window_q2mul11 - window_q2mul0 + 2 dc.l window_q2mul12 - window_q2mul0 + 2 dc.l window_q2mul13 - window_q2mul0 + 2 dc.l window_q2mul14 - window_q2mul0 + 2 dc.l window_q2mul15 - window_q2mul0 + 2 window_q1muls: dc.l window_q1mul0 - window_q2mul0 + 2 dc.l window_q1mul1 - window_q2mul0 + 2 dc.l window_q1mul2 - window_q2mul0 + 2 dc.l window_q1mul3 - window_q2mul0 + 2 dc.l window_q1mul4 - window_q2mul0 + 2 dc.l window_q1mul5 - window_q2mul0 + 2 dc.l window_q1mul6 - window_q2mul0 + 2 dc.l window_q1mul7 - window_q2mul0 + 2 window_q0muls: dc.l window_q0mul0 - window_q2mul0 + 2 dc.l window_q0mul1 - window_q2mul0 + 2 dc.l window_q0mul2 - window_q2mul0 + 2 dc.l window_q0mul3 - window_q2mul0 + 2