[Coco] 6309 MULD real and emulators

Tue Oct 29 09:26:22 EDT 2019

On Tuesday 29 October 2019 07:48:55 Walter Zambotti wrote:

Proving a point, younger minds will eventually find a better way. A tip 
of my hat to Walter.

Now, plz get this stuff into clib, in the nitros9 distributions, its good 
stuff for the old girl.

> I found the other 32 bit shift routine that I was convinced was
> faster.
>
> Turns out it is the same.
>
> It uses upto 4 unsigned mul (not muld) instructions to perform the
> shift.
>
> A mul is 11 cycles and muld (indexed) is 32 so I naturally though 4*11
> would be less than 2*32.  It is but there is more supporting logic.
>
> For your assembler interests:
>
> /*#include <stdio.h>*/
> unsignedlongval, val1, val2, val3;
> shortshft;
> unsignedlongshfl32(unsignedlong, short);
> unsignedlongshfl32a(unsignedlong, short);
> unsignedlongshfl32b(unsignedlong, short);
> unsignedlongshfl32(unsignedlongval, shortshft)
> {
> returnval<<shft;
> }
> unsignedlongshfl32a(unsignedlongval, shortshft)
> {
> #asm
> *,x &10,s pointer to longresult
> *4,s 4byte value
> *8,s 2byte shift
> *d =shift amount
> *x =pointer to result
> ldx 10,s
> ldd 8,s
> *ifshift amount is greater than 31then
> *just returnzero
> cmpd #32
> blt _10x
> ldq #0
> stq 4,s
> bra _13x
> *ifshift amount is greater than 16than
> *move bottom word of value into top word
> *and clear bottom word
> _10x
> cmpb #16
> blt _1x
> ldu 6,s
> stu 4,s
> clr 6,s
> clr 7,s
> _1x
> *setup pointer u and offset e into mult table _2x
> leau _2x,pc
> andb #15
> *ifthere is no shift value just returnvalue
> beq _13x
> aslb *need to doubleshift to use as word table offset
> stb 8,s *save doubleshft
> tfr b,e
> *shift top word q =val.word.high*multtab[shft]
> ldd 4,s
> muld e,u
> stw ,x *result.word.high=low word of mult
> *shift bottom word q =val.word.low*multtab[shft]
> lde 8,s *reload doubleshft
> ldd 6,s
> muld e,u
> stw 2,x *result.word.low=low word of mult
> *The high word or mult needs to be corrected forsign
> *ifval is negative then muld will returnnegated results
> *and need to un negate it
> lde 8,s *reload doubleshift
> tst 4,s *test top byte of val fornegative
> bge _11x
> addd e,u *add the multtab[shft] again to top word
> _11x
> *ifmulttab[shft] is negative(shft is 15or shft<<1is 30)
> *also need to un negate result
> cmpe #30
> bne _12x
> addd 6,s *add val.word.lowto top word
> _12x
> *combine top and bottom and save bottom half of result
> ord ,x
> std ,x
> bra _14x
> *this is only reached ifthe result is in value(let result =value)
> _13x
> ldq 4,s *load value
> stq ,x *result =value
> _14x
> puls u,pc
> _2x fdb $01,$02,$04,$08,$10,$20,$40,$80,$0100,$0200,$0400,$0800
> fdb $1000,$2000,$4000,$8000
> #endasm
> }
> unsignedlongshfl32b(unsignedlongval, shortshft)
> {
> #asm
> *,x &10,s pointer to longresult
> *4,s 4byte value
> *8,s 2byte shift
> *d =shift amount
> *x =pointer to result
> ldx 10,s
> ldd 8,s
> *ifshift amount is greater than 31then
> *just returnzero
> cmpd #32
> blt _10y
> ldq #0
> stq ,x
> bra _exit
> *Copy val to result
> _10y
> ldq 4,s
> stq ,x
> ldb 9,s *reload shft and copy shft to tshft
> lda 9,s
> asra *divide tshft by 8
> asra
> asra
> sta 8,s *save tshft
> leau _swttab,pc *switch(tshft)
> lda a,u
> jmp a,u
> _swttab
> fcb _default-_swttab
> fcb _case1-_swttab
> fcb _case2-_swttab
> fcb _case3-_swttab
> _case3
> *shuffle results bytes b3 to b0 and clear the rest
> subb #24*shft -=24;
> lda 3,x
> sta ,x
> clr 1,x
> clr 2,x
> clr 3,x
> bra _endswt
> _case2
> *shuffle result bytes b2-b3 to b0-b1 and clear the rest
> subb #16*shft -=16;
> ldu 2,x
> stu 0,x
> clr 2,x
> clr 3,x
> bra _endswt
> _case1
> *shuffle result bytes b1-b3 to b0-b2 and clear the rest
> subb #8*shft -=8;
> ldu 1,x
> stu 0,x
> lda 3,x
> sta 2,x
> clr 3,x
> _default
> _endswt
> *ifshft is zero we can return
> tstb
> beq _exit
> *calc the mult from a mult table
> leau <_multtab,pc
> leau b,u
> ldb ,u
> *result byte 0=low byte of result byte 0*mult
> lda ,x
> mul
> stb ,x
> *iftshft is 3can return
> ldf 8,s
> cmpf #3
> beq _exit
> *result byte 0=result byte 0OR high byte of result byte 1*mult
> *result byte 1=low byte of result byte 1*mult
> ldb ,u
> lda 1,x
> mul
> ora ,x
> std ,x
> *iftshft is 2can return
> cmpf #2
> beq _exit
> *result byte 1=result byte 1OR high byte of result byte 2*mult
> *result byte 2=low byte of result byte 2*mult
> ldb ,u
> lda 2,x
> mul
> ora 1,x
> std 1,x
> *iftshft is 1can return
> cmpf #1
> beq _exit
> *result byte 2=result byte 2OR high byte of result byte 3*mult
> *result byte 3=low byte of result byte 3*mult
> ldb ,u
> lda 3,x
> mul
> ora 2,x
> std 2,x
> _exit
> puls u,pc
> _multtab fcb $01,$02,$04,$08,$10,$20,$40,$80
> #endasm
> }
> intmain(intargc, char*argv[])
> {
> unsignedlong(*shftstfunc)(unsignedlong, short);
> intargc;
> pflinit();
> if(argv[2] !=0)
> {
> sscanf(argv[1], "%D", &val);
> shft =(short)atoi(argv[2]);
> printf("%lx %d\n", val, shft);
> val1 =shfl32(val, shft);
> val2 =shfl32a(val, shft);
> val3 =shfl32b(val, shft);
> printf("%lx\n", val1);
> printf("%lx\n", val2);
> printf("%lx\n", val3);
> return0;
> }
> shft =(short)atoi(argv[1]);
> switch(shft)
> {
> case2:
> printf("shfl32b\n");
> shftstfunc =shfl32b;
> break;
> case1:
> printf("shfl32a\n");
> shftstfunc =shfl32a;
> break;
> case0:
> default:
> printf("shfl32\n");
> shftstfunc =shfl32;
> break;
> }
> for(val =1; val <1000000; val++)
> {
> for(shft =0; shft <32; shft++)
> {
> val1 =shftstfunc(val, shft);
> /*printf("%lx %d = %lx\n", val, shft, val1);*/
> /*printf("%lx ", shfl32(val, shft));*/
> /*printf("%lx\n", shfl32a(val, shft));*/
> /*val1 = 0 + shfl32(val, shft);*/
> /*val2 = 0 + shfl32a(val, shft);*/
> /*printf("%lx ", val1);*/
> /*printf("%lx\n", val2);*/
> /*if (val1 != val2)
> {
> printf("%lx %d = %lx %lx\n", val, shft, val1, val2);
> }*/
> }
> }
> }
>
> On 10/28/19 3:57 PM, Gene Heskett wrote:
> > On Monday 28 October 2019 02:41:33 Walter Zambotti wrote:
> >> And I managed to get muld to perform the 32 variable (0-31+) bit
> >> shift.
> >>
> >> The function takes about 197 cycles regardless of number of bits.
> >> Except for the special case of 16 bits where it takes 99 cycles.
> >
> > One thing I always do when building a c function that involves bit
> > twidding, is check how far because the existing c library does it
> > one bit at a time in a loop.  So I stopped the compile at the output
> > of c.pass2 to inspect the generated code. After editing in the
> > differences, the compile was resumed to generate the final binary.
> >
> > I always checked the how far count, and if over 8, subtracted 8, the
> > did a tfr a,b clear a ,if to the right, and a tfr b,a clear b if to
> > the left.
> >
> > This resulted in bit shifts a lot faster while still being 100%
> > correct, and I see no reason that it couldn't be applied to regs.e &
> > f to accomplish exactly the same thing for a 32 bit operation.
> >
> > The last version of rzsz-3.36 I built and you may have was so hand
> > optimized, gaining around 100 cps in speed.
> >
> > How that might compare to what you are doing here, I've no clue.
> > However, reading this more carefully, it looks  pretty good.
> >
> >> It confirms to OS9 C ABI stack and result passing convention.
> >>
> >> /*#include <stdio.h>*/
> >>
> >> unsigned long shfl32(val, shft)
> >> unsigned long val;
> >> short shft;
> >> {
> >>    return val<<shft;
> >> }
> >>
> >> unsigned long shfl32a(val, shft)
> >> unsigned long val;
> >> short shft;
> >> {
> >> #asm
> >> * 10,s pointer to long result
> >> * 4,s 4 byte value
> >> * 8,s 2 byte shift
> >> * d = shift amount
> >> * x = pointer to result
> >>    ldx 10,s
> >>    ldd 8,s
> >> * if shift amount is greater than 31 then
> >> * just return zero
> >>    cmpd #32
> >>    blt _10x
> >>    ldq #0
> >>    stq 4,s
> >>    bra _13x
> >> * if shift amount is greater than 16 than
> >> * move bottom word of value into top word
> >> * and clear bottom word
> >> _10x
> >>    cmpb #16
> >>    blt _1x
> >>    ldu 6,s
> >>    stu 4,s
> >>    clr 6,s
> >>    clr 7,s
> >> _1x
> >> * setup pointer u and offset e into mult table _2x
> >>    leau _2x,pc
> >>    andb #15
> >> * if there is no shift value just return value
> >>    beq _13x
> >>    aslb * need to double shift to use as word table offset
> >>    stb 8,s     * save double shft
> >>    tfr b,e
> >> * shift top word q = val.word.high * multtab[shft]
> >>    ldd 4,s
> >>    muld e,u
> >>    stw ,x * result.word.high = low word of mult
> >> * shift bottom word q = val.word.low * multtab[shft]
> >>    lde 8,s     * reload double shft
> >>    ldd 6,s
> >>    muld e,u
> >>    stw 2,x     * result.word.low = low word of mult
> >> * The high word or mult needs to be corrected for sign
> >> * if val is negative then muld will return negated results
> >> * and need to un negate it
> >>    lde 8,s     * reload double shift
> >>    tst 4,s     * test top byte of val for negative
> >>    bge _11x
> >>    addd e,u    * add the multtab[shft] again to top word
> >> _11x
> >> * if multtab[shft] is negative (shft is 15 or shft<<1 is 30)
> >> * also need to un negate result
> >>    cmpe #30
> >>    bne _12x
> >>    addd 6,s    * add val.word.low to top word
> >> _12x
> >> * combine top and bottom and save bottom half of result
> >>    ord ,x
> >>    std ,x
> >>    bra _14x
> >> * this is only reached if the result is in value (let result =
> >> value) _13x
> >>    ldq 4,s     * load value
> >>    stq ,x      * result = value
> >> _14x
> >>    puls u,pc
> >> _2x fdb $01,$02,$04,$08,$10,$20,$40,$80,$0100,$0200,$0400,$0800
> >>     fdb $1000,$2000,$4000,$8000
> >> #endasm
> >> }
> >>
> >> unsigned long val, val1, val2;
> >> short shft;
> >>
> >> int main(argc, argv)
> >> int argc;
> >> char *argv[];
> >> {
> >>    /*long val, val1, val2;
> >>    short shft;*/
> >>    unsigned long dummy = 0;
> >>    /*long (*shftstfunc)(long, short);*/
> >>
> >>    pflinit();
> >>
> >>    sscanf(argv[1], "%D", &val);
> >>    shft = (short)atoi(argv[2]);
> >>    /* val = 1; shft = 1;*/
> >>    printf("%lx %d\n", val, shft);
> >>    val1 = shfl32(val, shft);
> >>    val2 = shfl32a(val, shft);
> >>    printf("%lx\n", val1);
> >>    printf("%lx\n", val2);
> >>    return 0;
> >>    /*
> >>    shft = (short)atoi(argv[1]);
> >>
> >>    if(shft == 1)
> >>    {
> >>       printf("shfl32\n");
> >>       shftstfunc = shfl32;
> >>    }
> >>    else
> >>    {
> >>       printf("shfl32a\n");
> >>       shftstfunc = shfl32a;
> >>    }
> >>    */
> >>
> >>    for(val = 1 ; val < 1000000 ; val++)
> >>    {
> >>      for(shft = 0 ; shft < 32 ; shft++)
> >>      {
> >>        /*printf("%lx ", shfl32(val, shft));*/
> >>        /*printf("%lx\n", shfl32a(val, shft));*/
> >>        val1 = 0 + shfl32(val, shft);
> >>        val2 = 0 + shfl32a(val, shft);
> >>        /*printf("%lx ", val1);*/
> >>        /*printf("%lx\n", val2);*/
> >>        if (val1 != val2)
> >>        {
> >>          printf("%lx %d = %lx %lx\n", val, shft, val1, val2);
> >>        }
> >>      }
> >>    }
> >> }
> >>
> >>
> >> -----Original Message-----
> >> From: Coco [mailto:coco-bounces at maltedmedia.com] On Behalf Of
> >> Walter Zambotti Sent: Friday, 25 October 2019 2:04 PM
> >> To: 'CoCoList for Color Computer Enthusiasts'
> >> <coco at maltedmedia.com> Subject: Re: [Coco] 6309 MULD real and
> >> emulators
> >>
> >> Robert
> >>
> >> On OVCC it has already been correct in version 1.1.
> >>
> >> In my recent 6309 emulator rewrite in X86 assembly I added all the
> >> missing ops and corrected some other 6309 ops that I thought were
> >> not correct.
> >>
> >> I also did this in the C version.  The C version should be
> >> backwards portable to VCC with very little effort.
> >>
> >> Walter
> >>
> >> Here is the OVCC muld C code
> >>
> >> void Muld_M(void)
> >> { //118F Phase 5 6309
> >> 	Q_REG =  (signed short)D_REG * (signed short)IMMADDRESS(PC_REG);
> >> 	cc[C] = 0;
> >> 	cc[Z] = ZTEST(Q_REG);
> >> 	cc[V] = 0;
> >> 	cc[N] = NTEST32(Q_REG);
> >> 	PC_REG+=2;
> >> 	CycleCounter+=28;
> >> }
> >>
> >> -----Original Message-----
> >> From: Coco [mailto:coco-bounces at maltedmedia.com] On Behalf Of
> >> Robert Gault Sent: Friday, 25 October 2019 10:47 AM
> >> To: CoCoList for Color Computer Enthusiasts <coco at maltedmedia.com>
> >> Subject: [Coco] 6309 MULD real and emulators
> >>
> >> There was a question posted about the 6309 opcode MULD. That is a
> >> multiplication of the content of regD with Immediate, Direct,
> >> Extended, or Indexed numbers. What makes it different from the
> >> opcode MUL is that MULD is a signed multiplication.
> >>
> >> However, be warned that while for a real 6309, and the MAME/MESS
> >> emulator MULD is signed, it is unsigned with VCC v2.0.1. VCC should
> >> be corrected! ex.
> >>    real 6309
> >>    ldd #$8001
> >>    muld #$8001
> >>    regQ = $3FFF0001
> >>
> >>    VCC
> >>    ldd #$8001
> >>    muld #$8001
> >>    regQ = $40010001     Correct if the multiplication was unsigned.
> >>
> >> You can get the same $3FFF0001 answer with real 6309
> >>    ldd #$7FFF
> >>    muld #$7FFF
> >> regQ = $3FFF0001
> >>
> >> Now since $10000-$7FFF=$8001 the above signed math makes sense as
> >> $8001=-$7FFF.
> >>
> >> Robert
> >>
> >> --
> >> Coco mailing list
> >> Coco at maltedmedia.com
> >> https://pairlist5.pair.net/mailman/listinfo/coco
> >>
> >>
> >> --
> >> Coco mailing list
> >> Coco at maltedmedia.com
> >> https://pairlist5.pair.net/mailman/listinfo/coco
> >
> > Cheers, Gene Heskett

Cheers, Gene Heskett
-- 
"There are four boxes to be used in defense of liberty:
 soap, ballot, jury, and ammo. Please use in that order."
-Ed Howdershelt (Author)
If we desire respect for the law, we must first make the law respectable.
 - Louis D. Brandeis
Genes Web page <http://geneslinuxbox.net:6309/gene>