[Coco] 6309 MULD real and emulators

Tue Oct 29 07:48:55 EDT 2019

I found the other 32 bit shift routine that I was convinced was faster.

Turns out it is the same.

It uses upto 4 unsigned mul (not muld) instructions to perform the shift.

A mul is 11 cycles and muld (indexed) is 32 so I naturally though 4*11 
would be less than 2*32.  It is but there is more supporting logic.

For your assembler interests:

/*#include <stdio.h>*/
unsignedlongval, val1, val2, val3;
shortshft;
unsignedlongshfl32(unsignedlong, short);
unsignedlongshfl32a(unsignedlong, short);
unsignedlongshfl32b(unsignedlong, short);
unsignedlongshfl32(unsignedlongval, shortshft)
{
returnval<<shft;
}
unsignedlongshfl32a(unsignedlongval, shortshft)
{
#asm
*,x &10,s pointer to longresult
*4,s 4byte value
*8,s 2byte shift
*d =shift amount
*x =pointer to result
ldx 10,s
ldd 8,s
*ifshift amount is greater than 31then
*just returnzero
cmpd #32
blt _10x
ldq #0
stq 4,s
bra _13x
*ifshift amount is greater than 16than
*move bottom word of value into top word
*and clear bottom word
_10x
cmpb #16
blt _1x
ldu 6,s
stu 4,s
clr 6,s
clr 7,s
_1x
*setup pointer u and offset e into mult table _2x
leau _2x,pc
andb #15
*ifthere is no shift value just returnvalue
beq _13x
aslb *need to doubleshift to use as word table offset
stb 8,s *save doubleshft
tfr b,e
*shift top word q =val.word.high*multtab[shft]
ldd 4,s
muld e,u
stw ,x *result.word.high=low word of mult
*shift bottom word q =val.word.low*multtab[shft]
lde 8,s *reload doubleshft
ldd 6,s
muld e,u
stw 2,x *result.word.low=low word of mult
*The high word or mult needs to be corrected forsign
*ifval is negative then muld will returnnegated results
*and need to un negate it
lde 8,s *reload doubleshift
tst 4,s *test top byte of val fornegative
bge _11x
addd e,u *add the multtab[shft] again to top word
_11x
*ifmulttab[shft] is negative(shft is 15or shft<<1is 30)
*also need to un negate result
cmpe #30
bne _12x
addd 6,s *add val.word.lowto top word
_12x
*combine top and bottom and save bottom half of result
ord ,x
std ,x
bra _14x
*this is only reached ifthe result is in value(let result =value)
_13x
ldq 4,s *load value
stq ,x *result =value
_14x
puls u,pc
_2x fdb $01,$02,$04,$08,$10,$20,$40,$80,$0100,$0200,$0400,$0800
fdb $1000,$2000,$4000,$8000
#endasm
}
unsignedlongshfl32b(unsignedlongval, shortshft)
{
#asm
*,x &10,s pointer to longresult
*4,s 4byte value
*8,s 2byte shift
*d =shift amount
*x =pointer to result
ldx 10,s
ldd 8,s
*ifshift amount is greater than 31then
*just returnzero
cmpd #32
blt _10y
ldq #0
stq ,x
bra _exit
*Copy val to result
_10y
ldq 4,s
stq ,x
ldb 9,s *reload shft and copy shft to tshft
lda 9,s
asra *divide tshft by 8
asra
asra
sta 8,s *save tshft
leau _swttab,pc *switch(tshft)
lda a,u
jmp a,u
_swttab
fcb _default-_swttab
fcb _case1-_swttab
fcb _case2-_swttab
fcb _case3-_swttab
_case3
*shuffle results bytes b3 to b0 and clear the rest
subb #24*shft -=24;
lda 3,x
sta ,x
clr 1,x
clr 2,x
clr 3,x
bra _endswt
_case2
*shuffle result bytes b2-b3 to b0-b1 and clear the rest
subb #16*shft -=16;
ldu 2,x
stu 0,x
clr 2,x
clr 3,x
bra _endswt
_case1
*shuffle result bytes b1-b3 to b0-b2 and clear the rest
subb #8*shft -=8;
ldu 1,x
stu 0,x
lda 3,x
sta 2,x
clr 3,x
_default
_endswt
*ifshft is zero we can return
tstb
beq _exit
*calc the mult from a mult table
leau <_multtab,pc
leau b,u
ldb ,u
*result byte 0=low byte of result byte 0*mult
lda ,x
mul
stb ,x
*iftshft is 3can return
ldf 8,s
cmpf #3
beq _exit
*result byte 0=result byte 0OR high byte of result byte 1*mult
*result byte 1=low byte of result byte 1*mult
ldb ,u
lda 1,x
mul
ora ,x
std ,x
*iftshft is 2can return
cmpf #2
beq _exit
*result byte 1=result byte 1OR high byte of result byte 2*mult
*result byte 2=low byte of result byte 2*mult
ldb ,u
lda 2,x
mul
ora 1,x
std 1,x
*iftshft is 1can return
cmpf #1
beq _exit
*result byte 2=result byte 2OR high byte of result byte 3*mult
*result byte 3=low byte of result byte 3*mult
ldb ,u
lda 3,x
mul
ora 2,x
std 2,x
_exit
puls u,pc
_multtab fcb $01,$02,$04,$08,$10,$20,$40,$80
#endasm
}
intmain(intargc, char*argv[])
{
unsignedlong(*shftstfunc)(unsignedlong, short);
intargc;
pflinit();
if(argv[2] !=0)
{
sscanf(argv[1], "%D", &val);
shft =(short)atoi(argv[2]);
printf("%lx %d\n", val, shft);
val1 =shfl32(val, shft);
val2 =shfl32a(val, shft);
val3 =shfl32b(val, shft);
printf("%lx\n", val1);
printf("%lx\n", val2);
printf("%lx\n", val3);
return0;
}
shft =(short)atoi(argv[1]);
switch(shft)
{
case2:
printf("shfl32b\n");
shftstfunc =shfl32b;
break;
case1:
printf("shfl32a\n");
shftstfunc =shfl32a;
break;
case0:
default:
printf("shfl32\n");
shftstfunc =shfl32;
break;
}
for(val =1; val <1000000; val++)
{
for(shft =0; shft <32; shft++)
{
val1 =shftstfunc(val, shft);
/*printf("%lx %d = %lx\n", val, shft, val1);*/
/*printf("%lx ", shfl32(val, shft));*/
/*printf("%lx\n", shfl32a(val, shft));*/
/*val1 = 0 + shfl32(val, shft);*/
/*val2 = 0 + shfl32a(val, shft);*/
/*printf("%lx ", val1);*/
/*printf("%lx\n", val2);*/
/*if (val1 != val2)
{
printf("%lx %d = %lx %lx\n", val, shft, val1, val2);
}*/
}
}
}
On 10/28/19 3:57 PM, Gene Heskett wrote:
> On Monday 28 October 2019 02:41:33 Walter Zambotti wrote:
>
>> And I managed to get muld to perform the 32 variable (0-31+) bit
>> shift.
>>
>> The function takes about 197 cycles regardless of number of bits.
>> Except for the special case of 16 bits where it takes 99 cycles.
>>
> One thing I always do when building a c function that involves bit
> twidding, is check how far because the existing c library does it one
> bit at a time in a loop.  So I stopped the compile at the output of
> c.pass2 to inspect the generated code. After editing in the differences,
> the compile was resumed to generate the final binary.
>
> I always checked the how far count, and if over 8, subtracted 8, the did
> a tfr a,b clear a ,if to the right, and a tfr b,a clear b if to the
> left.
>
> This resulted in bit shifts a lot faster while still being 100% correct,
> and I see no reason that it couldn't be applied to regs.e & f to
> accomplish exactly the same thing for a 32 bit operation.
>
> The last version of rzsz-3.36 I built and you may have was so hand
> optimized, gaining around 100 cps in speed.
>
> How that might compare to what you are doing here, I've no clue. However,
> reading this more carefully, it looks  pretty good.
>
>> It confirms to OS9 C ABI stack and result passing convention.
>>
>> /*#include <stdio.h>*/
>>
>> unsigned long shfl32(val, shft)
>> unsigned long val;
>> short shft;
>> {
>>    return val<<shft;
>> }
>>
>> unsigned long shfl32a(val, shft)
>> unsigned long val;
>> short shft;
>> {
>> #asm
>> * 10,s pointer to long result
>> * 4,s 4 byte value
>> * 8,s 2 byte shift
>> * d = shift amount
>> * x = pointer to result
>>    ldx 10,s
>>    ldd 8,s
>> * if shift amount is greater than 31 then
>> * just return zero
>>    cmpd #32
>>    blt _10x
>>    ldq #0
>>    stq 4,s
>>    bra _13x
>> * if shift amount is greater than 16 than
>> * move bottom word of value into top word
>> * and clear bottom word
>> _10x
>>    cmpb #16
>>    blt _1x
>>    ldu 6,s
>>    stu 4,s
>>    clr 6,s
>>    clr 7,s
>> _1x
>> * setup pointer u and offset e into mult table _2x
>>    leau _2x,pc
>>    andb #15
>> * if there is no shift value just return value
>>    beq _13x
>>    aslb * need to double shift to use as word table offset
>>    stb 8,s     * save double shft
>>    tfr b,e
>> * shift top word q = val.word.high * multtab[shft]
>>    ldd 4,s
>>    muld e,u
>>    stw ,x * result.word.high = low word of mult
>> * shift bottom word q = val.word.low * multtab[shft]
>>    lde 8,s     * reload double shft
>>    ldd 6,s
>>    muld e,u
>>    stw 2,x     * result.word.low = low word of mult
>> * The high word or mult needs to be corrected for sign
>> * if val is negative then muld will return negated results
>> * and need to un negate it
>>    lde 8,s     * reload double shift
>>    tst 4,s     * test top byte of val for negative
>>    bge _11x
>>    addd e,u    * add the multtab[shft] again to top word
>> _11x
>> * if multtab[shft] is negative (shft is 15 or shft<<1 is 30)
>> * also need to un negate result
>>    cmpe #30
>>    bne _12x
>>    addd 6,s    * add val.word.low to top word
>> _12x
>> * combine top and bottom and save bottom half of result
>>    ord ,x
>>    std ,x
>>    bra _14x
>> * this is only reached if the result is in value (let result = value)
>> _13x
>>    ldq 4,s     * load value
>>    stq ,x      * result = value
>> _14x
>>    puls u,pc
>> _2x fdb $01,$02,$04,$08,$10,$20,$40,$80,$0100,$0200,$0400,$0800
>>     fdb $1000,$2000,$4000,$8000
>> #endasm
>> }
>>
>> unsigned long val, val1, val2;
>> short shft;
>>
>> int main(argc, argv)
>> int argc;
>> char *argv[];
>> {
>>    /*long val, val1, val2;
>>    short shft;*/
>>    unsigned long dummy = 0;
>>    /*long (*shftstfunc)(long, short);*/
>>
>>    pflinit();
>>
>>    sscanf(argv[1], "%D", &val);
>>    shft = (short)atoi(argv[2]);
>>    /* val = 1; shft = 1;*/
>>    printf("%lx %d\n", val, shft);
>>    val1 = shfl32(val, shft);
>>    val2 = shfl32a(val, shft);
>>    printf("%lx\n", val1);
>>    printf("%lx\n", val2);
>>    return 0;
>>    /*
>>    shft = (short)atoi(argv[1]);
>>
>>    if(shft == 1)
>>    {
>>       printf("shfl32\n");
>>       shftstfunc = shfl32;
>>    }
>>    else
>>    {
>>       printf("shfl32a\n");
>>       shftstfunc = shfl32a;
>>    }
>>    */
>>
>>    for(val = 1 ; val < 1000000 ; val++)
>>    {
>>      for(shft = 0 ; shft < 32 ; shft++)
>>      {
>>        /*printf("%lx ", shfl32(val, shft));*/
>>        /*printf("%lx\n", shfl32a(val, shft));*/
>>        val1 = 0 + shfl32(val, shft);
>>        val2 = 0 + shfl32a(val, shft);
>>        /*printf("%lx ", val1);*/
>>        /*printf("%lx\n", val2);*/
>>        if (val1 != val2)
>>        {
>>          printf("%lx %d = %lx %lx\n", val, shft, val1, val2);
>>        }
>>      }
>>    }
>> }
>>
>>
>> -----Original Message-----
>> From: Coco [mailto:coco-bounces at maltedmedia.com] On Behalf Of Walter
>> Zambotti Sent: Friday, 25 October 2019 2:04 PM
>> To: 'CoCoList for Color Computer Enthusiasts' <coco at maltedmedia.com>
>> Subject: Re: [Coco] 6309 MULD real and emulators
>>
>> Robert
>>
>> On OVCC it has already been correct in version 1.1.
>>
>> In my recent 6309 emulator rewrite in X86 assembly I added all the
>> missing ops and corrected some other 6309 ops that I thought were not
>> correct.
>>
>> I also did this in the C version.  The C version should be backwards
>> portable to VCC with very little effort.
>>
>> Walter
>>
>> Here is the OVCC muld C code
>>
>> void Muld_M(void)
>> { //118F Phase 5 6309
>> 	Q_REG =  (signed short)D_REG * (signed short)IMMADDRESS(PC_REG);
>> 	cc[C] = 0;
>> 	cc[Z] = ZTEST(Q_REG);
>> 	cc[V] = 0;
>> 	cc[N] = NTEST32(Q_REG);
>> 	PC_REG+=2;
>> 	CycleCounter+=28;
>> }
>>
>> -----Original Message-----
>> From: Coco [mailto:coco-bounces at maltedmedia.com] On Behalf Of Robert
>> Gault Sent: Friday, 25 October 2019 10:47 AM
>> To: CoCoList for Color Computer Enthusiasts <coco at maltedmedia.com>
>> Subject: [Coco] 6309 MULD real and emulators
>>
>> There was a question posted about the 6309 opcode MULD. That is a
>> multiplication of the content of regD with Immediate, Direct,
>> Extended, or Indexed numbers. What makes it different from the opcode
>> MUL is that MULD is a signed multiplication.
>>
>> However, be warned that while for a real 6309, and the MAME/MESS
>> emulator MULD is signed, it is unsigned with VCC v2.0.1. VCC should be
>> corrected! ex.
>>    real 6309
>>    ldd #$8001
>>    muld #$8001
>>    regQ = $3FFF0001
>>
>>    VCC
>>    ldd #$8001
>>    muld #$8001
>>    regQ = $40010001     Correct if the multiplication was unsigned.
>>
>> You can get the same $3FFF0001 answer with real 6309
>>    ldd #$7FFF
>>    muld #$7FFF
>> regQ = $3FFF0001
>>
>> Now since $10000-$7FFF=$8001 the above signed math makes sense as
>> $8001=-$7FFF.
>>
>> Robert
>>
>> --
>> Coco mailing list
>> Coco at maltedmedia.com
>> https://pairlist5.pair.net/mailman/listinfo/coco
>>
>>
>> --
>> Coco mailing list
>> Coco at maltedmedia.com
>> https://pairlist5.pair.net/mailman/listinfo/coco
>
> Cheers, Gene Heskett