* Re: [TUHS] A Reiser tour do force
@ 2022-04-03 10:55 Paul Ruizendaal via TUHS
2022-04-03 12:26 ` Rob Pike
0 siblings, 1 reply; 3+ messages in thread
From: Paul Ruizendaal via TUHS @ 2022-04-03 10:55 UTC (permalink / raw)
To: TUHS main list
[-- Attachment #1: Type: text/plain, Size: 18998 bytes --]
> A not-very-thorough search at tuhs turned up V9/jerq/src/lib/j/bitblt.c
> It appears to be a pre-Reiser bitblt, not what was asked for.
The Reiser code is in the V8 jerq tarball that Dan Cross donated:
v8jerq.tar.bz2 <https://www.tuhs.org/Archive/Distributions/Research/Dan_Cross_v8/v8jerq.tar.bz2>
It is in file blit/src/libj/bitblt.s (attached below for convenience). It is 750 lines of 68K assembler. It does not appear to have been ported to the Bellmac 32 CPU. Maybe it did not make sense in that context.
Paul
=====
#
# bitblt(sm,r,dm,p,fc)
# Bitmap *sm,*dm;
# Rectangle r;
# Point p;
# int fc;
#
# by John F. Reiser summer 1982
#
# Depending on the case at hand, generate very good code and execute it.
#
# offsets in a Point
set x,0
set y,2
# offsets in a Rectangle
set origin,0
set corner,4
# offsets in a Bitmap
set base,0
set width,4
set rect,6
# parameter offsets from %fp
set sm,8
set r,12
set dm,20
set p,24
set fc,28
set NREG,11
global bitblt
bitblt:
movm.l &0x3f3e,-(%sp) # save C registers
movm.l NREG*4-4+sm(%sp),&0x001f
# d1=r.o.x,,r.o.y; d2=r.c.x,,r.c.y; d4=p.x,,p.y;
mov.l %d0,%a4 # sm
mov.l %d3,%a5 # dm
mov.w NREG*4-4+fc(%sp),%a6 # a6.w == fc
movm.l rect(%a4),&0x9 # d0=sm.o.x,,sm.o.y; d3=sm.c.x,,sm.c.y;
movm.l rect(%a5),&0x60 # d5=dm.o.x,,dm.o.y; d6=dm.c.x,,dm.c.y;
lea.l $L50(%pc),%a0
L5:
# clip r.y to sm.y
mov.w %d0,%d7 # sm.o.y
sub.w %d1,%d7 # - r.o.y
ble.b L10
mov.w %d0,%d1 # r.o.y = sm.o.y; /* r.o.y was above sm.rect */
add.w %d7,%d4 # p.y parallels r.o.y
L10:
cmp.w %d2,%d3 # r.c.y : sm.c.y
ble.b L20
mov.w %d3,%d2 # r.c.y = sm.c.y; /* bottom of r was below sm.rect */
L20:
# clip (r.y at p.y) to dm.y
mov.w %d5,%d7 # dm.o.y
sub.w %d4,%d7 # -p.y
ble.b L30
mov.w %d5,%d4 # p.y = dm.o.y; /* p.y was above dm.rect */
add.w %d7,%d1 # r.o.y parallels p.y
L30:
mov.w %d1,%d7 # r.o.y
add.w %d6,%d7 # + dm.c.y
sub.w %d4,%d7 # - p.y /* == max y that dm.rect allows in r */
cmp.w %d2,%d7 # r.c.y : limit
ble.b L40
mov.w %d7,%d2 # r.c.y = limit
L40:
mov.w %d2,%d7 # r.c.y
sub.w %d1,%d7 # - r.o.y
sub.w &1,%d7 # /* == h-1 in bits */
blt.b ret
jmp (%a0)
retgen:
lea.l gensiz(%sp),%sp
ret8:
add.l &8,%sp
ret:
movm.l (%sp)+,&0x7cfc
rts
L50:
# mirror in pi/4 and reuse same code to clip x
swap.w %d0; swap.w %d1; swap.w %d2; swap.w %d3
swap.w %d4; swap.w %d5; swap.w %d6; swap.w %d7
lea.l $L55(%pc),%a0
br.b L5
L55:
mov.l %d1,%a1
mov.l %d4,%d6
#
# So far
# %d7 == h-1,,w-1
# %d6 == p.y,,p.x
# %a6.w == fc
# %a5 == dm
# %a4 == sm
# %a1 == r.o.y,,r.o.x
#
# Compute masks, and width in words
#
mov.w %d6,%d0 # p.x /* left endpoint of dst */
mov.w %d7,%d1 # w-1
add.w %d6,%d1 # right endpoint
mov.l &-1,%d3
mov.l &15,%d2
and.w %d0,%d2
lsr.w %d2,%d3 # mask1
mov.l &-1,%d5
mov.l &15,%d2
and.w %d1,%d2
add.w &1,%d2
lsr.w %d2,%d5
not.w %d5 # mask2
swap.w %d5
mov.w %d3,%d5 # mask2,,mask1
asr.w &4,%d0
asr.w &4,%d1
sub.w %d0,%d1
sub.w &1,%d1 # inner-loop width in words
mov.l &0,%d4 # assume LtoR
mov.w width(%a5),%d3
add.w %d3,%d3
mov.w width(%a4),%d2
add.w %d2,%d2
#
# So far
# %d7 == h-1,,w-1 in bits
# %d6 == p.y,,p.x
# %d5 == mask2,,mask1
# %d4 == 0 (LtoR)
# %d3.w == dm width in bytes
# %d2.w == sm width in bytes
# %d1.w == inner-loop width in words
# %a6.w == fc
# %a5 == dm
# %a4 == sm
# %a1 == r.o.y,,r.o.x
#
# If necessary, compensate for overlap of source and destination
#
cmp.l %a4,%a5
bne.b L80 # overlap not possible
mov.l %d6,%d0 # p.y,,p.x
mov.w %a1,%d0 # p.y,,r.o.x
cmp.l %a1,%d0 # r.o.y : p.y
bge.b L60 # if (r.o.y < p.y)
mov.l %d7,%d0 # h-1,,w-1
clr.w %d0 # h-1,,0
add.l %d0,%a1 # r.o.y += h-1;
add.l %d0,%d6 # p.y += h-1;
neg.w %d3 # wdst = -wdst;
neg.w %d2 # wsrc = -wsrc;
L60:
cmp.w %d7,&16
blt.b L70 # l<->r swap not needed for narrow
cmp.w %d6,%a1 # p.x : r.o.x
ble.b L70 # if (r.o.x < p.x)
mov.l %a1,%d0
add.w %d7,%d0
mov.l %d0,%a1 # r.o.x += w-1;
add.w %d7,%d6 # p.x += w-1;
mov.l &-1,%d4 # RtoL
swap.w %d5 # masks in other order
L70:
L80:
#
# Locate actual starting points
#
mov.l %d6,%d0 # p.y,,p.x
swap.w %d0
mov.l %d0,-(%sp) # p
mov.l %a5,-(%sp) # dm
mov.l &15,%d0
lea.l $L82(%pc),%a0 # assume narrow
cmp.w %d7,%d0 # w-1 : 15
ble.b L81 # guessed correctly
lea.l $L85(%pc),%a0 # wide
L81:
mov.l %a0,-(%sp) # on return, go directly to wide/narrow code
add.w %a6,%a6; add.w %a6,%a6 # with 4*fc
mov.w %d1,%d7 # h-1 in bits,,inner width in words
and.l %d0,%d6 # 0,,bit offset of p.x
mov.l %a1,%d1 # r.o.y,,r.o.x
and.w %d1,%d0 # bit offset of r.o.x
sub.w %d0,%d6 # BO(p.x) - BO(r.o.x) /* amount of right rotation */
swap.w %d1 # r.o.x,,r.o.y
mov.l %d1,-(%sp) # r.o
mov.l %a4,-(%sp) # sm
lea.l addr,%a3
jsr (%a3)
mov.l %a0,%a2 # src = addr(sm,r.origin);
add.l &8,%sp
jmp (%a3) # %a0 = addr(dm,p);
L82:
mov.l &0,%d4
mov.w %d5,%d4 # 0,,mask1
swap.w %d5 # mask1,,mask2 (proper long mask; maybe 16 bits too wide)
and.w %d5,%d4 # check for overlap of mask1 and mask2
beq.b L83 # no overlap ==> %d5 already correct
mov.l %d4,%d5 # overlap ==> reduce %d5 by 16 bits
swap.w %d5 # and put it in the proper half
L83:
swap.w %d7 # ,,height-1
lea.l $nrwtab(%pc,%a6.w),%a6 # -> optab
tst.w %d6 # amount of right rotation
bge.b L84
neg.w %d6
add.l &2,%a6
L84:
add.w (%a6),%a6
jmp (%a6)
nrwtab:
short opMnwr-nrwtab- 0, opMnwl-nrwtab- 2
short opSnwr-nrwtab- 4, opSnwl-nrwtab- 6
short opCnwr-nrwtab- 8, opCnwl-nrwtab-10
short opXnwr-nrwtab-12, opXnwl-nrwtab-14
opMnwr:
mov.l (%a2),%d0
mov.l (%a0),%d1
ror.l %d6,%d0
eor.l %d1,%d0
and.l %d5,%d0
eor.l %d1,%d0
mov.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opMnwr
br ret8
opMnwl:
mov.l (%a2),%d0
mov.l (%a0),%d1
rol.l %d6,%d0
eor.l %d1,%d0
and.l %d5,%d0
eor.l %d1,%d0
mov.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opMnwl
br ret8
opSnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
or.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opSnwr
br ret8
opSnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
or.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opSnwl
br ret8
opCnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
not.l %d0
and.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opCnwr
br ret8
opCnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
not.l %d0
and.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opCnwl
br ret8
opXnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
eor.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opXnwr
br ret8
opXnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
eor.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opXnwl
br ret8
set DBR,0x51c8
set MOVLI,0x2000+074 # mov.l &...,
set MOVWI,0x3000+074 # mov.w &...,
set ADDWI,0x0640 # add.w &...,
set FDFRAG,16 # first destination is a fragment
set LDFRAG,17 # last destination is a fragment
set NSHF1,18
set FD2D,19 # first destination should store 2 words
set LD2D,20 # last destination should store 2 words
set FSTORE,21
set DST1L,24 # dst inner count is 0
set SRC1L,25 # Nsrc is 2
set gensiz,80
widtab:
mov.w %d0,(%a0)+; short 0
or.w %d0,(%a0)+; short 0
and.w %d0,(%a0)+; not.w %d0
eor.w %d0,(%a0)+; short 0
#
# So far
# %d7 == h-1 (bits),,w (words)
# %d6 == 0,,rotate count
# %d5 == mask2,,mask1
# %d4 == -RtoL
# %d3.w == wdst (bytes)
# %d2.w == wsrc (bytes)
# %a6.w == 4*fc
# %a2 -> src
# %a0 -> dst
#
L85:
lea.l $widtab(%pc,%a6.w),%a6
tst.w %d4; bpl.b L300; bset &31,%d6
L300:
mov.w %d7,%d0 # inner word count
bne.b L304; bset &DST1L,%d6
L304:
add.w &1,%d0 # Nsrc = 1+Ninner
mov.w %d0,%a1 # + ...
add.w &1,%d0 # Ndst = 1+Ninner+1
add.w %d0,%d0 # magnitude of dst addressing side effects
tst.l %d6; bpl.b L310
neg.w %d0; add.l &2,%a0 # RtoL
L310:
sub.w %d0,%d3 # compensate dst for autoincrement
mov.w %d5,%d4 # mask1
swap.w %d5 # mask2
cmp.w %d4,&-1; beq.b L320; bset &FDFRAG,%d6
L320:
cmp.w %d5,&-1; seq.b %d1; beq.b L330; bset &LDFRAG,%d6
L330:
tst.w %d6; bne.b L360 # not NOSHIFT
add.w &1,%a1 # Nsrc = 1+Ninner+1
mov.l %d6,%d0; swap.w %d0; ext.w %d0 # 0,,flag bits
asr.w &1,%d7; roxl.w &1,%d0 # account for inner words odd
mov.b $nstab(%pc,%d0.w),%d0
bpl.b L340; add.w &1,%d7
L340:
add.b %d0,%d0
bpl.b L350; sub.w &1,%d7
L350:
swap.w %d0; eor.l %d0,%d6 # the bits
btst &DST1L,%d6; bne.b L355
btst &FD2D,%d6; beq.b L410
L355:
ext.l %d4; bmi.b L410; swap.w %d4; not.w %d4 # NOSHIFT mask1 .l
br.b L410 # NOSHIFT mask2 .l
nstab:
byte 0x82,0x80,0x04,0x80 # 0x80: +1 inner; 0x40: -1 inner
byte 0x02,0x00,0x44,0x00 # 0x04: FD2D; 0x02: NSHF1 no first word
L360:
ext.w %d1; sub.w %d1,%d7 # extend inner loop
mov.l &0xf,%d0 # 0 1 7 8 9 e f
add.w &8,%d6 # 8 9 f 0 1 6 7
and.w %d0,%d6
sub.w &8,%d6 # 0 1 7 -8 -7 -2 -1 X=C= sign
mov.w %d6,%d1; bge.b L367 # X unchanged
neg.w %d1 # 8 7 2 1 X=C= 1
L367:
roxl.w &1,%d1 # 0 2 e 11 f 5 3
and.w %d0,%d1 # 0 2 e 1 f 5 3
lsl.w &8,%d1 # magic position
short ADDWI+001
ror.l &8,%d0
mov.w %d1,%a3 # the rotate instruction
mov.l &0,%d1; not.w %d1 # 0,,-1
ror.l %d6,%d1 # where the bits are after a rotate
mov.w %d1,%d0; and.w %d4,%d0; beq.b L370 # 1 src word covers dst frag
not.w %d1; and.w %d4,%d1; beq.b L370
add.w &1,%a1; br.b L390 # fragment needs another src word
L370:
sub.w &1,%d7 # .l takes an inner word
bset &FD2D,%d6
ext.l %d4; bmi.b L390
swap.w %d4; not.w %d4 # mask1 .l
L390:
swap.w %d1
mov.w %d1,%d0; and.w %d5,%d0; beq.b L400 # 1 src word covers dst frag
not.w %d1; and.w %d5,%d1; beq.b L400
add.w &1,%a1; br.b L420 # fragment needs another src word
L400:
dbr %d7,L405 # .l takes an inner word
clr.w %d7; br.b L420 # nothing there to take
L405:
L410:
bset &LD2D,%d6
ext.l %d5; bmi.b L420
swap.w %d5; not.w %d5 # mask2 .l
L420:
tst.w NREG*4-4+fc+8(%sp); bne.b L430; bset &FSTORE,%d6
L430:
mov.w %a1,%d0 # Nsrc
add.w %d0,%d0 # magnitude of src addressing side effects
tst.l %d6; bpl.b L431
neg.w %d0; add.l &2,%a2 # RtoL
L431:
sub.w %d0,%d2 # compensate src for autoincrement
lea.l -gensiz(%sp),%sp
mov.l %sp,%a5
swap.w %d3
swap.w %d2
cmp.w %a1,&2; bgt L445
short MOVWI+00000
mov.l (%a2)+,%d0
tst.l %d6; bpl.b L432; add.w &010,%d0 # RtoL
L432:
mov.w %d0,(%a5)+
mov.l &0,%d1; mov.w &-0x1000,%d2; mov.w &0100,%d3
lea.l $L438(%pc),%a1
mov.l &-1,%d0 # prepare bits to decide on "swap"
tst.w %d6; bpl.b L432d; neg.w %d6
lsl.l %d6,%d0; br.b L432e
L432d:
lsr.l %d6,%d0
L432e:
btst &DST1L,%d6; beq.b L434
bset &FD2D,%d6; bne.b L432a
ext.l %d4; bmi.b L432a; swap.w %d4; not.w %d4 # mask1 .l
L432a:
bset &LD2D,%d6; bne.b L432b
ext.l %d5; bmi.b L432b; swap.w %d5; not.w %d5 # mask2 .l
L432b:
and.l %d5,%d4; mov.l %d4,%d5 # single .l does it all
add.l &1,%d4; beq L730 # all 32 bits
sub.l &1,%d4 # need an "and"
and.l %d5,%d0
cmp.l %d5,%d0
beq.b L432c
short MOVWI+05300
swap.w %d0
L432c:
tst.w %d6; bne L690 # and a rotate
br.b L437 # NOSHIFT
L434:
mov.w %a3,(%a5)+ # the rotate instr
short MOVWI+05300
mov.l %d0,%d1 # copy after rotate
and.l %d4,%d0
cmp.l %d4,%d0
seq.b %d0; neg.b %d0; ext.w %d0
short ADDWI+000
swap.w %d0
mov.w %d0,(%a5)+
lea.l $L436(%pc),%a1
br.b L437
L436:
and.w %d4,%d0
mov.w &01001,%d1; clr.w %d2; clr.w %d3
lea.l $L438(%pc),%a1
L437:
br L700
L438:
and.w %d5,%d0
br L545
L445:
#
# During compilation
# %d7 == h-1,,w
# %d6 == flags,,rotate count
# %d5 == mask2
# %d4 == mask1
# %d3 == dst_dW,,bits for xxx.[wl]
# %d2 == src_dW,,bits for mov.[wl]
# %d1.w == parity
# %a6 -> optab
# %a5 -> next generated instruction
# %a4 -> top of inner loop
# %a3.w == rotate instruction
# %a2 -> src
# %a1 -> fragment "and" instruction
# %a0 -> dst
#
tst.w %d6; bne.b L480 # not NOSHIFT ==> always need first word
btst &NSHF1,%d6; bne.b L485 # interplay of NOSHIFT, odd, FDFRAG
L480:
mov.l &1,%d1
and.w %d7,%d1 # parity of inner word count
lsl.w &2,%d1 # even ==> frag in %d0, odd ==> frag in %d1
bsr genwid # generate for first word
and.w %d4,%d0
L485:
cmp.w %d7,&2; ble.b L490 # inner dbr always falls through
btst &FSTORE,%d6; beq.b L490 # no conflict "mov field" vs. %d6
short MOVWI+05300 # init inner count
mov.w %a4,%d6
L490:
mov.l %a5,%a4 # top of inner loop
asr.w &1,%d7 # check inner word count
blt.b L540 # single .l does it all
bcc.b L500 # even
beq.b L520 # 1
short MOVWI+05300
br.b L500 # jump into middle of inner loop
add.l &1,%a4 # remember to fixup "br.b"
add.w &1,%d7 # middle entry ==> no dbr offset
L500:
beq.b L530 # no inner words at all
mov.l &4,%d1 # use %d1 in
bsr.b genwid # even half of inner loop
short 0
L510:
mov.w %a4,%d0; neg.w %d0
bclr &0,%d0; beq.b L520
add.w %a5,%d0; mov.b %d0,(%a4)+ # fixup "br.b" into middle
L520:
mov.l &0,%d1 # use %d0 in
bsr.b genwid # odd half of inner loop
short 0
sub.w &1,%d7 # offset for inner dbr loop
ble.b L530 # dbr always falls through
mov.w &DBR+6,(%a5)+
sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
L530:
btst &LDFRAG,%d6; beq.b L540 # omit "and" for full last word
mov.l &4,%d1
bsr.b genwid
and.w %d5,%d0
L540:
tst.w %d7; ble.b L545 # no inner loop
btst &FSTORE,%d6; bne.b L545 # possible conflict "mov field" vs. %d6
short MOVWI+05300 # init inner count
mov.w %a4,%d6
L545:
swap.w %d3; tst.w %d3; beq.b L546 # wdst is full width of bitmap
mov.w %d3,%a1 # dst_dW
short MOVWI+05300
add.w %a1,%a0
L546:
swap.w %d2; tst.w %d2; beq.b L547 # wsrc is full width of bitmap
mov.w %d2,%a3 # src_dW
short MOVWI+05300
add.w %a3,%a2
L547:
mov.w &DBR+7,(%a5)+
mov.l %sp,%a4 # top of outer loop
cmp.b (%a4),&0x60; bne.b L548 # not br.b
mov.b 1(%a4),%d0; ext.w %d0; lea.l 2(%a4,%d0.w),%a4 # collapse branches
L548:
sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
short MOVWI+05300
jmp (%a5)
mov.w %d7,%a4 # init inner count
mov.w %d7,%d6 # init inner count, 2nd case
swap.w %d7 # h-1
lea.l $retgen(%pc),%a5
jmp (%sp)
genwid:
mov.l (%sp)+,%a1 # -> inline parameter
mov.l $genget(%pc,%d1.w),%d0
tst.w %d1; beq.b L550; mov.w &01001,%d1; swap.w %d1 # parity bits
L550:
clr.w %d2; clr.w %d3 # .[wl] bits default to .w
tst.l %d6; bpl.b L560; add.w &010,%d0 # RtoL
L560:
tst.w %d6; bne.b L569 # not NOSHIFT
bclr &9,%d0 # NOSHIFT always %d0
mov.w (%a1),%d1; bne.b L564 # not inner loop
btst &FSTORE,%d6; beq.b L562 # not "mov"
mov.l &070,%d1; and.w %d0,%d1
lsl.w &3,%d1; or.w %d1,%d0 # copy RtoL mode
add.w &-0x1000,%d0 # .w ==> .l
mov.w %d0,(%a5)+
L561:
jmp 2(%a1)
genget:
swap.w %d0; mov.w (%a2)+,%d0
swap.w %d1; mov.w (%a2)+,%d1
L562:
mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l
add.w %d2,%d0
L563:
mov.l &0,%d1 # NOSHIFT always %d0
br L698 # assemble the fetch, then do the op
L564:
lsr.w &1,%d1; bcs.b L562 # NOSHIFT always LD2D
btst &FD2D,%d6; bne.b L562
br.b L563 # alas, .w
L569:
mov.w (%a1),%d1; beq.b L630 # inner loop
L570:
lsr.w &1,%d1; bcs.b L580 # last word
add.w &-0x1000,%d0 # force fetch .l
mov.w %d0,(%a5)+ # the fetch .l
short MOVLI+00000
mov.l %d0,%d1
swap.w %d0
clr.w %d1; eor.l %d1,%d0 # parity for mov.l %d[01],%d[10]
tst.l %d1; sne.b %d1; sub.b %d1,%d0 # parity for swap.w %d[01]
mov.l %d0,(%a5) # ran out of registers
mov.l &0x4c80ec,%d0 # microcoded bits
tst.l %d6; bpl.b L572; ror.l &1,%d0 # RtoL
L572:
tst.w %d6; bpl.b L574; ror.l &2,%d0 # rol
L574:
btst &FD2D,%d6; beq.b L576; ror.l &4,%d0 # first op .l
mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l corrections
L576:
ror.l &1,%d0; bpl.b L578 # "swap" not needed
add.l &2,%a5
ror.l &8,%d0; bpl.b L577 # existing "swap" parity OK
eor.w &1,(%a5)
L577:
ror.l &8,%d0; bpl.b L578 # existing order OK
sub.l &2,%a5
mov.l (%a5),%d0; swap.w %d0; mov.l %d0,(%a5)
add.l &2,%a5
L578:
add.l &2,%a5
swap.w %d1 # junk,,parity
br.b L690
L580:
btst &LD2D,%d6; beq.b L630 # operator .w
mov.w &-0x1000,%d2 # mov.w +=> mov.l
mov.w &0100,%d3 # xxx.w +=> xxx.l
L630:
tst.l %d6; smi.b %d1
eor.b %d6,%d1; bpl.b L650 # rotation in same direction as scan
swap.w %d0 # interchange "swap" and "mov"
L650:
mov.l %d0,(%a5)+
swap.w %d1 # junk,,parity
mov.w (%a1),%d0; lsr.w &1,%d0; bcs.b L660 # last word
short MOVWI+000
mov.l %d0,%d1
eor.w %d1,%d0
mov.w %d0,(%a5)+
br.b L690
L660:
tst.l %d6; bmi.b L690 # RtoL
btst &LD2D,%d6; beq.b L690 # not .l
tst.w %d6; bpl.b L670 # ror
sub.l &2,%a5; br.b L690 # no "swap"
L670:
mov.w -4(%a5),(%a5)+ # extra "swap"
L690:
mov.w %a3,%d0
eor.b %d1,%d0
L698:
mov.w %d0,(%a5)+ # the rotate instruction
L700:
mov.w (%a1),%d0; beq.b L730 # inner loop
btst &0,%d0; bne.b L705 # last word
btst &FDFRAG,%d6; beq.b L730 # no "and"
L705:
add.w %d3,%d0; add.w %d1,%d0; sub.b %d1,%d0 # and.[wl] %d[45],%d[01]
btst &FSTORE,%d6; beq.b L720
# "mov" partial word
swap.w %d0 # save the "and"
short MOVWI+00000 # ,%d0
mov.w (%a0),%d6
add.w %d2,%d0 # mov.[wl]
tst.l %d6; bpl.b L710; add.w &020,%d0 # RtoL; "(%a0)" ==> "-(%a0)"
L710:
mov.w %d0,(%a5)+ # instr to fetch memory part of word
short MOVWI+00000 # ,%d0
eor.w %d6,%d0
add.w %d3,%d0; add.b %d1,%d0 # eor.[wl] %d6,%d[01]
swap.w %d0; mov.l %d0,(%a5)+; swap.w %d0; mov.w %d0,(%a5)+
mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
mov.l &-0100,%d1 # RtoL correction, if necessary
br.b L770
L720:
mov.w %d0,(%a5)+ # "and" for non-mov operators
L730:
mov.w 2(%a6),%d0; beq.b L740 # not F_CLR
add.w %d3,%d0; add.b %d1,%d0 # not.[wl] %d[01]
mov.w %d0,(%a5)+
L740:
btst &FSTORE,%d6; beq.b L790 # non-"mov"
mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
mov.l &0100,%d1 # RtoL correction, if necessary
L770:
add.w (%a6),%d0
tst.l %d6; bpl.b L780
add.w %d1,%d0 # RtoL correction
L780:
mov.w %d0,(%a5)+
jmp 2(%a1)
L790:
mov.w %d1,%d0; clr.b %d0; add.w %d3,%d0 # xxx.[wl] %d[01]
mov.l &010,%d1 # RtoL correction, if necessary
br.b L770
#
# During execution
# %d[01] == rotator
# %d2 [reserved for texture bits]
# %d3 [reserved for texture index]
# %d4 == mask1
# %d5 == mask2
# %d6.w == inner count
# %d7.w == outer count
# %a0 -> dst
# %a1 == dst_dW
# %a2 -> src
# %a3 == src_dW
# %a4.w == inner count init
# %a5 -> retgen
# %a6 [reserved for -> texture]
#
[-- Attachment #2: Type: text/html, Size: 84480 bytes --]
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [TUHS] A Reiser tour do force
2022-04-03 10:55 [TUHS] A Reiser tour do force Paul Ruizendaal via TUHS
@ 2022-04-03 12:26 ` Rob Pike
0 siblings, 0 replies; 3+ messages in thread
From: Rob Pike @ 2022-04-03 12:26 UTC (permalink / raw)
To: Paul Ruizendaal; +Cc: TUHS main list
Ah, yes, that's the one.
-rob
On Sun, Apr 3, 2022 at 9:03 PM Paul Ruizendaal via TUHS
<tuhs@minnie.tuhs.org> wrote:
>
> A not-very-thorough search at tuhs turned up V9/jerq/src/lib/j/bitblt.c
> It appears to be a pre-Reiser bitblt, not what was asked for.
>
>
> The Reiser code is in the V8 jerq tarball that Dan Cross donated:
> v8jerq.tar.bz2
>
> It is in file blit/src/libj/bitblt.s (attached below for convenience). It is 750 lines of 68K assembler. It does not appear to have been ported to the Bellmac 32 CPU. Maybe it did not make sense in that context.
>
> Paul
>
> =====
>
> #
> # bitblt(sm,r,dm,p,fc)
> # Bitmap *sm,*dm;
> # Rectangle r;
> # Point p;
> # int fc;
> #
> # by John F. Reiser summer 1982
> #
> # Depending on the case at hand, generate very good code and execute it.
> #
>
> # offsets in a Point
> set x,0
> set y,2
> # offsets in a Rectangle
> set origin,0
> set corner,4
> # offsets in a Bitmap
> set base,0
> set width,4
> set rect,6
> # parameter offsets from %fp
> set sm,8
> set r,12
> set dm,20
> set p,24
> set fc,28
>
> set NREG,11
>
> global bitblt
> bitblt:
> movm.l &0x3f3e,-(%sp) # save C registers
> movm.l NREG*4-4+sm(%sp),&0x001f
> # d1=r.o.x,,r.o.y; d2=r.c.x,,r.c.y; d4=p.x,,p.y;
> mov.l %d0,%a4 # sm
> mov.l %d3,%a5 # dm
> mov.w NREG*4-4+fc(%sp),%a6 # a6.w == fc
> movm.l rect(%a4),&0x9 # d0=sm.o.x,,sm.o.y; d3=sm.c.x,,sm.c.y;
> movm.l rect(%a5),&0x60 # d5=dm.o.x,,dm.o.y; d6=dm.c.x,,dm.c.y;
>
> lea.l $L50(%pc),%a0
> L5:
> # clip r.y to sm.y
> mov.w %d0,%d7 # sm.o.y
> sub.w %d1,%d7 # - r.o.y
> ble.b L10
> mov.w %d0,%d1 # r.o.y = sm.o.y; /* r.o.y was above sm.rect */
> add.w %d7,%d4 # p.y parallels r.o.y
> L10:
> cmp.w %d2,%d3 # r.c.y : sm.c.y
> ble.b L20
> mov.w %d3,%d2 # r.c.y = sm.c.y; /* bottom of r was below sm.rect */
> L20:
> # clip (r.y at p.y) to dm.y
> mov.w %d5,%d7 # dm.o.y
> sub.w %d4,%d7 # -p.y
> ble.b L30
> mov.w %d5,%d4 # p.y = dm.o.y; /* p.y was above dm.rect */
> add.w %d7,%d1 # r.o.y parallels p.y
> L30:
> mov.w %d1,%d7 # r.o.y
> add.w %d6,%d7 # + dm.c.y
> sub.w %d4,%d7 # - p.y /* == max y that dm.rect allows in r */
> cmp.w %d2,%d7 # r.c.y : limit
> ble.b L40
> mov.w %d7,%d2 # r.c.y = limit
> L40:
> mov.w %d2,%d7 # r.c.y
> sub.w %d1,%d7 # - r.o.y
> sub.w &1,%d7 # /* == h-1 in bits */
> blt.b ret
> jmp (%a0)
>
> retgen:
> lea.l gensiz(%sp),%sp
> ret8:
> add.l &8,%sp
> ret:
> movm.l (%sp)+,&0x7cfc
> rts
>
> L50:
> # mirror in pi/4 and reuse same code to clip x
> swap.w %d0; swap.w %d1; swap.w %d2; swap.w %d3
> swap.w %d4; swap.w %d5; swap.w %d6; swap.w %d7
> lea.l $L55(%pc),%a0
> br.b L5
>
> L55:
> mov.l %d1,%a1
> mov.l %d4,%d6
> #
> # So far
> # %d7 == h-1,,w-1
> # %d6 == p.y,,p.x
> # %a6.w == fc
> # %a5 == dm
> # %a4 == sm
> # %a1 == r.o.y,,r.o.x
> #
> # Compute masks, and width in words
> #
> mov.w %d6,%d0 # p.x /* left endpoint of dst */
> mov.w %d7,%d1 # w-1
> add.w %d6,%d1 # right endpoint
>
> mov.l &-1,%d3
> mov.l &15,%d2
> and.w %d0,%d2
> lsr.w %d2,%d3 # mask1
> mov.l &-1,%d5
> mov.l &15,%d2
> and.w %d1,%d2
> add.w &1,%d2
> lsr.w %d2,%d5
> not.w %d5 # mask2
> swap.w %d5
> mov.w %d3,%d5 # mask2,,mask1
>
> asr.w &4,%d0
> asr.w &4,%d1
> sub.w %d0,%d1
> sub.w &1,%d1 # inner-loop width in words
>
> mov.l &0,%d4 # assume LtoR
> mov.w width(%a5),%d3
> add.w %d3,%d3
> mov.w width(%a4),%d2
> add.w %d2,%d2
> #
> # So far
> # %d7 == h-1,,w-1 in bits
> # %d6 == p.y,,p.x
> # %d5 == mask2,,mask1
> # %d4 == 0 (LtoR)
> # %d3.w == dm width in bytes
> # %d2.w == sm width in bytes
> # %d1.w == inner-loop width in words
> # %a6.w == fc
> # %a5 == dm
> # %a4 == sm
> # %a1 == r.o.y,,r.o.x
> #
> # If necessary, compensate for overlap of source and destination
> #
> cmp.l %a4,%a5
> bne.b L80 # overlap not possible
> mov.l %d6,%d0 # p.y,,p.x
> mov.w %a1,%d0 # p.y,,r.o.x
> cmp.l %a1,%d0 # r.o.y : p.y
> bge.b L60 # if (r.o.y < p.y)
> mov.l %d7,%d0 # h-1,,w-1
> clr.w %d0 # h-1,,0
> add.l %d0,%a1 # r.o.y += h-1;
> add.l %d0,%d6 # p.y += h-1;
> neg.w %d3 # wdst = -wdst;
> neg.w %d2 # wsrc = -wsrc;
> L60:
> cmp.w %d7,&16
> blt.b L70 # l<->r swap not needed for narrow
> cmp.w %d6,%a1 # p.x : r.o.x
> ble.b L70 # if (r.o.x < p.x)
> mov.l %a1,%d0
> add.w %d7,%d0
> mov.l %d0,%a1 # r.o.x += w-1;
> add.w %d7,%d6 # p.x += w-1;
> mov.l &-1,%d4 # RtoL
> swap.w %d5 # masks in other order
> L70:
> L80:
> #
> # Locate actual starting points
> #
> mov.l %d6,%d0 # p.y,,p.x
> swap.w %d0
> mov.l %d0,-(%sp) # p
> mov.l %a5,-(%sp) # dm
>
> mov.l &15,%d0
> lea.l $L82(%pc),%a0 # assume narrow
> cmp.w %d7,%d0 # w-1 : 15
> ble.b L81 # guessed correctly
> lea.l $L85(%pc),%a0 # wide
> L81:
> mov.l %a0,-(%sp) # on return, go directly to wide/narrow code
> add.w %a6,%a6; add.w %a6,%a6 # with 4*fc
>
> mov.w %d1,%d7 # h-1 in bits,,inner width in words
> and.l %d0,%d6 # 0,,bit offset of p.x
> mov.l %a1,%d1 # r.o.y,,r.o.x
> and.w %d1,%d0 # bit offset of r.o.x
> sub.w %d0,%d6 # BO(p.x) - BO(r.o.x) /* amount of right rotation */
> swap.w %d1 # r.o.x,,r.o.y
> mov.l %d1,-(%sp) # r.o
> mov.l %a4,-(%sp) # sm
> lea.l addr,%a3
> jsr (%a3)
> mov.l %a0,%a2 # src = addr(sm,r.origin);
> add.l &8,%sp
> jmp (%a3) # %a0 = addr(dm,p);
> L82:
> mov.l &0,%d4
> mov.w %d5,%d4 # 0,,mask1
> swap.w %d5 # mask1,,mask2 (proper long mask; maybe 16 bits too wide)
> and.w %d5,%d4 # check for overlap of mask1 and mask2
> beq.b L83 # no overlap ==> %d5 already correct
> mov.l %d4,%d5 # overlap ==> reduce %d5 by 16 bits
> swap.w %d5 # and put it in the proper half
> L83:
> swap.w %d7 # ,,height-1
> lea.l $nrwtab(%pc,%a6.w),%a6 # -> optab
> tst.w %d6 # amount of right rotation
> bge.b L84
> neg.w %d6
> add.l &2,%a6
> L84:
> add.w (%a6),%a6
> jmp (%a6)
>
> nrwtab:
> short opMnwr-nrwtab- 0, opMnwl-nrwtab- 2
> short opSnwr-nrwtab- 4, opSnwl-nrwtab- 6
> short opCnwr-nrwtab- 8, opCnwl-nrwtab-10
> short opXnwr-nrwtab-12, opXnwl-nrwtab-14
>
> opMnwr:
> mov.l (%a2),%d0
> mov.l (%a0),%d1
> ror.l %d6,%d0
> eor.l %d1,%d0
> and.l %d5,%d0
> eor.l %d1,%d0
> mov.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opMnwr
> br ret8
>
> opMnwl:
> mov.l (%a2),%d0
> mov.l (%a0),%d1
> rol.l %d6,%d0
> eor.l %d1,%d0
> and.l %d5,%d0
> eor.l %d1,%d0
> mov.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opMnwl
> br ret8
>
> opSnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> or.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opSnwr
> br ret8
>
> opSnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> or.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opSnwl
> br ret8
>
> opCnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> not.l %d0
> and.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opCnwr
> br ret8
>
> opCnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> not.l %d0
> and.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opCnwl
> br ret8
>
> opXnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> eor.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opXnwr
> br ret8
>
> opXnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> eor.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opXnwl
> br ret8
>
> set DBR,0x51c8
> set MOVLI,0x2000+074 # mov.l &...,
> set MOVWI,0x3000+074 # mov.w &...,
> set ADDWI,0x0640 # add.w &...,
>
> set FDFRAG,16 # first destination is a fragment
> set LDFRAG,17 # last destination is a fragment
> set NSHF1,18
> set FD2D,19 # first destination should store 2 words
> set LD2D,20 # last destination should store 2 words
> set FSTORE,21
> set DST1L,24 # dst inner count is 0
> set SRC1L,25 # Nsrc is 2
>
> set gensiz,80
>
> widtab:
> mov.w %d0,(%a0)+; short 0
> or.w %d0,(%a0)+; short 0
> and.w %d0,(%a0)+; not.w %d0
> eor.w %d0,(%a0)+; short 0
>
> #
> # So far
> # %d7 == h-1 (bits),,w (words)
> # %d6 == 0,,rotate count
> # %d5 == mask2,,mask1
> # %d4 == -RtoL
> # %d3.w == wdst (bytes)
> # %d2.w == wsrc (bytes)
> # %a6.w == 4*fc
> # %a2 -> src
> # %a0 -> dst
> #
> L85:
> lea.l $widtab(%pc,%a6.w),%a6
> tst.w %d4; bpl.b L300; bset &31,%d6
> L300:
> mov.w %d7,%d0 # inner word count
> bne.b L304; bset &DST1L,%d6
> L304:
> add.w &1,%d0 # Nsrc = 1+Ninner
> mov.w %d0,%a1 # + ...
> add.w &1,%d0 # Ndst = 1+Ninner+1
> add.w %d0,%d0 # magnitude of dst addressing side effects
> tst.l %d6; bpl.b L310
> neg.w %d0; add.l &2,%a0 # RtoL
> L310:
> sub.w %d0,%d3 # compensate dst for autoincrement
>
> mov.w %d5,%d4 # mask1
> swap.w %d5 # mask2
>
> cmp.w %d4,&-1; beq.b L320; bset &FDFRAG,%d6
> L320:
>
> cmp.w %d5,&-1; seq.b %d1; beq.b L330; bset &LDFRAG,%d6
> L330:
>
> tst.w %d6; bne.b L360 # not NOSHIFT
> add.w &1,%a1 # Nsrc = 1+Ninner+1
> mov.l %d6,%d0; swap.w %d0; ext.w %d0 # 0,,flag bits
> asr.w &1,%d7; roxl.w &1,%d0 # account for inner words odd
> mov.b $nstab(%pc,%d0.w),%d0
> bpl.b L340; add.w &1,%d7
> L340:
> add.b %d0,%d0
> bpl.b L350; sub.w &1,%d7
> L350:
> swap.w %d0; eor.l %d0,%d6 # the bits
> btst &DST1L,%d6; bne.b L355
> btst &FD2D,%d6; beq.b L410
> L355:
> ext.l %d4; bmi.b L410; swap.w %d4; not.w %d4 # NOSHIFT mask1 .l
> br.b L410 # NOSHIFT mask2 .l
> nstab:
> byte 0x82,0x80,0x04,0x80 # 0x80: +1 inner; 0x40: -1 inner
> byte 0x02,0x00,0x44,0x00 # 0x04: FD2D; 0x02: NSHF1 no first word
> L360:
> ext.w %d1; sub.w %d1,%d7 # extend inner loop
>
> mov.l &0xf,%d0 # 0 1 7 8 9 e f
> add.w &8,%d6 # 8 9 f 0 1 6 7
> and.w %d0,%d6
> sub.w &8,%d6 # 0 1 7 -8 -7 -2 -1 X=C= sign
> mov.w %d6,%d1; bge.b L367 # X unchanged
> neg.w %d1 # 8 7 2 1 X=C= 1
> L367:
> roxl.w &1,%d1 # 0 2 e 11 f 5 3
> and.w %d0,%d1 # 0 2 e 1 f 5 3
> lsl.w &8,%d1 # magic position
> short ADDWI+001
> ror.l &8,%d0
> mov.w %d1,%a3 # the rotate instruction
>
> mov.l &0,%d1; not.w %d1 # 0,,-1
> ror.l %d6,%d1 # where the bits are after a rotate
>
> mov.w %d1,%d0; and.w %d4,%d0; beq.b L370 # 1 src word covers dst frag
> not.w %d1; and.w %d4,%d1; beq.b L370
> add.w &1,%a1; br.b L390 # fragment needs another src word
> L370:
> sub.w &1,%d7 # .l takes an inner word
> bset &FD2D,%d6
> ext.l %d4; bmi.b L390
> swap.w %d4; not.w %d4 # mask1 .l
> L390:
>
> swap.w %d1
>
> mov.w %d1,%d0; and.w %d5,%d0; beq.b L400 # 1 src word covers dst frag
> not.w %d1; and.w %d5,%d1; beq.b L400
> add.w &1,%a1; br.b L420 # fragment needs another src word
> L400:
> dbr %d7,L405 # .l takes an inner word
> clr.w %d7; br.b L420 # nothing there to take
> L405:
> L410:
> bset &LD2D,%d6
> ext.l %d5; bmi.b L420
> swap.w %d5; not.w %d5 # mask2 .l
> L420:
>
> tst.w NREG*4-4+fc+8(%sp); bne.b L430; bset &FSTORE,%d6
> L430:
> mov.w %a1,%d0 # Nsrc
> add.w %d0,%d0 # magnitude of src addressing side effects
> tst.l %d6; bpl.b L431
> neg.w %d0; add.l &2,%a2 # RtoL
> L431:
> sub.w %d0,%d2 # compensate src for autoincrement
>
> lea.l -gensiz(%sp),%sp
> mov.l %sp,%a5
> swap.w %d3
> swap.w %d2
>
> cmp.w %a1,&2; bgt L445
> short MOVWI+00000
> mov.l (%a2)+,%d0
> tst.l %d6; bpl.b L432; add.w &010,%d0 # RtoL
> L432:
> mov.w %d0,(%a5)+
> mov.l &0,%d1; mov.w &-0x1000,%d2; mov.w &0100,%d3
> lea.l $L438(%pc),%a1
> mov.l &-1,%d0 # prepare bits to decide on "swap"
> tst.w %d6; bpl.b L432d; neg.w %d6
> lsl.l %d6,%d0; br.b L432e
> L432d:
> lsr.l %d6,%d0
> L432e:
> btst &DST1L,%d6; beq.b L434
> bset &FD2D,%d6; bne.b L432a
> ext.l %d4; bmi.b L432a; swap.w %d4; not.w %d4 # mask1 .l
> L432a:
> bset &LD2D,%d6; bne.b L432b
> ext.l %d5; bmi.b L432b; swap.w %d5; not.w %d5 # mask2 .l
> L432b:
> and.l %d5,%d4; mov.l %d4,%d5 # single .l does it all
> add.l &1,%d4; beq L730 # all 32 bits
> sub.l &1,%d4 # need an "and"
> and.l %d5,%d0
> cmp.l %d5,%d0
> beq.b L432c
> short MOVWI+05300
> swap.w %d0
> L432c:
> tst.w %d6; bne L690 # and a rotate
> br.b L437 # NOSHIFT
> L434:
> mov.w %a3,(%a5)+ # the rotate instr
> short MOVWI+05300
> mov.l %d0,%d1 # copy after rotate
> and.l %d4,%d0
> cmp.l %d4,%d0
> seq.b %d0; neg.b %d0; ext.w %d0
> short ADDWI+000
> swap.w %d0
> mov.w %d0,(%a5)+
> lea.l $L436(%pc),%a1
> br.b L437
> L436:
> and.w %d4,%d0
> mov.w &01001,%d1; clr.w %d2; clr.w %d3
> lea.l $L438(%pc),%a1
> L437:
> br L700
> L438:
> and.w %d5,%d0
> br L545
> L445:
> #
> # During compilation
> # %d7 == h-1,,w
> # %d6 == flags,,rotate count
> # %d5 == mask2
> # %d4 == mask1
> # %d3 == dst_dW,,bits for xxx.[wl]
> # %d2 == src_dW,,bits for mov.[wl]
> # %d1.w == parity
> # %a6 -> optab
> # %a5 -> next generated instruction
> # %a4 -> top of inner loop
> # %a3.w == rotate instruction
> # %a2 -> src
> # %a1 -> fragment "and" instruction
> # %a0 -> dst
> #
> tst.w %d6; bne.b L480 # not NOSHIFT ==> always need first word
> btst &NSHF1,%d6; bne.b L485 # interplay of NOSHIFT, odd, FDFRAG
> L480:
> mov.l &1,%d1
> and.w %d7,%d1 # parity of inner word count
> lsl.w &2,%d1 # even ==> frag in %d0, odd ==> frag in %d1
> bsr genwid # generate for first word
> and.w %d4,%d0
> L485:
> cmp.w %d7,&2; ble.b L490 # inner dbr always falls through
> btst &FSTORE,%d6; beq.b L490 # no conflict "mov field" vs. %d6
> short MOVWI+05300 # init inner count
> mov.w %a4,%d6
> L490:
> mov.l %a5,%a4 # top of inner loop
> asr.w &1,%d7 # check inner word count
> blt.b L540 # single .l does it all
> bcc.b L500 # even
> beq.b L520 # 1
> short MOVWI+05300
> br.b L500 # jump into middle of inner loop
> add.l &1,%a4 # remember to fixup "br.b"
> add.w &1,%d7 # middle entry ==> no dbr offset
> L500:
> beq.b L530 # no inner words at all
> mov.l &4,%d1 # use %d1 in
> bsr.b genwid # even half of inner loop
> short 0
> L510:
> mov.w %a4,%d0; neg.w %d0
> bclr &0,%d0; beq.b L520
> add.w %a5,%d0; mov.b %d0,(%a4)+ # fixup "br.b" into middle
> L520:
> mov.l &0,%d1 # use %d0 in
> bsr.b genwid # odd half of inner loop
> short 0
> sub.w &1,%d7 # offset for inner dbr loop
> ble.b L530 # dbr always falls through
> mov.w &DBR+6,(%a5)+
> sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
> L530:
>
> btst &LDFRAG,%d6; beq.b L540 # omit "and" for full last word
> mov.l &4,%d1
> bsr.b genwid
> and.w %d5,%d0
> L540:
>
> tst.w %d7; ble.b L545 # no inner loop
> btst &FSTORE,%d6; bne.b L545 # possible conflict "mov field" vs. %d6
> short MOVWI+05300 # init inner count
> mov.w %a4,%d6
> L545:
> swap.w %d3; tst.w %d3; beq.b L546 # wdst is full width of bitmap
> mov.w %d3,%a1 # dst_dW
> short MOVWI+05300
> add.w %a1,%a0
> L546:
> swap.w %d2; tst.w %d2; beq.b L547 # wsrc is full width of bitmap
> mov.w %d2,%a3 # src_dW
> short MOVWI+05300
> add.w %a3,%a2
> L547:
> mov.w &DBR+7,(%a5)+
> mov.l %sp,%a4 # top of outer loop
> cmp.b (%a4),&0x60; bne.b L548 # not br.b
> mov.b 1(%a4),%d0; ext.w %d0; lea.l 2(%a4,%d0.w),%a4 # collapse branches
> L548:
> sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
> short MOVWI+05300
> jmp (%a5)
>
> mov.w %d7,%a4 # init inner count
> mov.w %d7,%d6 # init inner count, 2nd case
> swap.w %d7 # h-1
> lea.l $retgen(%pc),%a5
> jmp (%sp)
>
> genwid:
> mov.l (%sp)+,%a1 # -> inline parameter
> mov.l $genget(%pc,%d1.w),%d0
> tst.w %d1; beq.b L550; mov.w &01001,%d1; swap.w %d1 # parity bits
> L550:
> clr.w %d2; clr.w %d3 # .[wl] bits default to .w
> tst.l %d6; bpl.b L560; add.w &010,%d0 # RtoL
> L560:
> tst.w %d6; bne.b L569 # not NOSHIFT
> bclr &9,%d0 # NOSHIFT always %d0
> mov.w (%a1),%d1; bne.b L564 # not inner loop
> btst &FSTORE,%d6; beq.b L562 # not "mov"
> mov.l &070,%d1; and.w %d0,%d1
> lsl.w &3,%d1; or.w %d1,%d0 # copy RtoL mode
> add.w &-0x1000,%d0 # .w ==> .l
> mov.w %d0,(%a5)+
> L561:
> jmp 2(%a1)
> genget:
> swap.w %d0; mov.w (%a2)+,%d0
> swap.w %d1; mov.w (%a2)+,%d1
>
> L562:
> mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l
> add.w %d2,%d0
> L563:
> mov.l &0,%d1 # NOSHIFT always %d0
> br L698 # assemble the fetch, then do the op
> L564:
> lsr.w &1,%d1; bcs.b L562 # NOSHIFT always LD2D
> btst &FD2D,%d6; bne.b L562
> br.b L563 # alas, .w
> L569:
> mov.w (%a1),%d1; beq.b L630 # inner loop
> L570:
> lsr.w &1,%d1; bcs.b L580 # last word
> add.w &-0x1000,%d0 # force fetch .l
> mov.w %d0,(%a5)+ # the fetch .l
> short MOVLI+00000
> mov.l %d0,%d1
> swap.w %d0
> clr.w %d1; eor.l %d1,%d0 # parity for mov.l %d[01],%d[10]
> tst.l %d1; sne.b %d1; sub.b %d1,%d0 # parity for swap.w %d[01]
> mov.l %d0,(%a5) # ran out of registers
> mov.l &0x4c80ec,%d0 # microcoded bits
> tst.l %d6; bpl.b L572; ror.l &1,%d0 # RtoL
> L572:
> tst.w %d6; bpl.b L574; ror.l &2,%d0 # rol
> L574:
> btst &FD2D,%d6; beq.b L576; ror.l &4,%d0 # first op .l
> mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l corrections
> L576:
> ror.l &1,%d0; bpl.b L578 # "swap" not needed
> add.l &2,%a5
> ror.l &8,%d0; bpl.b L577 # existing "swap" parity OK
> eor.w &1,(%a5)
> L577:
> ror.l &8,%d0; bpl.b L578 # existing order OK
> sub.l &2,%a5
> mov.l (%a5),%d0; swap.w %d0; mov.l %d0,(%a5)
> add.l &2,%a5
> L578:
> add.l &2,%a5
> swap.w %d1 # junk,,parity
> br.b L690
> L580:
> btst &LD2D,%d6; beq.b L630 # operator .w
> mov.w &-0x1000,%d2 # mov.w +=> mov.l
> mov.w &0100,%d3 # xxx.w +=> xxx.l
> L630:
> tst.l %d6; smi.b %d1
> eor.b %d6,%d1; bpl.b L650 # rotation in same direction as scan
> swap.w %d0 # interchange "swap" and "mov"
> L650:
> mov.l %d0,(%a5)+
>
> swap.w %d1 # junk,,parity
> mov.w (%a1),%d0; lsr.w &1,%d0; bcs.b L660 # last word
> short MOVWI+000
> mov.l %d0,%d1
> eor.w %d1,%d0
> mov.w %d0,(%a5)+
> br.b L690
> L660:
> tst.l %d6; bmi.b L690 # RtoL
> btst &LD2D,%d6; beq.b L690 # not .l
> tst.w %d6; bpl.b L670 # ror
> sub.l &2,%a5; br.b L690 # no "swap"
> L670:
> mov.w -4(%a5),(%a5)+ # extra "swap"
> L690:
> mov.w %a3,%d0
> eor.b %d1,%d0
> L698:
> mov.w %d0,(%a5)+ # the rotate instruction
> L700:
>
> mov.w (%a1),%d0; beq.b L730 # inner loop
> btst &0,%d0; bne.b L705 # last word
> btst &FDFRAG,%d6; beq.b L730 # no "and"
> L705:
> add.w %d3,%d0; add.w %d1,%d0; sub.b %d1,%d0 # and.[wl] %d[45],%d[01]
> btst &FSTORE,%d6; beq.b L720
> # "mov" partial word
> swap.w %d0 # save the "and"
> short MOVWI+00000 # ,%d0
> mov.w (%a0),%d6
> add.w %d2,%d0 # mov.[wl]
> tst.l %d6; bpl.b L710; add.w &020,%d0 # RtoL; "(%a0)" ==> "-(%a0)"
> L710:
> mov.w %d0,(%a5)+ # instr to fetch memory part of word
> short MOVWI+00000 # ,%d0
> eor.w %d6,%d0
> add.w %d3,%d0; add.b %d1,%d0 # eor.[wl] %d6,%d[01]
> swap.w %d0; mov.l %d0,(%a5)+; swap.w %d0; mov.w %d0,(%a5)+
> mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
> mov.l &-0100,%d1 # RtoL correction, if necessary
> br.b L770
> L720:
> mov.w %d0,(%a5)+ # "and" for non-mov operators
> L730:
> mov.w 2(%a6),%d0; beq.b L740 # not F_CLR
> add.w %d3,%d0; add.b %d1,%d0 # not.[wl] %d[01]
> mov.w %d0,(%a5)+
> L740:
> btst &FSTORE,%d6; beq.b L790 # non-"mov"
> mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
> mov.l &0100,%d1 # RtoL correction, if necessary
> L770:
> add.w (%a6),%d0
> tst.l %d6; bpl.b L780
> add.w %d1,%d0 # RtoL correction
> L780:
> mov.w %d0,(%a5)+
> jmp 2(%a1)
>
> L790:
> mov.w %d1,%d0; clr.b %d0; add.w %d3,%d0 # xxx.[wl] %d[01]
> mov.l &010,%d1 # RtoL correction, if necessary
> br.b L770
>
> #
> # During execution
> # %d[01] == rotator
> # %d2 [reserved for texture bits]
> # %d3 [reserved for texture index]
> # %d4 == mask1
> # %d5 == mask2
> # %d6.w == inner count
> # %d7.w == outer count
> # %a0 -> dst
> # %a1 == dst_dW
> # %a2 -> src
> # %a3 == src_dW
> # %a4.w == inner count init
> # %a5 -> retgen
> # %a6 [reserved for -> texture]
> #
>
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [TUHS] A Reiser tour do force
@ 2022-04-03 0:35 Douglas McIlroy
0 siblings, 0 replies; 3+ messages in thread
From: Douglas McIlroy @ 2022-04-03 0:35 UTC (permalink / raw)
To: TUHS main list
> Does {Reiser's bitblt replacement] exist for the rest of us to study?
A not-very-thorough search at tuhs turned up V9/jerq/src/lib/j/bitblt.c
It appears to be a pre-Reiser bitblt, not what was asked for. But
weighing in at over 800 lines of C, it vivifies the challenge that
Reiser met in half the space.
Doug
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2022-04-03 12:31 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-03 10:55 [TUHS] A Reiser tour do force Paul Ruizendaal via TUHS
2022-04-03 12:26 ` Rob Pike
-- strict thread matches above, loose matches on Subject: below --
2022-04-03 0:35 Douglas McIlroy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).