English Amiga Board

English Amiga Board (https://eab.abime.net/index.php)
-   Coders. General (https://eab.abime.net/forumdisplay.php?f=37)
-   -   Optimizing polygonfill bitcopy (https://eab.abime.net/showthread.php?t=99764)

TCH 22 November 2019 13:57

Optimizing polygonfill bitcopy
 
After i drawn it's edges and xorfilled the polygon on a 1-bit plane, i copy the data to the actual planes. Since i wanted to support dither (like PenA and PenB in the OS API), i built a two by eight support table for masking the planes when copying:
Code:

t = 2;
cmA = 0x01;
cmB = 0x02;
while (t-- > 0)
{
        cmask = 1;
        i = D;
        while (--i >= 0)
        {
                pattern16 = 0x0000;
                if ((cmask & ColorA) != 0)
                {
                        pattern16 |= cmA;
                }
                if ((cmask & ColorB) != 0)
                {
                        pattern16 |= cmB;
                }
                if (pattern16 != 0)
                {
                        pattern16 |= pattern16 << 2;
                        pattern16 |= pattern16 << 4;
                        pattern16 |= pattern16 << 8;
                }
                cpattern32[t][i] = (pattern16 << 16) | pattern16;
                cmask <<= 1;
        }
        cmA ^= 0x03;
        cmB ^= 0x03;
}

And there is the actual copy which runs slow:
Code:

while (--h >= 0)
{
        dstptr32 = (ULONG *)DestArea;
        i = D;
        while (--i >= 0)
        {
                srcptr32 = (ULONG *)TempArea;
                pattern32 = cpattern32[h & 1][i];
                t = w;
                while (--t >= 0)
                {
                        read32 = *srcptr32++;
                        *dstptr32 &= ~read32;
                        *dstptr32++ |= (read32 & pattern32);
                }
                dstptr32 += Stepper;
        }
        TempArea += RowSize;
        DestArea += RowSize;
}

I tried to rearrange the innermost cycle, but this setup seemed the fastest. However, maybe the entire approach is wrong? How should be the three cycles structured? Lines->Planes->Columns as now or the Planes should be the outer or maybe the innermost? I've tried to make the Planes as the outer one and it became much (2x) slower. I did not tried to make them the inner, since it seems to be crazy.

Any suggestion how to make this bitcopy more effective? (The algorithm, i mean. When that is done, i will rewrite it into assembly.)

hooverphonique 22 November 2019 15:11

Do you actually need to clear the destination inside the loop (*dstptr16 &= ..) ?

The blitter is perfect for this sort of copy with logical op applied, btw ;-)

TCH 22 November 2019 22:05

Of course i do. There is no tell what the corresponding bits on that plane will be. We are doing dithering fill here. I only could spare the zeroing and then the writing, if i would copy one plane, because that necessarily will hold only one type of bits (depending if i filled the polygon on the temporary plane with zeros or ones) and thus i could just OR or AND them with the destination plane. But on multiple planes, this is dependent on the color index. Example with two planes with dithering colours number 2 and 3:
Code:

PLANE 0:
???10???
??0??1??
??0??1??
??0101??
??0??1??
??0??1??

PLANE 1:
???11???
??1??1??
??1??1??
??1111??
??1??1??
??1??1??

I know, that the Blitter can do this, but what i am trying to optimize here is a software copy, because the Blitter will always run on 7.14 MHz, thus a software polygon fill will be faster on a faster CPU with Fast RAM. Or at least, it should be.

TCH 24 November 2019 00:33

I rewrote it in ASM, but it only got slower. Any ideas?
Code:

        section code,code

; PolygonBitmapToPlanes32()
;
; args:
; -----
; a0.l = DestArea
; a1.l = TempArea
; d0.w = Height
; d1.w = Width
; d2.l = Modulo
; d3.l = RowSize
; d4.w = Depth
; a2.l = PatternLUT
;
; local:
; ------
; a3.l = SrcPtr
; a4.l = DestPtr
; a6.w = Depth
; d4.w = PlaneCounter
; d5.w = WidthCounter
; d6.l = CurrentPattern
; d7.l = Temp

                xref _PolygonBitmapToPlanes32
                xdef _PolygonBitmapToPlanes32
_PolygonBitmapToPlanes32:
                move.l        a3,-(sp)
                move.l        a4,-(sp)
                move.w        a6,-(sp)
                move.w        d5,-(sp)
                move.l        d6,-(sp)
                move.l        d7,-(sp)

                subq.w        #1,d4                ; Depth--;
                movea.w        d4,a6                ; a6.w = Depth

                asl.l        #2,d2                ; Modulo <<= 2;
c_h:                move.l        a0,a4                ; DestPtr = DestArea;
                move.w        a6,d4                ; PlaneCounter = Depth;

c_p:                movea.l        a1,a3                ; SrcPtr = TempArea;
                move.w        d0,d7                ; Temp = Height;
                andi.w        #1,d7                ; Temp &= 1;
                asl.w        #3,d7                ; Temp <<= 3;
                or.w        d4,d7                ; Temp |= PlaneCounter;
                asl.w        #2,d7                ; Temp <<= 2;
                move.l        (a2,d7),d6        ; CurrentPattern = PatternLUT[Temp];
                move.w        d1,d5                ; WidthCounter = Width;

c_w:                move.l        (a3)+,d7        ; Temp = *SrcPtr++;
                not.l        d7                ; Temp = ~Temp;
                and.l        d7,(a4)                ; *DestPtr &= Temp;
                not.l        d7                ; Temp = ~Temp;
                and.l        d6,d7                ; Temp &= CurrentPattern;
                or.l        d7,(a4)+        ; *DestPtr++ |= Temp;
                subq.w        #1,d5                ; WidthCounter--;
                bne        c_w                ; if (WidthCounter != 0) goto c_w;

                adda.l        d2,a4                ; DestPtr += Modulo;
                subq.b        #1,d4                ; PlaneCounter--;
                bpl        c_p                ; if (PlaneCounter >= 0) goto c_p;

                adda.l        d3,a0                ; DestArea += RowSize;
                adda.l        d3,a1                ; TempArea += RowSize;
                subq.w        #1,d0                ; Height--;
                bne        c_h                ; if (Height != 0) goto c_h;

                move.l        (sp)+,d7
                move.l        (sp)+,d6
                move.w        (sp)+,d5
                move.w        (sp)+,a6
                move.l        (sp)+,a4
                move.l        (sp)+,a3
                rts


a/b 24 November 2019 08:07

Going to bed, so this is only a few obvious things, will probably look at it tomorrow.
Code:

_PolygonBitmapToPlanes32:
;                move.l        a3,-(sp)
;                move.l        a4,-(sp)
;                move.w        a6,-(sp)
;                move.w        d5,-(sp)
;                move.l        d6,-(sp)
;                move.l        d7,-(sp)
 movem.l        d5-d7/a3-a6,-(a7)
 subq.w        #1,d1        ; push to stack if you have to preserve it
 move.l        d3,a5        ; row size
                subq.w        #1,d4                ; Depth--;
 asl.w        #2,d4
                movea.w        d4,a6                ; a6.w = Depth

                asl.l        #2,d2                ; Modulo <<= 2;
c_h:                move.l        a0,a4                ; DestPtr = DestArea;
                move.w        a6,d4                ; PlaneCounter = Depth;

c_p:                movea.l        a1,a3                ; SrcPtr = TempArea;
;                move.w        d0,d7                ; Temp = Height;
;                andi.w        #1,d7                ; Temp &= 1;
 moveq        #1,d7
 and.w        d0,d7
;                asl.w        #3,d7                ; Temp <<= 3;
 asl.w        #3+2,d7                ; Temp <<= 3+2;
                or.w        d4,d7                ; Temp |= PlaneCounter<<2;
;                asl.w        #2,d7                ; Temp <<= 2;
;                move.l        (a2,d7),d6        ; CurrentPattern = PatternLUT[Temp];
 move.l        (a2,d7.w),d6        ; CurrentPattern = PatternLUT[Temp];
                move.w        d1,d5                ; WidthCounter = Width-1;

c_w:                move.l        (a3)+,d7        ; Temp = *SrcPtr++;
;                not.l        d7                ; Temp = ~Temp;
;                and.l        d7,(a4)                ; *DestPtr &= Temp;
;                not.l        d7                ; Temp = ~Temp;
;                and.l        d6,d7                ; Temp &= CurrentPattern;
;                or.l        d7,(a4)+        ; *DestPtr++ |= Temp;
 move.l        d7,d3
 not.l        d7
 and.l        (a4),d7
 and.l        d6,d3
 or.l        d3,d7
 move.l        d7,(a4)+
;                subq.w        #1,d5                ; WidthCounter--;
;                bne        c_w                ; if (WidthCounter != 0) goto c_w;
 dbf        d5,c_w                ; if (WidthCounter >= 0) goto c_w;

                adda.l        d2,a4                ; DestPtr += Modulo;
;                subq.b        #1,d4                ; PlaneCounter--;
;                bpl        c_p                ; if (PlaneCounter >= 0) goto c_p;
 subq.w        #1<<2,d4                ; PlaneCounter -= 1<<2;
 bpl.b        c_p                ; if (PlaneCounter >= 0) goto c_p;

;                adda.l        d3,a0                ; DestArea += RowSize;
;                adda.l        d3,a1                ; TempArea += RowSize;
 adda.l        a5,a0                ; DestArea += RowSize;
 adda.l        a5,a1                ; TempArea += RowSize;
                subq.w        #1,d0                ; Height--;
;                bne        c_h                ; if (Height != 0) goto c_h;
 bne.s        c_h                ; if (Height != 0) goto c_h;

;                move.l        (sp)+,d7
;                move.l        (sp)+,d6
;                move.w        (sp)+,d5
;                move.w        (sp)+,a6
;                move.l        (sp)+,a4
;                move.l        (sp)+,a3
 movem.l        (a7)+,d5-d7/a3-a6
                rts


TCH 24 November 2019 10:05

Unfortunately the code crashes. The polygon has been drawn, but dots appear on the screen in a matrix and then, GURU.

Edit: And i think i know why. If i used register
a5
, then my code crashed too. That is why i skipped it and use
a6
instead. Preserving it in the stack and pulling it back at the end did not helped at all. If used, crashed.

Edit #2: Nope, i've found it: you pushed registers to the stack by
movem.l	d5-d7/a3-a6,-(a7)
, but you pulled them back by
movem.l	(a7)+,d5-d7/a3/a4/a6
, skipping
a5
. Now it works. And while it's significantly faster than my original code, it is still slower than the C code.

http://oscomp.hu/depot/PolygonBitmapToPlanes32.png

I'm gonna check your modifications, to understand this.

ross 24 November 2019 10:48

Quote:

Originally Posted by TCH (Post 1361008)
Edit: And i think i know why. If i used register
a5
, then my code crashed too. That is why i skipped it and use
a6
instead. Preserving it in the stack and pulling it back at the end did not helped at all. If used, crashed.

The content of A5 is modified by something external: an IRQ routine with bad registers preserving.

TCH 24 November 2019 10:57

It's possible, but it seems, that the problem was, that a/b pushed
a5
into the stack, but did not pull it back. I have no idea what was the problem, when i did mess around
a5
. Either i messed up something about push/pull too, or you're right and an external interrupt messed up my registers.

ross 24 November 2019 11:12

Quote:

Originally Posted by TCH (Post 1361020)
It's possible, but it seems, that the problem was, that a/b pushed
a5
into the stack, but did not pull it back. I have no idea what was the problem, when i did mess around
a5
. Either i messed up something about push/pull too, or you're right and an external interrupt messed up my registers.

EDIT: uh, sorry, I missed your EDIT2, so probably not a bad preserving in IRQ

Weird that a/b code is slower than C...

a/b 24 November 2019 15:42

Yeah, as I said I glanced over it and only did the obvious/simple optimizations. And apparently too tired to even do that properly ;P.

Don_Adan 24 November 2019 16:15

Quote:

Originally Posted by ross (Post 1361026)
EDIT: uh, sorry, I missed your EDIT2, so probably not a bad preserving in IRQ

Weird that a/b code is slower than C...

I dont see original C version in 68k assembler. Anyway using move.w ax,-(sp) and move.w (sp)+,ax is not good idea for me, because ax will be autoextended to longword size, if i remember right.

ross 24 November 2019 16:37

Quote:

Originally Posted by Don_Adan (Post 1361081)
I dont see original C version in 68k assembler. Anyway using move.w ax,-(sp) and move.w (sp)+,ax is not good idea for me, because ax will be autoextended to longword size, if i remember right.

Yes, it would be interesting to see the version generated by the compiler.

Regarding the stack, only the move.b instructions are lined up to word.

Don_Adan 24 November 2019 16:46

Quote:

Originally Posted by ross (Post 1361084)
Yes, it would be interesting to see the version generated by the compiler.

Regarding the stack, only the move.b instructions are lined up to word.

Seems you dont understand me.
F.e
A5=$00010000
Move.w a5,-(sp) ; $0000
Move.w (sp)+,a5, $00000000
A5 is not restored correctly, if i remember right.

a/b 24 November 2019 16:57

Don't understand this part:
Code:

                asl.l        #2,d2                ; Modulo <<= 2;
...
                adda.l        d2,a4                ; DestPtr += Modulo;

Why modulo*4? Convert longwords to bytes, or...?
Basically I need an extra free reg (almost all code between c_p and c_w can be pushed one level higher but need an extra reg to avoid using stack) and trying to figure out the relation between width/rowsize/modulo.
What do you send in d1/d2/d3 and how is data organized in memory?

a/b 24 November 2019 17:01

Word move to areg will auto-expand to longword and destroy whatever it was in the upper half of areg. Same with e.g. adda.w (it will expand to longword and do a longword addition to areg), etc.

ross 24 November 2019 17:07

Quote:

Originally Posted by Don_Adan (Post 1361086)
Seems you dont understand me.
F.e
A5=$00010000
Move.w a5,-(sp) ; $0000
Move.w (sp)+,a5, $00000000
A5 is not restored correctly, if i remember right.

Ah this, yes, you are right :)

So TCH had been lucky with A6 in its version (and explain crash with A5).

a/b 24 November 2019 19:30

Try this, I take no reposibility if anything explodes ;P.

A few things...
1) It seems you are accessing pattern LUT backwards. You are going through bitplanes 0->N, while LUT index goes N->0. This version below is going 0->N in both cases, which I pressume is the correct way.
2) Temp area seems to be the same width as dest (full width), but is organized differently.
dest:
- line0: AAAA00BBBB00CCCC00DDD00,
- line1: AAAA00BBBB00CCCC00DDD00, ...
temp:
- line0: AAAABBBBCCCCDDD00000000,
- line1: AAAABBBBCCCCDDD00000000, ...
A=bpl0, B=bpl1, C=bpl2, D=bpl3, 0 = area not affected by a blit
If my assumption is corrent, the explosion won't be too big.

Code:

; d3 (rowsize) is not needed, we assume that rowsize = depth*((width+modulo)<<2)
_PolygonBitmapToPlanes32:
                movem.l        d2-d7/a2-a6,-(a7)
                asl.l        #2,d2                ; Modulo <<= 2;
                move.l        d2,a6
                mulu.w        d4,d2
                move.l        d2,a3                ; a3.l = Depth*Modulo
                subq.w        #1,d0                ; Height--;
                subq.w        #1,d1                ; Width--;
                subq.w        #1,d4                ; Depth--;
                move.w        d4,a5                ; a5.w = Depth
                moveq        #0,d2

c_h:                lea        (a2,d2.w),a4
                eor.w        #8<<2,d2        ; alternate between 0 and 8<<2

                move.w        a5,d4                ; PlaneCounter = Depth;
c_p:                move.l        (a4)+,d6        ; CurrentPattern

                move.w        d1,d5                ; WidthCounter = Width-1;
c_w:                move.l        (a1)+,d7        ; Temp = *SrcPtr++;
                move.l        d7,d3
                not.l        d7
                and.l        (a0),d7
                and.l        d6,d3
                or.l        d3,d7
                move.l        d7,(a0)+        ; *DestPtr++ = (*DestPtr&~Temp)|(CurrentPattern&Temp)
                dbf        d5,c_w                ; if (--WidthCounter >= 0) goto c_w;

                adda.l        a6,a0                ; DestPtr += Modulo;
                dbf        d4,c_p                ; if (--PlaneCounter >= 0) goto c_p;

                adda.l        a3,a1                ; TempAreaPtr += Depth*Modulo;
                dbf        d0,c_h                ; if (--Height >= 0) goto c_h;

                movem.l        (a7)+,d2-d7/a2-a6
                rts


TCH 24 November 2019 23:22

@Don_Adan:
That's a good thing to know, that address registers needs to be push/pull as longwords, thanks.

@a/b:
Modulo
is multiplied by 4, because former C cycles were using it with 32-bit pointers. In this post i posted the ASM code, and the args were listed, but now i elaborate then.
a0.l
is the destination zone's upper left corner on the zeroth plane of the screen.
a1.l
is the same, but for the source bitmap, which has only one plane.
d0.w
and
d1.w
are the height (as in number of rows) and width (as in number of longwords per row), in this order. These are the dimensions of the work area, not the screen.
d2.l
is modulo value, as in how many longwords are left on this screen, until the beginning of the same line of the work area on the next plane.
d3.l
is the number of bytes per row of one plane on the screen. (As in:
RastPort->BitMap->BytesPerRow
)
d4.w
is the depth of the screen. It is passed as word, because if i'd send a byte, then pushing it to the stack would mess up word-alignment. Maybe this is not necessary, i was only cautious. And
a2.l
is the LUT for patterns, which are filled up with the code in my opening post.

The source area is a simple 1-bit bitmap, with identical dimensions to the destination screen. First, i calculate the work rectangle by iterating the vertice coordinates of the polygon, then i clear the work rectangle on it with zeros, then draw the outlines of the polygon, then fill it up with ones with xorfill.

Then i copy this workarea to each plane of the destination screen, line by line, but masked with the corresponding value of LUT. (Direct copy would result in the last colour index all times.)

As for the LUT, you can see, how it's filled up in the opening post.
Basically it's iterating through the bits of both colour indexes and puts together patterns for all planes, in which one pixel is one and next pixel is other, so a LUT entry is a mask pattern for a bitplane. And since it has to alternate per lines, the LUT has 16 entries and the upper entries has these pattern, but with swapped colours (by XOR 3, but i've just realized, that XOR is not needed, as neither the double cycle, i can just rotate the patterns by one bit and put them to pattern
[1][i]
). The result is visible on that picture i linked in previously. (Without the separate dots of course.)
However, i filled up LUT backwards, to be able to decrement the loop variable to zero, instead of incrementing it to the depth, and to spare a
D - i
operation each time.

So, RowSize is needed to be passed.

(Your code did not exploded, but did not worked also, due to this.)

a/b 25 November 2019 01:17

> d2.l is modulo value, as in how many longwords are left on this screen, until the beginning of the same line of the work area on the next plane

OK, this is important because it's misleading. It's a delta, not modulo (semantics :p ).
Modulo would be, for me, how many bytes until the next line, e.g. bitmap width is 320px and blit width is 256px, so modulo is then 64px or 8 bytes.
While in your case it's 320*bitmap_height/8 bytes, so for a 320x256 bitmap that's 10240 bytes.
That got me confused, and when I asked what do you send in d1/d2/d3 I meant the actual values so I could clearly see what's going on.

OK, I'll post a fixed version later today...

a/b 25 November 2019 03:54

Did a simple test with a 320x256x3 bitmap and 256x128 blit size, and the produced output 100% matched the original code's output, so this should work in a non-explosive way.
And yeah, pattern LUT must be in normal (non-backward) order...

Code:

_PolygonBitmapToPlanes32:
        movem.l        d2-d7/a2-a6,-(a7)
        move.l        d2,d7
        asl.l        #2,d2                ; longwords to bytes
        move.l        d2,a6                ; a6 = Modulo<<2 = BitplaneSize-Width<<2

        add.w        d1,d7
        mulu.w        d4,d7
        asl.l        #2,d7                ; longwords to bytes
        sub.l        d3,d7
        move.l        d7,-(a7)        ; (a7) = Depth*BitplaneSize-RowSize

        subq.w        #1,d0                ; Height--;
        subq.w        #1,d1                ; Width--;
        subq.w        #1,d4                ; Depth--;
        move.w        d4,a5                ; a5.w = Depth
        moveq        #0,d2

c_h:        lea        (a2,d2.w),a4
        eor.w        #8<<2,d2        ; alternate between 0 and 8<<2

        swap        d0
        move.w        a5,d0                ; PlaneCounter = Depth;
c_p:        movea.l        a1,a3                ; SrcPtr = TempArea;
        move.l        (a4)+,d6        ; CurrentPattern

        move.w        d1,d5                ; WidthCounter = Width-1;
c_w:        move.l        (a3)+,d7        ; Temp = *SrcPtr++;
        move.l        d7,d4
        not.l        d7
        and.l        (a0),d7
        and.l        d6,d4
        or.l        d4,d7
        move.l        d7,(a0)+        ; *DestArea++ = (*DestArea&~Temp)|(CurrentPattern&Temp);
        dbf        d5,c_w                ; if (--WidthCounter >= 0) goto c_w;

        adda.l        a6,a0                ; DestArea += BitplaneSize-Width<<2;
        dbf        d0,c_p                ; if (--PlaneCounter >= 0) goto c_p;

        swap        d0
        suba.l        (a7),a0                ; DestArea += RowSize-Depth*BitplaneSize;
        adda.l        d3,a1                ; TempArea += RowSize;
        dbf        d0,c_h                ; if (--Height >= 0) goto c_h;

        addq.l        #4,a7
        movem.l        (a7)+,d2-d7/a2-a6
        rts



All times are GMT +2. The time now is 20:54.

Powered by vBulletin® Version 3.8.11
Copyright ©2000 - 2024, vBulletin Solutions Inc.

Page generated in 0.05397 seconds with 11 queries