22 November 2019, 13:57 | #1 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
Optimizing polygonfill bitcopy
After i drawn it's edges and xorfilled the polygon on a 1-bit plane, i copy the data to the actual planes. Since i wanted to support dither (like PenA and PenB in the OS API), i built a two by eight support table for masking the planes when copying:
Code:
t = 2; cmA = 0x01; cmB = 0x02; while (t-- > 0) { cmask = 1; i = D; while (--i >= 0) { pattern16 = 0x0000; if ((cmask & ColorA) != 0) { pattern16 |= cmA; } if ((cmask & ColorB) != 0) { pattern16 |= cmB; } if (pattern16 != 0) { pattern16 |= pattern16 << 2; pattern16 |= pattern16 << 4; pattern16 |= pattern16 << 8; } cpattern32[t][i] = (pattern16 << 16) | pattern16; cmask <<= 1; } cmA ^= 0x03; cmB ^= 0x03; } Code:
while (--h >= 0) { dstptr32 = (ULONG *)DestArea; i = D; while (--i >= 0) { srcptr32 = (ULONG *)TempArea; pattern32 = cpattern32[h & 1][i]; t = w; while (--t >= 0) { read32 = *srcptr32++; *dstptr32 &= ~read32; *dstptr32++ |= (read32 & pattern32); } dstptr32 += Stepper; } TempArea += RowSize; DestArea += RowSize; } Any suggestion how to make this bitcopy more effective? (The algorithm, i mean. When that is done, i will rewrite it into assembly.) Last edited by TCH; 23 November 2019 at 13:00. Reason: a/b-s cycle tip is applied for a small amount of speed; it turned out, that 32-bit copy is faster, regardless of bus size |
22 November 2019, 15:11 | #2 |
ex. demoscener "Bigmama"
Join Date: Jun 2012
Location: Fyn / Denmark
Posts: 1,631
|
Do you actually need to clear the destination inside the loop (*dstptr16 &= ..) ?
The blitter is perfect for this sort of copy with logical op applied, btw ;-) |
22 November 2019, 22:05 | #3 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
Of course i do. There is no tell what the corresponding bits on that plane will be. We are doing dithering fill here. I only could spare the zeroing and then the writing, if i would copy one plane, because that necessarily will hold only one type of bits (depending if i filled the polygon on the temporary plane with zeros or ones) and thus i could just OR or AND them with the destination plane. But on multiple planes, this is dependent on the color index. Example with two planes with dithering colours number 2 and 3:
Code:
PLANE 0: ???10??? ??0??1?? ??0??1?? ??0101?? ??0??1?? ??0??1?? PLANE 1: ???11??? ??1??1?? ??1??1?? ??1111?? ??1??1?? ??1??1?? |
24 November 2019, 00:33 | #4 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
I rewrote it in ASM, but it only got slower. Any ideas?
Code:
section code,code ; PolygonBitmapToPlanes32() ; ; args: ; ----- ; a0.l = DestArea ; a1.l = TempArea ; d0.w = Height ; d1.w = Width ; d2.l = Modulo ; d3.l = RowSize ; d4.w = Depth ; a2.l = PatternLUT ; ; local: ; ------ ; a3.l = SrcPtr ; a4.l = DestPtr ; a6.w = Depth ; d4.w = PlaneCounter ; d5.w = WidthCounter ; d6.l = CurrentPattern ; d7.l = Temp xref _PolygonBitmapToPlanes32 xdef _PolygonBitmapToPlanes32 _PolygonBitmapToPlanes32: move.l a3,-(sp) move.l a4,-(sp) move.w a6,-(sp) move.w d5,-(sp) move.l d6,-(sp) move.l d7,-(sp) subq.w #1,d4 ; Depth--; movea.w d4,a6 ; a6.w = Depth asl.l #2,d2 ; Modulo <<= 2; c_h: move.l a0,a4 ; DestPtr = DestArea; move.w a6,d4 ; PlaneCounter = Depth; c_p: movea.l a1,a3 ; SrcPtr = TempArea; move.w d0,d7 ; Temp = Height; andi.w #1,d7 ; Temp &= 1; asl.w #3,d7 ; Temp <<= 3; or.w d4,d7 ; Temp |= PlaneCounter; asl.w #2,d7 ; Temp <<= 2; move.l (a2,d7),d6 ; CurrentPattern = PatternLUT[Temp]; move.w d1,d5 ; WidthCounter = Width; c_w: move.l (a3)+,d7 ; Temp = *SrcPtr++; not.l d7 ; Temp = ~Temp; and.l d7,(a4) ; *DestPtr &= Temp; not.l d7 ; Temp = ~Temp; and.l d6,d7 ; Temp &= CurrentPattern; or.l d7,(a4)+ ; *DestPtr++ |= Temp; subq.w #1,d5 ; WidthCounter--; bne c_w ; if (WidthCounter != 0) goto c_w; adda.l d2,a4 ; DestPtr += Modulo; subq.b #1,d4 ; PlaneCounter--; bpl c_p ; if (PlaneCounter >= 0) goto c_p; adda.l d3,a0 ; DestArea += RowSize; adda.l d3,a1 ; TempArea += RowSize; subq.w #1,d0 ; Height--; bne c_h ; if (Height != 0) goto c_h; move.l (sp)+,d7 move.l (sp)+,d6 move.w (sp)+,d5 move.w (sp)+,a6 move.l (sp)+,a4 move.l (sp)+,a3 rts |
24 November 2019, 08:07 | #5 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Going to bed, so this is only a few obvious things, will probably look at it tomorrow.
Code:
_PolygonBitmapToPlanes32: ; move.l a3,-(sp) ; move.l a4,-(sp) ; move.w a6,-(sp) ; move.w d5,-(sp) ; move.l d6,-(sp) ; move.l d7,-(sp) movem.l d5-d7/a3-a6,-(a7) subq.w #1,d1 ; push to stack if you have to preserve it move.l d3,a5 ; row size subq.w #1,d4 ; Depth--; asl.w #2,d4 movea.w d4,a6 ; a6.w = Depth asl.l #2,d2 ; Modulo <<= 2; c_h: move.l a0,a4 ; DestPtr = DestArea; move.w a6,d4 ; PlaneCounter = Depth; c_p: movea.l a1,a3 ; SrcPtr = TempArea; ; move.w d0,d7 ; Temp = Height; ; andi.w #1,d7 ; Temp &= 1; moveq #1,d7 and.w d0,d7 ; asl.w #3,d7 ; Temp <<= 3; asl.w #3+2,d7 ; Temp <<= 3+2; or.w d4,d7 ; Temp |= PlaneCounter<<2; ; asl.w #2,d7 ; Temp <<= 2; ; move.l (a2,d7),d6 ; CurrentPattern = PatternLUT[Temp]; move.l (a2,d7.w),d6 ; CurrentPattern = PatternLUT[Temp]; move.w d1,d5 ; WidthCounter = Width-1; c_w: move.l (a3)+,d7 ; Temp = *SrcPtr++; ; not.l d7 ; Temp = ~Temp; ; and.l d7,(a4) ; *DestPtr &= Temp; ; not.l d7 ; Temp = ~Temp; ; and.l d6,d7 ; Temp &= CurrentPattern; ; or.l d7,(a4)+ ; *DestPtr++ |= Temp; move.l d7,d3 not.l d7 and.l (a4),d7 and.l d6,d3 or.l d3,d7 move.l d7,(a4)+ ; subq.w #1,d5 ; WidthCounter--; ; bne c_w ; if (WidthCounter != 0) goto c_w; dbf d5,c_w ; if (WidthCounter >= 0) goto c_w; adda.l d2,a4 ; DestPtr += Modulo; ; subq.b #1,d4 ; PlaneCounter--; ; bpl c_p ; if (PlaneCounter >= 0) goto c_p; subq.w #1<<2,d4 ; PlaneCounter -= 1<<2; bpl.b c_p ; if (PlaneCounter >= 0) goto c_p; ; adda.l d3,a0 ; DestArea += RowSize; ; adda.l d3,a1 ; TempArea += RowSize; adda.l a5,a0 ; DestArea += RowSize; adda.l a5,a1 ; TempArea += RowSize; subq.w #1,d0 ; Height--; ; bne c_h ; if (Height != 0) goto c_h; bne.s c_h ; if (Height != 0) goto c_h; ; move.l (sp)+,d7 ; move.l (sp)+,d6 ; move.w (sp)+,d5 ; move.w (sp)+,a6 ; move.l (sp)+,a4 ; move.l (sp)+,a3 movem.l (a7)+,d5-d7/a3-a6 rts Last edited by a/b; 24 November 2019 at 15:26. Reason: fixed bad movem (zzZZzZZ...) |
24 November 2019, 10:05 | #6 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
Unfortunately the code crashes. The polygon has been drawn, but dots appear on the screen in a matrix and then, GURU.
Edit: And i think i know why. If i used register a5, then my code crashed too. That is why i skipped it and use a6instead. Preserving it in the stack and pulling it back at the end did not helped at all. If used, crashed. Edit #2: Nope, i've found it: you pushed registers to the stack by movem.l d5-d7/a3-a6,-(a7), but you pulled them back by movem.l (a7)+,d5-d7/a3/a4/a6, skipping a5. Now it works. And while it's significantly faster than my original code, it is still slower than the C code. I'm gonna check your modifications, to understand this. Last edited by TCH; 24 November 2019 at 10:38. Reason: Possible error cause, found error |
24 November 2019, 10:48 | #7 |
Defendit numerus
Join Date: Mar 2017
Location: Crossing the Rubicon
Age: 53
Posts: 4,480
|
The content of A5 is modified by something external: an IRQ routine with bad registers preserving.
|
24 November 2019, 10:57 | #8 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
It's possible, but it seems, that the problem was, that a/b pushed
a5into the stack, but did not pull it back. I have no idea what was the problem, when i did mess around a5. Either i messed up something about push/pull too, or you're right and an external interrupt messed up my registers. |
24 November 2019, 11:12 | #9 | |
Defendit numerus
Join Date: Mar 2017
Location: Crossing the Rubicon
Age: 53
Posts: 4,480
|
Quote:
Weird that a/b code is slower than C... Last edited by ross; 24 November 2019 at 11:23. |
|
24 November 2019, 15:42 | #10 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Yeah, as I said I glanced over it and only did the obvious/simple optimizations. And apparently too tired to even do that properly ;P.
|
24 November 2019, 16:15 | #11 |
Registered User
Join Date: Jan 2008
Location: Warsaw/Poland
Age: 55
Posts: 2,010
|
I dont see original C version in 68k assembler. Anyway using move.w ax,-(sp) and move.w (sp)+,ax is not good idea for me, because ax will be autoextended to longword size, if i remember right.
|
24 November 2019, 16:37 | #12 | |
Defendit numerus
Join Date: Mar 2017
Location: Crossing the Rubicon
Age: 53
Posts: 4,480
|
Quote:
Regarding the stack, only the move.b instructions are lined up to word. |
|
24 November 2019, 16:46 | #13 | |
Registered User
Join Date: Jan 2008
Location: Warsaw/Poland
Age: 55
Posts: 2,010
|
Quote:
F.e A5=$00010000 Move.w a5,-(sp) ; $0000 Move.w (sp)+,a5, $00000000 A5 is not restored correctly, if i remember right. |
|
24 November 2019, 16:57 | #14 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Don't understand this part:
Code:
asl.l #2,d2 ; Modulo <<= 2; ... adda.l d2,a4 ; DestPtr += Modulo; Basically I need an extra free reg (almost all code between c_p and c_w can be pushed one level higher but need an extra reg to avoid using stack) and trying to figure out the relation between width/rowsize/modulo. What do you send in d1/d2/d3 and how is data organized in memory? Last edited by a/b; 24 November 2019 at 18:33. |
24 November 2019, 17:01 | #15 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Word move to areg will auto-expand to longword and destroy whatever it was in the upper half of areg. Same with e.g. adda.w (it will expand to longword and do a longword addition to areg), etc.
|
24 November 2019, 17:07 | #16 |
Defendit numerus
Join Date: Mar 2017
Location: Crossing the Rubicon
Age: 53
Posts: 4,480
|
|
24 November 2019, 19:30 | #17 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Try this, I take no reposibility if anything explodes ;P.
A few things... 1) It seems you are accessing pattern LUT backwards. You are going through bitplanes 0->N, while LUT index goes N->0. This version below is going 0->N in both cases, which I pressume is the correct way. 2) Temp area seems to be the same width as dest (full width), but is organized differently. dest: - line0: AAAA00BBBB00CCCC00DDD00, - line1: AAAA00BBBB00CCCC00DDD00, ... temp: - line0: AAAABBBBCCCCDDD00000000, - line1: AAAABBBBCCCCDDD00000000, ... A=bpl0, B=bpl1, C=bpl2, D=bpl3, 0 = area not affected by a blit If my assumption is corrent, the explosion won't be too big. Code:
; d3 (rowsize) is not needed, we assume that rowsize = depth*((width+modulo)<<2) _PolygonBitmapToPlanes32: movem.l d2-d7/a2-a6,-(a7) asl.l #2,d2 ; Modulo <<= 2; move.l d2,a6 mulu.w d4,d2 move.l d2,a3 ; a3.l = Depth*Modulo subq.w #1,d0 ; Height--; subq.w #1,d1 ; Width--; subq.w #1,d4 ; Depth--; move.w d4,a5 ; a5.w = Depth moveq #0,d2 c_h: lea (a2,d2.w),a4 eor.w #8<<2,d2 ; alternate between 0 and 8<<2 move.w a5,d4 ; PlaneCounter = Depth; c_p: move.l (a4)+,d6 ; CurrentPattern move.w d1,d5 ; WidthCounter = Width-1; c_w: move.l (a1)+,d7 ; Temp = *SrcPtr++; move.l d7,d3 not.l d7 and.l (a0),d7 and.l d6,d3 or.l d3,d7 move.l d7,(a0)+ ; *DestPtr++ = (*DestPtr&~Temp)|(CurrentPattern&Temp) dbf d5,c_w ; if (--WidthCounter >= 0) goto c_w; adda.l a6,a0 ; DestPtr += Modulo; dbf d4,c_p ; if (--PlaneCounter >= 0) goto c_p; adda.l a3,a1 ; TempAreaPtr += Depth*Modulo; dbf d0,c_h ; if (--Height >= 0) goto c_h; movem.l (a7)+,d2-d7/a2-a6 rts Last edited by a/b; 24 November 2019 at 19:39. |
24 November 2019, 23:22 | #18 |
Newbie Amiga programmer
Join Date: Jun 2012
Location: Front of my A500+
Age: 38
Posts: 372
|
@Don_Adan:
That's a good thing to know, that address registers needs to be push/pull as longwords, thanks. @a/b: Modulois multiplied by 4, because former C cycles were using it with 32-bit pointers. In this post i posted the ASM code, and the args were listed, but now i elaborate then. a0.lis the destination zone's upper left corner on the zeroth plane of the screen. a1.lis the same, but for the source bitmap, which has only one plane. d0.wand d1.ware the height (as in number of rows) and width (as in number of longwords per row), in this order. These are the dimensions of the work area, not the screen. d2.lis modulo value, as in how many longwords are left on this screen, until the beginning of the same line of the work area on the next plane. d3.lis the number of bytes per row of one plane on the screen. (As in: RastPort->BitMap->BytesPerRow) d4.wis the depth of the screen. It is passed as word, because if i'd send a byte, then pushing it to the stack would mess up word-alignment. Maybe this is not necessary, i was only cautious. And a2.lis the LUT for patterns, which are filled up with the code in my opening post. The source area is a simple 1-bit bitmap, with identical dimensions to the destination screen. First, i calculate the work rectangle by iterating the vertice coordinates of the polygon, then i clear the work rectangle on it with zeros, then draw the outlines of the polygon, then fill it up with ones with xorfill. Then i copy this workarea to each plane of the destination screen, line by line, but masked with the corresponding value of LUT. (Direct copy would result in the last colour index all times.) As for the LUT, you can see, how it's filled up in the opening post. Basically it's iterating through the bits of both colour indexes and puts together patterns for all planes, in which one pixel is one and next pixel is other, so a LUT entry is a mask pattern for a bitplane. And since it has to alternate per lines, the LUT has 16 entries and the upper entries has these pattern, but with swapped colours (by XOR 3, but i've just realized, that XOR is not needed, as neither the double cycle, i can just rotate the patterns by one bit and put them to pattern [1][i]). The result is visible on that picture i linked in previously. (Without the separate dots of course.) However, i filled up LUT backwards, to be able to decrement the loop variable to zero, instead of incrementing it to the depth, and to spare a D - ioperation each time. So, RowSize is needed to be passed. (Your code did not exploded, but did not worked also, due to this.) Last edited by TCH; 24 November 2019 at 23:50. Reason: got modulo and rowsize wrong |
25 November 2019, 01:17 | #19 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
> d2.l is modulo value, as in how many longwords are left on this screen, until the beginning of the same line of the work area on the next plane
OK, this is important because it's misleading. It's a delta, not modulo (semantics ). Modulo would be, for me, how many bytes until the next line, e.g. bitmap width is 320px and blit width is 256px, so modulo is then 64px or 8 bytes. While in your case it's 320*bitmap_height/8 bytes, so for a 320x256 bitmap that's 10240 bytes. That got me confused, and when I asked what do you send in d1/d2/d3 I meant the actual values so I could clearly see what's going on. OK, I'll post a fixed version later today... Last edited by a/b; 25 November 2019 at 01:50. |
25 November 2019, 03:54 | #20 |
Registered User
Join Date: Jun 2016
Location: europe
Posts: 1,043
|
Did a simple test with a 320x256x3 bitmap and 256x128 blit size, and the produced output 100% matched the original code's output, so this should work in a non-explosive way.
And yeah, pattern LUT must be in normal (non-backward) order... Code:
_PolygonBitmapToPlanes32: movem.l d2-d7/a2-a6,-(a7) move.l d2,d7 asl.l #2,d2 ; longwords to bytes move.l d2,a6 ; a6 = Modulo<<2 = BitplaneSize-Width<<2 add.w d1,d7 mulu.w d4,d7 asl.l #2,d7 ; longwords to bytes sub.l d3,d7 move.l d7,-(a7) ; (a7) = Depth*BitplaneSize-RowSize subq.w #1,d0 ; Height--; subq.w #1,d1 ; Width--; subq.w #1,d4 ; Depth--; move.w d4,a5 ; a5.w = Depth moveq #0,d2 c_h: lea (a2,d2.w),a4 eor.w #8<<2,d2 ; alternate between 0 and 8<<2 swap d0 move.w a5,d0 ; PlaneCounter = Depth; c_p: movea.l a1,a3 ; SrcPtr = TempArea; move.l (a4)+,d6 ; CurrentPattern move.w d1,d5 ; WidthCounter = Width-1; c_w: move.l (a3)+,d7 ; Temp = *SrcPtr++; move.l d7,d4 not.l d7 and.l (a0),d7 and.l d6,d4 or.l d4,d7 move.l d7,(a0)+ ; *DestArea++ = (*DestArea&~Temp)|(CurrentPattern&Temp); dbf d5,c_w ; if (--WidthCounter >= 0) goto c_w; adda.l a6,a0 ; DestArea += BitplaneSize-Width<<2; dbf d0,c_p ; if (--PlaneCounter >= 0) goto c_p; swap d0 suba.l (a7),a0 ; DestArea += RowSize-Depth*BitplaneSize; adda.l d3,a1 ; TempArea += RowSize; dbf d0,c_h ; if (--Height >= 0) goto c_h; addq.l #4,a7 movem.l (a7)+,d2-d7/a2-a6 rts |
Currently Active Users Viewing This Thread: 1 (0 members and 1 guests) | |
Thread Tools | |
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Optimizing HAM8 renderer. | Thorham | Coders. Asm / Hardware | 5 | 22 June 2017 18:29 |
NetSurf AGA optimizing | arti | Coders. Asm / Hardware | 199 | 10 November 2013 14:36 |
Layered tile engine optimizing. | Thorham | Coders. General | 0 | 30 September 2011 20:43 |
Benching and optimizing CF-IDE speed | Photon | support.Hardware | 12 | 15 July 2009 01:48 |
For people who like optimizing 680x0 code. | Thorham | Coders. General | 5 | 28 May 2008 11:48 |
|
|