Coders Challenge #2: C2P

All C2P's floating around are optimized for speed. But are there any optimized for size? Example a C2P that fits into a bootblock or such?
Do you have any really really small C2P's? No CPU/memory-restrictions. Selfmodifying/dynamically created code is allowed. OS-functions like WritePixel() are not allowed.
*** Chunky 2 Planar	       174 Bytes***

; Tiny 8bpl converter. 4k intros, anyone?
; c2p1x1_8_c5_gen_mini

; a0	chunkybuffer
; a1	bitplanes

; 22-Aug-2oo4:	2o bytes killed! :) -> size: 216 bytes
;		several minutes and 1 crash :) later:
;		34 bytes killed! -> 182 bytes
;		size optimizing rulez! :)
;		and the next 8 bytes killed! -> 174 bytes

	movem.l	(a0)+,d0-d3
	lsr.l	d4,d0
	lsr.l	d4,d1
	lsr.l	d4,d2
	lsr.l	d4,d3
	and.l	d5,d0
	and.l	d5,d1
	and.l	d5,d2
	and.l	d5,d3
	lsl.l	#4,d0
	lsl.l	#4,d2
	or.l	d1,d0
	or.l	d3,d2
c2p	moveq	#0,d4

	add.w	#PLsize*2+(256-CHUNKYY)/2*40,a1	; auto centering
	bsr.b	.conv
	sub.l	#CHUNKYX*CHUNKYY+32,a0
	add.l	#PLsize*3+(256-CHUNKYY)*40,a1	; auto centering
	moveq	#4,d4

.conv	lea	32(a0),a6
	bsr.s	.pix32
	subq.l	#4,a1
	move.l	a0,a6

.pix32	move.l	#$0f0f0f0f,d5
	bsr.b	c2p_merge		; d0,d2
	move.l	d0,d6
	move.l	d2,d7
	move.l	a5,-PLsize*2(a1)
	bsr.b	c2p_merge		; d1,d6
	move.l	d0,d1
	move.l	d6,d0
	move.l	d2,d6
	move.l	d7,d2

	swap	d1
	swap	d6
	eor.w	d0,d1
	eor.w	d2,d6
	eor.w	d1,d0
	eor.w	d6,d2
	eor.w	d0,d1
	eor.w	d2,d6
	swap	d1
	swap	d6

	move.l	a4,-PLsize(a1)

	move.l	#$33333333,d5
	move.l	d4,-(a7)
	moveq	#2,d4
	exg.l	d1,d2
	bsr.b	.swap
	exg.l	d1,d2

	move.l	a3,(a1)+

	move.l	#$00ff00ff,d5
	moveq	#8,d4
	bsr.b	.swap
	move.l	a2,PLsize-4(a1)

	move.l	#$55555555,d5
	moveq	#1,d4
	bsr.b	.swap
	move.l	(a7)+,d4
	move.l	d0,a2
	move.l	d2,a3
	move.l	d1,a4
	move.l	d6,a5
	cmp.l	a0,a6
	bne.b	.pix32

.swap	move.l	d2,d3
	move.l	d6,d7
	lsr.l	d4,d3
	lsr.l	d4,d7
	eor.l	d0,d3
	eor.l	d1,d7
	and.l	d5,d3
	and.l	d5,d7
	eor.l	d3,d0
	eor.l	d7,d1
	lsl.l	d4,d3
	lsl.l	d4,d7
	eor.l	d3,d2
	eor.l	d7,d6
That's the c2p routine I made for our 4k intros, base code by Kalms which I then optimized. It's short but the speed is still OK. MUCH shorter c2p is possible (addx) of course.
Wow, that's impressive. Anyone with a shorter one?
Very small and very slow.

WIDTH               = 320
HEIGHT              = 200

C2P                 sub.l     a5, a5
.yloop              sub.l     a4, a4
.xloop              sub.l     a3, a3
.wloop              move.w    (a0)+, d0

                    sub.l     a2, a2
.bloop              add.l     d0, d0
                    add.w     d0, d0
                    addx.w    d1, d1
                    add.w     d0, d0
                    addx.w    d2, d2
                    add.w     d0, d0
                    addx.w    d3, d3
                    add.w     d0, d0
                    addx.w    d4, d4
                    add.w     d0, d0
                    addx.w    d5, d5
                    add.w     d0, d0
                    addx.w    d6, d6
                    add.w     d0, d0
                    addx.w    d7, d7

                    addq.w    #1, a2
                    cmp.w     #2, a2
                    bne       .bloop

                    addq.w    #1, a3
                    cmp.w     #8, a3
                    bne       .wloop

                    swap      d0
                    move.w    d7, (a1)+
                    move.w    d6, WIDTH/8-2(a1)
                    move.w    d5, WIDTH/8*2-2(a1)
                    move.w    d4, WIDTH/8*3-2(a1)
                    move.w    d3, WIDTH/8*4-2(a1)
                    move.w    d2, WIDTH/8*5-2(a1)
                    move.w    d1, WIDTH/8*6-2(a1)
                    move.w    d0, WIDTH/8*7-2(a1)

                    addq.w    #1, a4
                    cmp.w     #WIDTH/16, a4
                    bne       .xloop

                    add.w     #WIDTH/8*7, a1
                    addq.w    #1, a5
                    cmp.w     #HEIGHT, a5
                    bne       .yloop

Old 04 June 2010, 18:12   #5
Join Date: Jul 2008
Location: Sweden
Posts: 2,112

C2P                 moveq     #8-1, d5
.sloop              move.l    a0, a2

                    move.w    #WIDTH*HEIGHT/32-1, d4
.ploop              moveq     #8-1, d3
.lloop              move.l    (a2)+, d0
                    rol.l     d5, d0
                    ror.l     #7, d0

                    moveq     #4-1, d2
.bloop              roxl.l    #8, d0
                    addx.l    d1, d1
                    dbf       d2, .bloop
                    dbf       d3, .lloop

                    move.l    d1, (a1)+
                    dbf       d4, .ploop

                    dbf       d5, .sloop

