/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 275 - (show annotations)
Fri Feb 12 20:33:14 2010 UTC (9 years, 7 months ago) by thrust26
File size: 18251 byte(s)
bugfix for very horizontal lines
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13 ;473 final optimization for mainly_vertical (37.89 -> 38.34 corrected)
14 ;468 a weird stunt on mainly_horizontal (38.07)
15 ;467 minor very_horizontal optimization (37.88 -> 38.56 corrected)
16 ;463 self modifying pointer in mainly_horizontal (38.35)
17 ;459 self modifying pointer in mainly_vertical (37.99)
18 ;459 a little tweak to very_horizontal (37.94)
19
20 ; TODOs:
21 ; + chunking (-35)
22 ; - two separate branches instead of patching?
23 ; + countdown minor
24 ; x mainly_horizontal (won't work)
25 ; + mainly_vertical (-9)
26 ; o optimizing for space (-2 tables and one alignment page)
27 ; - optimize horizontal
28 ; - optimize vertical
29 ; + correct branch taken percentages
30
31 .zero
32
33 ; *= tmp1
34
35 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
36 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
37 ;dx .dsb 1 ; Width
38 ;dy .dsb 1 ; Height
39 ;_CurrentPixelX .dsb 1
40 ;_CurrentPixelY .dsb 1
41 ;_OtherPixelX .dsb 1
42 ;_OtherPixelY .dsb 1
43
44 save_a .dsb 1
45 curBit .dsb 1
46 chunk .dsb 1
47 lastSum .dsb 1
48
49
50 #define BYTE_PIXEL 6
51 #define X_SIZE 240
52 #define ROW_SIZE X_SIZE/BYTE_PIXEL
53
54 #define _NOP $ea
55 #define _INX $e8
56 #define _DEX $ca
57 #define _INY $c8
58 #define _DEY $88
59 #define _ASL $0a
60 #define _LSR $4a
61 #define _INC_ZP $e6
62 #define _DEC_ZP $c6
63 #define _INC_ABS $ee
64 #define _DEC_ABS $ce
65 #define _STA_ZP $85
66 #define _CPY_IMM $c0
67
68
69 .text
70
71 ; .dsb 256-(*&255)
72
73 ;**********************************************************
74 draw_totaly_vertical_8
75 .(
76 ldx _CurrentPixelX
77 ldy _TableDiv6,x
78 lda _TableBit6Reverse,x ; 4
79 and #$7f
80 sta _mask_patch+1
81 ldx dy
82 inx
83 clc ; 2
84 loop
85 _mask_patch
86 lda #0 ; 2
87 eor (tmp0),y ; 5*
88 sta (tmp0),y ; 6*= 13**
89
90 ; update the screen address:
91 .(
92 tya ; 2
93 adc #ROW_SIZE ; 2
94 tay ; 2
95 bcc skip ; 2/3 84.4% taken
96 inc tmp0+1 ; 5
97 clc ; 2
98 skip ; = 9.94
99 .)
100 dex ; 2
101 bne loop ; 2/3=4/5
102 rts
103 ; average: 27.94
104 .)
105
106
107 ;**********************************************************
108 ;
109 ; Expects the following variables to be set when called:
110 ; _CurrentPixelX
111 ; _CurrentPixelY
112 ; _OtherPixelX
113 ; _OtherPixelY
114 ;
115 _DrawLine8
116 ;
117 ; Compute deltas and signs
118 ;
119
120 ; Test Y value
121 .(
122 sec
123 lda _CurrentPixelY
124 sbc _OtherPixelY
125 bcc cur_smaller
126 beq end
127
128 cur_bigger ; y1>y2
129 ; Swap X and Y
130 ; So we always draw from top to bottom
131 ldy _CurrentPixelY
132 ldx _OtherPixelY
133 sty _OtherPixelY
134 stx _CurrentPixelY
135
136 ldy _CurrentPixelX
137 ldx _OtherPixelX
138 sty _OtherPixelX
139 stx _CurrentPixelX
140
141 bcs end
142
143 cur_smaller ; y1<y2
144 ; Absolute value
145 eor #$ff
146 adc #1
147 end
148 sta dy
149 .)
150
151 ;
152 ; Initialise screen pointer
153 ;
154 ldy _CurrentPixelY
155 lda _HiresAddrLow,y ; 4
156 sta tmp0+0 ; 3
157 lda _HiresAddrHigh,y ; 4
158 sta tmp0+1 ; 3 => Total 14 cycles
159
160 ; Test X value
161 .(
162 sec
163 lda _CurrentPixelX
164 sbc _OtherPixelX
165 beq draw_totaly_vertical_8
166 ldx #_DEX
167 bcs cur_bigger
168
169 cur_smaller ; x1<x2
170 ; Absolute value
171 eor #$ff
172 adc #1
173 sta dx
174
175 ldx #_INX
176 cur_bigger ; x1>x2
177 sta dx
178 .)
179
180 jmp alignIt
181
182 .dsb 256-(*&255)
183
184 alignIt
185 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
186 ldy dy
187 beq draw_totaly_horizontal_8
188 cpy dx
189 bcc draw_mainly_horizontal_8
190 jmp draw_mainly_vertical_8
191
192 ;**********************************************************
193 draw_totaly_horizontal_8
194 .(
195 ; here we have DY in Y, and the OPCODE in X
196 stx _outer_patch ; Write a (dex / inx) instruction
197
198 ldx _OtherPixelX
199 stx __auto_cpx+1
200
201 ldx _CurrentPixelX
202
203 ;
204 ; Draw loop
205 ;
206 outer_loop
207 ldy _TableDiv6,x ; 4
208 lda _TableBit6Reverse,x ; 4
209 and #$7f ; 2
210 eor (tmp0),y ; 5
211 sta (tmp0),y ; 6 = 19
212
213 _outer_patch
214 inx ; 2
215
216 __auto_cpx
217 cpx #00 ; 2 At the endpoint yet?
218 bne outer_loop ; 2
219 rts
220 .)
221
222 ;**********************************************************
223 draw_mainly_horizontal_8
224 .(
225 ; A = DX, Y = DY, X = opcode
226 ; lda dx
227 lsr
228 cmp dy
229 bcc contMainly
230 jmp draw_very_horizontal_8
231
232 contMainly
233 sty __auto_dy+1
234
235 ; all this stress to be able to use dex, beq :)
236 cpx #_INX
237 beq doInx
238
239 lda #_DEY
240 sta __auto_stepx
241 lda #$ff
242 sta __auto_cpy+1
243 ldy #_DEC_ABS
244
245 lda #<_TableBit6Reverse-1 ; == 0
246 ; clc
247 adc _OtherPixelX
248 ldx #>_TableBit6Reverse ;
249 bne endPatch
250
251 doInx
252 lda #_INY
253 sta __auto_stepx
254 lda #$00
255 sta __auto_cpy+1
256 ldy #_INC_ABS
257
258 lda #X_SIZE-1
259 ; sec
260 sbc _OtherPixelX
261 ldx #>_TableBit6 ;
262 endPatch
263 sty __auto_yHi0
264 sty __auto_yHi1
265 sta __auto_bit6+1
266 sta __auto_bit6_0+1
267 stx __auto_bit6+2
268 stx __auto_bit6_0+2
269
270 ldx _CurrentPixelX
271 lda _TableDiv6,x
272 clc
273 adc tmp0
274 tay
275 lda tmp0+1
276 adc #0
277 sta __auto_ptr0+2
278 sta __auto_ptr1+2
279
280 lda dx
281 tax
282 inx ; 2 +1 since we count to 0
283 sta __auto_dx+1
284 lsr
285 eor #$ff
286 clc
287
288 sta save_a ; 3 = 3
289 __auto_bit6_0
290 lda _TableBit6Reverse-1,x;4
291 and #$7f ; remove signal bit
292 bcc contColumn
293
294 ; a = sum, x = dX+1
295 ;----------------------------------------------------------
296 loopX
297 sta save_a ; 3 = 3
298 loopY
299 __auto_bit6
300 lda _TableBit6Reverse-1,x;4
301 bmi nextColumn ; 2/14 16.7% taken
302 contColumn
303 __auto_ptr0
304 eor $a000,y ; 4
305 __auto_ptr1
306 sta $a000,y ; 5 = 17.06
307
308 dex ; 2 Step in x
309 beq exitLoop ; 2/3 At the endpoint yet?
310 lda save_a ; 3
311 __auto_dy
312 adc #00 ; 2 +DY
313 bcc loopX ; 2/3=11/12 ~28.0% taken (not 50% due do to special code for very horizontal lines)
314 ; Time to step in y
315 __auto_dx
316 sbc #00 ; 2 -DX
317 sta save_a ; 3 = 5
318
319 ; update the screen address:
320 tya ; 2
321 adc #ROW_SIZE ; 2
322 tay ; 2
323 bcc loopY ; 2/3= 8/9 ~84.4% taken
324 inc __auto_ptr0+2 ; 6
325 inc __auto_ptr1+2 ; 6
326 clc ; 2
327 bcc loopY ; 3 = 17
328 ; average: 11.83
329
330 exitLoop
331 rts
332
333 nextColumn
334 and #$7f ; 2 remove signal bit
335 __auto_stepx
336 iny ; 2
337 __auto_cpy
338 cpy #$00 ; 2
339 clc ; 2
340 bne contColumn ; 2/3=10/11 99% taken
341 __auto_yHi0
342 inc __auto_ptr0+2 ; 6
343 __auto_yHi1
344 inc __auto_ptr1+2 ; 6
345 bcc contColumn ; 3
346
347 ; Timings:
348 ; x++/y : 32.06 (28.0%) <- corrected!
349 ; x++/y++: 44.89 (72.0%) <- corrected!
350 ; average: 41.30
351 .)
352
353 ; .dsb 256-(*&255)
354 ;**********************************************************
355 draw_very_horizontal_8
356 .(
357 ; dX > 2*dY, here we use "chunking"
358 ; here we have DY in Y, and the OPCODE (inx, dex) in A
359 sty __auto_dy0+1
360 sty __auto_dy1+1
361 sty __auto_dy2+1
362 cpx #_INX
363 php
364 ; setup pointer and Y:
365 ldx _CurrentPixelX
366 lda _TableDiv6,x
367 clc
368 adc tmp0
369 tay
370 lda #0
371 sta tmp0
372 bcc skipHi
373 inc tmp0+1
374 skipHi
375 lda _TableDiv6,x
376 asl
377 adc _TableDiv6,x
378 asl
379 sta save_a ; save_a = _CurrentPixelX % 6
380 lda _CurrentPixelX
381 sec
382 sbc save_a
383 ; patch the code:
384 plp
385 beq doInx
386 ; negative x-direction
387 tax
388
389 lda #_DEY
390 sta __auto_stepx
391 sta __auto_stepx2
392 lda #_CPY_IMM
393 sta __auto_cpy
394 sta __auto_cpy2
395 lda #$ff
396 sta __auto_cpy+1
397 sta __auto_cpy2+1
398 lda #_DEC_ZP
399 sta __auto_yHi
400 sta __auto_yHi2
401 lda Pot2NTbl,x
402 sta chunk
403 lda #<Pot2NTbl
404 bne endPatch
405
406 doInx
407 ; positive x-direction
408 sta save_a
409 lda #BYTE_PIXEL-1
410 sbc save_a
411 tax
412
413 lda #_INY
414 sta __auto_stepx
415 sta __auto_stepx2
416 lda #_STA_ZP
417 sta __auto_cpy
418 sta __auto_cpy2
419 lda #<save_a ; 2
420 sta __auto_cpy+1
421 sta __auto_cpy2+1
422 lda #_INC_ZP
423 sta __auto_yHi
424 sta __auto_yHi2
425 lda Pot2PTbl,x
426 sta chunk
427 lda #<Pot2PTbl
428 endPatch
429 sta __auto_pot1+1
430 sta __auto_pot2+1
431 sta __auto_pot3+1
432
433 lda dx
434 sta __auto_dx+1
435 ; calculate initial bresenham sum
436 lsr
437 sta lastSum ; 3 this is used for the last line segment
438 eor #$ff ; = -dx/2
439 clc
440 bcc loopX
441 ; a = sum, x = _CurrentPixelX % 6, y = ptr-offset
442
443 ;----------------------------------------------------------
444 nextColumn ;
445 tax ; 2
446 lda chunk ; 3
447 eor (tmp0),y ; 5
448 sta (tmp0),y ; 6
449 lda #%00111111 ; 2
450 sta chunk ; 3
451 txa ; 2
452 ldx #BYTE_PIXEL-1 ; 2
453 __auto_stepx
454 iny ; 2 next column
455 __auto_cpy
456 cpy #00 ; 2
457 bne contColumn ; 2/3=31/32 99% taken
458 __auto_yHi
459 inc tmp0+1 ; 5 dec/inc
460 clc ; 2
461 bcc contColumn ; 3 = 10
462 ;----------------------------------------------------------
463 loopY
464 dec dy ; 5 all but one vertical segments drawn?
465 beq exitLoop ; 2/3= 7/8 yes, exit loop
466 dex ; 2
467 bmi nextColumn ; 2/38.04 ~16.7% taken (this will continue below)
468 __auto_dy0
469 adc #00 ; 2 = 11.67 +DY, no check necessary here!
470 loopX
471 dex ; 2
472 bmi nextColumn ; 2/35.04 ~16.7% taken
473 contColumn ; = 9.51
474 __auto_dy1
475 adc #00 ; 2 +DY
476 bcc loopX ; 2/3= 4/5 ~76.4% taken
477 ; Time to step in y
478 __auto_dx
479 sbc #00 ; 2 -DX
480 sta save_a ; 3 = 5
481
482 ; plot the last bits of current segment:
483 __auto_pot1
484 lda Pot2PTbl,x ; 4
485 eor chunk ; 3
486 eor (tmp0),y ; 5
487 sta (tmp0),y ; 6
488 __auto_pot2
489 lda Pot2PTbl,x ; 4
490 sta chunk ; 3 = 25
491
492 ; update the screen address:
493 tya ; 2
494 adc #ROW_SIZE ; 2
495 tay ; 2
496 lda save_a ; 3
497 bcc loopY ; 2/3=11/12 ~84.4% taken
498 inc tmp0+1 ; 5
499 clc ; 2
500 bcc loopY ; 3 = 10
501 ; average: 13.40
502
503 ; Timings:
504 ; x++/y : 14.51 (76.4%)
505 ; x++/y++: 62.07 (23.6%)
506 ; average: 25.73
507 ;----------------------------------------------------------
508 exitLoop
509 ; draw the last horizontal line segment:
510 adc lastSum ; 3
511 loopXEnd
512 dex ; 2
513 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
514 contColumnEnd ; = 9.85
515 __auto_dy2
516 adc #00 ; 2 +DY
517 bcc loopXEnd ; 2/3= 4/5 ~38.2% taken
518
519 ; plot last chunk:
520 __auto_pot3
521 lda Pot2PTbl,x ; 4
522 eor chunk ; 3
523 eor (tmp0),y ; 5
524 sta (tmp0),y ; 6 = 18
525 rts
526 ;----------------------------------------------------------
527 nextColumnEnd ;
528 tax ; 2
529 lda chunk ; 3
530 eor (tmp0),y ; 5
531 sta (tmp0),y ; 6
532 lda #%00111111 ; 2
533 sta chunk ; 3
534 txa ; 2
535 ldx #BYTE_PIXEL-1 ; 2
536 __auto_stepx2
537 iny ; 2 next column
538 __auto_cpy2
539 cpy #00 ; 2
540 bne contColumnEnd ; 2/3=31/32 99% taken
541 __auto_yHi2
542 inc tmp0+1 ; 5 dec/inc
543 clc ; 2
544 bcc contColumnEnd ; 3 = 10
545
546 .dsb 256-(*&255)
547
548 Pot2PTbl
549 .byte %00000001, %00000011, %00000111, %00001111
550 .byte %00011111, %00111111
551 Pot2NTbl
552 .byte %00100000, %00110000
553 .byte %00111000, %00111100, %00111110, %00111111
554 .)
555
556 ;**********************************************************
557 ;
558 ; This code is used when the things are moving faster
559 ; vertically than horizontally
560 ;
561 ; dy>dx
562 ;
563 draw_mainly_vertical_8
564 ; here we have DY in Y, and the OPCODE in A
565 .(
566 ; setup bresenham values:
567 sty __auto_dy+1
568
569 ; setup direction:
570 cpx #_DEX ; which direction?
571 bne doInx
572 ; dex -> moving left:
573 lda #%00100000
574 sta __auto_cpBit+1
575 lda #_ASL ;
576 sta __auto_shBit
577 lda #%00000001
578 sta __auto_ldBit+1
579 lda #_DEY
580 sta __auto_yLo
581 ldx #$ff
582 lda #_DEC_ABS
583 bne endPatch
584
585 doInx
586 ; inx -> moving right:
587 lda #%00000001
588 sta __auto_cpBit+1
589 lda #_LSR
590 sta __auto_shBit
591 lda #%00100000
592 sta __auto_ldBit+1
593 lda #_INY
594 sta __auto_yLo
595 ldx #$00
596 lda #_INC_ABS
597 endPatch
598 stx __auto_cpY+1
599 sta __auto_yHi0
600 sta __auto_yHi1
601 ; setup X
602 ldx dx ; X = dx
603 stx __auto_dx1+1
604 stx __auto_dx2+1
605 ; setup current bit:
606 ldy _CurrentPixelX
607 lda _TableBit6Reverse,y ; 4
608 and #$7f
609 sta curBit
610 ; setup pointer and Y:
611 ; TODO: self-modyfing code for the ptrs?
612 lda _TableDiv6,y
613 clc
614 adc tmp0
615 tay
616 lda #0
617 sta tmp0
618 lda tmp0+1
619 adc #0
620 sta __auto_ptr0+2
621 sta __auto_ptr1+2
622 ; bcc skipTmp0
623 ; inc tmp0+1
624 ;skipTmp0
625 ; calculate initial bresenham sum:
626 lda dy
627 lsr
628 sta lastSum
629 eor #$ff ; -DY/2
630 clc ; 2
631 bcc loopY ; 3
632 ; a = sum, y = tmp0, x = dX, tmp0 = 0
633 ;----------------------------------------------------------
634 incHiPtr ;
635 inc __auto_ptr0+2 ; 6
636 inc __auto_ptr1+2 ; 6
637 clc ; 2
638 bcc contHiPtr ; 3
639 ;----------------------------------------------------------
640 loopY
641 sta save_a ; 3
642 lda curBit ; 3 = 6
643 loopX
644 ; Draw the pixel
645 __auto_ptr0
646 eor $a000,y ; 4
647 __auto_ptr1
648 sta $a000,y ; 5 = 9
649 ; update the screen address:
650 tya ; 2
651 adc #ROW_SIZE ; 2
652 tay ; 2
653 bcs incHiPtr ; 2/20 ~15.6% taken
654 contHiPtr ; = 11.00 average
655 lda save_a ; 3
656 __auto_dx1
657 adc #00 ; 2 +DX
658 bcc loopY ; 2/3= 7/8 ~41.4% taken
659 ; Time to step in x
660 __auto_dy
661 sbc #00 ; 2 -DY
662 sta save_a ; 3 = 5
663
664 lda curBit ; 3
665 __auto_cpBit ; TODO: optimize
666 cmp #%00100000 ; 2 %00100000/%00000001
667 beq nextColumn ; 2/14.05 ~16.7% taken
668 __auto_shBit
669 asl ; 2 asl/lsr, clears carry
670 contNextColumn
671 sta curBit ; 3 =~13.68
672 ; step in x:
673 dex ; 2 At the endpoint yet?
674 bne loopX ; 2/3= 4/5
675
676 ; x ,y++: 34.00 (41.4%) <- corrected!
677 ; x++,y++: 50.68 (58.6%) <- corrected!
678 ; average: 43.77
679
680 ; draw the last vertical line segment:
681 ldx __auto_ptr0+2 ; 4
682 stx tmp0+1 ; 3
683 lda save_a ; 3
684 adc lastSum ; 3
685 loopYEnd
686 tax ; 2 = 2
687 ; Draw the pixel
688 lda curBit ; 3
689 eor (tmp0),y ; 5
690 sta (tmp0),y ; 6 = 14
691 ; update the screen address:
692 tya ; 2
693 adc #ROW_SIZE ; 2
694 tay ; 2
695 bcs incHiPtrEnd ; 2/13 ~15.6% taken
696 contHiPtrEnd ; = 9.72 average
697 txa ; 2
698 __auto_dx2
699 adc #00 ; 2 +DX
700 bcc loopYEnd ; 2/3= 6/7 ~20.7% taken
701 rts
702 ;----------------------------------------------------------
703 nextColumn
704 __auto_ldBit
705 lda #%00000001 ; 2 %00000001/%00100000
706 __auto_yLo
707 dey ; 2 dey/iny
708 __auto_cpY
709 cpy #$ff ; 2 $ff/$00
710 clc ; 2 TODO: optimize
711 bne contNextColumn ; 2/3 ~99% taken
712 __auto_yHi0
713 dec __auto_ptr0+2 ; 6 dec/inc
714 __auto_yHi1
715 dec __auto_ptr1+2 ; 6 dec/inc
716 bcc contNextColumn ; 3
717
718 incHiPtrEnd ; 9
719 inc tmp0+1 ; 5
720 clc ; 2
721 bcc contHiPtrEnd ; 3
722 ;----------------------------------------------------------
723 .)
724
725 ; *** total timings: ***
726 ; draw_very_horizontal_8 (29.5%): 25.73
727 ; draw_mainly_horizontal_8 (20.5%): 41.30
728 ; draw_mainly_vertical_8 (50.0%): 43.77
729 ;----------------------------------------
730 ; total average (100.0%): 37.94

  ViewVC Help
Powered by ViewVC 1.1.26