/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 268 - (show annotations)
Tue Feb 9 18:44:23 2010 UTC (10 years, 1 month ago) by thrust26
File size: 16632 byte(s)
reduced space usage
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13 ;473 final optimization for mainly_vertical (37.89 -> 38.34 corrected)
14 ;468 a weird stunt on mainly_horizontal (38.07)
15
16 ; TODOs:
17 ; + chunking (-35)
18 ; - two separate branches instead of patching?
19 ; + countdown minor
20 ; x mainly_horizontal (won't work)
21 ; + mainly_vertical (-9)
22
23 .zero
24
25 ; *= tmp1
26
27 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
28 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
29 ;dx .dsb 1 ; Width
30 ;dy .dsb 1 ; Height
31 ;_CurrentPixelX .dsb 1
32 ;_CurrentPixelY .dsb 1
33 ;_OtherPixelX .dsb 1
34 ;_OtherPixelY .dsb 1
35
36 save_a .dsb 1
37 curBit .dsb 1
38 chunk .dsb 1
39 lastSum .dsb 1
40
41
42 #define BYTE_PIXEL 6
43 #define X_SIZE 240
44 #define ROW_SIZE X_SIZE/BYTE_PIXEL
45
46 #define _NOP $ea
47 #define _INX $e8
48 #define _DEX $ca
49 #define _INY $c8
50 #define _DEY $88
51 #define _ASL $0a
52 #define _LSR $4a
53 #define _INC_ZP $e6
54 #define _DEC_ZP $c6
55
56
57 .text
58
59 ; .dsb 256-(*&255)
60
61 ;**********************************************************
62 draw_totaly_vertical_8
63 .(
64 ldx _CurrentPixelX
65 ldy _TableDiv6,x
66 lda _TableBit6Reverse,x ; 4
67 and #$7f
68 sta _mask_patch+1
69 ldx dy
70 inx
71 clc ; 2
72 loop
73 _mask_patch
74 lda #0 ; 2
75 eor (tmp0),y ; 5*
76 sta (tmp0),y ; 6*= 13**
77
78 ; update the screen address:
79 .(
80 tya ; 2
81 adc #ROW_SIZE ; 2
82 tay ; 2
83 bcc skip ; 2/3= 8/9
84 inc tmp0+1 ; 5
85 clc ; 2 = 7
86 skip ;
87 .)
88 dex ; 2
89 bne loop ; 2/3=4/5
90 rts
91 .)
92
93
94 ;**********************************************************
95 ;
96 ; Expects the following variables to be set when called:
97 ; _CurrentPixelX
98 ; _CurrentPixelY
99 ; _OtherPixelX
100 ; _OtherPixelY
101 ;
102 _DrawLine8
103 ;
104 ; Compute deltas and signs
105 ;
106
107 ; Test Y value
108 .(
109 sec
110 lda _CurrentPixelY
111 sbc _OtherPixelY
112 beq end
113 bcc cur_smaller
114
115 cur_bigger ; y1>y2
116 ; Swap X and Y
117 ; So we always draw from top to bottom
118 ldy _CurrentPixelY
119 ldx _OtherPixelY
120 sty _OtherPixelY
121 stx _CurrentPixelY
122
123 ldy _CurrentPixelX
124 ldx _OtherPixelX
125 sty _OtherPixelX
126 stx _CurrentPixelX
127
128 jmp end
129
130 cur_smaller ; y1<y2
131 ; Absolute value
132 eor #$ff
133 adc #1
134 end
135 sta dy
136 .)
137
138 ;
139 ; Initialise screen pointer
140 ;
141 ldy _CurrentPixelY
142 lda _HiresAddrLow,y ; 4
143 sta tmp0+0 ; 3
144 lda _HiresAddrHigh,y ; 4
145 sta tmp0+1 ; 3 => Total 14 cycles
146
147 ; Test X value
148 .(
149 sec
150 lda _CurrentPixelX
151 sbc _OtherPixelX
152 sta dx
153 beq draw_totaly_vertical_8
154 bcc cur_smaller
155
156 cur_bigger ; x1>x2
157 lda #_DEX
158 bne end
159
160 cur_smaller ; x1<x2
161 ; Absolute value
162 eor #$ff
163 adc #1
164 sta dx
165
166 lda #_INX
167 end
168 .)
169
170 jmp alignIt
171
172 .dsb 256-(*&255)
173
174 alignIt
175 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
176 ldy dy
177 beq draw_totaly_horizontal_8
178 cpy dx
179 bcc draw_mainly_horizontal_8
180 jmp draw_mainly_vertical_8
181
182 ;**********************************************************
183 draw_totaly_horizontal_8
184 .(
185 ; here we have DY in Y, and the OPCODE in A
186 sta _outer_patch ; Write a (dex / nop / inx) instruction
187
188 ldx _OtherPixelX
189 stx __auto_cpx+1
190
191 ldx _CurrentPixelX
192
193 ;
194 ; Draw loop
195 ;
196 outer_loop
197 ldy _TableDiv6,x
198 lda _TableBit6Reverse,x ; 4
199 and #$7f
200 eor (tmp0),y ; 5
201 sta (tmp0),y ; 6
202
203 _outer_patch
204 inx
205
206 __auto_cpx
207 cpx #00 ; At the endpoint yet?
208 bne outer_loop
209 rts
210 .)
211
212 ;**********************************************************
213 draw_mainly_horizontal_8
214 .(
215 tax
216 lda dx
217 lsr
218 cmp dy
219 bcc contMainly
220 jmp draw_very_horizontal_8
221 contMainly
222
223 ; here we have DY in Y, and the OPCODE (inx, dex) in A
224 sty __auto_dy+1
225
226 ; all this stress to be able to use dex, beq :)
227 cpx #_INX
228 beq doInx
229
230 lda #_DEY
231 sta __auto_stepx
232 lda #$ff
233 sta __auto_cpy+1
234 ldy #_DEC_ZP
235
236 lda #<_TableBit6Reverse-1 ; == 0
237 ; clc
238 adc _OtherPixelX
239 ldx #>_TableBit6Reverse ;
240 bne endPatch
241
242 doInx
243 lda #_INY
244 sta __auto_stepx
245 lda #$00
246 sta __auto_cpy+1
247 ldy #_INC_ZP
248
249 lda #X_SIZE-1
250 ; sec
251 sbc _OtherPixelX
252 ldx #>_TableBit6 ;
253 endPatch
254 sty __auto_yHi
255 sta __auto_bit6+1
256 sta __auto_bit6_0+1
257 stx __auto_bit6+2
258 stx __auto_bit6_0+2
259
260 ldx _CurrentPixelX
261 lda _TableDiv6,x
262 clc
263 adc tmp0
264 tay
265 bcc skipInc
266 inc tmp0+1
267 skipInc
268 lda #0
269 sta tmp0
270
271 lda dx
272 tax
273 inx ; 2 +1 since we count to 0
274 sta __auto_dx+1
275 lsr
276 eor #$ff
277 clc
278
279 sta save_a ; 3 = 3
280 __auto_bit6_0
281 lda _TableBit6Reverse-1,x;4
282 and #$7f
283 bcc contColumn
284
285 ; a = sum, x = dX+1
286 ;----------------------------------------------------------
287 loopX
288 sta save_a ; 3 = 3
289 loopY
290 __auto_bit6
291 lda _TableBit6Reverse-1,x;4
292 bmi nextColumn ; 2/14 16.7% taken
293 contColumn
294 eor (tmp0),y ; 5
295 sta (tmp0),y ; 6 = 19
296
297 dex ; 2 Step in x
298 beq exitLoop ; 2/3 At the endpoint yet?
299 lda save_a ; 3
300 __auto_dy
301 adc #00 ; 2 +DY
302 bcc loopX ; 2/3=11/12 ~33.3% taken (not 50% due do to special code for very horizontal lines)
303 ; Time to step in y
304 __auto_dx
305 sbc #00 ; 2 -DX
306 sta save_a ; 3 = 5
307
308 ; update the screen address:
309 tya ; 2
310 adc #ROW_SIZE ; 2
311 tay ; 2
312 bcc loopY ; 2/3= 8/9 ~84.4% taken
313 inc tmp0+1 ; 5
314 clc ; 2
315 bcc loopY ; 3 = 10
316 ; average: 10.40
317
318 exitLoop
319 rts
320
321 nextColumn
322 and #$7f ; 2 remove signal bit
323 __auto_stepx
324 iny ; 2
325 __auto_cpy
326 cpy #$00 ; 2
327 clc ; 2
328 bne contColumn ; 2/3=10/11
329 __auto_yHi
330 inc tmp0+1 ; 5
331 bcc contColumn ; 3
332
333 ; Timings:
334 ; x++/y : 34.00 (33.3%)
335 ; x++/y++: 45.40 (66.7%)
336 ; average: 41.61
337 .)
338
339 .dsb 256-(*&255)
340 ;**********************************************************
341 draw_very_horizontal_8
342 .(
343 ; dX > 2*dY, here we use "chunking"
344 ; here we have DY in Y, and the OPCODE (inx, dex) in A
345 sty __auto_dy+1
346 sty __auto_dy2+1
347 cpx #_INX
348 php
349 ; setup pointer and Y:
350 ldx _CurrentPixelX
351 lda _TableDiv6,x
352 clc
353 adc tmp0
354 tay
355 bcc skipHi
356 inc tmp0+1
357 skipHi
358 lda #0
359 sta tmp0
360 ; patch the code:
361 plp
362 beq doInx
363 ; negative x-direction
364 lda _TableMod6,x
365 tax
366
367 lda #_DEY
368 sta __auto_stepx
369 sta __auto_stepx2
370 lda #$ff
371 sta __auto_cpy+1
372 sta __auto_cpy2+1
373 lda #_DEC_ZP
374 sta __auto_yHi
375 sta __auto_yHi2
376 lda Pot2NTbl,x
377 sta chunk
378 lda #<Pot2NTbl
379 bne endPatch
380
381 doInx
382 ; positive x-direction
383 lda #BYTE_PIXEL-1
384 ; sec
385 sbc _TableMod6,x
386 tax
387
388 lda #_INY
389 sta __auto_stepx
390 sta __auto_stepx2
391 lda #$00
392 sta __auto_cpy+1
393 sta __auto_cpy2+1
394 lda #_INC_ZP
395 sta __auto_yHi
396 sta __auto_yHi2
397 lda Pot2PTbl,x
398 sta chunk
399 lda #<Pot2PTbl
400 endPatch
401 sta __auto_pot1+1
402 sta __auto_pot2+1
403 sta __auto_pot3+1
404 lda dx
405 sta __auto_dx+1
406 ; calculate initial bresenham sum
407 lsr
408 sta lastSum ; 3 this is used for the last line segment
409 eor #$ff ; = -dx/2
410 clc
411 bcc loopX
412 ; a = sum, x = dX+1, y = ptr-offset
413
414 ;----------------------------------------------------------
415 nextColumn ;
416 tax ; 2
417 lda chunk ; 3
418 eor (tmp0),y ; 5
419 sta (tmp0),y ; 6
420 lda #%00111111 ; 2
421 sta chunk ; 3
422 txa ; 2
423 ldx #BYTE_PIXEL-1 ; 2
424 __auto_stepx
425 iny ; 2 next column
426 __auto_cpy
427 cpy #00 ; 2
428 clc ; 2
429 bne contColumn ; 2/3=33/34 99% taken
430 __auto_yHi
431 inc tmp0+1 ; 5 dec/inc
432 bcc contColumn ; 3 = 8
433 ;----------------------------------------------------------
434 loopY
435 dec dy ; 5 all but one vertical segments drawn?
436 beq exitLoop ; 2/3= 7/8 yes, exit loop
437 loopX
438 dex ; 2
439 bmi nextColumn ; 2/37.03 ~16.7% taken
440 contColumn ; = 9.85
441 __auto_dy
442 adc #00 ; 2 +DY
443 bcc loopX ; 2/3= 4/5 ~75% taken
444 ; Time to step in y
445 __auto_dx
446 sbc #00 ; 2 -DX
447 sta save_a ; 3 = 5
448
449 ; plot the last bits of current row:
450 __auto_pot1
451 lda Pot2PTbl,x ; 4
452 eor chunk ; 3
453 eor (tmp0),y ; 5
454 sta (tmp0),y ; 6
455 __auto_pot2
456 lda Pot2PTbl,x ; 4
457 sta chunk ; 3 = 25
458
459 ; update the screen address:
460 tya ; 2
461 adc #ROW_SIZE ; 2
462 tay ; 2
463 lda save_a ; 3
464 bcc loopY ; 2/3=11/12 ~84.4% taken
465 inc tmp0+1 ; 5
466 clc ; 2
467 bcc loopY ; 3 = 10
468 ; average: 13.40
469
470 ; Timings:
471 ; x++/y : 14.85 (75%)
472 ; x++/y++: 64.25 (25%)
473 ; average: 27.20
474 ;----------------------------------------------------------
475 exitLoop
476 ; draw the last horizontal line segment:
477 adc lastSum ; 3
478 loopXEnd
479 dex ; 2
480 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
481 contColumnEnd ; = 9.85
482 __auto_dy2
483 adc #00 ; 2 +DY
484 bcc loopXEnd ; 2/3= 4/5 ~50% taken
485
486 ; plot last chunk:
487 __auto_pot3
488 lda Pot2PTbl,x ; 4
489 eor chunk ; 3
490 eor (tmp0),y ; 5
491 sta (tmp0),y ; 6 = 18
492 rts
493 ;----------------------------------------------------------
494 nextColumnEnd ;
495 tax ; 2
496 lda chunk ; 3
497 eor (tmp0),y ; 5
498 sta (tmp0),y ; 6
499 lda #%00111111 ; 2
500 sta chunk ; 3
501 txa ; 2
502 ldx #BYTE_PIXEL-1 ; 2
503 __auto_stepx2
504 iny ; 2 next column
505 __auto_cpy2
506 cpy #00 ; 2
507 clc ; 2
508 bne contColumnEnd ; 2/3=33/34 99% taken
509 __auto_yHi2
510 inc tmp0+1 ; 5 dec/inc
511 bcc contColumnEnd ; 3 = 8
512
513 Pot2PTbl
514 .byte %00000001, %00000011, %00000111, %00001111
515 .byte %00011111, %00111111
516 Pot2NTbl
517 .byte %00100000, %00110000
518 .byte %00111000, %00111100, %00111110, %00111111
519 .)
520
521 .dsb 256-(*&255)
522 ;**********************************************************
523 ;
524 ; This code is used when the things are moving faster
525 ; vertically than horizontally
526 ;
527 ; dy>dx
528 ;
529 draw_mainly_vertical_8
530 ; here we have DY in Y, and the OPCODE in A
531 .(
532 ; setup bresenham values:
533 sty __auto_dy+1
534
535 ; setup direction:
536 cmp #_DEX ; which direction?
537 bne doInx
538 ; dex -> moving left:
539 lda #%00100000
540 sta __auto_cpBit+1
541 lda #_ASL ;
542 sta __auto_shBit
543 lda #%00000001
544 sta __auto_ldBit+1
545 lda #_DEY
546 sta __auto_yLo
547 ldx #$ff
548 lda #_DEC_ZP
549 bne endPatch
550
551 doInx
552 ; inx -> moving right:
553 lda #%00000001
554 sta __auto_cpBit+1
555 lda #_LSR
556 sta __auto_shBit
557 lda #%00100000
558 sta __auto_ldBit+1
559 lda #_INY
560 sta __auto_yLo
561 ldx #$00
562 lda #_INC_ZP
563 endPatch
564 stx __auto_cpY+1
565 sta __auto_yHi
566 ; setup X
567 ldx dx ; X = dx
568 stx __auto_dx1+1
569 stx __auto_dx2+1
570 ; setup current bit:
571 ldy _CurrentPixelX
572 lda _TableBit6Reverse,y ; 4
573 and #$7f
574 sta curBit
575 ; setup pointer and Y:
576 ; TODO: self-modyfing code for the ptrs?
577 lda _TableDiv6,y
578 clc
579 adc tmp0
580 tay
581 lda #0
582 sta tmp0
583 bcc skipTmp0
584 inc tmp0+1
585 skipTmp0
586 ; calculate initial bresenham sum:
587 lda dy
588 lsr
589 sta lastSum
590 eor #$ff ; -DY/2
591 clc ; 2
592 bcc loopY ; 3
593 ; a = sum, y = tmp0, x = dX, tmp0 = 0
594 ;----------------------------------------------------------
595 incHiPtr ;
596 inc tmp0+1 ; 5
597 clc ; 2
598 bcc contHiPtr ; 3
599 ;----------------------------------------------------------
600 loopY
601 sta save_a ; 3
602 lda curBit ; 3 = 6
603 loopX
604 ; Draw the pixel
605 eor (tmp0),y ; 5
606 sta (tmp0),y ; 6 = 11
607 ; update the screen address:
608 tya ; 2
609 adc #ROW_SIZE ; 2
610 tay ; 2
611 bcs incHiPtr ; 2/13 ~15.6% taken
612 contHiPtr ; = 9.72 average
613 lda save_a ; 3
614 __auto_dx1
615 adc #00 ; 2 +DX
616 bcc loopY ; 2/3= 7/8 ~50% taken
617 ; Time to step in x
618 __auto_dy
619 sbc #00 ; 2 -DY
620 sta save_a ; 3 = 5
621
622 lda curBit ; 3
623 __auto_cpBit ; TODO: optimize
624 cmp #%00100000 ; 2 %00100000/%00000001
625 beq nextColumn ; 2/14.07 ~16.7% taken
626 __auto_shBit
627 asl ; 2 asl/lsr, clears carry
628 contNextColumn
629 sta curBit ; 3 =~13.68
630 ; step in x:
631 dex ; 2 At the endpoint yet?
632 bne loopX ; 2/3= 4/5
633
634 ; x ,y++: 34.72 (50%)
635 ; x++,y++: 51.40 (50%)
636 ; average: 43.06
637
638 ; draw the last vertical line segment:
639 lda save_a ; 3
640 adc lastSum ; 3
641 loopYEnd
642 tax ; 2 = 2
643 ; Draw the pixel
644 lda curBit ; 3
645 eor (tmp0),y ; 5
646 sta (tmp0),y ; 6 = 14
647 ; update the screen address:
648 tya ; 2
649 adc #ROW_SIZE ; 2
650 tay ; 2
651 bcs incHiPtrEnd ; 2/13 ~15.6% taken
652 contHiPtrEnd ; = 9.72 average
653 txa ; 2
654 __auto_dx2
655 adc #00 ; 2 +DX
656 bcc loopYEnd ; 2/3= 6/7 ~25% taken
657 rts
658 ;----------------------------------------------------------
659 nextColumn
660 __auto_ldBit
661 lda #%00000001 ; 2 %00000001/%00100000
662 __auto_yLo
663 dey ; 2 dey/iny
664 __auto_cpY
665 cpy #$ff ; 2 $ff/$00
666 clc ; 2 TODO: optimize
667 bne contNextColumn ; 2/3 ~99% taken
668 __auto_yHi
669 dec tmp0+1 ; 5 dec/inc
670 bcc contNextColumn ; 3
671
672 incHiPtrEnd ; 9
673 inc tmp0+1 ; 5
674 clc ; 2
675 bcc contHiPtrEnd ; 3
676 ;----------------------------------------------------------
677 .)
678
679 ; *** total timings: ***
680 ; draw_very_horizontal_8 (29.6%): 27.20
681 ; draw_mainly_horizontal_8 (20.4%): 41.61 <- corrected!
682 ; draw_mainly_vertical_8 (50.0%): 43.06
683 ;----------------------------------------
684 ; total average (100.0%): 38.07

  ViewVC Help
Powered by ViewVC 1.1.26