/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 267 - (show annotations)
Mon Feb 8 22:50:29 2010 UTC (9 years, 8 months ago) by thrust26
File size: 16933 byte(s)
some weird idea in mainly_horizontal, 468
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13 ;473 final optimization for mainly_vertical (37.89 -> 38.34 corrected)
14 ;468 a weird stunt on mainly_horizontal (38.07)
15
16 ; TODOs:
17 ; + chunking (-35)
18 ; - two separate branches instead of patching?
19 ; + countdown minor
20 ; x mainly_horizontal (won't work)
21 ; + mainly_vertical (-9)
22
23 .zero
24
25 ; *= tmp1
26
27 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
28 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
29 ;dx .dsb 1 ; Width
30 ;dy .dsb 1 ; Height
31 ;_CurrentPixelX .dsb 1
32 ;_CurrentPixelY .dsb 1
33 ;_OtherPixelX .dsb 1
34 ;_OtherPixelY .dsb 1
35
36 save_a .dsb 1
37 curBit .dsb 1
38 chunk .dsb 1
39 lastSum .dsb 1
40
41
42 #define BYTE_PIXEL 6
43 #define X_SIZE 240
44 #define ROW_SIZE X_SIZE/BYTE_PIXEL
45
46 #define _NOP $ea
47 #define _INX $e8
48 #define _DEX $ca
49 #define _INY $c8
50 #define _DEY $88
51 #define _ASL $0a
52 #define _LSR $4a
53 #define _INC_ZP $e6
54 #define _DEC_ZP $c6
55
56
57 .text
58
59 .dsb 256-(*&255)
60
61 ;**********************************************************
62 draw_totaly_vertical_8
63 .(
64 ldx _CurrentPixelX
65 ldy _TableDiv6,x
66 lda _TableBit6Reverse,x ; 4
67 and #$7f
68 sta _mask_patch+1
69 ldx dy
70 inx
71 clc ; 2
72 loop
73 _mask_patch
74 lda #0 ; 2
75 eor (tmp0),y ; 5*
76 sta (tmp0),y ; 6*= 13**
77
78 ; update the screen address:
79 .(
80 tya ; 2
81 adc #ROW_SIZE ; 2
82 tay ; 2
83 bcc skip ; 2/3= 8/9
84 inc tmp0+1 ; 5
85 clc ; 2 = 7
86 skip ;
87 .)
88 dex ; 2
89 bne loop ; 2/3=4/5
90 rts
91 .)
92
93
94 ;**********************************************************
95 ;
96 ; Expects the following variables to be set when called:
97 ; _CurrentPixelX
98 ; _CurrentPixelY
99 ; _OtherPixelX
100 ; _OtherPixelY
101 ;
102 _DrawLine8
103 ;
104 ; Compute deltas and signs
105 ;
106
107 ; Test Y value
108 .(
109 sec
110 lda _CurrentPixelY
111 sbc _OtherPixelY
112 beq end
113 bcc cur_smaller
114
115 cur_bigger ; y1>y2
116 ; Swap X and Y
117 ; So we always draw from top to bottom
118 ldy _CurrentPixelY
119 ldx _OtherPixelY
120 sty _OtherPixelY
121 stx _CurrentPixelY
122
123 ldy _CurrentPixelX
124 ldx _OtherPixelX
125 sty _OtherPixelX
126 stx _CurrentPixelX
127
128 jmp end
129
130 cur_smaller ; y1<y2
131 ; Absolute value
132 eor #$ff
133 adc #1
134 end
135 sta dy
136 .)
137
138 ;
139 ; Initialise screen pointer
140 ;
141 ldy _CurrentPixelY
142 lda _HiresAddrLow,y ; 4
143 sta tmp0+0 ; 3
144 lda _HiresAddrHigh,y ; 4
145 sta tmp0+1 ; 3 => Total 14 cycles
146
147 ; Test X value
148 .(
149 sec
150 lda _CurrentPixelX
151 sbc _OtherPixelX
152 sta dx
153 beq draw_totaly_vertical_8
154 bcc cur_smaller
155
156 cur_bigger ; x1>x2
157 lda #_DEX
158 bne end
159
160 cur_smaller ; x1<x2
161 ; Absolute value
162 eor #$ff
163 adc #1
164 sta dx
165
166 lda #_INX
167 end
168 .)
169
170 jmp alignIt
171
172 .dsb 256-(*&255)
173
174 alignIt
175 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
176 ldy dy
177 beq draw_totaly_horizontal_8
178 cpy dx
179 bcc draw_mainly_horizontal_8
180 jmp draw_mainly_vertical_8
181
182 ;**********************************************************
183 draw_totaly_horizontal_8
184 .(
185 ; here we have DY in Y, and the OPCODE in A
186 sta _outer_patch ; Write a (dex / nop / inx) instruction
187
188 ldx _OtherPixelX
189 stx __auto_cpx+1
190
191 ldx _CurrentPixelX
192
193 ;
194 ; Draw loop
195 ;
196 outer_loop
197 ldy _TableDiv6,x
198 lda _TableBit6Reverse,x ; 4
199 and #$7f
200 eor (tmp0),y ; 5
201 sta (tmp0),y ; 6
202
203 _outer_patch
204 inx
205
206 __auto_cpx
207 cpx #00 ; At the endpoint yet?
208 bne outer_loop
209 rts
210 .)
211
212 ;**********************************************************
213 draw_mainly_horizontal_8
214 .(
215 tax
216 lda dx
217 lsr
218 cmp dy
219 bcc contMainly
220 jmp draw_very_horizontal_8
221 contMainly
222
223 ; here we have DY in Y, and the OPCODE (inx, dex) in A
224 sty __auto_dy+1
225
226 ; all this stress to be able to use dex, beq :)
227 cpx #_INX
228 beq doInx
229
230 lda #_DEY
231 sta __auto_stepx
232 lda #$ff
233 sta __auto_cpy+1
234 lda #_DEC_ZP
235 sta __auto_yHi
236
237 lda #<_TableDiv6 ; == 1
238 ; clc ; _DEX < _INX
239 adc _OtherPixelX
240 sta __auto_div6+1
241 lda #<_TableBit6Reverse-1 ; == 0
242 ; clc
243 adc _OtherPixelX
244
245 ldy #>_TableDiv6
246 ldx #>_TableBit6Reverse ;
247 bne endPatch
248
249 doInx
250 lda #_INY
251 sta __auto_stepx
252 lda #$00
253 sta __auto_cpy+1
254 lda #_INC_ZP
255 sta __auto_yHi
256
257 lda #X_SIZE
258 ; sec
259 sbc _OtherPixelX
260 sta __auto_div6+1
261 lda #X_SIZE-1
262 ; sec
263 sbc _OtherPixelX
264
265 ldy #>_TableDiv6Rev
266 ldx #>_TableBit6 ;
267 endPatch
268 sta __auto_bit6+1
269 sta __auto_bit6_0+1
270 stx __auto_bit6+2
271 stx __auto_bit6_0+2
272 sty __auto_div6+2
273
274 ldx dx
275 __auto_div6
276 lda _TableDiv6,x
277 clc
278 adc tmp0
279 tay
280 bcc skipInc
281 inc tmp0+1
282 skipInc
283 lda #0
284 sta tmp0
285
286 lda dx
287 ; tax
288 inx ; 2 +1 since we count to 0
289 sta __auto_dx+1
290 lsr
291 eor #$ff
292 clc
293
294 sta save_a ; 3 = 3
295 __auto_bit6_0
296 lda _TableBit6Reverse-1,x;4
297 and #$7f
298 bcc contColumn
299
300 ; a = sum, x = dX+1
301 ;----------------------------------------------------------
302 loopX
303 sta save_a ; 3 = 3
304 loopY
305 __auto_bit6
306 lda _TableBit6Reverse-1,x;4
307 bmi nextColumn ; 2/14 16.7% taken
308 contColumn
309 eor (tmp0),y ; 5
310 sta (tmp0),y ; 6 = 19
311
312 dex ; 2 Step in x
313 beq exitLoop ; 2/3 At the endpoint yet?
314 lda save_a ; 3
315 __auto_dy
316 adc #00 ; 2 +DY
317 bcc loopX ; 2/3=11/12 ~33.3% taken (not 50% due do to special code for very horizontal lines)
318 ; Time to step in y
319 __auto_dx
320 sbc #00 ; 2 -DX
321 sta save_a ; 3 = 5
322
323 ; update the screen address:
324 tya ; 2
325 adc #ROW_SIZE ; 2
326 tay ; 2
327 bcc loopY ; 2/3= 8/9 ~84.4% taken
328 inc tmp0+1 ; 5
329 clc ; 2
330 bcc loopY ; 3 = 10
331 ; average: 10.40
332
333 exitLoop
334 rts
335
336 nextColumn
337 and #$7f ; 2 remove signal bit
338 __auto_stepx
339 iny ; 2
340 __auto_cpy
341 cpy #$00 ; 2
342 clc ; 2
343 bne contColumn ; 2/3=10/11
344 __auto_yHi
345 inc tmp0+1 ; 5
346 bcc contColumn ; 3
347
348 ; Timings:
349 ; x++/y : 34.00 (33.3%)
350 ; x++/y++: 45.40 (66.7%)
351 ; average: 41.61
352 .)
353
354 .dsb 256-(*&255)
355 ;**********************************************************
356 draw_very_horizontal_8
357 .(
358 ; dX > 2*dY, here we use "chunking"
359 ; here we have DY in Y, and the OPCODE (inx, dex) in A
360 sty __auto_dy+1
361 sty __auto_dy2+1
362 cpx #_INX
363 php
364 ; setup pointer and Y:
365 ldx _CurrentPixelX
366 lda _TableDiv6,x
367 clc
368 adc tmp0
369 tay
370 bcc skipHi
371 inc tmp0+1
372 skipHi
373 lda #0
374 sta tmp0
375 ; patch the code:
376 plp
377 beq doInx
378 ; negative x-direction
379 lda _TableMod6,x
380 tax
381
382 lda #_DEY
383 sta __auto_stepx
384 sta __auto_stepx2
385 lda #$ff
386 sta __auto_cpy+1
387 sta __auto_cpy2+1
388 lda #_DEC_ZP
389 sta __auto_yHi
390 sta __auto_yHi2
391 lda Pot2NTbl,x
392 sta chunk
393 lda #<Pot2NTbl
394 bne endPatch
395
396 doInx
397 ; positive x-direction
398 lda #BYTE_PIXEL-1
399 ; sec
400 sbc _TableMod6,x
401 tax
402
403 lda #_INY
404 sta __auto_stepx
405 sta __auto_stepx2
406 lda #$00
407 sta __auto_cpy+1
408 sta __auto_cpy2+1
409 lda #_INC_ZP
410 sta __auto_yHi
411 sta __auto_yHi2
412 lda Pot2PTbl,x
413 sta chunk
414 lda #<Pot2PTbl
415 endPatch
416 sta __auto_pot1+1
417 sta __auto_pot2+1
418 sta __auto_pot3+1
419 lda dx
420 sta __auto_dx+1
421 ; calculate initial bresenham sum
422 lsr
423 sta lastSum ; 3 this is used for the last line segment
424 eor #$ff ; = -dx/2
425 clc
426 bcc loopX
427 ; a = sum, x = dX+1, y = ptr-offset
428
429 ;----------------------------------------------------------
430 nextColumn ;
431 tax ; 2
432 lda chunk ; 3
433 eor (tmp0),y ; 5
434 sta (tmp0),y ; 6
435 lda #%00111111 ; 2
436 sta chunk ; 3
437 txa ; 2
438 ldx #BYTE_PIXEL-1 ; 2
439 __auto_stepx
440 iny ; 2 next column
441 __auto_cpy
442 cpy #00 ; 2
443 clc ; 2
444 bne contColumn ; 2/3=33/34 99% taken
445 __auto_yHi
446 inc tmp0+1 ; 5 dec/inc
447 bcc contColumn ; 3 = 8
448 ;----------------------------------------------------------
449 loopY
450 dec dy ; 5 all but one vertical segments drawn?
451 beq exitLoop ; 2/3= 7/8 yes, exit loop
452 loopX
453 dex ; 2
454 bmi nextColumn ; 2/37.03 ~16.7% taken
455 contColumn ; = 9.85
456 __auto_dy
457 adc #00 ; 2 +DY
458 bcc loopX ; 2/3= 4/5 ~75% taken
459 ; Time to step in y
460 __auto_dx
461 sbc #00 ; 2 -DX
462 sta save_a ; 3 = 5
463
464 ; plot the last bits of current row:
465 __auto_pot1
466 lda Pot2PTbl,x ; 4
467 eor chunk ; 3
468 eor (tmp0),y ; 5
469 sta (tmp0),y ; 6
470 __auto_pot2
471 lda Pot2PTbl,x ; 4
472 sta chunk ; 3 = 25
473
474 ; update the screen address:
475 tya ; 2
476 adc #ROW_SIZE ; 2
477 tay ; 2
478 lda save_a ; 3
479 bcc loopY ; 2/3=11/12 ~84.4% taken
480 inc tmp0+1 ; 5
481 clc ; 2
482 bcc loopY ; 3 = 10
483 ; average: 13.40
484
485 ; Timings:
486 ; x++/y : 14.85 (75%)
487 ; x++/y++: 64.25 (25%)
488 ; average: 27.20
489 ;----------------------------------------------------------
490 exitLoop
491 ; draw the last horizontal line segment:
492 adc lastSum ; 3
493 loopXEnd
494 dex ; 2
495 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
496 contColumnEnd ; = 9.85
497 __auto_dy2
498 adc #00 ; 2 +DY
499 bcc loopXEnd ; 2/3= 4/5 ~50% taken
500
501 ; plot last chunk:
502 __auto_pot3
503 lda Pot2PTbl,x ; 4
504 eor chunk ; 3
505 eor (tmp0),y ; 5
506 sta (tmp0),y ; 6 = 18
507 rts
508 ;----------------------------------------------------------
509 nextColumnEnd ;
510 tax ; 2
511 lda chunk ; 3
512 eor (tmp0),y ; 5
513 sta (tmp0),y ; 6
514 lda #%00111111 ; 2
515 sta chunk ; 3
516 txa ; 2
517 ldx #BYTE_PIXEL-1 ; 2
518 __auto_stepx2
519 iny ; 2 next column
520 __auto_cpy2
521 cpy #00 ; 2
522 clc ; 2
523 bne contColumnEnd ; 2/3=33/34 99% taken
524 __auto_yHi2
525 inc tmp0+1 ; 5 dec/inc
526 bcc contColumnEnd ; 3 = 8
527
528 Pot2PTbl
529 .byte %00000001, %00000011, %00000111, %00001111
530 .byte %00011111, %00111111
531 Pot2NTbl
532 .byte %00100000, %00110000
533 .byte %00111000, %00111100, %00111110, %00111111
534 .)
535
536 .dsb 256-(*&255)
537 ;**********************************************************
538 ;
539 ; This code is used when the things are moving faster
540 ; vertically than horizontally
541 ;
542 ; dy>dx
543 ;
544 draw_mainly_vertical_8
545 ; here we have DY in Y, and the OPCODE in A
546 .(
547 ; setup bresenham values:
548 sty __auto_dy+1
549
550 ; setup direction:
551 cmp #_DEX ; which direction?
552 bne doInx
553 ; dex -> moving left:
554 lda #%00100000
555 sta __auto_cpBit+1
556 lda #_ASL ;
557 sta __auto_shBit
558 lda #%00000001
559 sta __auto_ldBit+1
560 lda #_DEY
561 sta __auto_yLo
562 ldx #$ff
563 lda #_DEC_ZP
564 bne endPatch
565
566 doInx
567 ; inx -> moving right:
568 lda #%00000001
569 sta __auto_cpBit+1
570 lda #_LSR
571 sta __auto_shBit
572 lda #%00100000
573 sta __auto_ldBit+1
574 lda #_INY
575 sta __auto_yLo
576 ldx #$00
577 lda #_INC_ZP
578 endPatch
579 stx __auto_cpY+1
580 sta __auto_yHi
581 ; setup X
582 ldx dx ; X = dx
583 stx __auto_dx1+1
584 stx __auto_dx2+1
585 ; setup current bit:
586 ldy _CurrentPixelX
587 lda _TableBit6Reverse,y ; 4
588 and #$7f
589 sta curBit
590 ; setup pointer and Y:
591 ; TODO: self-modyfing code for the ptrs?
592 lda _TableDiv6,y
593 clc
594 adc tmp0
595 tay
596 lda #0
597 sta tmp0
598 bcc skipTmp0
599 inc tmp0+1
600 skipTmp0
601 ; calculate initial bresenham sum:
602 lda dy
603 lsr
604 sta lastSum
605 eor #$ff ; -DY/2
606 clc ; 2
607 bcc loopY ; 3
608 ; a = sum, y = tmp0, x = dX, tmp0 = 0
609 ;----------------------------------------------------------
610 incHiPtr ;
611 inc tmp0+1 ; 5
612 clc ; 2
613 bcc contHiPtr ; 3
614 ;----------------------------------------------------------
615 loopY
616 sta save_a ; 3
617 lda curBit ; 3 = 6
618 loopX
619 ; Draw the pixel
620 eor (tmp0),y ; 5
621 sta (tmp0),y ; 6 = 11
622 ; update the screen address:
623 tya ; 2
624 adc #ROW_SIZE ; 2
625 tay ; 2
626 bcs incHiPtr ; 2/13 ~15.6% taken
627 contHiPtr ; = 9.72 average
628 lda save_a ; 3
629 __auto_dx1
630 adc #00 ; 2 +DX
631 bcc loopY ; 2/3= 7/8 ~50% taken
632 ; Time to step in x
633 __auto_dy
634 sbc #00 ; 2 -DY
635 sta save_a ; 3 = 5
636
637 lda curBit ; 3
638 __auto_cpBit ; TODO: optimize
639 cmp #%00100000 ; 2 %00100000/%00000001
640 beq nextColumn ; 2/14.07 ~16.7% taken
641 __auto_shBit
642 asl ; 2 asl/lsr, clears carry
643 contNextColumn
644 sta curBit ; 3 =~13.68
645 ; step in x:
646 dex ; 2 At the endpoint yet?
647 bne loopX ; 2/3= 4/5
648
649 ; x ,y++: 34.72 (50%)
650 ; x++,y++: 51.40 (50%)
651 ; average: 43.06
652
653 ; draw the last vertical line segment:
654 lda save_a ; 3
655 adc lastSum ; 3
656 loopYEnd
657 tax ; 2 = 2
658 ; Draw the pixel
659 lda curBit ; 3
660 eor (tmp0),y ; 5
661 sta (tmp0),y ; 6 = 14
662 ; update the screen address:
663 tya ; 2
664 adc #ROW_SIZE ; 2
665 tay ; 2
666 bcs incHiPtrEnd ; 2/13 ~15.6% taken
667 contHiPtrEnd ; = 9.72 average
668 txa ; 2
669 __auto_dx2
670 adc #00 ; 2 +DX
671 bcc loopYEnd ; 2/3= 6/7 ~25% taken
672 rts
673 ;----------------------------------------------------------
674 nextColumn
675 __auto_ldBit
676 lda #%00000001 ; 2 %00000001/%00100000
677 __auto_yLo
678 dey ; 2 dey/iny
679 __auto_cpY
680 cpy #$ff ; 2 $ff/$00
681 clc ; 2 TODO: optimize
682 bne contNextColumn ; 2/3 ~99% taken
683 __auto_yHi
684 dec tmp0+1 ; 5 dec/inc
685 bcc contNextColumn ; 3
686
687 incHiPtrEnd ; 9
688 inc tmp0+1 ; 5
689 clc ; 2
690 bcc contHiPtrEnd ; 3
691 ;----------------------------------------------------------
692 .)
693
694 ; *** total timings: ***
695 ; draw_very_horizontal_8 (29.6%): 27.20
696 ; draw_mainly_horizontal_8 (20.4%): 41.61 <- corrected!
697 ; draw_mainly_vertical_8 (50.0%): 43.06
698 ;----------------------------------------
699 ; total average (100.0%): 38.07

  ViewVC Help
Powered by ViewVC 1.1.26