/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 278 - (show annotations)
Sat Feb 13 16:30:31 2010 UTC (9 years, 9 months ago) by thrust26
File size: 17402 byte(s)
optimized totally_horizontal
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13 ;473 final optimization for mainly_vertical (37.89 -> 38.34 corrected)
14 ;468 a weird stunt on mainly_horizontal (38.07)
15 ;467 minor very_horizontal optimization (37.88 -> 38.56 corrected)
16 ;463 self modifying pointer in mainly_horizontal (38.35)
17 ;459 self modifying pointer in mainly_vertical (37.99)
18 ;459 a little tweak to very_horizontal (37.94)
19 ;451 refactored to make x-direction always positive (37.07)
20
21 ; TODOs:
22 ; + chunking (-35)
23 ; - two separate branches instead of patching?
24 ; + countdown minor
25 ; x mainly_horizontal (won't work)
26 ; + mainly_vertical (-9)
27 ; o optimizing for space (-2 tables and one alignment page)
28 ; + optimize horizontal (merge with very_horizontal)
29 ; o optimize vertical
30 ; + correct branch taken percentages
31 ; + always draw left to right and patch y-direction (-8)
32 ; + switch between XOR and OR
33
34 .zero
35
36 ; *= tmp1
37
38 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
39 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
40 ;dx .dsb 1 ; Width
41 ;dy .dsb 1 ; Height
42 ;_CurrentPixelX .dsb 1
43 ;_CurrentPixelY .dsb 1
44 ;_OtherPixelX .dsb 1
45 ;_OtherPixelY .dsb 1
46
47 save_a .dsb 1
48 curBit .dsb 1
49 chunk .dsb 1
50 lastSum .dsb 1
51
52
53 ;#define OPP ORA
54 #define OPP EOR
55
56 #define BYTE_PIXEL 6
57 #define X_SIZE 240
58 #define ROW_SIZE X_SIZE/BYTE_PIXEL
59
60 #define _INY $c8
61 #define _DEY $88
62 #define _INC_ZP $e6
63 #define _DEC_ZP $c6
64 #define _INC_ABS $ee
65 #define _DEC_ABS $ce
66 #define _ADC_IMM $69
67 #define _SBC_IMM $e9
68 #define _BCC $90
69 #define _BCS $b0
70 #define _CLC $18
71 #define _SEC $38
72
73
74 .text
75
76 ; .dsb 256-(*&255)
77
78 ;**********************************************************
79 ;
80 ; Expects the following variables to be set when called:
81 ; _CurrentPixelX
82 ; _CurrentPixelY
83 ; _OtherPixelX
84 ; _OtherPixelY
85 ;
86 _DrawLine8
87 ;
88 ; compute deltas and signs
89 ;
90 .(
91 ; test X value
92 sec
93 lda _CurrentPixelX
94 sbc _OtherPixelX
95 bcc cur_smaller
96 beq end
97
98 ldy _CurrentPixelX
99 ldx _OtherPixelX
100 sty _OtherPixelX
101 stx _CurrentPixelX
102
103 ldy _CurrentPixelY
104 ldx _OtherPixelY
105 sty _OtherPixelY
106 stx _CurrentPixelY
107
108 bcs end
109
110 cur_smaller ; y1<y2
111 ; absolute value
112 eor #$ff
113 adc #1
114 end
115 sta dx
116 .)
117 ;
118 ; initialise screen pointer
119 ;
120 ldy _CurrentPixelY
121 lda _HiresAddrLow,y ; 4
122 sta tmp0+0 ; 3
123 lda _HiresAddrHigh,y ; 4
124 sta tmp0+1 ; 3 => Total 14 cycles
125 .(
126 ; test Y value
127 sec
128 lda _CurrentPixelY
129 sbc _OtherPixelY
130 ; beq horizontal
131 ldx #_DEY
132 bcs cur_bigger
133
134 cur_smaller ; x1<x2
135 ; absolute value
136 eor #$ff
137 adc #1
138
139 ldx #_INY
140 cur_bigger ; x1>x2
141 sta dy
142 .)
143 tay
144 jmp alignIt
145
146 ;horizontal
147 ; jmp draw_totally_horizontal_8
148
149 .dsb 256-(*&255)
150
151 alignIt
152 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
153 cmp dx
154 bcc draw_mainly_horizontal_8
155 lda dx
156 beq draw_totaly_vertical_8
157 jmp draw_mainly_vertical_8
158
159 ;**********************************************************
160 draw_totaly_vertical_8
161 .(
162 cpx #_INY
163 beq doIny
164
165 ; dey -> moving down:
166 ldx _OtherPixelX
167 bcc endPatch
168
169 doIny
170 ; inx -> moving up:
171 clc
172 ldx _CurrentPixelX
173 endPatch
174 ldy _TableDiv6,x
175 lda _TableBit6Reverse,x ; 4
176 sta _mask_patch+1
177 ldx dy
178 inx
179
180 loop
181 _mask_patch
182 lda #0 ; 2
183 OPP (tmp0),y ; 5*
184 sta (tmp0),y ; 6*= 13**
185
186 ; update the screen address:
187 tya ; 2
188 adc #ROW_SIZE ; 2
189 tay ; 2
190 bcc skip ; 2/3 84.4% taken
191 inc tmp0+1 ; 5
192 clc ; 2
193 skip ; = 9.94
194 dex ; 2
195 bne loop ; 2/3=4/5
196 rts
197 ; average: 27.94
198 .)
199
200 ;**********************************************************
201 draw_mainly_horizontal_8
202 .(
203 ; A = DX, Y = DY, X = opcode
204 lda dx
205 lsr
206 cmp dy
207 bcc contMainly
208 jmp draw_very_horizontal_8
209
210 contMainly
211
212 ; all this stress to be able to use dex, beq :)
213 cpx #_INY
214 beq doIny
215
216 ; dey -> moving down:
217 dey
218 sty _patch_dy+1
219
220 lda #<(loopX-_patch_loop-2)
221 sta _patch_loop+1
222
223 lda #_SBC_IMM
224 sta _patch_adc
225 lda #ROW_SIZE-1
226 sta _patch_adc+1
227 lda #_DEC_ABS
228 ldx #_BCS
229 ldy #_SEC
230 bne endPatch
231
232 doIny
233 sty _patch_dy+1
234
235 lda #<(loopX-_patch_loop-1)
236 sta _patch_loop+1
237
238 lda #_ADC_IMM
239 sta _patch_adc
240 lda #ROW_SIZE
241 sta _patch_adc+1
242 lda #_INC_ABS
243 ldx #_BCC
244 ldy #_CLC
245 endPatch
246 sta _patch_inc1
247 sta _patch_inc2
248 stx _patch_bcc
249 sty _patch_clc1
250 sty _patch_clc2
251
252 lda #X_SIZE-1
253 sec
254 sbc _OtherPixelX
255 sta _patch_bit1+1
256 sta _patch_bit2+1
257
258 ldx _CurrentPixelX
259 lda _TableDiv6,x
260 clc
261 adc tmp0
262 tay
263 lda tmp0+1
264 adc #0
265 sta _patch_ptr0+2
266 sta _patch_ptr1+2
267
268 lda dx
269 tax
270 inx ; 2 +1 since we count to 0
271 sta _patch_dx+1
272 lsr
273 eor #$ff
274 _patch_clc1
275 clc
276
277 sta save_a ; 3 = 3
278 _patch_bit1
279 lda _TableBit6-1,x;4
280 and #$7f ; remove signal bit
281 bne contColumn
282
283 ; a = sum, x = dX+1
284 ;----------------------------------------------------------
285 loopX
286 sec ; 1 50% executed (y--)
287 sta save_a ; 3 = 4
288 loopY
289 _patch_bit2
290 lda _TableBit6-1,x ; 4
291 bmi nextColumn ; 2/10.05 16.7% taken
292 contColumn
293 _patch_ptr0
294 OPP $a000,y ; 4
295 _patch_ptr1
296 sta $a000,y ; 5 = 16.34
297
298 dex ; 2 Step in x
299 beq exitLoop ; 2/3 At the endpoint yet?
300 lda save_a ; 3
301 _patch_dy
302 adc #00 ; 2 +DY
303 _patch_loop
304 bcc loopX ; 2/3=11/12 ~28.0% taken (not 50% due do to special code for very horizontal lines)
305 ; Time to step in y
306 _patch_dx
307 sbc #00 ; 2 -DX
308 sta save_a ; 3 = 5
309
310 ; update the screen address:
311 tya ; 2
312 _patch_adc
313 adc #ROW_SIZE ; 2
314 tay ; 2
315 _patch_bcc
316 bcc loopY ; 2/3= 8/9 ~84.4% taken
317 _patch_inc1
318 inc _patch_ptr0+2 ; 6
319 _patch_inc2
320 inc _patch_ptr1+2 ; 6
321 _patch_clc2
322 clc ; 2
323 bne loopY ; 3 = 17
324 ; average: 11.50
325
326 exitLoop
327 rts
328
329 nextColumn
330 and #$7f ; 2 remove signal bit
331 iny ; 2
332 bne contColumn ; 2/3= 6/7 99% taken
333 inc _patch_ptr0+2 ; 6
334 inc _patch_ptr1+2 ; 6
335 bne contColumn ; 3 = 15
336
337 ; Timings:
338 ; x++/y : 32.34 (28.0%)
339 ; x++/y++: 43.84 (72.0%)
340 ; average: 40.62
341 .)
342
343 ; .dsb 256-(*&255)
344 ;**********************************************************
345 draw_very_horizontal_8
346 .(
347 ; dX > 2*dY, here we use "chunking"
348 ; here we have DY in Y, and the OPCODE (inx, dex) in X
349 sty _patch_dy0+1
350 sty _patch_dy1+1
351 sty _patch_dy2+1
352 cpx #_INY
353 php
354 ; setup pointer and Y:
355 ldx _CurrentPixelX
356 lda _TableDiv6,x
357 clc
358 adc tmp0
359 tay
360 lda #0
361 sta tmp0
362 bcc skipHi
363 inc tmp0+1
364 skipHi
365 lda _TableDiv6,x
366 asl
367 adc _TableDiv6,x
368 asl
369 ; clc
370 adc #BYTE_PIXEL;-1
371 ; sec
372 sbc _CurrentPixelX
373 tax
374 lda Pot2PTbl,x
375 sta chunk
376
377 ; patch the code:
378 plp
379 beq doIny
380 ; no y-direction?
381 lda dy
382 beq draw_totally_horizontal_8
383 ; negative y-direction
384 dec _patch_dy0+1
385
386 lda #_SBC_IMM
387 sta _patch_adc
388 lda #ROW_SIZE-1
389 sta _patch_adc+1
390 lda #_BCS
391 sta _patch_bcc
392 lda #_DEC_ZP
393 sta _patch_inc
394 lda #_SEC
395 bne endPatch
396
397 doIny
398 ; positive y-direction
399 lda #_ADC_IMM
400 sta _patch_adc
401 lda #ROW_SIZE
402 sta _patch_adc+1
403 lda #_BCC
404 sta _patch_bcc
405 lda #_INC_ZP
406 sta _patch_inc
407 lda #_CLC
408 endPatch
409 sta _patch_clc
410
411 lda dx
412 sta _patch_dx+1
413 ; calculate initial bresenham sum
414 lsr
415 sta lastSum ; 3 this is used for the last line segment
416 eor #$ff ; = -dx/2
417 clc
418 bcc loopX
419 ; a = sum, x = _CurrentPixelX % 6, y = ptr-offset
420
421 ;----------------------------------------------------------
422 nextColumnC ;
423 clc ; 2 = 2
424 nextColumn ;
425 tax ; 2
426 lda chunk ; 3
427 OPP (tmp0),y ; 5
428 sta (tmp0),y ; 6
429 lda #%00111111 ; 2
430 sta chunk ; 3
431 txa ; 2
432 ldx #BYTE_PIXEL-1 ; 2
433 iny ; 2 next column
434 bne contColumn ; 2/3=29/30 99% taken
435 inc tmp0+1 ; 5 dec/inc
436 bne contColumn ; 3 = 8
437 ; average: 30.03
438 ;----------------------------------------------------------
439 draw_totally_horizontal_8
440 lda #1
441 sta _patch_dy2+1
442 lda dx
443 eor #$ff ; = -dx
444 clc
445 bcc loopXEnd
446 ;----------------------------------------------------------
447 loopY
448 lda save_a ; 3
449 dec dy ; 5 all but one vertical segments drawn?
450 beq exitLoop ; 2/3=10/11 yes, exit loop
451 dex ; 2
452 bmi nextColumnC ; 2/38.03 ~16.7% taken (this will continue below)
453 _patch_dy0
454 adc #00 ; 2 = 12.01 +DY, no check necessary here!
455 loopX
456 dex ; 2
457 bmi nextColumn ; 2/33.03 ~16.7% taken
458 contColumn ; = 9.17
459 _patch_dy1
460 adc #00 ; 2 +DY
461 bcc loopX ; 2/3= 4/5 ~76.4% taken
462 ; Time to step in y
463 _patch_dx
464 sbc #00 ; 2 -DX
465 sta save_a ; 3 = 5
466
467 ; plot the last bits of current segment:
468 lda Pot2PTbl,x ; 4
469 eor chunk ; 3
470 OPP (tmp0),y ; 5
471 sta (tmp0),y ; 6
472 lda Pot2PTbl,x ; 4
473 sta chunk ; 3 = 25
474
475 ; update the screen address:
476 tya ; 2
477 _patch_adc
478 adc #ROW_SIZE ; 2
479 tay ; 2
480 _patch_bcc
481 bcc loopY ; 2/3= 8/9 ~84.4% taken
482 _patch_inc
483 inc tmp0+1 ; 5
484 _patch_clc
485 clc ; 2
486 bne loopY ; 3 = 10
487 ; average: 10.40
488
489 ; Timings:
490 ; x++/y : 14.17 (76.4%)
491 ; x++/y++: 62.41 (23.6%)
492 ; average: 25.55
493 ;----------------------------------------------------------
494 exitLoop
495 ; draw the last horizontal line segment:
496 clc
497 adc lastSum ; 3
498 loopXEnd
499 dex ; 2
500 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
501 contColumnEnd ; = 9.85
502 _patch_dy2
503 adc #00 ; 2 +DY
504 bcc loopXEnd ; 2/3= 4/5 ~38.2% taken
505
506 ; plot last chunk:
507 lda Pot2PTbl,x ; 4
508 eor chunk ; 3
509 OPP (tmp0),y ; 5
510 sta (tmp0),y ; 6 = 18
511 rts
512 ;----------------------------------------------------------
513 nextColumnEnd ;
514 tax ; 2
515 lda chunk ; 3
516 OPP (tmp0),y ; 5
517 sta (tmp0),y ; 6
518 lda #%00111111 ; 2
519 sta chunk ; 3
520 txa ; 2
521 ldx #BYTE_PIXEL-1 ; 2
522 iny ; 2 next column
523 bne contColumnEnd ; 2/3=29/30 99% taken
524 inc tmp0+1 ; 5 dec/inc
525 bne contColumnEnd ; 3 = 8
526
527 Pot2PTbl
528 .byte %00000001, %00000011, %00000111, %00001111
529 .byte %00011111, %00111111
530 .)
531
532 .dsb 256-(*&255)
533
534 ;**********************************************************
535 ;
536 ; This code is used when the things are moving faster
537 ; vertically than horizontally
538 ;
539 ; dy>dx
540 ;
541 draw_mainly_vertical_8
542 ; A = DX, Y = DY, X = opcode
543 .(
544 ; setup bresenham values:
545 sty _patch_dy+1
546
547 ; setup direction:
548 cpx #_DEY ; which direction?
549 bne doIny
550 ; dey -> moving down:
551 lda #_SBC_IMM
552 sta _patch_adc1
553 sta _patch_adc2
554 lda #ROW_SIZE-1
555 sta _patch_adc1+1
556 sta _patch_adc2+1
557 lda #_BCC
558 sta _patch_bcs1
559 sta _patch_bcs2
560 lda #_DEC_ZP
561 sta _patch_inc2
562 ldy #_SEC
563 ldx dx
564 dex
565 lda #_DEC_ABS
566 bne endPatch
567
568 doIny
569 ; inx -> moving up:
570 lda #_ADC_IMM
571 sta _patch_adc1
572 sta _patch_adc2
573 lda #ROW_SIZE
574 sta _patch_adc1+1
575 sta _patch_adc2+1
576 lda #_BCS
577 sta _patch_bcs1
578 sta _patch_bcs2
579 lda #_INC_ZP
580 sta _patch_inc2
581 ldy #_CLC
582 ldx dx
583 lda #_INC_ABS
584 endPatch
585 sta _patch_inc0
586 sta _patch_inc1
587 stx _patch_dx1+1
588 stx _patch_dx2+1
589 sty _patch_clc1
590 sty _patch_clc2
591
592 ; setup X
593 ldx dx ; X = dx
594 ; setup current bit:
595 ldy _CurrentPixelX
596 lda _TableBit6Reverse,y ; 4
597 sta curBit
598 ; setup pointer and Y:
599 lda _TableDiv6,y
600 clc
601 adc tmp0
602 tay
603 lda #0
604 sta tmp0
605 lda tmp0+1
606 adc #0
607 sta _patch_ptr0+2
608 sta _patch_ptr1+2
609 ; calculate initial bresenham sum:
610 lda dy
611 lsr
612 sta lastSum
613 eor #$ff ; -DY/2
614 clc ; 2
615 bcc loopY ; 3
616 ; a = sum, y = tmp0, x = dX, tmp0 = 0
617 ;----------------------------------------------------------
618 incHiPtr ;
619 _patch_inc0
620 inc _patch_ptr0+2 ; 6
621 _patch_inc1
622 inc _patch_ptr1+2 ; 6
623 _patch_clc1
624 clc ; 2
625 bne contHiPtr ; 3 = 17
626 ;----------------------------------------------------------
627 loopY
628 sta save_a ; 3
629 lda curBit ; 3 = 6
630 loopX
631 ; Draw the pixel
632 _patch_ptr0
633 OPP $a000,y ; 4
634 _patch_ptr1
635 sta $a000,y ; 5 = 9
636 ; update the screen address:
637 tya ; 2
638 _patch_adc1
639 adc #ROW_SIZE ; 2
640 tay ; 2
641 _patch_bcs1
642 bcs incHiPtr ; 2/20 ~15.6% taken
643 contHiPtr ; = 10.81 average
644 lda save_a ; 3
645 _patch_dx1
646 adc #00 ; 2 +DX
647 bcc loopY ; 2/3= 7/8 ~41.4% taken
648 ; Time to step in x
649 _patch_dy
650 sbc #00 ; 2 -DY
651 sta save_a ; 3 = 5
652
653 lda curBit ; 3
654 lsr ; 2
655 beq nextColumn ; 2/12.05 ~16.7% taken
656 contNextColumn
657 sta curBit ; 3 =~11.68
658 ; step in x:
659 dex ; 2 At the endpoint yet?
660 bne loopX ; 2/3= 4/5
661
662 ; x ,y++: 33.81 (41.4%)
663 ; x++,y++: 48.49 (58.6%)
664 ; average: 42.41
665
666 ; draw the last vertical line segment:
667 ldx _patch_ptr0+2 ; 4
668 stx tmp0+1 ; 3
669 lda save_a ; 3
670 adc lastSum ; 3
671 loopYEnd
672 tax ; 2 = 2
673 ; Draw the pixel
674 lda curBit ; 3
675 OPP (tmp0),y ; 5
676 sta (tmp0),y ; 6 = 14
677 ; update the screen address:
678 tya ; 2
679 _patch_adc2
680 adc #ROW_SIZE ; 2
681 tay ; 2
682 _patch_bcs2
683 bcs incHiPtrEnd ; 2/13 ~15.6% taken
684 contHiPtrEnd ; = 9.72 average
685 txa ; 2
686 _patch_dx2
687 adc #00 ; 2 +DX
688 bcc loopYEnd ; 2/3= 6/7 ~20.7% taken
689 rts ; 6
690 ;----------------------------------------------------------
691 nextColumn
692 clc ; 2
693 lda #%00100000 ; 2
694 iny ; 2
695 bne contNextColumn ; 2/3= 8/9 ~99% taken
696 inc _patch_ptr0+2 ; 6
697 inc _patch_ptr1+2 ; 6
698 bcc contNextColumn ; 3 = 15
699
700 incHiPtrEnd ; 9
701 _patch_inc2
702 inc tmp0+1 ; 5
703 _patch_clc2
704 clc ; 2
705 bne contHiPtrEnd ; 3
706 ;----------------------------------------------------------
707 .)
708
709 ; *** total timings: ***
710 ; draw_very_horizontal_8 (29.5%): 25.55 (was 25.73)
711 ; draw_mainly_horizontal_8 (20.5%): 40.62 (was 41.30)
712 ; draw_mainly_vertical_8 (50.0%): 42.41 (was 43.77)
713 ;----------------------------------------
714 ; total average (100.0%): 37.07 (was 37.94)

  ViewVC Help
Powered by ViewVC 1.1.26