/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 265 - (show annotations)
Mon Feb 8 17:33:45 2010 UTC (9 years, 7 months ago) by thrust26
File size: 15960 byte(s)
some comments and cycle calculation updates
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13 ;473 final optimization for mainly_vertical (37.89 -> 38.34 corrected)
14
15 ; TODOs:
16 ; + chunking (-35)
17 ; - two separate branches instead of patching?
18 ; + countdown minor
19 ; x mainly_horizontal (won't work)
20 ; + mainly_vertical (-9)
21
22 .zero
23
24 ; *= tmp1
25
26 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
27 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
28 ;dx .dsb 1 ; Width
29 ;dy .dsb 1 ; Height
30 ;_CurrentPixelX .dsb 1
31 ;_CurrentPixelY .dsb 1
32 ;_OtherPixelX .dsb 1
33 ;_OtherPixelY .dsb 1
34
35 save_a .dsb 1
36 curBit .dsb 1
37 chunk .dsb 1
38 lastSum .dsb 1
39
40
41 #define BYTE_PIXEL 6
42 #define X_SIZE 240
43 #define ROW_SIZE X_SIZE/BYTE_PIXEL
44
45 #define _NOP $ea
46 #define _INX $e8
47 #define _DEX $ca
48 #define _INY $c8
49 #define _DEY $88
50 #define _ASL $0a
51 #define _LSR $4a
52 #define _INC_ZP $e6
53 #define _DEC_ZP $c6
54
55
56 .text
57
58 .dsb 256-(*&255)
59
60 ;**********************************************************
61 draw_totaly_vertical_8
62 .(
63 ldx _CurrentPixelX
64 ldy _TableDiv6,x
65 lda _TableBit6Reverse,x ; 4
66 sta _mask_patch+1
67 ldx dy
68 inx
69 clc ; 2
70 loop
71 _mask_patch
72 lda #0 ; 2
73 eor (tmp0),y ; 5*
74 sta (tmp0),y ; 6*= 13**
75
76 ; update the screen address:
77 .(
78 tya ; 2
79 adc #ROW_SIZE ; 2
80 tay ; 2
81 bcc skip ; 2/3= 8/9
82 inc tmp0+1 ; 5
83 clc ; 2 = 7
84 skip ;
85 .)
86 dex ; 2
87 bne loop ; 2/3=4/5
88 rts
89 .)
90
91
92 ;**********************************************************
93 ;
94 ; Expects the following variables to be set when called:
95 ; _CurrentPixelX
96 ; _CurrentPixelY
97 ; _OtherPixelX
98 ; _OtherPixelY
99 ;
100 _DrawLine8
101 ;
102 ; Compute deltas and signs
103 ;
104
105 ; Test Y value
106 .(
107 sec
108 lda _CurrentPixelY
109 sbc _OtherPixelY
110 beq end
111 bcc cur_smaller
112
113 cur_bigger ; y1>y2
114 ; Swap X and Y
115 ; So we always draw from top to bottom
116 ldy _CurrentPixelY
117 ldx _OtherPixelY
118 sty _OtherPixelY
119 stx _CurrentPixelY
120
121 ldy _CurrentPixelX
122 ldx _OtherPixelX
123 sty _OtherPixelX
124 stx _CurrentPixelX
125
126 jmp end
127
128 cur_smaller ; y1<y2
129 ; Absolute value
130 eor #$ff
131 adc #1
132 end
133 sta dy
134 .)
135
136 ;
137 ; Initialise screen pointer
138 ;
139 ldy _CurrentPixelY
140 lda _HiresAddrLow,y ; 4
141 sta tmp0+0 ; 3
142 lda _HiresAddrHigh,y ; 4
143 sta tmp0+1 ; 3 => Total 14 cycles
144
145 ; Test X value
146 .(
147 sec
148 lda _CurrentPixelX
149 sbc _OtherPixelX
150 sta dx
151 beq draw_totaly_vertical_8
152 bcc cur_smaller
153
154 cur_bigger ; x1>x2
155 lda #_DEX
156 bne end
157
158 cur_smaller ; x1<x2
159 ; Absolute value
160 eor #$ff
161 adc #1
162 sta dx
163
164 lda #_INX
165 end
166 .)
167
168 ; jmp alignIt
169 ;
170 ; .dsb 256-(*&255)
171 ;
172 ;alignIt
173 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
174 ldy dy
175 beq draw_totaly_horizontal_8
176 cpy dx
177 bcc draw_mainly_horizontal_8
178 jmp draw_mainly_vertical_8
179
180 ;**********************************************************
181 draw_totaly_horizontal_8
182 .(
183 ; here we have DY in Y, and the OPCODE in A
184 sta _outer_patch ; Write a (dex / nop / inx) instruction
185
186 ldx _OtherPixelX
187 stx __auto_cpx+1
188
189 ldx _CurrentPixelX
190
191 ;
192 ; Draw loop
193 ;
194 outer_loop
195 ldy _TableDiv6,x
196 lda _TableBit6Reverse,x ; 4
197 eor (tmp0),y ; 5
198 sta (tmp0),y ; 6
199
200 _outer_patch
201 inx
202
203 __auto_cpx
204 cpx #00 ; At the endpoint yet?
205 bne outer_loop
206 rts
207 .)
208
209 ;**********************************************************
210 draw_mainly_horizontal_8
211 .(
212 tax
213 lda dx
214 lsr
215 cmp dy
216 bcs draw_very_horizontal_8
217
218 ; here we have DY in Y, and the OPCODE (inx, dex) in A
219 sty __auto_dy+1
220
221 ; all this stress to be able to use dex, beq :)
222 cpx #_INX
223 beq doInx
224
225 lda #<_TableDiv6-1 ; == 0
226 ; clc ; _DEX < _INX
227 adc _OtherPixelX
228 sta __auto_div6+1
229 lda #<_TableBit6Reverse-1 ; == 0
230 ; clc
231 adc _OtherPixelX
232
233 ldx #>_TableDiv6
234 ldy #>_TableBit6Reverse ;
235 bne endPatch
236
237 doInx
238 lda #X_SIZE-1
239 ; sec
240 sbc _OtherPixelX
241 sta __auto_div6+1
242 lda #X_SIZE-1
243 ; sec
244 sbc _OtherPixelX
245
246 ldx #>_TableDiv6Rev
247 ldy #>_TableBit6 ;
248 endPatch
249 sta __auto_bit6+1
250 stx __auto_div6+2
251 sty __auto_bit6+2
252
253 lda dx
254 tax
255 inx ; 2 +1 since we count to 0
256 sta __auto_dx+1
257 lsr
258 eor #$ff
259 clc
260 ; a = sum, x = dX+1
261 ;----------------------------------------------------------
262 loopX
263 sta save_a ; 3 = 3
264 loopY
265 ; Draw the pixel
266 __auto_div6
267 ldy _TableDiv6-1,x ; 4
268 __auto_bit6
269 lda _TableBit6Reverse-1,x;4
270 eor (tmp0),y ; 5*
271 sta (tmp0),y ; 6*= 19
272
273 dex ; 2 Step in x
274 beq exitLoop ; 2/3 At the endpoint yet?
275 lda save_a ; 3
276 __auto_dy
277 adc #00 ; 2 +DY
278 bcc loopX ; 2/3=11/12 ~33.3% taken (not 50% due do to special code for very horizontal lines)
279 ; Time to step in y
280 __auto_dx
281 sbc #00 ; 2 -DX
282 sta save_a ; 3 = 5
283
284 ; update the screen address:
285 lda tmp0+0 ; 3
286 adc #ROW_SIZE ; 2
287 sta tmp0+0 ; 3
288 bcc loopY ; 2/3=10/11 ~84.4% taken
289 inc tmp0+1 ; 5
290 clc ; 2
291 bcc loopY ; 3 = 10
292 ; average: 12.40
293
294 exitLoop
295 rts
296 ; Timings:
297 ; x++/y : 34 (33.3%)
298 ; x++/y++: 47.40 (66.7%)
299 ; average: 42.94
300 .)
301
302 .dsb 256-(*&255)
303 ;**********************************************************
304 draw_very_horizontal_8
305 .(
306 ; dX > 2*dY, here we use "chunking"
307 ; here we have DY in Y, and the OPCODE (inx, dex) in A
308 sty __auto_dy+1
309 sty __auto_dy2+1
310 cpx #_INX
311 php
312 ; setup pointer and Y:
313 ldx _CurrentPixelX
314 lda _TableDiv6,x
315 clc
316 adc tmp0
317 tay
318 bcc skipHi
319 inc tmp0+1
320 skipHi
321 lda #0
322 sta tmp0
323 ; patch the code:
324 plp
325 beq doInx
326 ; negative x-direction
327 lda _TableMod6,x
328 tax
329
330 lda #_DEY
331 sta __auto_stepx
332 sta __auto_stepx2
333 lda #$ff
334 sta __auto_cpy+1
335 sta __auto_cpy2+1
336 lda #_DEC_ZP
337 sta __auto_yHi
338 sta __auto_yHi2
339 lda Pot2NTbl,x
340 sta chunk
341 lda #<Pot2NTbl
342 bne endPatch
343
344 doInx
345 ; positive x-direction
346 lda #BYTE_PIXEL-1
347 ; sec
348 sbc _TableMod6,x
349 tax
350
351 lda #_INY
352 sta __auto_stepx
353 sta __auto_stepx2
354 lda #$00
355 sta __auto_cpy+1
356 sta __auto_cpy2+1
357 lda #_INC_ZP
358 sta __auto_yHi
359 sta __auto_yHi2
360 lda Pot2PTbl,x
361 sta chunk
362 lda #<Pot2PTbl
363 endPatch
364 sta __auto_pot1+1
365 sta __auto_pot2+1
366 sta __auto_pot3+1
367 lda dx
368 sta __auto_dx+1
369 ; calculate initial bresenham sum
370 lsr
371 sta lastSum ; 3 this is used for the last line segment
372 eor #$ff ; = -dx/2
373 clc
374 jmp loopX
375 ; a = sum, x = dX+1, y = ptr-offset
376
377 ;----------------------------------------------------------
378 nextColumn ;
379 tax ; 2
380 lda chunk ; 3
381 eor (tmp0),y ; 5
382 sta (tmp0),y ; 6
383 lda #%00111111 ; 2
384 sta chunk ; 3
385 txa ; 2
386 ldx #BYTE_PIXEL-1 ; 2
387 __auto_stepx
388 iny ; 2 next column
389 __auto_cpy
390 cpy #00 ; 2
391 clc ; 2
392 bne contColumn ; 2/3=33/34 99% taken
393 __auto_yHi
394 inc tmp0+1 ; 5 dec/inc
395 bcc contColumn ; 3 = 8
396 ;----------------------------------------------------------
397 loopY
398 dec dy ; 5 all but one vertical segments drawn?
399 beq exitLoop ; 2/3= 7/8 yes, exit loop
400 loopX
401 dex ; 2
402 bmi nextColumn ; 2/37.03 ~16.7% taken
403 contColumn ; = 9.85
404 __auto_dy
405 adc #00 ; 2 +DY
406 bcc loopX ; 2/3= 4/5 ~75% taken
407 ; Time to step in y
408 __auto_dx
409 sbc #00 ; 2 -DX
410 sta save_a ; 3 = 5
411
412 ; plot the last bits of current row:
413 __auto_pot1
414 lda Pot2PTbl,x ; 4
415 eor chunk ; 3
416 eor (tmp0),y ; 5
417 sta (tmp0),y ; 6
418 __auto_pot2
419 lda Pot2PTbl,x ; 4
420 sta chunk ; 3 = 25
421
422 ; update the screen address:
423 tya ; 2
424 adc #ROW_SIZE ; 2
425 tay ; 2
426 lda save_a ; 3
427 bcc loopY ; 2/3=11/12 ~84.4% taken
428 inc tmp0+1 ; 5
429 clc ; 2
430 bcc loopY ; 3 = 10
431 ; average: 13.40
432
433 ; Timings:
434 ; x++/y : 14.85 (75%)
435 ; x++/y++: 64.25 (25%)
436 ; average: 27.20
437 ;----------------------------------------------------------
438 exitLoop
439 ; draw the last horizontal line segment:
440 adc lastSum ; 3
441 loopXEnd
442 dex ; 2
443 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
444 contColumnEnd ; = 9.85
445 __auto_dy2
446 adc #00 ; 2 +DY
447 bcc loopXEnd ; 2/3= 4/5 ~50% taken
448
449 ; plot last chunk:
450 __auto_pot3
451 lda Pot2PTbl,x ; 4
452 eor chunk ; 3
453 eor (tmp0),y ; 5
454 sta (tmp0),y ; 6 = 18
455 rts
456 ;----------------------------------------------------------
457 nextColumnEnd ;
458 tax ; 2
459 lda chunk ; 3
460 eor (tmp0),y ; 5
461 sta (tmp0),y ; 6
462 lda #%00111111 ; 2
463 sta chunk ; 3
464 txa ; 2
465 ldx #BYTE_PIXEL-1 ; 2
466 __auto_stepx2
467 iny ; 2 next column
468 __auto_cpy2
469 cpy #00 ; 2
470 clc ; 2
471 bne contColumnEnd ; 2/3=33/34 99% taken
472 __auto_yHi2
473 inc tmp0+1 ; 5 dec/inc
474 bcc contColumnEnd ; 3 = 8
475
476 Pot2PTbl
477 .byte %00000001, %00000011, %00000111, %00001111
478 .byte %00011111, %00111111
479 Pot2NTbl
480 .byte %00100000, %00110000
481 .byte %00111000, %00111100, %00111110, %00111111
482 .)
483
484 .dsb 256-(*&255)
485 ;**********************************************************
486 ;
487 ; This code is used when the things are moving faster
488 ; vertically than horizontally
489 ;
490 ; dy>dx
491 ;
492 draw_mainly_vertical_8
493 ; here we have DY in Y, and the OPCODE in A
494 .(
495 ; setup bresenham values:
496 sty __auto_dy+1
497
498 ; setup direction:
499 cmp #_DEX ; which direction?
500 bne doInx
501 ; dex -> moving left:
502 lda #%00100000
503 sta __auto_cpBit+1
504 lda #_ASL ;
505 sta __auto_shBit
506 lda #%00000001
507 sta __auto_ldBit+1
508 lda #_DEY
509 sta __auto_yLo
510 ldx #$ff
511 lda #_DEC_ZP
512 bne endPatch
513
514 doInx
515 ; inx -> moving right:
516 lda #%00000001
517 sta __auto_cpBit+1
518 lda #_LSR
519 sta __auto_shBit
520 lda #%00100000
521 sta __auto_ldBit+1
522 lda #_INY
523 sta __auto_yLo
524 ldx #$00
525 lda #_INC_ZP
526 endPatch
527 stx __auto_cpY+1
528 sta __auto_yHi
529 ; setup X
530 ldx dx ; X = dx
531 stx __auto_dx1+1
532 stx __auto_dx2+1
533 ; setup current bit:
534 ldy _CurrentPixelX
535 lda _TableBit6Reverse,y ; 4
536 sta curBit
537 ; setup pointer and Y:
538 ; TODO: self-modyfing code for the ptrs?
539 lda _TableDiv6,y
540 clc
541 adc tmp0
542 tay
543 lda #0
544 sta tmp0
545 bcc skipTmp0
546 inc tmp0+1
547 skipTmp0
548 ; calculate initial bresenham sum:
549 lda dy
550 lsr
551 sta lastSum
552 eor #$ff ; -DY/2
553 clc ; 2
554 bcc loopY ; 3
555 ; a = sum, y = tmp0, x = dX, tmp0 = 0
556 ;----------------------------------------------------------
557 incHiPtr ;
558 inc tmp0+1 ; 5
559 clc ; 2
560 bcc contHiPtr ; 3
561 ;----------------------------------------------------------
562 loopY
563 sta save_a ; 3
564 lda curBit ; 3 = 6
565 loopX
566 ; Draw the pixel
567 eor (tmp0),y ; 5
568 sta (tmp0),y ; 6 = 11
569 ; update the screen address:
570 tya ; 2
571 adc #ROW_SIZE ; 2
572 tay ; 2
573 bcs incHiPtr ; 2/13 ~15.6% taken
574 contHiPtr ; = 9.72 average
575 lda save_a ; 3
576 __auto_dx1
577 adc #00 ; 2 +DX
578 bcc loopY ; 2/3= 7/8 ~50% taken
579 ; Time to step in x
580 __auto_dy
581 sbc #00 ; 2 -DY
582 sta save_a ; 3 = 5
583
584 lda curBit ; 3
585 __auto_cpBit ; TODO: optimize
586 cmp #%00100000 ; 2 %00100000/%00000001
587 beq nextColumn ; 2/14.07 ~16.7% taken
588 __auto_shBit
589 asl ; 2 asl/lsr, clears carry
590 contNextColumn
591 sta curBit ; 3 =~13.68
592 ; step in x:
593 dex ; 2 At the endpoint yet?
594 bne loopX ; 2/3= 4/5
595
596 ; x ,y++: 34.72 (50%)
597 ; x++,y++: 51.40 (50%)
598 ; average: 43.06
599
600 ; draw the last vertical line segment:
601 lda save_a ; 3
602 adc lastSum ; 3
603 loopYEnd
604 tax ; 2 = 2
605 ; Draw the pixel
606 lda curBit ; 3
607 eor (tmp0),y ; 5
608 sta (tmp0),y ; 6 = 14
609 ; update the screen address:
610 tya ; 2
611 adc #ROW_SIZE ; 2
612 tay ; 2
613 bcs incHiPtrEnd ; 2/13 ~15.6% taken
614 contHiPtrEnd ; = 9.72 average
615 txa ; 2
616 __auto_dx2
617 adc #00 ; 2 +DX
618 bcc loopYEnd ; 2/3= 6/7 ~25% taken
619 rts
620 ;----------------------------------------------------------
621 nextColumn
622 __auto_ldBit
623 lda #%00000001 ; 2 %00000001/%00100000
624 __auto_yLo
625 dey ; 2 dey/iny
626 __auto_cpY
627 cpy #$ff ; 2 $ff/$00
628 clc ; 2 TODO: optimize
629 bne contNextColumn ; 2/3 ~99% taken
630 __auto_yHi
631 dec tmp0+1 ; 5 dec/inc
632 bcc contNextColumn ; 3
633
634 incHiPtrEnd ; 9
635 inc tmp0+1 ; 5
636 clc ; 2
637 bcc contHiPtrEnd ; 3
638 ;----------------------------------------------------------
639 .)
640
641 ; *** total timings: ***
642 ; draw_very_horizontal_8 (29.6%): 27.20
643 ; draw_mainly_horizontal_8 (20.4%): 42.94 <- corrected!
644 ; draw_mainly_vertical_8 (50.0%): 43.06
645 ;----------------------------------------
646 ; total average (100.0%): 38.34
647
648
649

  ViewVC Help
Powered by ViewVC 1.1.26