/[projet1]/public/oric/routines/rasterization/linebench/line8.s
Defence Force logotype

Contents of /public/oric/routines/rasterization/linebench/line8.s

Parent Directory Parent Directory | Revision Log Revision Log


Revision 261 - (show annotations)
Mon Feb 8 14:01:57 2010 UTC (9 years, 7 months ago) by thrust26
File size: 14347 byte(s)
optimized chunking, 482
1 ; History of linebench timings...
2 ;649
3 ;614 (replacing the update of tmp0)
4 ;607
5 ;588
6 ;583 after alignment
7 ;579
8 ;534 redid mainly_vertical
9 ;529 removed page penalty
10 ;517 final optimization at mainly_horizontal
11 ;501 chunking, initial version
12 ;482 optimized chunking (avg: 38.91 cylces)
13
14
15 ; TODOs:
16 ; + chunking
17 ; - two separate branches instead of patching?
18 ; - countdown minor
19 ; - mainly horizontal
20 ; - mainly vertical
21
22 .zero
23
24 ; *= tmp1
25
26 ;e .dsb 2 ; Error decision factor (slope) 2 bytes in zero page
27 ;i .dsb 1 ; Number of pixels to draw (iteration counter) 1 byte in zp
28 ;dx .dsb 1 ; Width
29 ;dy .dsb 1 ; Height
30 ;_CurrentPixelX .dsb 1
31 ;_CurrentPixelY .dsb 1
32 ;_OtherPixelX .dsb 1
33 ;_OtherPixelY .dsb 1
34
35 save_a .dsb 1
36 curBit .dsb 1
37 chunk .dsb 1
38 lastSum .dsb 1
39
40
41 #define BYTE_PIXEL 6
42 #define X_SIZE 240
43 #define ROW_SIZE X_SIZE/BYTE_PIXEL
44
45 #define _NOP $ea
46 #define _INX $e8
47 #define _DEX $ca
48 #define _INY $c8
49 #define _DEY $88
50 #define _ASL $0a
51 #define _LSR $4a
52 #define _INC_ZP $e6
53 #define _DEC_ZP $c6
54
55
56 .text
57
58 .dsb 256-(*&255)
59
60 draw_totaly_vertical_8
61 .(
62 ldx _CurrentPixelX
63 ldy _TableDiv6,x
64 lda _TableBit6Reverse,x ; 4
65 sta _mask_patch+1
66
67 ldx dy
68 inx
69
70 clc ; 2
71 loop
72 _mask_patch
73 lda #0 ; 2
74 eor (tmp0),y ; 5
75 sta (tmp0),y ; 6 => total = 13 cycles
76
77 ; update the screen address:
78 .(
79 lda tmp0+0 ; 3
80 adc #ROW_SIZE ; 2
81 sta tmp0+0 ; 3
82 bcc skip ; 2 (+1 if taken)
83 inc tmp0+1 ; 5
84 clc ; 2
85 skip
86 .)
87 ; ------------------Min=13 Max=17
88
89 dex
90 bne loop
91 rts
92 .)
93
94
95
96 ;
97 ; Expects the following variables to be set when called:
98 ; _CurrentPixelX
99 ; _CurrentPixelY
100 ; _OtherPixelX
101 ; _OtherPixelY
102 ;
103 _DrawLine8
104 ;
105 ; Compute deltas and signs
106 ;
107
108 ; Test Y value
109 .(
110 sec
111 lda _CurrentPixelY
112 sbc _OtherPixelY
113 beq end
114 bcc cur_smaller
115
116 cur_bigger ; y1>y2
117 ; Swap X and Y
118 ; So we always draw from top to bottom
119 ldy _CurrentPixelY
120 ldx _OtherPixelY
121 sty _OtherPixelY
122 stx _CurrentPixelY
123
124 ldy _CurrentPixelX
125 ldx _OtherPixelX
126 sty _OtherPixelX
127 stx _CurrentPixelX
128
129 jmp end
130
131 cur_smaller ; y1<y2
132 ; Absolute value
133 eor #$ff
134 adc #1
135 end
136 sta dy
137 .)
138
139 ;
140 ; Initialise screen pointer
141 ;
142 ldy _CurrentPixelY
143 lda _HiresAddrLow,y ; 4
144 sta tmp0+0 ; 3
145 lda _HiresAddrHigh,y ; 4
146 sta tmp0+1 ; 3 => Total 14 cycles
147
148 ; Test X value
149 .(
150 sec
151 lda _CurrentPixelX
152 sbc _OtherPixelX
153 sta dx
154 beq draw_totaly_vertical_8
155 bcc cur_smaller
156
157 cur_bigger ; x1>x2
158 lda #_DEX
159 bne end
160
161 cur_smaller ; x1<x2
162 ; Absolute value
163 eor #$ff
164 adc #1
165 sta dx
166
167 lda #_INX
168 end
169 .)
170
171 jmp alignIt
172
173 .dsb 256-(*&255)
174
175 alignIt
176 ; Compute slope and call the specialized code for mostly horizontal or vertical lines
177 ldy dy
178 beq draw_totaly_horizontal_8
179 cpy dx
180 bcc draw_mainly_horizontal_8
181 jmp draw_mainly_vertical_8
182
183 draw_totaly_horizontal_8
184 .(
185 ; here we have DY in Y, and the OPCODE in A
186 sta _outer_patch ; Write a (dex / nop / inx) instruction
187
188 ldx _OtherPixelX
189 sta __auto_cpx+1
190
191 ldx _CurrentPixelX
192
193 ;
194 ; Draw loop
195 ;
196 outer_loop
197 ldy _TableDiv6,x
198 lda _TableBit6Reverse,x ; 4
199 eor (tmp0),y ; 5
200 sta (tmp0),y ; 6
201
202 _outer_patch
203 inx
204
205 __auto_cpx
206 cpx #00 ; At the endpoint yet?
207 bne outer_loop
208 rts
209 .)
210
211 draw_mainly_horizontal_8
212 .(
213 tax
214 lda dx
215 lsr
216 cmp dy
217 bcs draw_very_horizontal_8
218
219 ; here we have DY in Y, and the OPCODE (inx, dex) in A
220 sty __auto_dy+1
221
222 ; all this stress to be able to use dex, beq :)
223 cpx #_INX
224 beq doInx
225
226 lda #<_TableDiv6-1 ; == 0
227 ; clc ; _DEX < _INX
228 adc _OtherPixelX
229 sta __auto_div6+1
230 lda #<_TableBit6Reverse-1 ; == 0
231 ; clc
232 adc _OtherPixelX
233
234 ldx #>_TableDiv6
235 ldy #>_TableBit6Reverse ;
236 bne endPatch
237
238 doInx
239 lda #X_SIZE-1
240 ; sec
241 sbc _OtherPixelX
242 sta __auto_div6+1
243 lda #X_SIZE-1
244 ; sec
245 sbc _OtherPixelX
246
247 ldx #>_TableDiv6Rev
248 ldy #>_TableBit6 ;
249 endPatch
250 sta __auto_bit6+1
251 stx __auto_div6+2
252 sty __auto_bit6+2
253
254 lda dx
255 tax
256 inx ; 2 +1 since we count to 0
257 sta __auto_dx+1
258 lsr
259 eor #$ff
260 clc
261 ; a = sum, x = dX+1
262
263 loopX
264 sta save_a ; 3 = 3
265 loopY
266 ; Draw the pixel
267 __auto_div6
268 ldy _TableDiv6-1,x ; 4
269 __auto_bit6
270 lda _TableBit6Reverse-1,x;4
271 eor (tmp0),y ; 5*
272 sta (tmp0),y ; 6*= 19
273
274 dex ; 2 Step in x
275 beq exitLoop ; 2/3 At the endpoint yet?
276 lda save_a ; 3
277 __auto_dy
278 adc #00 ; 2 +DY
279 bcc loopX ; 2/3=11/12 ~50% taken
280 ; Time to step in y
281 __auto_dx
282 sbc #00 ; 2 -DX
283 sta save_a ; 3 = 5
284
285 ; update the screen address:
286 lda tmp0+0 ; 3
287 adc #ROW_SIZE ; 2
288 sta tmp0+0 ; 3
289 bcc loopY ; 2/3=10/11 ~84% taken
290 inc tmp0+1 ; 5
291 clc ; 2
292 bcc loopY ; 3 = 10
293 ; average: 12.44
294
295 exitLoop
296 rts
297 ; Timings:
298 ; x++/y : 34
299 ; x++/y++: 47.44
300 ; average: 40.72
301 .)
302
303 draw_very_horizontal_8
304 .(
305 ; dX > 2*dY, here we use "chunking"
306 ; here we have DY in Y, and the OPCODE (inx, dex) in A
307 sty __auto_dy+1
308 sty __auto_dy2+1
309 cpx #_INX
310 php
311
312 ldx _CurrentPixelX
313 lda _TableDiv6,x
314 clc
315 adc tmp0
316 tay
317 bcc skipHi
318 inc tmp0+1
319 skipHi
320 lda #0
321 sta tmp0
322
323 plp
324 beq doInx
325 ; negative x-direction
326 lda _TableMod6,x
327 tax
328
329 lda #_DEY
330 sta __auto_stepx
331 sta __auto_stepx2
332 lda #$ff
333 sta __auto_cpy+1
334 sta __auto_cpy2+1
335 lda #_DEC_ZP
336 sta __auto_yHi
337 sta __auto_yHi2
338 lda Pot2NTbl,x
339 sta chunk
340 lda #<Pot2NTbl
341 bne endPatch
342
343 doInx
344 ; positive x-direction
345 lda #BYTE_PIXEL-1
346 ; sec
347 sbc _TableMod6,x
348 tax
349
350 lda #_INY
351 sta __auto_stepx
352 sta __auto_stepx2
353 lda #$00
354 sta __auto_cpy+1
355 sta __auto_cpy2+1
356 lda #_INC_ZP
357 sta __auto_yHi
358 sta __auto_yHi2
359 lda Pot2PTbl,x
360 sta chunk
361 lda #<Pot2PTbl
362 endPatch
363 sta __auto_pot1+1
364 sta __auto_pot2+1
365 sta __auto_pot3+1
366
367 lda dx
368 sta __auto_dx+1
369 ; calculate initial bresenham sum
370 lsr
371 sta lastSum ; 3 this is used for the last line segment
372 eor #$ff ; = -dx/2
373 clc
374 jmp loopX
375 ; a = sum, x = dX+1, y = ptr-offset
376
377 .dsb 256-(*&255)
378
379 nextColumn ;
380 tax ; 2
381 lda chunk ; 3
382 eor (tmp0),y ; 5
383 sta (tmp0),y ; 6
384 lda #%00111111 ; 2
385 sta chunk ; 3
386 txa ; 2
387 ldx #BYTE_PIXEL-1 ; 2
388 __auto_stepx
389 iny ; 2 next column
390 __auto_cpy
391 cpy #00 ; 2
392 clc ; 2
393 bne contColumn ; 2/3=33/34 99% taken
394 __auto_yHi
395 inc tmp0+1 ; 5 dec/inc
396 bcc contColumn ; 3 = 8
397
398 loopY
399 dec dy ; 5 all but one vertical segments drawn?
400 beq exitLoop ; 2/3= 7/8 yes, exit loop
401 loopX
402 dex ; 2
403 bmi nextColumn ; 2/37.03 ~16.7% taken
404 contColumn ; = 9.85
405 __auto_dy
406 adc #00 ; 2 +DY
407 bcc loopX ; 2/3= 4/5 ~50% taken
408 ; Time to step in y
409 __auto_dx
410 sbc #00 ; 2 -DX
411 sta save_a ; 3 = 5
412
413 ; plot the last bits of current row:
414 __auto_pot1
415 lda Pot2PTbl,x ; 4
416 eor chunk ; 3
417 eor (tmp0),y ; 5
418 sta (tmp0),y ; 6
419 __auto_pot2
420 lda Pot2PTbl,x ; 4
421 sta chunk ; 3 = 25
422
423 ; update the screen address:
424 tya ; 2
425 adc #ROW_SIZE ; 2
426 tay ; 2
427 lda save_a ; 3
428 bcc loopY ; 2/3=11/12 ~84.4% taken
429 inc tmp0+1 ; 5
430 clc ; 2
431 bcc loopY ; 3 = 10
432 ; average: 13.40
433
434 ; Timings:
435 ; x++/y : 14.85 (75%)
436 ; x++/y++: 64.25 (25%)
437 ; average: 27.20
438
439 exitLoop
440 ; draw the last horizontal line segment:
441 adc lastSum ; 3
442 loopXEnd
443 dex ; 2
444 bmi nextColumnEnd ; 2/37.03 ~16.7% taken
445 contColumnEnd ; = 9.85
446 __auto_dy2
447 adc #00 ; 2 +DY
448 bcc loopXEnd ; 2/3=11/12 ~50% taken
449
450 ; plot last chunk:
451 __auto_pot3
452 lda Pot2PTbl,x ; 4
453 eor chunk ; 3
454 eor (tmp0),y ; 5
455 sta (tmp0),y ; 6 = 18
456 rts
457
458 nextColumnEnd ;
459 tax ; 2
460 lda chunk ; 3
461 eor (tmp0),y ; 5
462 sta (tmp0),y ; 6
463 lda #%00111111 ; 2
464 sta chunk ; 3
465 txa ; 2
466 ldx #BYTE_PIXEL-1 ; 2
467 __auto_stepx2
468 iny ; 2 next column
469 __auto_cpy2
470 cpy #00 ; 2
471 clc ; 2
472 bne contColumnEnd ; 2/3=33/34 99% taken
473 __auto_yHi2
474 inc tmp0+1 ; 5 dec/inc
475 bcc contColumnEnd ; 3 = 8
476
477 Pot2PTbl
478 .byte %00000001, %00000011, %00000111, %00001111
479 .byte %00011111, %00111111
480 Pot2NTbl
481 .byte %00100000, %00110000
482 .byte %00111000, %00111100, %00111110, %00111111
483 .)
484
485
486 .dsb 256-(*&255)
487 ;
488 ; This code is used when the things are moving faster
489 ; vertically than horizontally
490 ;
491 ; dy>dx
492 ;
493 draw_mainly_vertical_8
494 ; here we have DY in Y, and the OPCODE in A
495 .(
496 ; setup bresenham values:
497 sty __auto_dy+1
498 ldx dx
499 stx __auto_dx+1
500
501 ; setup direction:
502 cmp #_DEX ; which direction?
503 bne doInx
504 ; dex -> moving left:
505 lda #%00100000
506 sta __auto_cpBit+1
507 lda #_ASL ;
508 sta __auto_shBit
509 lda #%00000001
510 sta __auto_ldBit+1
511 lda #_DEY
512 sta __auto_yLo
513 ldx #$ff
514 lda #_DEC_ZP
515 bne endPatch
516
517 doInx
518 ; inx -> moving right:
519 lda #%00000001
520 sta __auto_cpBit+1
521 lda #_LSR
522 sta __auto_shBit
523 lda #%00100000
524 sta __auto_ldBit+1
525 lda #_INY
526 sta __auto_yLo
527 ldx #$00
528 lda #_INC_ZP
529 endPatch
530 stx __auto_cpY+1
531 sta __auto_yHi
532 ; setup X
533 tya ; y = dY
534 tax
535 inx ; x = dY+1
536 ; setup current bit:
537 ldy _CurrentPixelX
538 lda _TableBit6Reverse,y ; 4
539 sta curBit
540 ; setup pointer and Y:
541 ; TODO: self-modyfing code?
542 lda _TableDiv6,y
543 clc
544 adc tmp0
545 tay
546 lda #0
547 sta tmp0
548 bcc skipTmp0
549 inc tmp0+1
550 skipTmp0
551 ; calculate initial bresenham sum:
552 lda dy
553 lsr
554 eor #$ff ; -DY/2
555 clc ; 2
556 bcc loopY ; 3
557 ; a = sum, y = tmp0, x = dY+1, tmp0 = 0
558
559 incHiPtr ; 9
560 inc tmp0+1 ; 5
561 clc ; 2
562 bcc contHiPtr ; 3
563 ;----------------------------------------------------------
564 loopY
565 sta save_a ; 3 = 3
566 ; Draw the pixel
567 lda curBit ; 3
568 eor (tmp0),y ; 5
569 sta (tmp0),y ; 6 = 14
570
571 dex ; 2 At the endpoint yet?
572 beq exitLoop ; 2/3= 4/5
573 loopX
574 ; update the screen address:
575 tya ; 2
576 adc #ROW_SIZE ; 2
577 tay ; 2
578 bcs incHiPtr ; 2/13 ~16% taken
579 contHiPtr ; = 9.76 average
580
581 lda save_a ; 3
582 __auto_dx
583 adc #00 ; 2 +DX
584 bcc loopY ; 2/3= 7/8 ~50% taken
585
586 ; Time to step in x
587 __auto_dy
588 sbc #00 ; 2 -DY
589 sta save_a ; 3 = 5
590
591 lda curBit ; 3
592 __auto_cpBit ; TODO: optimize
593 cmp #%00100000 ; 2 %00100000/%00000001
594 beq nextColumn ; 2/14.07 ~17% taken
595 __auto_shBit
596 asl ; 2 asl/lsr, clears carry
597 contNextColumn
598 sta curBit ; 3 =~13.71
599
600 ; Draw the pixel
601 eor (tmp0),y ; 5
602 sta (tmp0),y ; 6 = 11
603 dex ; 2 At the endpoint yet?
604 bne loopX ; 2/3= 4/5
605 exitLoop
606 rts
607 ;----------------------------------------------------------
608 nextColumn
609 __auto_ldBit
610 lda #%00000001 ; 2 %00000001/%00100000
611 __auto_yLo
612 dey ; 2 dey/iny
613 __auto_cpY
614 cpy #$ff ; 2 $ff/$00
615 clc ; 2 TODO: optimize
616 bne contNextColumn ; 2/3 ~99% taken
617 __auto_yHi
618 dec tmp0+1 ; 5 dec/inc
619 bcc contNextColumn ; 3
620
621 ; x ,y++: 38.76 (50%)
622 ; x++,y++: 51.47 (50%)
623 ; average: 45.11
624 .)
625
626 ; *** total timings: ***
627 ; draw_very_horizontal_8 (29.6%): 27.20
628 ; draw_mainly_horizontal_8 (20.4%): 40.72
629 ; draw_mainly_vertical_8 (50.0%): 45.11
630 ;----------------------------------------
631 ; total average (100.0%): 38.91
632
633
634

  ViewVC Help
Powered by ViewVC 1.1.26