| ;; Copyright (C) 2019-2025 Free Software Foundation, Inc. |
| ;; |
| ;; This file is part of LIBF7, which is part of GCC. |
| ;; |
| ;; GCC is free software; you can redistribute it and/or modify it under |
| ;; the terms of the GNU General Public License as published by the Free |
| ;; Software Foundation; either version 3, or (at your option) any later |
| ;; version. |
| ;; |
| ;; GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| ;; WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| ;; for more details. |
| ;; |
| ;; Under Section 7 of GPL version 3, you are granted additional |
| ;; permissions described in the GCC Runtime Library Exception, version |
| ;; 3.1, as published by the Free Software Foundation. |
| ;; |
| ;; You should have received a copy of the GNU General Public License and |
| ;; a copy of the GCC Runtime Library Exception along with this program; |
| ;; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| ;; <http://www.gnu.org/licenses/>. */ |
| |
| #ifndef __AVR_TINY__ |
| |
| #define ASM_DEFS_HAVE_DEFUN |
| |
| #include "asm-defs.h" |
| #include "libf7.h" |
| |
| #define ZERO __zero_reg__ |
| #define TMP __tmp_reg__ |
| |
| #define F7(name) F7_(name##_asm) |
| |
| .macro F7call name |
| .global F7(\name\()) |
| XCALL F7(\name\()) |
| .endm |
| |
| .macro F7jmp name |
| .global F7(\name\()) |
| XJMP F7(\name\()) |
| .endm |
| |
| ;; Just for visibility in disassembly. |
| .macro LLL name |
| .global LLL.\name |
| LLL.\name: |
| nop |
| .endm |
| |
| .macro DEFUN name |
| .section .text.libf7.asm.\name, "ax", @progbits |
| .global F7(\name\()) |
| .func F7(\name\()) |
| F7(\name\()) : |
| .endm |
| |
| .macro ENDF name |
| .size F7(\name\()), . - F7(\name\()) |
| .endfunc |
| .endm |
| |
| .macro LABEL name |
| .global F7(\name\()) |
| F7(\name\()) : |
| .endm |
| |
| .macro _DEFUN name |
| .section .text.libf7.asm.\name, "ax", @progbits |
| .weak \name |
| .type \name, @function |
| \name : |
| .endm |
| |
| .macro _ENDF name |
| .size \name, . - \name |
| .endm |
| |
| .macro _LABEL name |
| .weak \name |
| .type \name, @function |
| \name : |
| .endm |
| |
| #define F7_NAME(X) F7_(X) |
| |
| ;; Make a weak alias. |
| .macro ALIAS sym |
| .weak \sym |
| .type \sym, @function |
| \sym: |
| .endm |
| |
| ;; Make a weak alias if double is 64 bits wide. |
| .macro DALIAS sym |
| #if defined (WITH_LIBF7_MATH_SYMBOLS) && __SIZEOF_DOUBLE__ == 8 |
| ALIAS \sym |
| #endif |
| .endm |
| |
| ;; Make a weak alias if long double is 64 bits wide. |
| .macro LALIAS sym |
| #if defined (WITH_LIBF7_MATH_SYMBOLS) && __SIZEOF_LONG_DOUBLE__ == 8 |
| ALIAS \sym |
| #endif |
| .endm |
| |
| #define Off 1 |
| #define Expo (Off + F7_MANT_BYTES) |
| |
| #ifdef F7MOD_classify_ |
| ;; r24 = classify (*Z) |
| ;; NaN -> F7_FLAG_nan |
| ;; INF -> F7_FLAG_inf [ | F7_FLAG_sign ] |
| ;; ==0 -> F7_FLAG_zero |
| ;; ... -> 0 [ | F7_FLAG_sign ] |
| |
| ;; Clobbers: None (no TMP, no T). |
| DEFUN classify |
| |
| ld r24, Z |
| lsr r24 |
| brne .Lnan_or_inf |
| |
| ldd r24, Z+6+Off |
| tst r24 |
| brpl 0f |
| sbc r24, r24 |
| andi r24, F7_FLAG_sign |
| ret |
| |
| 0: ldi r24, F7_FLAG_zero |
| ret |
| |
| .Lnan_or_inf: |
| rol r24 |
| ret |
| |
| ENDF classify |
| #endif /* F7MOD_classify_ */ |
| |
| #ifdef F7MOD_clr_ |
| DEFUN clr |
| std Z+0, ZERO |
| std Z+0+Off, ZERO |
| std Z+1+Off, ZERO |
| std Z+2+Off, ZERO |
| std Z+3+Off, ZERO |
| std Z+4+Off, ZERO |
| std Z+5+Off, ZERO |
| std Z+6+Off, ZERO |
| std Z+0+Expo, ZERO |
| std Z+1+Expo, ZERO |
| ret |
| ENDF clr |
| |
| #endif /* F7MOD_clr_ */ |
| |
| #ifdef F7MOD_clz_ |
| ;; The libcc CLZ implementations like __clzsi2 aka. __builtin_clzl are |
| ;; not very well suited for out purpose, so implement our own. |
| |
| #define ZBITS r26 |
| .macro .test.byte reg |
| or ZERO, \reg |
| brne .Loop_bit |
| subi ZBITS, -8 |
| .endm |
| |
| ;; R26 = CLZ (uint64_t R18); CLZ (0) = 64. |
| ;; Unchanged: T |
| DEFUN clzdi2 |
| clr ZBITS |
| ;; Catch the common case of normalized .mant for speed-up. |
| tst r25 |
| brmi 9f |
| .test.byte r25 |
| .test.byte r24 |
| .test.byte r23 |
| .test.byte r22 |
| .test.byte r21 |
| .test.byte r20 |
| .test.byte r19 |
| .test.byte r18 |
| .Ldone: |
| clr ZERO |
| 9: ret |
| |
| .Loop_bit: |
| lsl ZERO |
| brcs .Ldone |
| inc ZBITS |
| rjmp .Loop_bit |
| |
| ENDF clzdi2 |
| #undef ZBITS |
| #endif /* F7MOD_clz_ */ |
| |
| #ifdef F7MOD_cmp_mant_ |
| DEFUN cmp_mant |
| |
| adiw X, 6 + Off |
| ld r24, X $ ldd TMP, Z+6+Off $ SUB r24, TMP |
| brne .Lunequal |
| |
| sbiw X, 6 |
| ld r24, X+ $ ldd TMP, Z+0+Off $ SUB r24, TMP |
| ld r24, X+ $ ldd TMP, Z+1+Off $ sbc r24, TMP |
| ld r24, X+ $ ldd TMP, Z+2+Off $ sbc r24, TMP |
| ld r24, X+ $ ldd TMP, Z+3+Off $ sbc r24, TMP |
| ld r24, X+ $ ldd TMP, Z+4+Off $ sbc r24, TMP |
| ld r24, X+ $ ldd TMP, Z+5+Off $ sbc r24, TMP |
| ;; MSBs are already known to be equal |
| breq 9f |
| .Lunequal: |
| sbc r24, r24 |
| sbci r24, -1 |
| 9: sbiw X, 6 + Off |
| ret |
| ENDF cmp_mant |
| #endif /* F7MOD_cmp_mant_ */ |
| |
| #define CA 18 |
| #define C0 CA+1 |
| #define C1 C0+1 |
| #define C2 C0+2 |
| #define C3 C0+3 |
| #define C4 C0+4 |
| #define C5 C0+5 |
| #define C6 C0+6 |
| #define Carry r16 |
| #define Flags 18 |
| |
| #ifdef F7MOD_store_ |
| ;; Z->flags = CA. |
| ;; Z->mant = C[7]. |
| DEFUN store_mant.with_flags |
| st Z, CA |
| |
| ;; Z->mant = C[7]. |
| LABEL store_mant |
| std Z+0+Off, C0 |
| std Z+1+Off, C1 |
| std Z+2+Off, C2 |
| std Z+3+Off, C3 |
| std Z+4+Off, C4 |
| std Z+5+Off, C5 |
| std Z+6+Off, C6 |
| ret |
| ENDF store_mant.with_flags |
| #endif /* F7MOD_store_ */ |
| |
| #ifdef F7MOD_load_ |
| ;; CA = Z->flags |
| ;; C[7] = Z->mant |
| DEFUN load_mant.with_flags |
| ld CA, Z |
| skipnext |
| |
| ;; CA = 0 |
| ;; C[7] = Z->mant |
| LABEL load_mant.clr_CA |
| LABEL load_mant.clr_flags |
| clr CA ; May be skipped |
| |
| ;; C[7] = Z->mant |
| LABEL load_mant |
| ldd C0, Z+0+Off |
| ldd C1, Z+1+Off |
| ldd C2, Z+2+Off |
| ldd C3, Z+3+Off |
| ldd C4, Z+4+Off |
| ldd C5, Z+5+Off |
| ldd C6, Z+6+Off |
| ret |
| ENDF load_mant.with_flags |
| #endif /* F7MOD_load_ */ |
| |
| #ifdef F7MOD_copy_ |
| DEFUN copy |
| cp XL, ZL |
| cpc XH, ZH |
| breq 9f |
| adiw XL, F7_SIZEOF |
| adiw ZL, F7_SIZEOF |
| set |
| bld ZERO, 1 |
| bld ZERO, 3 ; ZERO = 0b1010 = 10. |
| .Loop: |
| ld TMP, -X |
| st -Z, TMP |
| dec ZERO |
| brne .Loop |
| 9: ret |
| ENDF copy |
| #endif /* F7MOD_copy_ */ |
| |
| #ifdef F7MOD_copy_P_ |
| DEFUN copy_P |
| set |
| bld ZERO, 1 |
| bld ZERO, 3 ; ZERO = 0b1010 = 10. |
| .Loop: |
| #ifdef __AVR_HAVE_LPMX__ |
| lpm TMP, Z+ |
| #else |
| lpm |
| adiw Z, 1 |
| #endif /* Have LPMx */ |
| st X+, TMP |
| dec ZERO |
| brne .Loop |
| sbiw X, F7_SIZEOF |
| sbiw Z, F7_SIZEOF |
| ret |
| ENDF copy_P |
| #endif /* F7MOD_copy_P_ */ |
| |
| #ifdef F7MOD_copy_mant_ |
| DEFUN copy_mant |
| cp XL, ZL |
| cpc XH, ZH |
| breq 9f |
| adiw XL, 1 |
| adiw ZL, 1 |
| set |
| bld ZERO, 3 |
| dec ZERO ; ZERO = 7 |
| .Loop: |
| ld TMP, X+ |
| st Z+, TMP |
| dec ZERO |
| brne .Loop |
| sbiw XL, 8 |
| sbiw ZL, 8 |
| 9: ret |
| ENDF copy_mant |
| #endif /* F7MOD_copy_mant_ */ |
| |
| |
| #ifdef F7MOD_clr_mant_lsbs_ |
| DEFUN clr_mant_lsbs |
| push r16 |
| mov r16, r20 |
| wmov XL, r24 |
| |
| wmov ZL, r22 |
| F7call load_mant |
| |
| F7call lshrdi3 |
| |
| clr CA |
| |
| F7call ashldi3 |
| |
| pop r16 |
| |
| wmov ZL, XL |
| F7jmp store_mant |
| |
| ENDF clr_mant_lsbs |
| #endif /* F7MOD_clr_mant_lsbs_ */ |
| |
| |
| #ifdef F7MOD_normalize_with_carry_ |
| ;; Z = &f7_t |
| ;; C[] = .mant may be not normalized |
| ;; Carry === r16 = Addend to Z->expo in [-64, 128). |
| ;; Normalize C[], set Flags, and adjust Z->expo. |
| ;; Return CA (after normalization) in TMP. |
| ;; Unchanged: T |
| #define Addend r17 |
| #define Zbits r26 |
| #define expL r26 |
| #define expH r27 |
| DEFUN normalize_with_carry |
| mov Addend, Carry |
| tst C6 |
| brmi .Lshift.0 |
| ;; r26 = CLZ (uint64_t R18) |
| F7call clzdi2 |
| cpi Zbits, 64 |
| breq .Lclr |
| sub Addend, Zbits |
| mov r16, Zbits |
| |
| F7call ashldi3 |
| ;; Assert (R25.7 == 1) |
| .Lshift.0: |
| mov TMP, CA |
| ld Flags, Z |
| |
| ;; .expo += Addend |
| ldd expL, Z+0+Expo |
| ldd expH, Z+1+Expo |
| ;; Sign-extend Addend |
| clr r16 |
| sbrc Addend, 7 |
| com r16 |
| |
| ;; exp += (int8_t) Addend, i.e. sign-extend Addend. |
| add expL, Addend |
| adc expH, r16 |
| brvc .Lnormal |
| tst r16 |
| brmi .Lclr |
| ;; Overflow |
| #if F7_HAVE_Inf == 1 |
| ori Flags, F7_FLAG_inf |
| #else |
| ldi Flags, F7_FLAG_nan |
| #endif /* Have Inf */ |
| ret |
| |
| .Lnormal: |
| std Z+0+Expo, expL |
| std Z+1+Expo, expH |
| ret |
| |
| .Lclr: |
| ;; Underflow or Zero. |
| clr TMP |
| .global __clr_8 |
| XJMP __clr_8 |
| |
| LABEL normalize.store_with_flags |
| ;; no rounding |
| set |
| skipnext |
| LABEL normalize.round.store_with_flags |
| ;; with rounding |
| clt ; skipped ? |
| LABEL normalize.maybe_round.store_with_flags |
| F7call normalize_with_carry |
| ;; We have: |
| ;; Z = &f7_t |
| ;; X = .expo |
| ;; C[] = .mant |
| ;; R18 = .flags |
| ;; TMP = byte below .mant after normalization |
| ;; T = 1 => no rounding. |
| brts .Lstore |
| lsl TMP |
| adc C0, ZERO |
| brcc .Lstore |
| adc C1, ZERO |
| adc C2, ZERO |
| adc C3, ZERO |
| adc C4, ZERO |
| adc C5, ZERO |
| adc C6, ZERO |
| brcc .Lstore |
| ;; We only come here if C6 overflowed, i.e. C[] is 0 now. |
| ;; .mant = 1.0 by restoring the MSbit. |
| ror C6 |
| ;; .expo += 1 and override the .expo stored during normalize. |
| adiw expL, 1 |
| std Z+0+Expo, expL |
| std Z+1+Expo, expH |
| |
| .Lstore: |
| F7call store_mant.with_flags |
| |
| ;; Return the byte below .mant after normalization. |
| ;; This is only useful without rounding; the caller will know. |
| mov R24, TMP |
| ret |
| ENDF normalize_with_carry |
| #endif /* F7MOD_normalize_with_carry_ */ |
| |
| |
| #ifdef F7MOD_normalize_ |
| ;; Using above functionality from C. |
| ;; f7_t* normalize (f7_t *cc) |
| ;; Adjusts cc->expo |
| ;; Clears cc->flags |
| DEFUN normalize |
| push r17 |
| push r16 |
| wmov ZL, r24 |
| F7call load_mant.clr_CA |
| clr Carry |
| st Z, ZERO |
| F7call normalize.store_with_flags |
| wmov r24, Z |
| pop r16 |
| pop r17 |
| ret |
| ENDF normalize |
| #endif /* F7MOD_normalize_ */ |
| |
| |
| #ifdef F7MOD_store_expo_ |
| #define Done r24 |
| #define expLO r24 |
| #define expHI r25 |
| ;; expo == INT16_MAX => *Z = Inf, return Done = true. |
| ;; expo == INT16_MIN => *Z = 0x0, return Done = true. |
| ;; else => Z->expo = expo, return Done = false. |
| DEFUN store_expo |
| cpi expHI, 0x80 |
| cpc expLO, ZERO |
| breq .Ltiny |
| adiw expLO, 1 |
| brvs .Lhuge |
| sbiw expLO, 1 |
| std Z+0+Expo, expLO |
| std Z+1+Expo, expHI |
| ldi Done, 0 |
| ret |
| |
| .Lhuge: |
| #if F7_HAVE_Inf == 1 |
| ld Done, Z |
| andi Done, F7_FLAG_sign |
| ori Done, F7_FLAG_inf |
| #else |
| ldi Done, F7_FLAG_nan |
| #endif /* Have Inf */ |
| st Z, Done |
| ldi Done, 1 |
| ret |
| |
| .Ltiny: |
| ldi Done, 1 |
| F7jmp clr |
| ENDF store_expo |
| #endif /* F7MOD_store_expo_ */ |
| |
| |
| #ifdef F7MOD_set_u64_ |
| DEFUN set_s64 |
| set |
| skipnext |
| ;; ... |
| LABEL set_u64 |
| clt ; Skipped? |
| wmov Zl, r16 |
| ;; TMP holds .flags. |
| clr TMP |
| brtc .Lnot.negative |
| |
| bst C6, 7 |
| brtc .Lnot.negative |
| bld TMP, F7_FLAGNO_sign |
| .global __negdi2 |
| XCALL __negdi2 |
| |
| .Lnot.negative: |
| st Z, TMP |
| std Z+0+Expo, ZERO |
| std Z+1+Expo, ZERO |
| ldi Carry, 63 |
| F7call normalize.round.store_with_flags |
| wmov r24, Z |
| wmov r16, Z ; Unclobber r16. |
| ret |
| ENDF set_s64 |
| #endif /* F7MOD_set_u64_ */ |
| |
| |
| #ifdef F7MOD_to_integer_ |
| #define Mask r26 |
| DEFUN to_integer |
| wmov ZL, r24 |
| mov Mask, r22 |
| |
| F7call load_mant.with_flags |
| |
| sbrc Flags, F7_FLAGNO_nan |
| rjmp .Lset_0x8000 |
| |
| sbrc Flags, F7_FLAGNO_inf |
| rjmp .Lsaturate |
| |
| sbrs C6, 7 |
| rjmp .Lset_0x0000 |
| |
| bst Flags, F7_FLAGNO_sign |
| ldd r27, Z+0+Expo |
| ;; Does .expo have bits outside Mask? ... |
| mov TMP, Mask |
| com TMP |
| and TMP, r27 |
| ldd r27, Z+1+Expo |
| tst r27 |
| brmi .Lset_0x0000 ; ...yes: .expo is < 0 => return 0 |
| or TMP, r27 |
| brne .Lsaturate.T ; ...yes: .expo > Mask => saturate |
| |
| ;; ...no: Shift right to meet .expo = 0. |
| PUSH r16 |
| ldd r16, Z+0+Expo |
| eor r16, Mask |
| and r16, Mask |
| clr CA |
| F7call lshrdi3 |
| POP r16 |
| tst C6 |
| brmi .Lsaturate.T ; > INTxx_MAX => saturate |
| |
| brtc 9f ; >= 0 => return |
| sbrc Mask, 5 |
| .global __negdi2 |
| XJMP __negdi2 |
| sbrc Mask, 4 |
| .global __negsi2 |
| XJMP __negsi2 |
| neg C6 |
| neg C5 |
| sbci C6, 0 |
| 9: ret |
| |
| .Lsaturate: |
| bst Flags, F7_FLAGNO_sign |
| .Lsaturate.T: |
| |
| #if F7_HAVE_Inf |
| brtc .Lset_0x7fff |
| ;; -Inf => return 1 + INTxx_MIN |
| mov ZL, Flags |
| .global __clr_8 |
| XCALL __clr_8 |
| ldi C6, 0x80 |
| |
| ldi CA+0, 0x01 |
| |
| sbrs Mask, 5 |
| ldi CA+4, 0x01 |
| |
| sbrs Mask, 4 |
| ldi CA+6, 0x01 |
| ret |
| |
| .Lset_0x7fff: |
| ;; +Inf => return INTxx_MAX |
| sec |
| .global __sbc_8 |
| XCALL __sbc_8 |
| ldi C6, 0x7f |
| ret |
| #endif /* F7_HAVE_Inf */ |
| |
| .Lset_0x8000: |
| ;; NaN => return INTxx_MIN |
| .global __clr_8 |
| XCALL __clr_8 |
| ldi C6, 0x80 |
| ret |
| |
| .Lset_0x0000: |
| ;; Small value => return 0x0 |
| .global __clr_8 |
| XJMP __clr_8 |
| |
| ENDF to_integer |
| #endif /* F7MOD_to_integer_ */ |
| |
| |
| #ifdef F7MOD_to_unsigned_ |
| #define Mask r26 |
| DEFUN to_unsigned |
| wmov ZL, r24 |
| mov Mask, r22 |
| |
| F7call load_mant.with_flags |
| |
| sbrc Flags, F7_FLAGNO_nan |
| rjmp .Lset_0xffff |
| |
| sbrc Flags, F7_FLAGNO_sign |
| rjmp .Lset_0x0000 |
| |
| sbrc Flags, F7_FLAGNO_inf |
| rjmp .Lset_0xffff |
| |
| sbrs C6, 7 |
| rjmp .Lset_0x0000 |
| |
| ldd r27, Z+0+Expo |
| ;; Does .expo have bits outside Mask? ... |
| mov TMP, Mask |
| com TMP |
| and TMP, r27 |
| ldd r27, Z+1+Expo |
| tst r27 |
| brmi .Lset_0x0000 ; ...yes: .expo is < 0 => return 0 |
| or TMP, r27 |
| brne .Lset_0xffff ; ...yes: .expo > Mask => saturate |
| |
| ;; ...no: Shift right to meet .expo = 0. |
| PUSH r16 |
| ldd r16, Z+0+Expo |
| eor r16, Mask |
| and r16, Mask |
| clr CA |
| F7call lshrdi3 |
| POP r16 |
| ret |
| |
| .Lset_0xffff: |
| ;; return UINTxx_MAX |
| sec |
| .global __sbc_8 |
| XJMP __sbc_8 |
| |
| .Lset_0x0000: |
| ;; Small value => return 0x0 |
| .global __clr_8 |
| XJMP __clr_8 |
| |
| ENDF to_unsigned |
| #endif /* F7MOD_to_unsigned_ */ |
| |
| |
| #ifdef F7MOD_addsub_mant_scaled_ |
| ;; int8_t f7_addsub_mant_scaled_asm (f7_t *r24, const f7_t *r22, const f7_t 20*, |
| ;; uint8_t r18); |
| ;; R18.0 = 1 : ADD |
| ;; R18.0 = 0 : SUB |
| ;; R18[7..1] : Scale |
| ;; Compute *R24 = *R22 + *R20 >> R18[7..1]. |
| |
| #define BA 10 |
| #define B0 BA+1 |
| #define B1 B0+1 |
| #define B2 B0+2 |
| #define B3 B0+3 |
| #define B4 B0+4 |
| #define B5 B0+5 |
| #define B6 B0+6 |
| |
| DEFUN addsub_mant_scaled |
| do_prologue_saves 10 |
| |
| bst r18, 0 ;; ADD ? |
| lsr r18 |
| mov r16, r18 |
| |
| wmov ZL, r20 |
| wmov YL, r22 |
| ;; C[] = bb >> shift |
| wmov XL, r24 |
| |
| F7call load_mant.clr_CA |
| F7call lshrdi3 |
| |
| wmov BA, CA |
| wmov B1, C1 |
| wmov B3, C3 |
| wmov B5, C5 |
| wmov ZL, YL |
| F7call load_mant.clr_CA |
| |
| wmov ZL, XL |
| |
| brts .Ladd |
| |
| .global __subdi3 |
| XCALL __subdi3 |
| |
| breq .Lzero |
| brcc .Lround |
| ;; C = 1: Can underflow happen at all ? |
| .Lzero: |
| F7call clr |
| rjmp .Lepilogue |
| |
| .Ladd: |
| .global __adddi3 |
| XCALL __adddi3 |
| brcc .Lround |
| ldi Carry, 1 |
| .global __lshrdi3 |
| XCALL __lshrdi3 |
| ori C6, 1 << 7 |
| skipnext |
| .Lround: |
| clr Carry ; skipped? |
| F7call normalize.round.store_with_flags |
| |
| .Lepilogue: |
| do_epilogue_restores 10 |
| |
| ENDF addsub_mant_scaled |
| |
| #if !defined (__AVR_HAVE_MOVW__) || !defined (__AVR_HAVE_JMP_CALL__) |
| DEFUN lshrdi3 |
| .global __lshrdi3 |
| XJMP __lshrdi3 |
| ENDF lshrdi3 |
| DEFUN ashldi3 |
| .global __ashldi3 |
| XJMP __ashldi3 |
| ENDF ashldi3 |
| #else |
| |
| # Basically just a wrapper around libgcc's __lshrdi3. |
| DEFUN lshrdi3 |
| ;; Handle bit 5 of shift offset. |
| sbrs r16, 5 |
| rjmp 4f |
| wmov CA, C3 |
| wmov C1, C5 |
| clr C6 $ clr C5 $ wmov C3, C5 |
| 4: |
| ;; Handle bit 4 of shift offset. |
| sbrs r16, 4 |
| rjmp 3f |
| wmov CA, C1 |
| wmov C1, C3 |
| wmov C3, C5 |
| clr C6 $ clr C5 |
| 3: |
| ;; Handle bits 3...0 of shift offset. |
| push r16 |
| andi r16, 0xf |
| breq 0f |
| |
| .global __lshrdi3 |
| XCALL __lshrdi3 |
| 0: |
| pop r16 |
| ret |
| ENDF lshrdi3 |
| |
| # Basically just a wrapper around libgcc's __ashldi3. |
| DEFUN ashldi3 |
| ;; Handle bit 5 of shift offset. |
| sbrs r16, 5 |
| rjmp 4f |
| wmov C5, C1 |
| wmov C3, CA |
| clr C2 $ clr C1 $ wmov CA, C1 |
| 4: |
| ;; Handle bit 4 of shift offset. |
| sbrs r16, 4 |
| rjmp 3f |
| wmov C5, C3 |
| wmov C3, C1 |
| wmov C1, CA |
| clr CA $ clr C0 |
| 3: |
| ;; Handle bits 3...0 of shift offset. |
| push r16 |
| andi r16, 0xf |
| breq 0f |
| |
| .global __ashldi3 |
| XCALL __ashldi3 |
| 0: |
| pop r16 |
| ret |
| ENDF ashldi3 |
| #endif /* Small device */ |
| |
| #endif /* F7MOD_addsub_mant_scaled_ */ |
| |
| #if defined F7MOD_mul_mant_ && defined (__AVR_HAVE_MUL__) |
| #define A0 11 |
| #define A1 A0+1 |
| #define A2 A0+2 |
| #define A3 A0+3 |
| #define A4 A0+4 |
| #define A5 A0+5 |
| #define A6 A0+6 |
| |
| #define TT0 26 |
| #define TT1 TT0+1 |
| #define TT2 28 |
| #define TT3 TT2+1 |
| |
| #define BB 10 |
| |
| ;; R18.0 = 1: No rounding. |
| |
| DEFUN mul_mant |
| ;; 10 = Y, R17...R10 |
| do_prologue_saves 10 |
| ;; T = R18.0: Skip rounding? |
| bst r18, 0 |
| ;; Save result address for later. |
| push r25 |
| push r24 |
| ;; Load A's mantissa. |
| movw ZL, r22 |
| LDD A0, Z+0+Off |
| LDD A1, Z+1+Off |
| LDD A2, Z+2+Off |
| LDD A3, Z+3+Off |
| LDD A4, Z+4+Off |
| LDD A5, Z+5+Off |
| LDD A6, Z+6+Off |
| movw ZL, r20 |
| |
| ;; 6 * 6 -> 6:5 |
| ;; 4 * 6 -> 4:3 |
| ;; 2 * 6 -> 2:1 |
| ;; 0 * 6 -> 0:a |
| ldd BB, Z+6+Off |
| mul A6, BB $ movw C5, r0 |
| mul A4, BB $ movw C3, r0 |
| mul A2, BB $ movw C1, r0 |
| mul A0, BB $ movw CA, r0 |
| |
| ;; 5 * 6 -> 5:4 |
| ;; 3 * 6 -> 3:2 |
| ;; 1 * 6 -> 1:0 |
| mul A5, BB $ movw TT2, r0 |
| mul A3, BB $ movw TT0, r0 |
| mul A1, BB |
| ADD C0, r0 $ adc C1, r1 |
| adc C2, TT0 $ adc C3, TT1 |
| adc C4, TT2 $ adc C5, TT3 $ clr ZERO |
| adc C6, ZERO |
| ;; Done B6 |
| |
| ;; 6 * 5 -> 5:4 |
| ;; 4 * 5 -> 3:2 |
| ;; 2 * 5 -> 1:0 |
| ;; 0 * 5 -> a:- |
| ldd BB, Z+5+Off |
| mul A0, BB |
| ;; Done A0 |
| #define Atmp A0 |
| #define Null A0 |
| |
| mov Atmp, r1 |
| mul A6, BB $ movw TT2, r0 |
| mul A4, BB $ movw TT0, r0 |
| mul A2, BB |
| |
| ADD CA, Atmp |
| adc C0, r0 $ adc C1, r1 |
| adc C2, TT0 $ adc C3, TT1 |
| adc C4, TT2 $ adc C5, TT3 $ clr Null |
| adc C6, Null |
| |
| ;; 1 * 5 -> 0:a |
| ;; 3 * 5 -> 2:1 |
| ;; 5 * 5 -> 4:3 |
| mul A1, BB $ movw TT0, r0 |
| mul A3, BB $ movw TT2, r0 |
| mul A5, BB |
| |
| ADD CA, TT0 $ adc C0, TT1 |
| adc C1, TT2 $ adc C2, TT3 |
| adc C3, r0 $ adc C4, r1 |
| adc C5, Null $ adc C6, Null |
| ;; Done B5 |
| |
| ;; 2 * 4 -> 0:a |
| ;; 4 * 4 -> 2:1 |
| ;; 6 * 4 -> 4:3 |
| ldd BB, Z+4+Off |
| mul A2, BB $ movw TT0, r0 |
| mul A4, BB $ movw TT2, r0 |
| mul A6, BB |
| |
| ADD CA, TT0 $ adc C0, TT1 |
| adc C1, TT2 $ adc C2, TT3 |
| adc C3, r0 $ adc C4, r1 |
| adc C5, Null $ adc C6, Null |
| |
| ;; 1 * 4 -> a:- |
| ;; 3 * 4 -> 1:0 |
| ;; 5 * 4 -> 3:2 |
| mul A1, BB $ mov TT1, r1 |
| mul A3, BB $ movw TT2, r0 |
| mul A5, BB |
| ;; Done A1 |
| ;; Done B4 |
| ADD CA, TT1 |
| adc C0, TT2 $ adc C1, TT3 |
| adc C2, r0 $ adc C3, r1 |
| ;; Accumulate carry for C3 in TT1. |
| ;; Accumulate carry for C4 in A1. |
| #define Cry3 TT1 |
| #define Cry4 A1 |
| clr Cry3 |
| clr Cry4 |
| rol Cry4 |
| |
| ;; 6 * 2 -> 2:1 |
| ;; 6 * 3 -> 3:2 |
| ;; 5 * 3 -> 2:1 |
| ldd BB, Z+2+Off |
| mul A6, BB |
| add C1, r0 |
| adc C2, r1 |
| adc Cry3, Null |
| |
| ldd BB, Z+3+Off |
| mul A6, BB |
| add C2, r0 |
| adc C3, r1 |
| adc Cry4, Null |
| |
| mul A5, BB |
| add C1, r0 |
| adc C2, r1 |
| adc Cry3, Null |
| |
| ;; Perform the remaining 11 multiplications in 4 loopings: |
| ;; 4 * 3 -> 1:0 |
| ;; 3 * 3 -> 0:a |
| ;; 2 * 3 -> a:- |
| ;; |
| ;; 5 * 2 -> 1:0 |
| ;; 4 * 2 -> 0:a |
| ;; 3 * 2 -> a:- |
| ;; |
| ;; 6 * 1 -> 1:0 |
| ;; 5 * 1 -> 0:a |
| ;; 4 * 1 -> a:- |
| ;; |
| ;; . * 0 -> 1:0 (=0) |
| ;; 6 * 0 -> 0:a |
| ;; 5 * 0 -> a:- |
| |
| ;; BB already contains B3, hence let Z point one past B2 so that |
| ;; the LD *, -Z below will pick up B2, B1, B0. |
| adiw r30, 1 + Off+2 |
| |
| ;; Accumulate carry for C2 in TT2. |
| #define Cry2 TT2 |
| clr Cry2 |
| |
| ;; TT3 is the loop counter, iterate over B3...B0. |
| ldi TT3, 4 |
| rjmp .Loop_start |
| |
| .Loop: |
| ;; We use A2...A4 below; so shift bytes of A into place. |
| mov A2, A3 |
| mov A3, A4 |
| mov A4, A5 |
| mov A5, A6 |
| clr A6 |
| ld BB, -Z |
| .Loop_start: |
| mul A3, BB |
| ADD CA, r0 $ adc C0, r1 $ adc C1, Null $ adc Cry2, Null |
| MUL A2, BB |
| mov TT0, r1 |
| MUL A4, BB |
| ADD CA, TT0 $ adc C0, r0 $ adc C1, r1 $ adc Cry2, Null |
| dec TT3 |
| brne .Loop |
| |
| clr ZERO |
| ADD C2, Cry2 |
| adc C3, Cry3 |
| adc C4, Cry4 |
| adc C5, ZERO |
| adc C6, ZERO |
| |
| ;; Finally... |
| |
| pop ZL |
| pop ZH |
| ;; The high byte is at least 0x40 and at most 0xfe. |
| ;; The result has to be left-shifted by one in order to scale it |
| ;; correctly. |
| |
| ldi Carry, 1 |
| F7call normalize.maybe_round.store_with_flags |
| |
| do_epilogue_restores 10 |
| |
| ENDF mul_mant |
| #endif /* F7MOD_mul_mant_ && MUL */ |
| |
| #if defined F7MOD_mul_mant_ && ! defined (__AVR_HAVE_MUL__) |
| #define AA TMP |
| #define A0 13 |
| #define A1 A0+1 |
| #define A2 A0+2 |
| #define A3 A0+3 |
| #define A4 A0+4 |
| #define A5 r26 |
| #define A6 r27 |
| #define BB ZERO |
| #define Bits r29 |
| #define Bytes r28 |
| |
| DEFUN mul_mant |
| do_prologue_saves 7 |
| bst r18, 0 ; T = 1: Don't round. |
| ;; Save result address for later. |
| push r25 |
| push r24 |
| ;; Load 1st operand mantissa. |
| wmov r30, r22 |
| clr AA |
| LDD A0, Z+0+Off |
| LDD A1, Z+1+Off |
| LDD A2, Z+2+Off |
| LDD A3, Z+3+Off |
| LDD A4, Z+4+Off |
| LDD A5, Z+5+Off |
| LDD A6, Z+6+Off |
| ;; Let Z point one past .mant of the 2nd input operand. |
| wmov r30, r20 |
| adiw r30, Expo |
| |
| ;; Clear the result mantissa. |
| .global __clr_8 |
| XCALL __clr_8 |
| |
| ;; Loop over the bytes of B's mantissa from highest to lowest. |
| ;; "+1" because we jump into the loop. |
| ldi Bytes, 1 + F7_MANT_BYTES |
| |
| ;; Divide one operand by 2 so that the result mantissa won't overflow. |
| ;; This is accounted for by "Carry = 1" below. |
| ldi Bits, 1 |
| rjmp .Loop_entry |
| |
| .Loop_bytes: |
| ld BB, -Z |
| ;; Loop over the bits of B's mantissa from highest to lowest. |
| ldi Bits, 8 |
| .Loop_bits: |
| lsl BB |
| brcc .Lnext_bit |
| |
| ADD CA, AA |
| adc C0, A0 |
| adc C1, A1 |
| adc C2, A2 |
| adc C3, A3 |
| adc C4, A4 |
| adc C5, A5 |
| adc C6, A6 |
| |
| .Lnext_bit: |
| .Loop_entry: |
| LSR A6 |
| ror A5 |
| ror A4 |
| ror A3 |
| ror A2 |
| ror A1 |
| ror A0 |
| ror AA |
| |
| dec Bits |
| brne .Loop_bits |
| |
| dec Bytes |
| brne .Loop_bytes |
| |
| ;; Finally... |
| |
| pop ZL |
| pop ZH |
| |
| ;; The result has to be left-shifted by one (multiplied by 2) in order |
| ;; to undo the division by 2 of the 1st operand. |
| ldi Carry, 1 |
| F7call normalize.maybe_round.store_with_flags |
| |
| do_epilogue_restores 7 |
| ENDF mul_mant |
| #endif /* F7MOD_mul_mant_ && ! MUL */ |
| |
| |
| #if defined (F7MOD_div_) |
| |
| ;; Dividend is C[] |
| |
| ;; Divisor |
| #define A0 9 |
| #define A1 10 |
| #define A2 11 |
| #define A3 12 |
| #define A4 13 |
| #define A5 14 |
| #define A6 15 |
| |
| ;; Quotient |
| #define Q0 0 /* === TMP */ |
| #define Q1 Q0+1 /* === ZERO */ |
| #define Q2 26 |
| #define Q3 Q2+1 |
| #define Q4 28 |
| #define Q5 Q4+1 |
| #define Q6 16 |
| #define Q7 Q6+1 |
| |
| #define Cnt CA |
| #define QBits r8 |
| |
| DEFUN div |
| do_prologue_saves 12 |
| |
| ;; Number of bits requested for the quotient. |
| ;; This is usually 2 + F7_MANT_BITS. |
| mov QBits, r20 |
| wmov ZL, r22 |
| LDD A0, Z+0+Off |
| LDD A1, Z+1+Off |
| LDD A2, Z+2+Off |
| LDD A3, Z+3+Off |
| LDD A4, Z+4+Off |
| LDD A5, Z+5+Off |
| LDD A6, Z+6+Off |
| wmov ZL, r24 |
| F7call load_mant |
| |
| ;; Clear quotient Q[]. |
| clr Q0 ; === TMP |
| ;clr Q1 ; === ZERO |
| wmov Q2, Q0 |
| wmov Q4, Q0 |
| wmov Q6, Q0 |
| |
| ;; C[] and A[] are valid mantissae, i.e. their MSBit is set. Therefore, |
| ;; quotient Q[] will be in [0x0.ff..., 0x0.40...] and to adjust Q[] we |
| ;; need at most 1 left-shift. Compute F7_MANT_BITS + 2 bits of the |
| ;; quotient: One bit is used for rounding, and one bit might be consumed |
| ;; by the mentioned left-shift. |
| mov Cnt, QBits |
| rjmp .Loop_start |
| |
| .Loop: |
| ;; Shift dividend. |
| LSL C0 |
| rol C1 |
| rol C2 |
| rol C3 |
| rol C4 |
| rol C5 |
| rol C6 |
| brcs .Lfits |
| ;; Compare dividend against divisor. |
| .Loop_start: |
| CP C0, A0 |
| cpc C1, A1 |
| cpc C2, A2 |
| cpc C3, A3 |
| cpc C4, A4 |
| cpc C5, A5 |
| cpc C6, A6 |
| ;; Shift 0 into quotient. |
| brlo 1f |
| .Lfits: |
| ;; Divisor fits into dividend. |
| SUB C0, A0 |
| sbc C1, A1 |
| sbc C2, A2 |
| sbc C3, A3 |
| sbc C4, A4 |
| sbc C5, A5 |
| sbc C6, A6 |
| ;; Shift 1 into quotient. |
| sec |
| rol Q0 |
| skipnext |
| 1: lsl Q0 |
| rol Q1 |
| rol Q2 |
| rol Q3 |
| rol Q4 |
| rol Q5 |
| rol Q6 |
| rol Q7 |
| dec Cnt |
| brne .Loop |
| |
| wmov CA, Q0 |
| wmov C1, Q2 |
| wmov C3, Q4 |
| wmov C5, Q6 |
| clr ZERO |
| |
| ldi Carry, 64 |
| sub Carry, QBits |
| F7call normalize.round.store_with_flags |
| |
| do_epilogue_restores 12 |
| ENDF div |
| |
| #endif /* F7MOD_div_ */ |
| |
| |
| #ifdef F7MOD_sqrt_approx_ |
| ;; ReMainder |
| #define MX 16 |
| #define M0 17 |
| #define M1 26 |
| #define M2 27 |
| #define M3 14 |
| #define M4 15 |
| #define M5 TMP |
| #define M6 r29 |
| |
| #define AA ZERO |
| #define One r13 |
| #define Bits r28 |
| #define Bytes M6 |
| |
| ;; Extend C[] by 0b01 at the low end. |
| #define CX (0b01 << 6) |
| |
| ;;; Compute square-root of const f7_t *R22 for a positive number. |
| DEFUN sqrt_approx |
| ;; 7 = Y, R17...R13 |
| do_prologue_saves 7 |
| |
| wmov ZL, r22 ; Input const f7_t* |
| wmov YL, r24 ; Output f7_t* |
| F7call load_mant |
| ldi CA, 0xff |
| |
| ;; The paper-pencil method for the mantissa consumes bits in pairs and |
| ;; expects the input as Q-format 2.*, but mant is in 1.*. This means |
| ;; we have to shift one to the right. If expo is odd, then we shift |
| ;; one to the left and subtract one from expo in order to compensate |
| ;; and to get an even exponent. |
| |
| ;; Divide expo by 2 because we are doing sqrt. |
| ldd XH, Z+Expo+1 |
| ldd XL, Z+Expo+0 |
| asr XH |
| ror XL |
| ;; Store expo to result. |
| wmov ZL, YL |
| std Z+Expo+0, XL |
| std Z+Expo+1, XH |
| |
| brcs 1f |
| ;; Expo was even. Do >>=1 in order to get Q2.* as explained above. |
| LSR C6 $ ror C5 $ ror C4 $ ror C3 |
| ror C2 $ ror C1 $ ror C0 $ ror CA |
| 1: |
| ;; For odd expo, >>=1 to get Q2.* and <<=1 to get an even expo cancel out. |
| ;; And the right-shift of the exponent implicitly subtracted 1 from it |
| ;; as needed. |
| F7call store_mant.with_flags |
| |
| ;; Let Z point one past the mantissa's MSB. |
| adiw ZL, Off + F7_MANT_BYTES |
| |
| ;; Clear the result mantissa. |
| .global __clr_8 |
| XCALL __clr_8 |
| ;; Clear ReMainder. M6 === Bytes will be zero when Bytes is down to 0. |
| clr M5 |
| wmov M3, C3 |
| wmov M1, C1 |
| wmov MX, CA |
| |
| clr One |
| inc One |
| |
| ;; "+1" because .flags extends the mantissa at the low end. |
| ldi Bytes, 1 + F7_MANT_BYTES |
| .Loop_bytes: |
| ld AA, -Z |
| ldi Bits, 8 |
| .Loop_bits: |
| ;; Shift top 2 bits of MX into M[]. |
| LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3 |
| LSL MX $ rol M0 $ rol M1 $ rol M2 $ rol M3 |
| |
| ;; "Take down" 2 bits from A[] to MX.7 and MX.6 |
| mov MX, AA |
| andi MX, 0xc0 |
| lsl AA |
| lsl AA |
| |
| ;; Compare remainder against current result extended by 0b01. |
| CPI MX, CX |
| cpc M0, C0 |
| cpc M1, C1 |
| cpc M2, C2 |
| cpc M3, C3 |
| brcs 1f |
| ;; If the extended result fits, subtract it from M and set the |
| ;; next result bit to 1. |
| SUBI MX, CX |
| sbc M0, C0 |
| sbc M1, C1 |
| sbc M2, C2 |
| sbc M3, C3 |
| 1: |
| ;; If it doesn't fit, set the next result bit to 0 (and don't subtract). |
| rol C0 |
| eor C0, One |
| rol C1 |
| rol C2 |
| rol C3 |
| |
| subi Bits, 2 |
| brne .Loop_bits |
| ;; AA (=== ZERO) is zero again. |
| |
| dec Bytes |
| brne .Loop_bytes |
| ;; B6 (=== Bytes) is zero now. |
| |
| ;; Now we consumed all the 64 bits of the extended mantissa, but we |
| ;; only expanded 64 / 2 = 32 bits of the result, which is currently |
| ;; held in C3 ... C0. Do the same like above, but on all bytes. |
| ;; Shift in 00's because the mantissa is exhausted. |
| |
| ;; "-1" because flags is part of the mantissa, which is already consumed. |
| ldi Bits, 8 * (F7_MANT_BYTES - 1) |
| .Loop2_bits: |
| ;; Shift top 2 bits of MX into M[]. |
| .Ltwice: |
| LSL MX |
| rol M0 |
| rol M1 |
| rol M2 |
| rol M3 |
| rol M4 |
| rol M5 |
| rol M6 |
| subi Bits, 0x80 |
| brmi .Ltwice |
| |
| ;; "Take down" two 0's to MX.7 and MX.6 |
| ; clr MX ;; MX is already zero. |
| |
| ;; Compare remainder against current result extended by 0b01. |
| CPI MX, CX |
| cpc M0, C0 |
| cpc M1, C1 |
| cpc M2, C2 |
| cpc M3, C3 |
| cpc M4, C4 |
| cpc M5, C5 |
| cpc M6, C6 |
| brcs 1f |
| ;; If the extended result fits, subtract it from M and set the |
| ;; next result bit to 1. |
| SUBI MX, CX |
| sbc M0, C0 |
| sbc M1, C1 |
| sbc M2, C2 |
| sbc M3, C3 |
| sbc M4, C4 |
| sbc M5, C5 |
| sbc M6, C6 |
| 1: |
| ;; If it doesn't fit, set the next result bit to 0 (and don't subtract). |
| rol C0 |
| eor C0, One |
| rol C1 |
| rol C2 |
| rol C3 |
| rol C4 |
| rol C5 |
| rol C6 |
| |
| subi Bits, 2 |
| brne .Loop2_bits |
| |
| ;; Set flags. |
| st Z, ZERO |
| F7call store_mant |
| |
| do_epilogue_restores 7 |
| ENDF sqrt_approx |
| #endif /* F7MOD_sqrt_approx_ */ |
| |
| |
| #if defined (F7MOD_sqrt16_) && defined (__AVR_HAVE_MUL__) |
| |
| #define Mask C6 |
| #define Q0 C3 /* = R22 */ |
| #define Q1 C4 /* = R23 */ |
| |
| ;; uint16_t R24 = sqrt16_XXX (uint16_t R24); |
| ;; Clobbers: R22, R23, TMP. |
| ;; |
| ;; XXX = floor: Return integral part of square-root of R25:R24 with R25 = 0. |
| ;; Error is in [0, -1 LSB). |
| ;; XXX = round: Return quare-root of R25:R24 rounded to nearest integer. |
| ;; R25 = (Q[] >= 65281) = (Q > 0xff00), i.e. if Q[] is not |
| ;; bigger than 0xff00, then the result fits in 8 bits. |
| ;; Return C = 0 if the result is the same as for XXX = floor, |
| ;; error in [0, -1/2 LSB) |
| ;; Return C = 1 if the result is one higher than for XXX = floor, |
| ;; error in [1/2 LSB, 0). |
| DEFUN sqrt16_round |
| set |
| skipnext |
| ;; ... |
| LABEL sqrt16_floor |
| clt ; Skipped? |
| movw Q0, r24 |
| clr C5 |
| ldi Mask, 1 << 7 |
| |
| .Loop_mask: |
| add C5, Mask |
| mul C5, C5 |
| cp Q0, R0 |
| cpc Q1, R1 |
| brsh 1f |
| sub C5, Mask |
| 1: lsr Mask |
| brne .Loop_mask |
| |
| brtc .Ldone ; No rounding => C6 will be 0. |
| |
| ;; Rounding: (X + 1/2)^2 = X^2 + X + 1/4, thus probing |
| ;; for bit -1 is testing Q[] against C5^2 + C5. |
| mul C5, C5 |
| add R0, C5 |
| adc R1, C6 ; Exploit C6 === Mask = 0. |
| cp R0, Q0 |
| cpc R1, Q1 |
| brcc .Ldone |
| ;; If C5^2 + C5 + 1/4 fits into Q[], then round up and C = 1. |
| adiw C5, 1 ; Exploit C6 === Mask = 0. |
| sec |
| |
| .Ldone: |
| clr __zero_reg__ |
| ret |
| ENDF sqrt16_round |
| #undef Mask |
| #undef Q0 |
| #undef Q1 |
| #endif /* F7MOD_sqrt16_ && MUL */ |
| |
| |
| #undef CA |
| #undef C0 |
| #undef C1 |
| #undef C2 |
| #undef C3 |
| #undef C4 |
| #undef C5 |
| #undef C6 |
| #undef Carry |
| |
| |
| #ifdef F7MOD_D_fma_ |
| _DEFUN __fma |
| DALIAS fma |
| LALIAS fmal |
| |
| #define n_pushed 4 |
| #define n_frame (2 * F7_SIZEOF) |
| |
| do_prologue_saves n_pushed, n_frame |
| ;; Y = FramePointer + 1 |
| adiw Y, 1 |
| |
| ;; FP + 1 = (f7_t) arg1 |
| wmov r16, Y |
| ;; The double argument arg1 is already in R18[]. |
| XCALL F7_NAME (set_double_impl) |
| |
| ;; The double argument arg2 is in R10[]. Move it to R18[]. |
| wmov r18, r10 |
| wmov r20, r12 |
| wmov r22, r14 |
| ;; R16, R17 are clobbered. Fetch them from where prologue_saves put them. |
| ldd r24, Y + n_frame + 3 ; Saved R16 |
| ldd r25, Y + n_frame + 2 ; Saved R17 |
| ;; FP + 1 + 10 = (f7_t) arg2 |
| subi r16, lo8 (-F7_SIZEOF) |
| sbci r17, hi8 (-F7_SIZEOF) |
| XCALL F7_NAME (set_double_impl) |
| |
| wmov r24, Y ; &arg1 |
| wmov r22, r16 ; &arg2 |
| XCALL F7_NAME (Imul) ; arg1 *= arg2 |
| |
| ;; The 3rd double argument arg3 was passed on the stack. Move it to R18[], |
| ;; Don't use f7_set_pdouble() because that function is unused (for now). |
| .irp n, 0, 1, 2, 3, 4, 5, 6, 7 |
| ldd 18+\n, Y + n_frame + n_pushed + PC_SIZE + \n |
| .endr |
| XCALL F7_NAME (set_double_impl) |
| |
| wmov r24, Y ; &arg1 |
| wmov r22, r16 ; &arg2 |
| XCALL F7_NAME (Iadd) ; arg1 += arg2 |
| |
| wmov r24, Y ; &arg1 |
| XCALL F7_NAME (get_double) |
| |
| do_epilogue_restores n_pushed, n_frame |
| _ENDF __fma |
| #endif /* F7MOD_D_fma_ */ |
| |
| |
| #ifdef F7MOD_D_fabs_ |
| _DEFUN __fabs |
| DALIAS fabs |
| LALIAS fabsl |
| andi R25, 0b01111111 |
| ret |
| _ENDF __fabs |
| #endif /* F7MOD_D_fabs_ */ |
| |
| |
| #ifdef F7MOD_D_neg_ |
| _DEFUN __neg |
| _LABEL __negdf2 |
| subi R25, 0b10000000 |
| ret |
| _ENDF __neg |
| #endif /* F7MOD_D_neg_ */ |
| |
| |
| #ifdef F7MOD_D_signbit_ |
| _DEFUN __signbit |
| DALIAS signbit |
| LALIAS signbitl |
| bst R25, 7 |
| clr R25 |
| clr R24 |
| bld R24, 0 |
| ret |
| _ENDF __signbit |
| #endif /* F7MOD_D_signbit_ */ |
| |
| |
| #ifdef F7MOD_D_copysign_ |
| _DEFUN __copysign |
| DALIAS copysign |
| LALIAS copysignl |
| bst R17, 7 |
| bld R25, 7 |
| ret |
| _ENDF __copysign |
| #endif /* F7MOD_D_copysign_ */ |
| |
| |
| #ifdef F7MOD_D_isinf_ |
| ;;; +Inf -> +1 |
| ;;; -Inf -> -1 |
| _DEFUN __isinf |
| DALIAS isinf |
| LALIAS isinfl |
| ;; Save sign for later |
| push R25 |
| F7call class_D |
| pop TMP |
| ldi R24, 0 |
| ldi R25, 0 |
| ;; Inf: T = Z = 1. |
| brtc 0f ; ordinary number |
| brne 0f ; Nan |
| ldi R24, 1 |
| sbrc TMP, 7 |
| sbiw R24, 2 |
| 0: ret |
| _ENDF __isinf |
| #endif /* F7MOD_D_isinf_ */ |
| |
| |
| #ifdef F7MOD_D_isnan_ |
| _DEFUN __isnan |
| DALIAS isnan |
| LALIAS isnanl |
| F7call class_D |
| ;; NaN: T = 1, Z = 0. |
| brtc 0f |
| ldi R24, 1 |
| brne 1f |
| 0: |
| clr R24 |
| 1: |
| clr R25 |
| ret |
| _ENDF __isnan |
| #endif /* F7MOD_D_isnan_ */ |
| |
| |
| #ifdef F7MOD_D_isfinite_ |
| _DEFUN __isfinite |
| DALIAS isfinite |
| LALIAS isfinitel |
| F7call class_D |
| ;; Number <=> T = 0. |
| bld R24, 0 |
| com R24 |
| andi R24, 1 |
| clr R25 |
| ret |
| _ENDF __isfinite |
| #endif /* F7MOD_D_isfinite_ */ |
| |
| |
| #ifdef F7MOD_D_class_ |
| ;; The encoded exponent has 11 Bits. |
| #define MAX_BIASED_EXPO 0b0111111111110000 |
| |
| ;; Classify a double in R18[] |
| ;; Number: T-Flag = 0. |
| ;; +-Inf : T-Flag = 1, Z-Flag = 1. |
| ;; NaN : T-Flag = 1, Z-Flag = 0. |
| DEFUN class_D |
| wmov R26, R24 |
| andi R26, lo8 (MAX_BIASED_EXPO) |
| andi R27, hi8 (MAX_BIASED_EXPO) |
| subi R26, lo8 (MAX_BIASED_EXPO) |
| sbci R27, hi8 (MAX_BIASED_EXPO) |
| clt |
| brne .L.number |
| set |
| ;; Set sign and expo to 0. |
| clr R25 |
| andi R24, lo8 (~MAX_BIASED_EXPO) |
| ;; What remains is the mantissa. |
| ;; Mantissa == 0 => +/-Inf. |
| ;; Mantissa != 0 => NaN. |
| ;; Compare R18[] against sign_extend(R26) with R26 = 0. |
| .global __cmpdi2_s8 |
| XJMP __cmpdi2_s8 |
| .L.number: |
| ret |
| |
| ENDF class_D |
| #endif /* F7MOD_D_class_ */ |
| |
| |
| #ifdef F7MOD_D_cmp_ |
| |
| #define A0 18 |
| #define A1 A0 + 1 |
| #define A2 A0 + 2 |
| #define A3 A0 + 3 |
| #define A4 A0 + 4 |
| #define A5 A0 + 5 |
| #define A6 A0 + 6 |
| #define A7 A0 + 7 |
| |
| #define B0 10 |
| #define B1 B0 + 1 |
| #define B2 B0 + 2 |
| #define B3 B0 + 3 |
| #define B4 B0 + 4 |
| #define B5 B0 + 5 |
| #define B6 B0 + 6 |
| #define B7 B0 + 7 |
| |
| #define AA5 XH |
| #define AA6 ZL |
| #define AA7 ZH |
| |
| #define BB0 A0 |
| #define BB1 A1 |
| #define BB2 A2 |
| #define BB3 A3 |
| #define BB4 A4 |
| #define BB5 A5 |
| #define BB6 A6 |
| #define BB7 A7 |
| |
| ;;; Helper for __<cmp>df2 and __unorddf2. |
| ;;; T = 1: Comparison is unordered. |
| ;;; T = 0: Comparison is ordered, and Z, N, C, S flags are set according |
| ;;; to compare (double A, double B) as if set by a signed int comparison. |
| ;;; Note that f(+0) = f(-0) = 0. |
| ;;; In any case: |
| ;;; - return R24 = 1. |
| ;;; - return R25.0 = isNaN (A) |
| ;;; - return R25.1 = isNaN (B) |
| DEFUN D_cmp |
| rcall D_cmp.map_i64 |
| bld __tmp_reg__, 0 |
| push __tmp_reg__ |
| ;; Save A somewhere else... |
| wmov AA6, A6 |
| mov AA5, A5 |
| push A4 |
| push A3 |
| push A2 |
| push A1 |
| mov r0, A0 |
| ;; ... so that we can use D_cmp.map_i64 on B. |
| wmov BB6, B6 |
| wmov BB4, B4 |
| wmov BB2, B2 |
| wmov BB0, B0 |
| rcall D_cmp.map_i64 |
| ;; Run the following code even when B is NaN (T=1) so as to pop the regs. |
| ;; In the non-NaN case, AA and BB can be compared like int64_t for the |
| ;; sake of comparing A and B as double. |
| CP r0, BB0 $ pop r0 |
| cpc r0, BB1 $ pop r0 |
| cpc r0, BB2 $ pop r0 |
| cpc r0, BB3 $ pop r0 |
| cpc r0, BB4 |
| cpc AA5, BB5 |
| cpc AA6, BB6 |
| cpc AA7, BB7 |
| pop r25 |
| ;; R25.0 <=> A is NaN |
| ;; R25.1 <=> B is NaN |
| ;; T <=> comparison is unordered |
| bld r25, 1 |
| sbrc r25, 0 |
| set |
| ldi r24, 1 |
| ret |
| |
| ;;; A is NaN: Set T=1. |
| ;;; A is not a NaN: Set T=0, and map double A to int64_t such that |
| ;;; f(A) <cmp> f(B) iff A <cmp> B, i.e. we can treat the result |
| ;;; as int64_t for the matter of double comparison. |
| ;;; Clobbers: XL. |
| D_cmp.map_i64: |
| bst A7, 7 |
| cbr A7, 0x80 |
| ;; If Inf < |A|, then we have a NaN. |
| CP __zero_reg__, A0 |
| cpc __zero_reg__, A1 |
| cpc __zero_reg__, A2 |
| cpc __zero_reg__, A3 |
| cpc __zero_reg__, A4 |
| cpc __zero_reg__, A5 |
| ldi XL, lo8(0x7ff0) $ cpc XL, A6 |
| ldi XL, hi8(0x7ff0) $ cpc XL, A7 |
| brlo .Lunord |
| brtc 9f |
| clt |
| .global __negdi2 |
| XJMP __negdi2 |
| .Lunord: |
| set |
| 9: ret |
| |
| ENDF D_cmp |
| #endif /* F7MOD_D_cmp_ */ |
| |
| |
| ;; bool __ledf2 (double, double); |
| #ifdef F7MOD_D_le_ |
| _DEFUN __ledf2 |
| F7call D_cmp |
| brts 0f |
| breq 1f |
| brlt 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __ledf2 |
| #endif /* F7MOD_D_le_ */ |
| |
| ;; bool __ltdf2 (double, double); |
| #ifdef F7MOD_D_lt_ |
| _DEFUN __ltdf2 |
| F7call D_cmp |
| brts 0f |
| brlt 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __ltdf2 |
| #endif /* F7MOD_D_lt_ */ |
| |
| ;; bool __gedf2 (double, double); |
| #ifdef F7MOD_D_ge_ |
| _DEFUN __gedf2 |
| F7call D_cmp |
| brts 0f |
| brge 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __gedf2 |
| #endif /* F7MOD_D_ge_ */ |
| |
| ;; bool __gtdf2 (double, double); |
| #ifdef F7MOD_D_gt_ |
| _DEFUN __gtdf2 |
| F7call D_cmp |
| brts 0f |
| breq 0f |
| brge 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __gtdf2 |
| #endif /* F7MOD_D_gt_ */ |
| |
| ;; bool __nedf2 (double, double); |
| #ifdef F7MOD_D_ne_ |
| _DEFUN __nedf2 |
| F7call D_cmp |
| brts 0f |
| brne 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __nedf2 |
| #endif /* F7MOD_D_ne_ */ |
| |
| ;; bool __eqdf2 (double, double); |
| #ifdef F7MOD_D_eq_ |
| _DEFUN __eqdf2 |
| F7call D_cmp |
| brts 0f |
| breq 1f |
| 0: ldi r24, 0 |
| 1: ret |
| _ENDF __eqdf2 |
| #endif /* F7MOD_D_eq_ */ |
| |
| ;; bool __unorddf2 (double, double); |
| #ifdef F7MOD_D_unord_ |
| _DEFUN __unorddf2 |
| F7call D_cmp |
| bld r24, 0 |
| ret |
| _ENDF __unorddf2 |
| #endif /* F7MOD_D_unord_ */ |
| |
| #ifdef F7MOD_D_fminfmax_ |
| _DEFUN __fmin |
| DALIAS fmin |
| LALIAS fminl |
| inc __zero_reg__ |
| |
| _LABEL __fmax |
| DALIAS fmax |
| LALIAS fmaxl |
| ;; Push A[]. |
| push r25 |
| push r24 |
| push r23 |
| push r22 |
| push r21 |
| push r20 |
| push r19 |
| push r18 |
| ;; fmin or fmax |
| push __zero_reg__ |
| clr __zero_reg__ |
| |
| XCALL __gedf2 |
| |
| pop __tmp_reg__ |
| andi r25, 0x3 ; NaNs? |
| brne .Lnan |
| ;; No NaNs involved. |
| eor __tmp_reg__, r24 ; (f == fmin) ^ (A >= B) |
| brne 1f |
| 2: |
| ;; Return B since the cases are: |
| ;; fmax && A < B |
| ;; fmin && A >= B |
| #ifdef __AVR_XMEGA__ |
| in XL, __SP_L__ |
| in XH, __SP_H__ |
| adiw XL, 8 |
| out __SP_L__, XL |
| out __SP_H__, XH |
| #else |
| pop r0 $ pop r0 $ pop r0 $ pop r0 |
| pop r0 $ pop r0 $ pop r0 $ pop r0 |
| #endif |
| wmov r24, r16 |
| wmov r22, r14 |
| wmov r20, r12 |
| wmov r18, r10 |
| ret |
| 1: |
| ;; Return A since the cases are: |
| ;; fmax && A >= B |
| ;; fmin && A < B |
| pop r18 |
| pop r19 |
| pop r20 |
| pop r21 |
| pop r22 |
| pop r23 |
| pop r24 |
| pop r25 |
| ret |
| |
| .Lnan: |
| ;; There are NaNs. |
| ;; When only the 1st argument is a NaN, then return the 2nd argument |
| cpi r25, 0x1 |
| breq 2b |
| ;; When the 2nd argument is a NaN, then return the 1st argument. |
| ;; When both arguments are NaNs, then return NaN (e.g. the 1st argument). |
| rjmp 1b |
| _ENDF __fmax |
| #endif /* F7MOD_D_fminfmax_ */ |
| |
| |
| #ifdef F7MOD_D_sincos_ |
| ;;; void sincos (double R18, double *R16, double *R14); |
| _DEFUN __sincos |
| DALIAS sincos |
| LALIAS sincosl |
| |
| #define n_pushed 4 |
| #define n_frame (2 * F7_SIZEOF) |
| do_prologue_saves n_pushed, n_frame |
| ;; Y = FramePointer + 1 |
| adiw Y, 1 |
| ;; R16 = frame-arg 1 |
| wmov r16, Y |
| ;; The double argument is in R18[]. |
| XCALL F7_NAME (set_double_impl) |
| ;; void f7_sincos (f7_t *ss, f7_t *cc, const f7_t *aa) |
| ;; Note that aa may equal ss or cc. |
| wmov r20, r16 ; aa |
| wmov r24, r16 ; ss = FP + 1 |
| subi r16, lo8(-F7_SIZEOF) |
| sbci r17, hi8(-F7_SIZEOF) |
| wmov r22, r16 ; cc = FP + 1 + F7_SIZEOF |
| XCALL F7_NAME (sincos) |
| |
| ;; double R18 = get_double (cc) |
| wmov r24, r16 |
| XCALL F7_NAME (get_double) |
| wmov XL, r14 ; double *pcos |
| rcall store.r18.X ; *pcos = R18 |
| |
| ;; double R18 = get_double (ss) |
| wmov r24, Y |
| XCALL F7_NAME (get_double) |
| ldd XL, Y + n_frame + 3 ; Saved R16 |
| ldd XH, Y + n_frame + 2 ; Saved R17 |
| rcall store.r18.X ; *psin = R18 |
| |
| do_epilogue_restores n_pushed, n_frame |
| |
| store.r18.X: |
| st X+, r18 |
| st X+, r19 |
| st X+, r20 |
| st X+, r21 |
| st X+, r22 |
| st X+, r23 |
| st X+, r24 |
| st X+, r25 |
| ret |
| _ENDF __sincos |
| #endif /* F7MOD_D_sincos_ */ |
| |
| #ifdef F7MOD_call_dd_ |
| |
| ;; Provide double wrappers for functions that operate on f7_t and get f7_t*. |
| ;; |
| ;; We set up a frame of sizeof(f7_t), convert the input double in R18[] to |
| ;; f7_t in that frame location, then call *Z and finally convert the result f7_t |
| ;; to double R18[] if that's requested. |
| ;; |
| ;; call_dd: double func (double A) |
| ;; void (*Z) (f7_t *aa, const f7_t *aa) |
| ;; |
| ;; call_dx: double func (type_t A) , sizeof(type_t) <= 4 |
| ;; void (*Z) (f7_t *aa, type_t) |
| ;; |
| ;; call_xd: type_t func (double A) |
| ;; type_t (*Z) (const f7_t *aa) |
| ;; |
| ;; call_ddx: double func (double A, word_t) , sizeof (word_t) <= 2 |
| ;; void (*Z) (f7_t *aa, const f7_t *aa, word_t) |
| |
| #define WHAT R13 |
| |
| DEFUN call_dd ; WHAT = R13 = 3 |
| inc ZERO |
| LABEL call_xd ; WHAT = R13 = 2 |
| inc ZERO |
| LABEL call_ddx ; WHAT = R13 = 1 |
| inc ZERO |
| LABEL call_dx ; WHAT = R13 = 0 |
| push WHAT |
| mov WHAT, ZERO |
| clr ZERO |
| ;; R14/R15 hold Z, the address of the f7_worker function, until we need it. |
| push r14 |
| push r15 |
| wmov r14, Z |
| |
| #define n_pushed 4 |
| #define n_frame F7_SIZEOF |
| |
| do_prologue_saves n_pushed, n_frame |
| ;; Y = FramePointer + 1 |
| adiw Y, 1 |
| dec WHAT |
| brmi .Ldx ; WHAT was initially 0. |
| ;; FP + 1 = (f7_t) arg1 |
| wmov r16, Y |
| ;; The double argument is in R18[]. |
| XCALL F7_NAME (set_double_impl) |
| tst WHAT |
| brne .Lno.ddx ; WHAT was initially != 1. |
| ;; call_ddx: Set R20/21 to the 2-byte scalar / pointer argument. |
| ;; Fetch it from where prologue_saves put it. |
| ldd r20, Y + n_frame + 3 ; Saved R16 |
| ldd r21, Y + n_frame + 2 ; Saved R17 |
| .Lno.ddx: |
| wmov r22, Y ; &arg1 (input) |
| .Ldo.dx: |
| wmov r24, Y ; &arg1 (output) |
| wmov Z, r14 |
| XICALL |
| dec WHAT |
| breq .Lepilogue ; WHAT was initially 2: Return non-double. |
| wmov r24, Y ; &arg1 |
| XCALL F7_NAME (get_double) |
| .Lepilogue: |
| ;; + 3 to account for R13...R15 pushed prior to do_prologue_saves. |
| do_epilogue_restores n_pushed + 3, n_frame |
| |
| .Ldx: |
| ;; call_dx: Copy the 4-byte input scalar from R22[4] to R20[4]. |
| wmov r20, r22 |
| wmov r22, r24 |
| rjmp .Ldo.dx |
| |
| ENDF call_dd |
| #endif /* F7MOD_call_dd_ */ |
| |
| |
| #ifdef F7MOD_call_ddd_ |
| |
| ;; Provide double wrappers for functions that operate on f7_t and get f7_t*. |
| ;; |
| ;; We set up a frame of 2 * sizeof(f7_t), convert the input doubles in R18[] |
| ;; and R10[] to f7_t in these frame locations, then call *Z and finally |
| ;; convert the result f7_t to double R18[] if that's requested. |
| ;; |
| ;; call_ddd: double func (double A, double B) |
| ;; void (*Z) (f7_t *aa, const f7_t *aa, const f7_t *bb) |
| ;; |
| ;; call_xdd: type_t func (double A, double B) |
| ;; type_t (*Z) (const f7_t *aa, const f7_t *bb) |
| |
| DEFUN call_ddd |
| inc ZERO |
| LABEL call_xdd |
| ;; R8/R9 hold Z, the address of the f7_worker function, until we need it. |
| push r9 |
| push r8 |
| wmov r8, Z |
| ;; This is an argument to call.2 and will be accessed by the arg pointer. |
| push ZERO |
| clr ZERO |
| rcall call.2 |
| pop TMP |
| pop r8 |
| pop r9 |
| ret |
| |
| #define n_pushed 4 |
| #define n_frame (2 * F7_SIZEOF) |
| |
| call.2: |
| do_prologue_saves n_pushed, n_frame |
| ;; Y = FramePointer + 1 |
| adiw Y, 1 |
| ;; FP + 1 = (f7_t) arg1 |
| wmov r16, Y |
| ;; First double argument is already in R18[]. |
| XCALL F7_NAME (set_double_impl) |
| ;; FP + 11 = (f7_t) arg2 |
| subi r16, lo8 (-F7_SIZEOF) |
| sbci r17, hi8 (-F7_SIZEOF) |
| ;; Move second double argument to R18[]. |
| wmov r18, r10 |
| wmov r20, r12 |
| wmov r22, r14 |
| ;; Get high word of arg2 from where prologue_saves put it. |
| ldd r24, Y + n_frame + 3 ; Saved R16 |
| ldd r25, Y + n_frame + 2 ; Saved R17 |
| XCALL F7_NAME (set_double_impl) |
| ;; Z (f7_t *arg1, const f7_t *arg1, const f7_t *arg2) |
| wmov Z, r8 |
| wmov r24, Y ; &arg1 |
| ;; WHAT == 0 => call_xdd |
| ;; WHAT != 0 => call_ddd |
| ldd TMP, Y + n_frame + n_pushed + PC_SIZE |
| tst TMP |
| breq .Lxdd |
| wmov r22, Y ; &arg1 |
| wmov r20, r16 ; &arg2 |
| XICALL |
| wmov r24, Y ; &arg1 |
| XCALL F7_NAME (get_double) |
| .Lepilogue: |
| do_epilogue_restores n_pushed, n_frame |
| .Lxdd: |
| wmov r22, r16 ; &arg2 |
| XICALL |
| rjmp .Lepilogue |
| ENDF call_ddd |
| #endif /* F7MOD_call_ddd_ */ |
| |
| #include "f7-wraps.h" |
| |
| ;;; Some additional, singular wraps that don't match any pattern. |
| |
| ;; double __powidf2 (double, int) ; __builtin_powi |
| #ifdef F7MOD_D_powi_ |
| _DEFUN __powidf2 |
| .global F7_NAME(powi) |
| ldi ZH, hi8(gs(F7_NAME(powi))) |
| ldi ZL, lo8(gs(F7_NAME(powi))) |
| F7jmp call_ddx |
| _ENDF __powidf2 |
| #endif /* F7MOD_D_powi_ */ |
| |
| #endif /* !AVR_TINY */ |