Greg Kroah-Hartman | b244131 | 2017-11-01 15:07:57 +0100 | [diff] [blame] | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 2 | #ifndef _LINUX_RECIPROCAL_DIV_H |
| 3 | #define _LINUX_RECIPROCAL_DIV_H |
| 4 | |
| 5 | #include <linux/types.h> |
| 6 | |
| 7 | /* |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 8 | * This algorithm is based on the paper "Division by Invariant |
| 9 | * Integers Using Multiplication" by Torbjörn Granlund and Peter |
| 10 | * L. Montgomery. |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 11 | * |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 12 | * The assembler implementation from Agner Fog, which this code is |
| 13 | * based on, can be found here: |
| 14 | * http://www.agner.org/optimize/asmlib.zip |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 15 | * |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 16 | * This optimization for A/B is helpful if the divisor B is mostly |
| 17 | * runtime invariant. The reciprocal of B is calculated in the |
| 18 | * slow-path with reciprocal_value(). The fast-path can then just use |
| 19 | * a much faster multiplication operation with a variable dividend A |
| 20 | * to calculate the division A/B. |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 21 | */ |
| 22 | |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 23 | struct reciprocal_value { |
| 24 | u32 m; |
| 25 | u8 sh1, sh2; |
| 26 | }; |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 27 | |
Jiong Wang | 06ae482 | 2018-07-06 15:13:18 -0700 | [diff] [blame] | 28 | /* "reciprocal_value" and "reciprocal_divide" together implement the basic |
| 29 | * version of the algorithm described in Figure 4.1 of the paper. |
| 30 | */ |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 31 | struct reciprocal_value reciprocal_value(u32 d); |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 32 | |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 33 | static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R) |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 34 | { |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 35 | u32 t = (u32)(((u64)a * R.m) >> 32); |
| 36 | return (t + ((a - t) >> R.sh1)) >> R.sh2; |
Eric Dumazet | 6a2d7a9 | 2006-12-13 00:34:27 -0800 | [diff] [blame] | 37 | } |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 38 | |
Jiong Wang | 06ae482 | 2018-07-06 15:13:18 -0700 | [diff] [blame] | 39 | struct reciprocal_value_adv { |
| 40 | u32 m; |
| 41 | u8 sh, exp; |
| 42 | bool is_wide_m; |
| 43 | }; |
| 44 | |
| 45 | /* "reciprocal_value_adv" implements the advanced version of the algorithm |
| 46 | * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose |
| 47 | * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The |
| 48 | * exception case could be easily handled before calling "reciprocal_value_adv". |
| 49 | * |
| 50 | * The advanced version requires more complex calculation to get the reciprocal |
| 51 | * multiplier and other control variables, but then could reduce the required |
| 52 | * emulation operations. |
| 53 | * |
| 54 | * It makes no sense to use this advanced version for host divide emulation, |
| 55 | * those extra complexities for calculating multiplier etc could completely |
| 56 | * waive our saving on emulation operations. |
| 57 | * |
| 58 | * However, it makes sense to use it for JIT divide code generation for which |
| 59 | * we are willing to trade performance of JITed code with that of host. As shown |
| 60 | * by the following pseudo code, the required emulation operations could go down |
| 61 | * from 6 (the basic version) to 3 or 4. |
| 62 | * |
| 63 | * To use the result of "reciprocal_value_adv", suppose we want to calculate |
| 64 | * n/d, the pseudo C code will be: |
| 65 | * |
| 66 | * struct reciprocal_value_adv rvalue; |
| 67 | * u8 pre_shift, exp; |
| 68 | * |
| 69 | * // handle exception case. |
| 70 | * if (d >= (1U << 31)) { |
| 71 | * result = n >= d; |
| 72 | * return; |
| 73 | * } |
| 74 | * |
| 75 | * rvalue = reciprocal_value_adv(d, 32) |
| 76 | * exp = rvalue.exp; |
| 77 | * if (rvalue.is_wide_m && !(d & 1)) { |
| 78 | * // floor(log2(d & (2^32 -d))) |
| 79 | * pre_shift = fls(d & -d) - 1; |
| 80 | * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift); |
| 81 | * } else { |
| 82 | * pre_shift = 0; |
| 83 | * } |
| 84 | * |
| 85 | * // code generation starts. |
| 86 | * if (imm == 1U << exp) { |
| 87 | * result = n >> exp; |
| 88 | * } else if (rvalue.is_wide_m) { |
| 89 | * // pre_shift must be zero when reached here. |
| 90 | * t = (n * rvalue.m) >> 32; |
| 91 | * result = n - t; |
| 92 | * result >>= 1; |
| 93 | * result += t; |
| 94 | * result >>= rvalue.sh - 1; |
| 95 | * } else { |
| 96 | * if (pre_shift) |
| 97 | * result = n >> pre_shift; |
| 98 | * result = ((u64)result * rvalue.m) >> 32; |
| 99 | * result >>= rvalue.sh; |
| 100 | * } |
| 101 | */ |
| 102 | struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec); |
| 103 | |
Hannes Frederic Sowa | 809fa97 | 2014-01-22 02:29:41 +0100 | [diff] [blame] | 104 | #endif /* _LINUX_RECIPROCAL_DIV_H */ |