|
| 1 | +/* Copyright (c) 2019, MariaDB Corporation. |
| 2 | +
|
| 3 | + This program is free software; you can redistribute it and/or modify |
| 4 | + it under the terms of the GNU General Public License as published by |
| 5 | + the Free Software Foundation; version 2 of the License. |
| 6 | +
|
| 7 | + This program is distributed in the hope that it will be useful, |
| 8 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 10 | + GNU General Public License for more details. |
| 11 | +
|
| 12 | + You should have received a copy of the GNU General Public License |
| 13 | + along with this program; if not, write to the Free Software |
| 14 | + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
| 15 | + |
| 16 | +#include <my_global.h> |
| 17 | +#include <my_cpu.h> |
| 18 | + |
| 19 | +#ifdef HAVE_PAUSE_INSTRUCTION |
| 20 | +/** How many times to invoke PAUSE in a loop */ |
| 21 | +unsigned my_cpu_relax_multiplier = 200; |
| 22 | + |
| 23 | +# include <stdint.h> |
| 24 | + |
| 25 | +# ifdef _MSC_VER |
| 26 | +# include <intrin.h> |
| 27 | +# else |
| 28 | +# include <x86intrin.h> |
| 29 | +# endif |
| 30 | + |
| 31 | +#define PAUSE4 MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU(); MY_RELAX_CPU() |
| 32 | +#define PAUSE16 PAUSE4; PAUSE4; PAUSE4; PAUSE4 |
| 33 | + |
| 34 | +/** |
| 35 | + Initialize my_cpu_relax_multiplier. |
| 36 | +
|
| 37 | + Determine the duration of a PAUSE instruction by running an |
| 38 | + unrolled loop of 16 PAUSE instructions twice, and taking the |
| 39 | + faster of the two runs. In this way, even if the execution is |
| 40 | + interrupted by the operating system, it should be extremely |
| 41 | + unlikely that both loops get interrupted. |
| 42 | +
|
| 43 | + On the Intel Skylake microarchitecture, the PAUSE instruction takes |
| 44 | + around 140 clock cycles, while on earlier microarchitectures it could |
| 45 | + be 10 clock cycles or less. Scale the PAUSE loop counter accordingly. |
| 46 | +
|
| 47 | + On a pre-Skylake Intel Xeon CPU E5-2630 v4 @ 2.20GHz running an AMD64 |
| 48 | + executable, the numbers would be between 172 and 220 when all the code |
| 49 | + is inlined as follows: |
| 50 | +
|
| 51 | + rdtsc,mov,shl,or, 16*pause, |
| 52 | + rdtsc,mov,shl,or, 16*pause, |
| 53 | + rdtsc. |
| 54 | +
|
| 55 | + That would yield 11 to 14 cycles per PAUSE instruction even if we |
| 56 | + (wrongly) ignore the overhead of the other instructions. |
| 57 | +
|
| 58 | + On a Skylake mobile processor Intel Core i7-6500U CPU @ 2.50GHz, the |
| 59 | + numbers would range from 1896 to 2410 (or 1976 if taking the minimum |
| 60 | + of two runs), yielding 118 to 151 (or 123) cycles per PAUSE instruction. |
| 61 | +
|
| 62 | + Let us define a threshold at roughly 30 cycles per PAUSE instruction, |
| 63 | + and use a shorter delay if the PAUSE instruction takes longer than |
| 64 | + that. In some AMD processors, the PAUSE instruction could take 40 or |
| 65 | + 50 cycles. Let us use a shorter delay multiplier for them as well. |
| 66 | +
|
| 67 | + The 1/10 scaling factor (200/20) was derived experimentally by |
| 68 | + Mikhail Sinyavin from Intel. |
| 69 | +*/ |
| 70 | +void my_cpu_init(void) |
| 71 | +{ |
| 72 | + uint64_t t0, t1, t2; |
| 73 | + t0= __rdtsc(); |
| 74 | + PAUSE16; |
| 75 | + t1= __rdtsc(); |
| 76 | + PAUSE16; |
| 77 | + t2= __rdtsc(); |
| 78 | + if (t2 - t1 > 30 * 16 && t1 - t0 > 30 * 16) |
| 79 | + my_cpu_relax_multiplier= 20; |
| 80 | +} |
| 81 | +#endif |
0 commit comments