/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ This program is free software; you can redistribute it and/or modify │ │ it under the terms of the GNU General Public License as published by │ │ the Free Software Foundation; version 2 of the License. │ │ │ │ This program is distributed in the hope that it will be useful, but │ │ WITHOUT ANY WARRANTY; without even the implied warranty of │ │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │ │ General Public License for more details. │ │ │ │ You should have received a copy of the GNU General Public License │ │ along with this program; if not, write to the Free Software │ │ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │ │ 02110-1301 USA │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/nexgen32e/x86feature.h" #include "libc/nexgen32e/macros.h" #include "libc/macros.h" .source __FILE__ / Returns pointer to first instance of character. / / @param rdi is a non-null NUL-terminated string pointer / @param esi is the search byte / @return rax points to character, or to NUL byte if not found / @note this won't return NULL if search character is NUL strchrnul: .leafprologue .profilable or $-1,%r9 jmp 0f .endfn strchrnul,globl / Returns pointer to first instance of character, the BSD way. / / @param rdi is a non-null NUL-terminated string pointer / @param esi is the search byte / @return rax points to first result, or NULL if not found / @note this won't return NULL if search character is NUL index: nop / 𝑠𝑙𝑖𝑑𝑒 .endfn index,globl / Returns pointer to first instance of character. / / @param rdi is a non-null NUL-terminated string pointer / @param esi is the search byte / @return rax points to first result, or NULL if not found / @note this won't return NULL if search character is NUL / @asyncsignalsafe strchr: .leafprologue .profilable xor %r9d,%r9d 0: movzbl %sil,%edx or $-1,%rsi xor %r8,%r8 jmp strsak .endfn strchr,globl / Returns pointer to first instance of character in range. / / @param rdi is a non-null pointer to memory / @param esi is the search byte / @return rax points to byte if found, or else undefined behavior rawmemchr: or $-1,%rdx / 𝑠𝑙𝑖𝑑𝑒 .endfn rawmemchr,globl / Returns pointer to first instance of character in range. / / @param rdi is a non-null pointer to memory / @param esi is the search byte / @param rdx is length of memory in bytes / @return rax points to byte if found or NULL / @asyncsignalsafe memchr: .leafprologue .profilable xchg %rsi,%rdx mov %dl,%dh xor %r8,%r8 xor %r10,%r10 jmp strsak .endfn memchr,globl / Returns length of NUL-terminated string w/ security blankets. / / This is like strnlen() except it'll return 0 if (1) RDI is NULL / or (2) a NUL-terminator wasn't found in the first RSI bytes. / / @param rdi is a nullable NUL-terminated string pointer / @param rsi is the maximum number of bytes to consider / @return rax is the number of bytes, excluding the NUL strnlen_s: .leafprologue .profilable xor %eax,%eax xor %r10d,%r10d test %rdi,%rdi jnz 0f .leafepilogue .endfn strnlen_s,globl / Returns length of NUL-terminated memory, with limit. / / @param rdi is non-null memory / @param rsi is the maximum number of bytes to consider / @return rax is the number of bytes, excluding the NUL / @asyncsignalsafe strnlen:.leafprologue .profilable or $-1,%r10 0: xor %edx,%edx mov %rdi,%r8 / 𝑠𝑙𝑖𝑑𝑒 .endfn strnlen,globl / Swiss army knife of string character scanning. / Fourteen fast functions in one. / / @param rdi is non-null string memory / @param rsi is max number of bytes to consider / @param dl is search character #1 / @param dh is search character #2 / @param r8 is subtracted from result (for length vs. pointer) / @param r9 masks result if DH is found (for NUL vs. NULL) / @param r10 masks result on bytes exhausted (for length v. NULL) / @return rax end pointer after r8/r9/r10 modifications strsak: lea -1(%rdi),%rax 1: add $1,%rax sub $1,%rsi jb .Lend test $31,%al jz .Lfast .Lbyte: mov (%rax),%cl cmp %cl,%dl je .Ldone cmp %cl,%dh je .Lnul jmp 1b .Ldone: sub %r8,%rax jmp .Lret .Lend: mov %r10,%r9 .Lnul: sub %r8,%rax and %r9,%rax .Lret: .leafepilogue .Lslow: add $32,%rsi jmp .Lbyte .Lfast: movzbl %dl,%ecx movd %ecx,%xmm0 movzbl %dh,%ecx movd %ecx,%xmm1 sub $32,%rax #if !X86_NEED(AVX2) testb X86_HAVE(AVX2)+kCpuids(%rip) jz .Lsse2 #endif vpbroadcastb %xmm0,%ymm0 vpbroadcastb %xmm1,%ymm1 1: add $32,%rax sub $32,%rsi 9: jb .Lslow vmovdqa (%rax),%ymm2 vpcmpeqb %ymm0,%ymm2,%ymm3 vpcmpeqb %ymm1,%ymm2,%ymm2 vpor %ymm3,%ymm2,%ymm2 vpmovmskb %ymm2,%ecx bsf %ecx,%ecx je 1b vzeroupper 2: add %rcx,%rax jmp .Lbyte #if !X86_NEED(AVX2) .Lsse2: pbroadcastb %xmm0 pbroadcastb %xmm1 1: add $32,%rax sub $32,%rsi jb 9b movdqa (%rax),%xmm2 movdqa 16(%rax),%xmm3 movdqa %xmm3,%xmm4 pcmpeqb %xmm0,%xmm3 pcmpeqb %xmm1,%xmm4 por %xmm4,%xmm3 pmovmskb %xmm3,%ecx shl $16,%ecx movdqa %xmm2,%xmm4 pcmpeqb %xmm0,%xmm2 pcmpeqb %xmm1,%xmm4 por %xmm4,%xmm2 pmovmskb %xmm2,%r11d or %r11d,%ecx bsf %ecx,%ecx je 1b jmp 2b #endif .endfn strsak,globl,hidden /* benchmarked on intel core i7-6700 @ 3.40GHz (skylake) includes function call overhead (unless marked otherwise) your strlen, &c (strsak+avx2) for #c per n where c ≈ 0.293ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 47.000 36.375 35.141 99 1 35.000 34.625 36.234 96 2 31.500 18.812 18.992 184 3 19.667 13.042 13.182 265 4 30.750 10.281 10.285 339 7 15.857 8.946 7.551 462 8 12.125 9.203 7.119 490 15 10.467 5.475 4.601 758 16 6.812 5.523 4.798 727 31 5.387 4.327 3.517 992 32 4.719 1.645 1.532 2278 63 5.000 2.403 2.034 1715 64 2.047 0.779 0.788 4427 127 2.134 1.194 1.027 3399 128 1.742 0.444 0.419 8327 255 0.945 0.594 0.554 6295 256 0.574 0.271 0.264 13226 511 0.785 0.362 0.307 11384 512 0.326 0.178 0.151 23134 1023 0.288 0.242 0.185 18862 1024 0.208 0.114 0.107 32565 2047 0.235 0.127 0.123 28430 2048 0.127 0.090 0.084 41413 4095 0.119 0.106 0.099 35116 4096 0.100 0.081 0.079 44372 8191 0.092 0.082 0.081 43176 8192 0.081 0.072 0.071 49419 16383 0.076 0.072 0.071 48847 16384 0.071 0.068 0.067 52381 32767 0.072 0.069 0.068 51154 32768 0.068 0.066 0.065 53409 your tinystrlen() N x1 x8 x64 mBps ------------------------------------------------------------ 1 53.000 33.625 33.672 97 1 33.000 32.125 32.234 101 2 24.500 19.438 17.711 184 3 23.667 12.875 11.911 273 4 13.750 9.281 9.238 352 7 11.000 6.125 5.801 560 8 7.625 5.609 5.232 621 15 11.800 3.825 3.364 966 16 4.562 3.648 3.173 1024 « optimal 31 3.710 2.851 2.298 1414 32 3.031 2.254 2.159 1506 « dropoff 63 2.683 1.827 1.691 1922 64 2.078 1.932 1.689 1924 127 1.630 1.647 1.622 2004 128 1.727 1.671 1.652 1968 255 1.392 1.450 1.435 2265 256 1.473 1.427 1.437 2262 511 1.325 1.353 1.337 2431 512 1.408 1.343 1.337 2431 1023 1.289 1.281 1.287 2525 1024 1.269 1.295 1.297 2506 2047 1.269 1.274 1.269 2561 2048 1.280 1.263 1.281 2538 4095 1.262 1.270 1.266 2568 4096 1.270 1.264 1.265 2570 8191 1.253 1.254 1.254 2592 8192 1.219 1.224 1.225 2653 16383 1.225 1.222 1.220 2663 16384 1.226 1.221 1.222 2659 32767 1.227 1.224 1.223 2658 32768 1.220 1.221 1.222 2659 glibc strlen for #c per n where c ≈ 0.273ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 3497.000 53.125 42.641 82 1 69.000 44.875 42.547 82 2 45.500 24.188 21.852 160 3 23.000 15.625 14.557 240 4 22.250 11.406 10.637 328 7 10.143 6.768 6.230 560 8 11.125 5.797 5.486 636 15 5.800 3.142 2.859 1220 16 7.062 3.070 2.737 1275 31 2.806 1.585 1.407 2481 32 3.156 1.574 1.349 2587 63 2.016 0.895 0.691 5049 64 1.328 0.744 0.670 5207 127 1.441 0.521 0.407 8577 128 0.648 0.454 0.405 8619 255 0.553 0.286 0.214 16277 256 0.387 0.235 0.218 15984 511 0.456 0.151 0.129 27077 512 0.182 0.134 0.129 27117 1023 0.171 0.106 0.082 42795 1024 0.112 0.088 0.082 42741 2047 0.099 0.069 0.059 59537 2048 0.072 0.060 0.058 59925 4095 0.065 0.053 0.047 74122 4096 0.061 0.048 0.047 74478 8191 0.048 0.045 0.044 79117 8192 0.051 0.045 0.044 79181 16383 0.042 0.040 0.061 57018 16384 0.069 0.063 0.061 57245 32767 0.081 0.073 0.068 51426 32768 0.084 0.072 0.068 51285 GCC strlen (-Os REPNZ SCASB) for #c per n where c ≈ 0.293ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 103.000 84.125 88.766 37 1 81.000 85.125 87.328 37 2 43.500 44.562 45.508 71 3 33.000 30.208 30.995 105 4 24.750 23.156 23.113 141 7 17.000 13.054 15.355 212 8 13.375 14.047 13.982 232 15 9.533 9.258 55.111 59 16 6.312 6.352 6.364 511 31 4.032 4.141 4.141 785 32 3.969 4.059 4.048 803 63 2.937 2.970 2.995 1086 64 2.922 2.939 2.956 1100 127 2.386 2.408 2.403 1353 128 2.383 2.403 2.401 1354 255 2.129 2.118 2.124 1530 256 2.137 2.133 2.130 1526 511 1.982 1.986 3.351 970 512 1.982 1.990 1.986 1637 1023 1.915 1.916 2.587 1257 1024 1.868 1.867 1.866 1742 2047 1.835 1.833 1.832 1775 2048 1.830 1.831 1.832 1775 4095 1.814 1.814 1.815 1791 4096 1.810 1.815 1.815 1791 8191 1.805 1.807 1.806 1800 8192 1.805 1.806 1.806 1800 16383 1.803 1.756 1.756 1851 16384 1.758 1.756 1.756 1851 32767 1.756 1.754 1.754 1853 32768 1.756 1.754 1.754 1853 Intel Optimz. Manual (SSE4.2) for #c per n where c ≈ 0.273ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 37.000 43.125 34.078 102 1 33.000 33.875 34.016 103 2 39.500 17.188 17.555 199 3 18.333 12.208 12.036 290 4 30.250 9.344 9.137 382 7 14.429 5.732 5.766 605 8 7.875 6.797 5.354 652 15 10.733 5.825 3.516 993 16 3.812 2.383 2.325 1501 31 4.097 2.609 2.079 1678 32 3.031 1.395 1.349 2587 63 2.937 1.558 1.079 3235 64 2.016 0.893 0.690 5056 127 1.929 0.721 0.607 5745 128 0.617 0.483 0.428 8147 255 1.275 0.404 0.411 8486 256 0.480 0.319 0.299 11681 511 0.479 0.307 0.288 12127 512 0.322 0.244 0.232 15013 1023 0.324 0.224 0.225 15512 1024 0.245 0.240 0.223 15651 2047 0.222 0.213 0.206 16938 2048 0.204 0.194 0.192 18140 4095 0.204 0.188 0.185 18888 4096 0.183 0.179 0.179 19446 8191 0.179 0.176 0.174 20000 8192 0.174 0.172 0.171 20383 16383 0.171 0.170 0.169 20604 16384 0.169 0.169 0.168 20808 32767 0.213 0.225 0.267 13064 32768 0.231 0.215 0.220 15852 musl libc strlen for #c per n where c ≈ 0.273ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 65.000 36.125 37.984 92 1 39.000 37.625 37.422 93 2 41.500 21.938 20.695 169 3 22.333 17.625 15.859 220 4 21.250 13.656 12.105 288 7 22.143 9.018 7.609 459 8 31.125 7.234 7.346 475 15 11.267 5.025 4.709 741 16 9.438 4.039 3.849 907 31 4.871 3.133 2.488 1402 32 5.219 2.246 2.039 1712 63 4.302 1.462 1.407 2479 64 2.109 1.428 1.155 3023 127 1.551 1.078 0.879 3971 128 1.742 0.903 0.760 4591 255 0.922 0.558 0.605 5764 256 0.934 0.575 0.537 6495 511 0.550 0.493 0.455 7674 512 0.646 0.490 0.426 8183 1023 0.550 0.439 0.425 8203 1024 0.472 0.421 0.408 8549 2047 0.507 0.334 0.373 9360 2048 0.403 0.426 0.409 8540 4095 0.391 0.240 0.236 14799 4096 0.238 0.222 0.221 15766 8191 0.225 0.223 0.221 15779 8192 0.225 0.214 0.215 16250 16383 0.212 0.212 0.210 16595 16384 0.209 0.210 0.211 16535 32767 0.214 0.208 0.205 17001 32768 0.207 0.207 0.291 12002 newlib strlen for #c per n where c ≈ 0.273ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 33.000 34.625 34.141 102 1 33.000 34.125 33.984 103 2 58.500 18.562 17.508 199 3 16.333 12.792 12.016 290 4 19.250 9.219 9.215 379 7 17.571 6.089 5.685 614 8 16.625 5.078 5.432 642 15 8.467 4.042 3.207 1088 16 3.938 2.773 2.733 1277 31 3.645 1.673 1.598 2183 32 3.281 1.527 1.493 2338 63 2.619 1.042 0.895 3901 64 1.422 0.928 0.813 4294 127 0.984 0.718 0.561 6222 128 1.195 0.591 0.532 6558 255 0.600 0.404 0.397 8785 256 0.621 0.429 0.376 9280 511 0.346 0.311 0.306 11421 512 0.420 0.308 0.296 11776 1023 0.284 0.285 0.285 12237 1024 0.321 0.282 0.280 12456 2047 0.253 0.252 0.252 13864 2048 0.260 0.249 0.249 14012 4095 0.236 0.236 0.236 14811 4096 0.239 0.235 0.234 14906 8191 0.233 0.228 0.227 15371 8192 0.230 0.227 0.227 15397 16383 0.223 0.224 0.223 15638 16384 0.223 0.224 0.223 15663 32767 0.224 0.387 0.225 15527 32768 0.223 0.222 0.222 15724 Agner Fog's strlen (SSE2) for #c per n where c ≈ 0.273ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 59.000 38.375 38.453 91 1 37.000 38.625 38.234 91 2 18.500 19.062 19.273 181 3 13.000 12.792 12.859 271 4 9.250 9.594 9.660 361 7 5.286 5.554 5.502 634 8 4.625 4.703 4.791 728 15 2.600 2.858 2.622 1331 16 2.438 2.414 2.421 1442 31 2.161 1.399 1.290 2706 32 1.219 1.262 1.250 2793 63 1.508 0.875 0.693 5038 64 0.641 0.654 0.655 5328 127 1.205 0.406 0.379 9200 128 0.367 0.372 0.369 9463 255 0.467 0.310 0.235 14835 256 0.230 0.232 0.232 15034 511 0.272 0.181 0.159 21918 512 0.174 0.161 0.158 22148 1023 0.175 0.134 0.120 29043 1024 0.140 0.122 0.120 29005 2047 0.128 0.114 0.112 31205 2048 0.130 0.113 0.112 31242 4095 0.105 0.098 0.097 35984 4096 0.105 0.098 0.097 35973 8191 0.093 0.090 0.090 38953 8192 0.094 0.090 0.090 38986 16383 0.088 0.086 0.086 40648 16384 0.088 0.086 0.086 40652 32767 0.088 0.086 0.085 40956 32768 0.087 0.085 0.085 41114 */