cosmopolitan/libc/nexgen32e/memcmp.S

207 lines
12 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
/ Compares memory.
/
/ @param edi first string
/ @param esi second string
/ @param edx byte size
/ @return unsigned char subtraction at stop index
/ @asyncsignalsafe
memcmp: jmp *__memcmp(%rip)
.endfn memcmp,globl
.source __FILE__
/* cosmo memcmp() avx2 for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 61.000 39.375 36.984 95
1 37.000 37.625 37.391 94
2 28.500 19.688 19.930 175
3 20.333 13.625 14.411 243
4 30.250 10.656 10.426 335
7 15.000 7.304 6.136 570
8 10.125 6.234 5.525 633
15 9.133 3.542 3.570 980
16 6.062 4.398 3.577 977
31 4.548 2.931 2.340 1494
32 2.594 1.520 1.492 2344
63 3.444 1.240 1.221 2864
64 1.328 0.736 0.742 4713
127 1.661 0.710 0.605 5778
128 0.820 0.452 0.396 8822
255 0.639 0.360 0.347 10080
256 0.434 0.250 0.220 15874
511 0.413 0.218 0.199 17612
512 0.201 0.176 0.138 25377
1023 0.216 0.142 0.125 28031
1024 0.132 0.097 0.096 36276
2047 0.125 0.091 0.091 38466
2048 0.093 0.079 0.075 46365
4095 0.084 0.081 0.078 44705
4096 0.069 0.069 0.069 50819
8191 0.070 0.068 0.067 51841
8192 0.063 0.062 0.062 56633
16383 0.066 0.063 0.061 56994
16384 0.059 0.058 0.058 60021
32767 0.131 0.104 0.100 34909
32768 0.120 0.084 0.079 44282
cosmo memcmp() sse2 (old cpu) for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 59.000 37.125 37.328 94
1 35.000 37.375 36.359 96
2 28.500 18.938 20.461 171
3 19.000 12.875 13.234 264
4 29.250 10.906 10.348 338
7 11.571 6.304 6.404 546
8 8.125 5.672 5.713 612
15 11.533 4.492 3.759 930
16 5.812 3.227 2.876 1216
31 5.516 2.367 1.797 1946
32 2.969 1.816 1.481 2361
63 3.413 0.990 0.929 3763
64 1.703 0.850 0.763 4580
127 1.614 0.531 0.533 6556
128 0.961 0.438 0.426 8205
255 0.922 0.378 0.325 10745
256 0.457 0.322 0.268 13035
511 0.331 0.253 0.216 16223
512 0.287 0.212 0.189 18460
1023 0.220 0.172 0.164 21378
1024 0.198 0.159 0.150 23357
2047 0.161 0.152 0.150 23271
2048 0.147 0.139 0.136 25732
4095 0.135 0.130 0.129 27157
4096 0.129 0.123 0.123 28499
8191 0.122 0.116 0.116 30110
8192 0.116 0.113 0.113 30863
16383 0.117 0.112 0.112 31311
16384 0.111 0.110 0.110 31802
32767 0.157 0.138 0.136 25653
32768 0.144 0.121 0.118 29590
glibc memcmp() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 6875.000 39.125 35.141 100
1 33.000 35.375 35.078 100
2 138.500 20.312 18.570 188
3 26.333 13.958 12.536 279
4 53.250 12.094 9.512 368
7 13.571 5.554 5.708 613
8 19.625 5.328 5.057 691
15 6.867 3.075 2.801 1248
16 9.062 2.555 2.526 1384
31 4.484 1.319 1.313 2663
32 3.906 1.285 1.299 2691
63 2.143 0.863 0.719 4867
64 1.234 0.814 0.718 4873
127 2.071 0.493 0.428 8174
128 0.523 0.427 0.421 8310
255 0.882 0.302 0.250 13983
256 0.465 0.258 0.266 13143
511 0.417 0.189 0.164 21339
512 0.209 0.170 0.160 21862
1023 0.320 0.120 0.111 31391
1024 0.128 0.115 0.112 31106
2047 0.110 0.092 0.088 39803
2048 0.098 0.088 0.086 40837
4095 0.093 0.078 0.076 46281
4096 0.081 0.076 0.075 46400
8191 0.080 0.071 0.069 50984
8192 0.075 0.069 0.069 50970
16383 0.083 0.071 0.068 51591
16384 0.072 0.071 0.068 51736
32767 0.145 0.136 0.121 28805
32768 0.145 0.139 0.137 25469
musl memcmp() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 55.000 37.625 34.484 101
1 35.000 33.625 34.203 102
2 37.500 24.562 18.648 188
3 20.333 13.625 12.766 274
4 32.750 11.531 9.527 367
7 12.714 8.482 5.828 600
8 13.125 6.234 5.330 656
15 9.000 4.892 3.391 1031
16 5.188 4.102 3.335 1048
31 4.806 2.899 2.295 1524
32 4.406 2.801 2.208 1584
63 3.794 1.808 1.689 2070
64 2.672 1.994 1.675 2088
127 1.961 1.739 1.648 2122
128 2.055 1.610 1.614 2167
255 1.463 1.381 1.401 2496
256 1.457 1.362 1.385 2525
511 1.286 1.351 1.226 2853
512 1.256 1.255 1.253 2791
1023 1.207 1.184 1.180 2964
1024 1.204 1.146 1.174 2978
2047 1.134 1.126 1.152 3036
2048 1.134 1.123 1.149 3044
4095 1.124 1.108 1.138 3074
4096 1.117 1.107 1.136 3077
8191 1.106 1.103 1.102 3174
8192 1.105 1.102 1.267 2760
16383 1.110 1.103 1.099 3182
16384 1.108 1.100 1.098 3184
32767 1.101 1.097 1.126 3105
32768 1.128 1.130 1.126 3105
newlib memcmp() for #c per n where c 0.273ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 73.000 39.625 36.297 96
1 35.000 35.375 35.328 99
2 41.500 19.438 18.508 189
3 29.667 13.542 13.005 269
4 22.750 10.656 10.332 338
7 14.714 6.875 6.248 560
8 18.125 6.453 5.846 598
15 11.533 3.575 3.547 986
16 8.062 3.461 2.880 1214
31 3.839 2.931 2.689 1300
32 5.594 1.848 1.589 2200
63 3.667 2.387 2.242 1560
64 2.078 1.170 0.842 4153
127 2.228 2.111 2.126 1644
128 1.617 0.669 0.510 6858
255 2.059 1.960 1.964 1781
256 0.590 0.398 0.335 10452
511 1.841 1.814 1.811 1931
512 0.373 0.275 0.252 13860
1023 1.788 1.748 2.426 1441
1024 0.261 0.230 0.226 15474
2047 1.745 1.731 1.774 1971
2048 0.218 0.199 0.197 17741
4095 1.771 1.764 1.763 1983
4096 0.187 0.177 0.181 19353
8191 1.722 1.714 1.714 2040
8192 0.173 0.174 0.173 20252
16383 1.754 1.754 1.845 1895
16384 0.175 0.171 0.169 20692
32767 1.753 1.753 1.753 1995
32768 0.186 0.173 0.170 20510 */