207 lines
12 KiB
ArmAsm
207 lines
12 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│vi: set et ft=asm ts=8 sw=8 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ This program is free software; you can redistribute it and/or modify │
|
|
│ it under the terms of the GNU General Public License as published by │
|
|
│ the Free Software Foundation; version 2 of the License. │
|
|
│ │
|
|
│ This program is distributed in the hope that it will be useful, but │
|
|
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
|
|
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
|
|
│ General Public License for more details. │
|
|
│ │
|
|
│ You should have received a copy of the GNU General Public License │
|
|
│ along with this program; if not, write to the Free Software │
|
|
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
|
|
│ 02110-1301 USA │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.h"
|
|
|
|
/ Compares memory.
|
|
/
|
|
/ @param edi first string
|
|
/ @param esi second string
|
|
/ @param edx byte size
|
|
/ @return unsigned char subtraction at stop index
|
|
/ @asyncsignalsafe
|
|
memcmp: jmp *__memcmp(%rip)
|
|
.endfn memcmp,globl
|
|
.source __FILE__
|
|
|
|
/* cosmo memcmp() avx2 for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 61.000 39.375 36.984 95
|
|
1 37.000 37.625 37.391 94
|
|
2 28.500 19.688 19.930 175
|
|
3 20.333 13.625 14.411 243
|
|
4 30.250 10.656 10.426 335
|
|
7 15.000 7.304 6.136 570
|
|
8 10.125 6.234 5.525 633
|
|
15 9.133 3.542 3.570 980
|
|
16 6.062 4.398 3.577 977
|
|
31 4.548 2.931 2.340 1494
|
|
32 2.594 1.520 1.492 2344
|
|
63 3.444 1.240 1.221 2864
|
|
64 1.328 0.736 0.742 4713
|
|
127 1.661 0.710 0.605 5778
|
|
128 0.820 0.452 0.396 8822
|
|
255 0.639 0.360 0.347 10080
|
|
256 0.434 0.250 0.220 15874
|
|
511 0.413 0.218 0.199 17612
|
|
512 0.201 0.176 0.138 25377
|
|
1023 0.216 0.142 0.125 28031
|
|
1024 0.132 0.097 0.096 36276
|
|
2047 0.125 0.091 0.091 38466
|
|
2048 0.093 0.079 0.075 46365
|
|
4095 0.084 0.081 0.078 44705
|
|
4096 0.069 0.069 0.069 50819
|
|
8191 0.070 0.068 0.067 51841
|
|
8192 0.063 0.062 0.062 56633
|
|
16383 0.066 0.063 0.061 56994
|
|
16384 0.059 0.058 0.058 60021
|
|
32767 0.131 0.104 0.100 34909
|
|
32768 0.120 0.084 0.079 44282
|
|
|
|
cosmo memcmp() sse2 (old cpu) for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 59.000 37.125 37.328 94
|
|
1 35.000 37.375 36.359 96
|
|
2 28.500 18.938 20.461 171
|
|
3 19.000 12.875 13.234 264
|
|
4 29.250 10.906 10.348 338
|
|
7 11.571 6.304 6.404 546
|
|
8 8.125 5.672 5.713 612
|
|
15 11.533 4.492 3.759 930
|
|
16 5.812 3.227 2.876 1216
|
|
31 5.516 2.367 1.797 1946
|
|
32 2.969 1.816 1.481 2361
|
|
63 3.413 0.990 0.929 3763
|
|
64 1.703 0.850 0.763 4580
|
|
127 1.614 0.531 0.533 6556
|
|
128 0.961 0.438 0.426 8205
|
|
255 0.922 0.378 0.325 10745
|
|
256 0.457 0.322 0.268 13035
|
|
511 0.331 0.253 0.216 16223
|
|
512 0.287 0.212 0.189 18460
|
|
1023 0.220 0.172 0.164 21378
|
|
1024 0.198 0.159 0.150 23357
|
|
2047 0.161 0.152 0.150 23271
|
|
2048 0.147 0.139 0.136 25732
|
|
4095 0.135 0.130 0.129 27157
|
|
4096 0.129 0.123 0.123 28499
|
|
8191 0.122 0.116 0.116 30110
|
|
8192 0.116 0.113 0.113 30863
|
|
16383 0.117 0.112 0.112 31311
|
|
16384 0.111 0.110 0.110 31802
|
|
32767 0.157 0.138 0.136 25653
|
|
32768 0.144 0.121 0.118 29590
|
|
|
|
glibc memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 6875.000 39.125 35.141 100
|
|
1 33.000 35.375 35.078 100
|
|
2 138.500 20.312 18.570 188
|
|
3 26.333 13.958 12.536 279
|
|
4 53.250 12.094 9.512 368
|
|
7 13.571 5.554 5.708 613
|
|
8 19.625 5.328 5.057 691
|
|
15 6.867 3.075 2.801 1248
|
|
16 9.062 2.555 2.526 1384
|
|
31 4.484 1.319 1.313 2663
|
|
32 3.906 1.285 1.299 2691
|
|
63 2.143 0.863 0.719 4867
|
|
64 1.234 0.814 0.718 4873
|
|
127 2.071 0.493 0.428 8174
|
|
128 0.523 0.427 0.421 8310
|
|
255 0.882 0.302 0.250 13983
|
|
256 0.465 0.258 0.266 13143
|
|
511 0.417 0.189 0.164 21339
|
|
512 0.209 0.170 0.160 21862
|
|
1023 0.320 0.120 0.111 31391
|
|
1024 0.128 0.115 0.112 31106
|
|
2047 0.110 0.092 0.088 39803
|
|
2048 0.098 0.088 0.086 40837
|
|
4095 0.093 0.078 0.076 46281
|
|
4096 0.081 0.076 0.075 46400
|
|
8191 0.080 0.071 0.069 50984
|
|
8192 0.075 0.069 0.069 50970
|
|
16383 0.083 0.071 0.068 51591
|
|
16384 0.072 0.071 0.068 51736
|
|
32767 0.145 0.136 0.121 28805
|
|
32768 0.145 0.139 0.137 25469
|
|
|
|
musl memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 55.000 37.625 34.484 101
|
|
1 35.000 33.625 34.203 102
|
|
2 37.500 24.562 18.648 188
|
|
3 20.333 13.625 12.766 274
|
|
4 32.750 11.531 9.527 367
|
|
7 12.714 8.482 5.828 600
|
|
8 13.125 6.234 5.330 656
|
|
15 9.000 4.892 3.391 1031
|
|
16 5.188 4.102 3.335 1048
|
|
31 4.806 2.899 2.295 1524
|
|
32 4.406 2.801 2.208 1584
|
|
63 3.794 1.808 1.689 2070
|
|
64 2.672 1.994 1.675 2088
|
|
127 1.961 1.739 1.648 2122
|
|
128 2.055 1.610 1.614 2167
|
|
255 1.463 1.381 1.401 2496
|
|
256 1.457 1.362 1.385 2525
|
|
511 1.286 1.351 1.226 2853
|
|
512 1.256 1.255 1.253 2791
|
|
1023 1.207 1.184 1.180 2964
|
|
1024 1.204 1.146 1.174 2978
|
|
2047 1.134 1.126 1.152 3036
|
|
2048 1.134 1.123 1.149 3044
|
|
4095 1.124 1.108 1.138 3074
|
|
4096 1.117 1.107 1.136 3077
|
|
8191 1.106 1.103 1.102 3174
|
|
8192 1.105 1.102 1.267 2760
|
|
16383 1.110 1.103 1.099 3182
|
|
16384 1.108 1.100 1.098 3184
|
|
32767 1.101 1.097 1.126 3105
|
|
32768 1.128 1.130 1.126 3105
|
|
|
|
newlib memcmp() for #c per n where c ≈ 0.273ns
|
|
N x1 x8 x64 mBps
|
|
------------------------------------------------------------
|
|
1 73.000 39.625 36.297 96
|
|
1 35.000 35.375 35.328 99
|
|
2 41.500 19.438 18.508 189
|
|
3 29.667 13.542 13.005 269
|
|
4 22.750 10.656 10.332 338
|
|
7 14.714 6.875 6.248 560
|
|
8 18.125 6.453 5.846 598
|
|
15 11.533 3.575 3.547 986
|
|
16 8.062 3.461 2.880 1214
|
|
31 3.839 2.931 2.689 1300
|
|
32 5.594 1.848 1.589 2200
|
|
63 3.667 2.387 2.242 1560
|
|
64 2.078 1.170 0.842 4153
|
|
127 2.228 2.111 2.126 1644
|
|
128 1.617 0.669 0.510 6858
|
|
255 2.059 1.960 1.964 1781
|
|
256 0.590 0.398 0.335 10452
|
|
511 1.841 1.814 1.811 1931
|
|
512 0.373 0.275 0.252 13860
|
|
1023 1.788 1.748 2.426 1441
|
|
1024 0.261 0.230 0.226 15474
|
|
2047 1.745 1.731 1.774 1971
|
|
2048 0.218 0.199 0.197 17741
|
|
4095 1.771 1.764 1.763 1983
|
|
4096 0.187 0.177 0.181 19353
|
|
8191 1.722 1.714 1.714 2040
|
|
8192 0.173 0.174 0.173 20252
|
|
16383 1.754 1.754 1.845 1895
|
|
16384 0.175 0.171 0.169 20692
|
|
32767 1.753 1.753 1.753 1995
|
|
32768 0.186 0.173 0.170 20510 */
|