cosmopolitan/libc/nexgen32e/crc32c-sse42.c

97 lines
5.7 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ This program is free software; you can redistribute it and/or modify │
│ it under the terms of the GNU General Public License as published by │
│ the Free Software Foundation; version 2 of the License. │
│ │
│ This program is distributed in the hope that it will be useful, but │
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
│ General Public License for more details. │
│ │
│ You should have received a copy of the GNU General Public License │
│ along with this program; if not, write to the Free Software │
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
│ 02110-1301 USA │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/str/internal.h"
/**
* Hashes data with hardware acceleration at 10GBps.
* @note needs Nehalem+ c. 2008 or Bulldozer+ c. 2011
*/
optimizespeed uint32_t crc32c$sse42(uint32_t init, const void *data, size_t n) {
const unsigned char *p = (const unsigned char *)data;
const unsigned char *pe = (const unsigned char *)data + n;
uint32_t h = init ^ 0xffffffff;
if (n >= 16 + 8) {
while ((uintptr_t)p & 7) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
uint64_t hl = h;
while (p < pe - 16ul) {
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
p += 8;
asm("crc32q\t%1,%0" : "+r"(hl) : "rm"(*(const uint64_t *)p));
p += 8;
}
h = (uint32_t)hl;
}
while (p < pe) asm("crc32b\t%1,%0" : "+r"(h) : "rm"(*p++));
return h ^ 0xffffffff;
}
/*
bench_crc32c$sse42 for #c per n where c ≈ 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 877.000 43.375 40.359 81
1 45.000 39.625 40.484 80
2 34.500 27.562 20.461 159
3 23.000 16.708 14.245 228
4 18.250 13.094 11.449 284
7 10.429 8.339 8.185 397
8 42.125 8.734 6.850 475
15 9.400 5.375 4.884 665
16 7.312 5.070 4.882 666
31 5.258 2.923 2.680 1213
32 3.969 2.676 2.562 1269
63 3.095 1.581 1.428 2276
64 2.234 1.623 1.478 2199
127 1.205 0.901 0.900 3610
128 1.164 0.960 0.915 3552
255 0.922 0.651 0.618 5260
256 0.715 0.650 0.609 5341
511 0.558 0.482 0.477 6819
512 0.529 0.475 0.469 6932
1023 0.425 0.400 0.396 8204
1024 0.417 0.392 0.388 8383
2047 0.367 0.355 0.353 9199
2048 0.374 0.366 0.364 8929
4095 0.351 0.338 0.337 9644
4096 0.353 0.338 0.338 9624
8191 0.335 0.338 0.337 9641
8192 0.335 0.329 0.329 9870
16383 0.336 0.325 0.325 10011
16384 0.336 0.326 0.375 8666
32767 0.329 0.323 0.323 10070
32768 0.327 0.324 0.323 10062
65535 0.322 0.322 0.322 10103
65536 0.321 0.322 0.322 10102
131071 0.322 0.321 0.321 10125
131072 0.321 0.321 0.321 10124
262143 0.322 0.321 0.335 9699
262144 0.321 0.321 0.321 10134
524287 0.321 0.321 0.499 6516
524288 0.321 0.321 0.339 9575
1048575 0.322 0.321 0.322 10095
1048576 0.320 1.001 0.323 10048
2097151 0.325 0.321 0.322 10086
2097152 0.330 0.320 0.323 10076
4194303 0.331 0.322 0.321 10128
4194304 0.332 0.321 0.325 10004
8388607 0.334 0.332 0.331 9829
8388608 0.334 0.329 0.327 9934
*/