cosmopolitan/libc/nexgen32e/crc32-pclmul.S

264 lines
11 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
/ Computes Phil Katz CRC-32 w/ carryless multiply isa.
/
/ This is support code that's abstracted by crc32_z().
/
/ @param edi is initial value
/ @param rsi points to buffer
/ @param rdx is bytes in buffer that's >=64 and %16==0
/ @return eax is crc32
/ @note needs Westmere (c.2010) or Bulldozer (c.2011)
/ @see “Fast CRC Computation for Generic Polynomials Using
/ PCLMULQDQ Instruction V. Gopal, E. Ozturk, et al.,
/ 2009, intel.ly/2ySEwL0
crc32$pclmul:
.leafprologue
.profilable
movdqu (%rsi),%xmm7
movd %edi,%xmm1
movdqu 16(%rsi),%xmm9
movdqu 32(%rsi),%xmm4
movdqu 48(%rsi),%xmm0
lea -64(%rdx),%rdi
lea 64(%rsi),%rcx
pxor %xmm7,%xmm1
movdqa .Lk1k2(%rip),%xmm8
cmp $63,%rdi
jbe 2f
lea -128(%rdx),%rdi
mov %rdi,%rdx
shr $6,%rdx
lea 2(%rdx),%rax
sal $6,%rax
add %rax,%rsi
mov %rcx,%rax
3: add $64,%rax
movdqa %xmm1,%xmm7
movdqa %xmm4,%xmm5
movdqa %xmm0,%xmm3
movdqa %xmm9,%xmm6
movdqa %xmm9,%xmm2
movdqu -48(%rax),%xmm9
pclmullqlqdq %xmm8,%xmm7
pclmullqlqdq %xmm8,%xmm6
pclmullqlqdq %xmm8,%xmm5
pclmulhqhqdq %xmm8,%xmm1
pclmulhqhqdq %xmm8,%xmm2
pclmulhqhqdq %xmm8,%xmm4
pxor %xmm7,%xmm1
movdqu -64(%rax),%xmm7
pxor %xmm6,%xmm2
pxor %xmm5,%xmm4
movdqu -32(%rax),%xmm6
movdqu -16(%rax),%xmm5
pclmullqlqdq %xmm8,%xmm3
pclmulhqhqdq %xmm8,%xmm0
pxor %xmm7,%xmm1
pxor %xmm3,%xmm0
pxor %xmm2,%xmm9
pxor %xmm6,%xmm4
pxor %xmm5,%xmm0
cmp %rsi,%rax
jne 3b
lea 1(%rdx),%rax
sal $6,%rdx
sal $6,%rax
sub %rdx,%rdi
add %rax,%rcx
2: movdqa .Lk3k4(%rip),%xmm3
movdqa %xmm1,%xmm2
movdqa %xmm1,%xmm5
pclmulhqhqdq %xmm3,%xmm2
pclmullqlqdq %xmm3,%xmm5
pxor %xmm9,%xmm2
pxor %xmm5,%xmm2
movdqa %xmm2,%xmm5
pclmulhqhqdq %xmm3,%xmm2
movdqa %xmm2,%xmm1
pclmullqlqdq %xmm3,%xmm5
pxor %xmm4,%xmm1
pxor %xmm5,%xmm1
movdqa %xmm1,%xmm2
pclmulhqhqdq %xmm3,%xmm1
pclmullqlqdq %xmm3,%xmm2
pxor %xmm1,%xmm0
pxor %xmm2,%xmm0
cmp $15,%rdi
jbe 4f
sub $16,%rdi
mov %rcx,%rax
and $-16,%rdi
lea 16(%rcx,%rdi),%rdx
5: movdqa %xmm0,%xmm1
movdqu (%rax),%xmm6
pclmulhqhqdq %xmm3,%xmm0
add $16,%rax
pclmullqlqdq %xmm3,%xmm1
pxor %xmm1,%xmm0
pxor %xmm6,%xmm0
cmp %rax,%rdx
jne 5b
4: movdqa %xmm0,%xmm1
movdqa .Lboop(%rip),%xmm2
psrldq $8,%xmm0
pclmullqhqdq %xmm3,%xmm1
movdqa .Lpoly(%rip),%xmm3
pxor %xmm1,%xmm0
movdqa %xmm0,%xmm1
pand %xmm2,%xmm0
pclmullqlqdq .Lk5k0(%rip),%xmm0
psrldq $4,%xmm1
pxor %xmm0,%xmm1
movdqa %xmm1,%xmm0
pand %xmm2,%xmm0
pclmullqhqdq %xmm3,%xmm0
pand %xmm2,%xmm0
pclmullqlqdq %xmm3,%xmm0
pxor %xmm1,%xmm0
pextrd $1,%xmm0,%eax
.leafepilogue
.endfn crc32$pclmul,globl,hidden
/ Definitions of the bit-reflected domain constants k1,k2,k3, etc.
/ and the CRC32+Barrett polynomials given at the end of the paper.
.rodata.cst16
.Lk1k2: .quad 0x0000000154442bd4
.quad 0x00000001c6e41596
.endobj .Lk1k2
.Lk3k4: .quad 0x00000001751997d0
.quad 0x00000000ccaa009e
.endobj .Lk3k4
.Lk5k0: .quad 0x0000000163cd6124
.quad 0x0000000000000000
.endobj .Lk5k0
.Lboop: .quad 0x00000000ffffffff
.quad 0x00000000ffffffff
.endobj .Lboop
.Lpoly: .quad 0x00000001db710641
.quad 0x00000001f7011641
.endobj .Lpoly
.previous
/* crc32() w/ pclmul for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 4437.000 42.375 38.141 85
1 45.000 39.375 38.234 85
2 31.500 25.312 23.102 141
3 25.667 19.792 17.911 181
4 21.250 16.219 15.035 216
7 18.429 12.946 11.712 277
8 16.125 12.578 10.998 296
15 12.867 9.925 9.161 355
16 12.438 9.836 9.114 357
31 11.194 8.528 8.149 399
32 10.781 8.418 8.098 401
63 9.063 7.780 7.647 425
64 3.109 1.604 1.414 2299
127 2.260 1.824 1.729 1880
128 1.305 0.860 0.806 4033
255 1.290 1.001 0.948 3428
256 0.574 0.491 0.476 6822
511 0.773 0.571 0.546 5956
512 0.354 0.320 0.306 10613
1023 0.425 0.365 0.347 9375
1024 0.237 0.229 0.231 14097
2047 0.278 0.251 0.246 13236
2048 0.187 0.187 0.188 17306
4095 0.229 0.200 0.194 16761
4096 0.162 0.170 0.167 19438
8191 0.182 0.173 0.178 18266
8192 0.162 0.155 0.158 20560
16383 0.156 0.162 0.154 21136
16384 0.156 0.156 0.148 22005
32767 0.163 0.149 0.149 21768
32768 0.150 0.146 0.145 22491
65535 0.158 0.141 0.141 23102
65536 0.149 0.140 0.138 23478
131071 0.150 0.145 0.141 23011
131072 0.148 0.141 0.148 21892
262143 0.151 0.148 0.147 22136
262144 0.149 0.146 0.146 22298
524287 0.150 0.149 0.149 21832
524288 0.148 0.148 0.147 22043
1048575 0.148 0.158 0.163 19913
1048576 0.156 0.179 0.153 21186
2097151 0.153 0.149 0.148 21979
2097152 0.147 0.148 0.147 22040
4194303 0.148 0.148 0.151 21482
4194304 0.148 0.148 0.147 22061
8388607 0.185 0.183 0.185 17536
8388608 0.193 0.180 0.183 17769
crc32() w/ 10+ year old cpus for #c per n where c 0.293ns
N x1 x8 x64 mBps
------------------------------------------------------------
1 4447.000 43.625 37.641 86
1 41.000 37.125 37.609 86
2 31.500 26.562 22.477 145
3 25.000 20.125 17.422 187
4 21.250 16.594 15.230 213
7 16.714 13.089 11.717 277
8 16.875 12.609 11.174 291
15 12.733 9.958 9.339 348
16 12.438 9.852 9.208 353
31 10.935 8.617 8.164 398
32 10.906 8.496 8.155 399
63 9.095 7.819 7.692 423
64 9.172 7.807 7.692 423
127 8.165 7.531 7.438 437
128 8.133 7.503 7.437 437
255 7.714 7.329 7.293 446
256 7.723 7.348 7.293 446
511 7.434 7.253 7.223 450
512 7.412 7.237 7.218 450
1023 7.274 7.214 7.201 451
1024 7.292 7.203 7.189 452
2047 7.232 7.185 7.178 453
2048 7.239 7.189 7.186 452
4095 7.189 7.175 7.172 453
4096 7.192 7.173 7.172 453
8191 7.187 7.173 7.172 453
8192 7.183 7.174 7.181 453
16383 7.175 7.170 7.169 453
16384 7.176 7.169 7.169 453
32767 7.169 7.182 7.170 453
32768 7.173 7.172 7.172 453
65535 7.170 7.170 7.171 453
65536 7.172 7.171 7.204 451
131071 7.170 7.354 7.260 448
131072 7.172 7.172 7.182 453
262143 7.037 7.178 7.182 453
262144 7.169 7.343 7.205 451
524287 7.438 7.170 7.206 451
524288 7.169 7.164 7.209 451
1048575 6.995 7.119 7.158 454
1048576 7.168 7.110 7.157 454
2097151 7.057 7.058 7.065 460
2097152 6.977 7.047 7.089 458
4194303 7.017 7.504 7.030 462
4194304 7.025 7.059 7.030 462
8388607 7.082 6.980 6.997 464
8388608 7.051 6.985 6.999 464 */
.source __FILE__