/*-*- mode:asm; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ This program is free software; you can redistribute it and/or modify │ │ it under the terms of the GNU General Public License as published by │ │ the Free Software Foundation; version 2 of the License. │ │ │ │ This program is distributed in the hope that it will be useful, but │ │ WITHOUT ANY WARRANTY; without even the implied warranty of │ │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │ │ General Public License for more details. │ │ │ │ You should have received a copy of the GNU General Public License │ │ along with this program; if not, write to the Free Software │ │ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │ │ 02110-1301 USA │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.h" / Computes Phil Katz CRC-32 w/ carryless multiply isa. / / This is support code that's abstracted by crc32_z(). / / @param edi is initial value / @param rsi points to buffer / @param rdx is bytes in buffer that's >=64 and %16==0 / @return eax is crc32 / @note needs Westmere (c.2010) or Bulldozer (c.2011) / @see “Fast CRC Computation for Generic Polynomials Using / PCLMULQDQ Instruction” V. Gopal, E. Ozturk, et al., / 2009, intel.ly/2ySEwL0 crc32$pclmul: .leafprologue .profilable movdqu (%rsi),%xmm7 movd %edi,%xmm1 movdqu 16(%rsi),%xmm9 movdqu 32(%rsi),%xmm4 movdqu 48(%rsi),%xmm0 lea -64(%rdx),%rdi lea 64(%rsi),%rcx pxor %xmm7,%xmm1 movdqa .Lk1k2(%rip),%xmm8 cmp $63,%rdi jbe 2f lea -128(%rdx),%rdi mov %rdi,%rdx shr $6,%rdx lea 2(%rdx),%rax sal $6,%rax add %rax,%rsi mov %rcx,%rax 3: add $64,%rax movdqa %xmm1,%xmm7 movdqa %xmm4,%xmm5 movdqa %xmm0,%xmm3 movdqa %xmm9,%xmm6 movdqa %xmm9,%xmm2 movdqu -48(%rax),%xmm9 pclmullqlqdq %xmm8,%xmm7 pclmullqlqdq %xmm8,%xmm6 pclmullqlqdq %xmm8,%xmm5 pclmulhqhqdq %xmm8,%xmm1 pclmulhqhqdq %xmm8,%xmm2 pclmulhqhqdq %xmm8,%xmm4 pxor %xmm7,%xmm1 movdqu -64(%rax),%xmm7 pxor %xmm6,%xmm2 pxor %xmm5,%xmm4 movdqu -32(%rax),%xmm6 movdqu -16(%rax),%xmm5 pclmullqlqdq %xmm8,%xmm3 pclmulhqhqdq %xmm8,%xmm0 pxor %xmm7,%xmm1 pxor %xmm3,%xmm0 pxor %xmm2,%xmm9 pxor %xmm6,%xmm4 pxor %xmm5,%xmm0 cmp %rsi,%rax jne 3b lea 1(%rdx),%rax sal $6,%rdx sal $6,%rax sub %rdx,%rdi add %rax,%rcx 2: movdqa .Lk3k4(%rip),%xmm3 movdqa %xmm1,%xmm2 movdqa %xmm1,%xmm5 pclmulhqhqdq %xmm3,%xmm2 pclmullqlqdq %xmm3,%xmm5 pxor %xmm9,%xmm2 pxor %xmm5,%xmm2 movdqa %xmm2,%xmm5 pclmulhqhqdq %xmm3,%xmm2 movdqa %xmm2,%xmm1 pclmullqlqdq %xmm3,%xmm5 pxor %xmm4,%xmm1 pxor %xmm5,%xmm1 movdqa %xmm1,%xmm2 pclmulhqhqdq %xmm3,%xmm1 pclmullqlqdq %xmm3,%xmm2 pxor %xmm1,%xmm0 pxor %xmm2,%xmm0 cmp $15,%rdi jbe 4f sub $16,%rdi mov %rcx,%rax and $-16,%rdi lea 16(%rcx,%rdi),%rdx 5: movdqa %xmm0,%xmm1 movdqu (%rax),%xmm6 pclmulhqhqdq %xmm3,%xmm0 add $16,%rax pclmullqlqdq %xmm3,%xmm1 pxor %xmm1,%xmm0 pxor %xmm6,%xmm0 cmp %rax,%rdx jne 5b 4: movdqa %xmm0,%xmm1 movdqa .Lboop(%rip),%xmm2 psrldq $8,%xmm0 pclmullqhqdq %xmm3,%xmm1 movdqa .Lpoly(%rip),%xmm3 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 pand %xmm2,%xmm0 pclmullqlqdq .Lk5k0(%rip),%xmm0 psrldq $4,%xmm1 pxor %xmm0,%xmm1 movdqa %xmm1,%xmm0 pand %xmm2,%xmm0 pclmullqhqdq %xmm3,%xmm0 pand %xmm2,%xmm0 pclmullqlqdq %xmm3,%xmm0 pxor %xmm1,%xmm0 pextrd $1,%xmm0,%eax .leafepilogue .endfn crc32$pclmul,globl,hidden / Definitions of the bit-reflected domain constants k1,k2,k3, etc. / and the CRC32+Barrett polynomials given at the end of the paper. .rodata.cst16 .Lk1k2: .quad 0x0000000154442bd4 .quad 0x00000001c6e41596 .endobj .Lk1k2 .Lk3k4: .quad 0x00000001751997d0 .quad 0x00000000ccaa009e .endobj .Lk3k4 .Lk5k0: .quad 0x0000000163cd6124 .quad 0x0000000000000000 .endobj .Lk5k0 .Lboop: .quad 0x00000000ffffffff .quad 0x00000000ffffffff .endobj .Lboop .Lpoly: .quad 0x00000001db710641 .quad 0x00000001f7011641 .endobj .Lpoly .previous /* crc32() w/ pclmul for #c per n where c ≈ 0.293ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 4437.000 42.375 38.141 85 1 45.000 39.375 38.234 85 2 31.500 25.312 23.102 141 3 25.667 19.792 17.911 181 4 21.250 16.219 15.035 216 7 18.429 12.946 11.712 277 8 16.125 12.578 10.998 296 15 12.867 9.925 9.161 355 16 12.438 9.836 9.114 357 31 11.194 8.528 8.149 399 32 10.781 8.418 8.098 401 63 9.063 7.780 7.647 425 64 3.109 1.604 1.414 2299 127 2.260 1.824 1.729 1880 128 1.305 0.860 0.806 4033 255 1.290 1.001 0.948 3428 256 0.574 0.491 0.476 6822 511 0.773 0.571 0.546 5956 512 0.354 0.320 0.306 10613 1023 0.425 0.365 0.347 9375 1024 0.237 0.229 0.231 14097 2047 0.278 0.251 0.246 13236 2048 0.187 0.187 0.188 17306 4095 0.229 0.200 0.194 16761 4096 0.162 0.170 0.167 19438 8191 0.182 0.173 0.178 18266 8192 0.162 0.155 0.158 20560 16383 0.156 0.162 0.154 21136 16384 0.156 0.156 0.148 22005 32767 0.163 0.149 0.149 21768 32768 0.150 0.146 0.145 22491 65535 0.158 0.141 0.141 23102 65536 0.149 0.140 0.138 23478 131071 0.150 0.145 0.141 23011 131072 0.148 0.141 0.148 21892 262143 0.151 0.148 0.147 22136 262144 0.149 0.146 0.146 22298 524287 0.150 0.149 0.149 21832 524288 0.148 0.148 0.147 22043 1048575 0.148 0.158 0.163 19913 1048576 0.156 0.179 0.153 21186 2097151 0.153 0.149 0.148 21979 2097152 0.147 0.148 0.147 22040 4194303 0.148 0.148 0.151 21482 4194304 0.148 0.148 0.147 22061 8388607 0.185 0.183 0.185 17536 8388608 0.193 0.180 0.183 17769 crc32() w/ 10+ year old cpus for #c per n where c ≈ 0.293ns N x1 x8 x64 mBps ------------------------------------------------------------ 1 4447.000 43.625 37.641 86 1 41.000 37.125 37.609 86 2 31.500 26.562 22.477 145 3 25.000 20.125 17.422 187 4 21.250 16.594 15.230 213 7 16.714 13.089 11.717 277 8 16.875 12.609 11.174 291 15 12.733 9.958 9.339 348 16 12.438 9.852 9.208 353 31 10.935 8.617 8.164 398 32 10.906 8.496 8.155 399 63 9.095 7.819 7.692 423 64 9.172 7.807 7.692 423 127 8.165 7.531 7.438 437 128 8.133 7.503 7.437 437 255 7.714 7.329 7.293 446 256 7.723 7.348 7.293 446 511 7.434 7.253 7.223 450 512 7.412 7.237 7.218 450 1023 7.274 7.214 7.201 451 1024 7.292 7.203 7.189 452 2047 7.232 7.185 7.178 453 2048 7.239 7.189 7.186 452 4095 7.189 7.175 7.172 453 4096 7.192 7.173 7.172 453 8191 7.187 7.173 7.172 453 8192 7.183 7.174 7.181 453 16383 7.175 7.170 7.169 453 16384 7.176 7.169 7.169 453 32767 7.169 7.182 7.170 453 32768 7.173 7.172 7.172 453 65535 7.170 7.170 7.171 453 65536 7.172 7.171 7.204 451 131071 7.170 7.354 7.260 448 131072 7.172 7.172 7.182 453 262143 7.037 7.178 7.182 453 262144 7.169 7.343 7.205 451 524287 7.438 7.170 7.206 451 524288 7.169 7.164 7.209 451 1048575 6.995 7.119 7.158 454 1048576 7.168 7.110 7.157 454 2097151 7.057 7.058 7.065 460 2097152 6.977 7.047 7.089 458 4194303 7.017 7.504 7.030 462 4194304 7.025 7.059 7.030 462 8388607 7.082 6.980 6.997 464 8388608 7.051 6.985 6.999 464 */ .yoink __FILE__