/*-*- mode:asm; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ This program is free software; you can redistribute it and/or modify │ │ it under the terms of the GNU General Public License as published by │ │ the Free Software Foundation; version 2 of the License. │ │ │ │ This program is distributed in the hope that it will be useful, but │ │ WITHOUT ANY WARRANTY; without even the implied warranty of │ │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │ │ General Public License for more details. │ │ │ │ You should have received a copy of the GNU General Public License │ │ along with this program; if not, write to the Free Software │ │ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │ │ 02110-1301 USA │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.h" / Multiplies matrices. .p2align 4 dgemm_: push %r15 mov %rdi,%r15 push %r14 mov %r8,%r14 push %r13 mov %rsi,%r13 mov $.LC1,%esi push %r12 push %rbp push %rbx sub $248,%rsp mov %rcx,8(%rsp) mov 352(%rsp),%rcx mov %rdx,(%rsp) mov $1,%edx mov 304(%rsp),%rbx mov (%rcx),%ecx mov 344(%rsp),%rbp mov %r9,24(%rsp) mov %ecx,16(%rsp) mov 328(%rsp),%rcx mov (%rcx),%ecx mov %ecx,32(%rsp) mov 312(%rsp),%rcx mov (%rcx),%ecx mov %ecx,40(%rsp) mov $1,%ecx call lsame_ mov $1,%ecx mov $1,%edx mov %r13,%rdi mov %eax,%r12d mov $.LC1,%esi call lsame_ test %r12d,%r12d mov 320(%rsp),%r11 mov %eax,%r8d je .L2 mov (%rsp),%rax mov (%rax),%r10d mov %r10d,%r9d test %r8d,%r8d jne .L3 movl $0,236(%rsp) mov 8(%rsp),%rax mov (%rax),%eax mov %eax,48(%rsp) .L103: mov $1,%ecx mov $1,%edx mov $.LC2,%esi mov %r13,%rdi mov %r11,320(%rsp) mov %r8d,64(%rsp) mov %r10d,56(%rsp) call lsame_ mov 56(%rsp),%r10d mov 64(%rsp),%r8d test %eax,%eax mov 320(%rsp),%r11 jne .L156 mov $1,%ecx mov $1,%edx mov $.LC3,%esi mov %r13,%rdi mov %r11,320(%rsp) mov %r8d,64(%rsp) mov %r10d,56(%rsp) call lsame_ test %eax,%eax jne .L155 movl $2,236(%rsp) jmp .L8 .p2align 4,,10 .p2align 3 .L2: mov %r11,320(%rsp) mov (%r14),%r10d mov %eax,64(%rsp) test %eax,%eax je .L5 mov $1,%ecx mov $1,%edx mov $.LC2,%esi mov %r15,%rdi mov %r10d,48(%rsp) mov %r10d,56(%rsp) movl $0,236(%rsp) call lsame_ mov 56(%rsp),%r10d mov 64(%rsp),%r8d test %eax,%eax mov 320(%rsp),%r11 je .L145 .L156: mov (%rsp),%rax mov (%rax),%r9d .L6: test %r9d,%r9d js .L160 .L9: mov 8(%rsp),%rax mov (%rax),%eax test %eax,%eax js .L161 mov (%r14),%r13d test %r13d,%r13d js .L162 mov 312(%rsp),%rdi mov $1,%edx test %r10d,%r10d mov %r10d,%ecx cmovle %edx,%ecx cmp %ecx,(%rdi) jge .L12 movl $8,236(%rsp) jmp .L8 .L3: movl $0,236(%rsp) mov (%r14),%eax mov %eax,48(%rsp) test %r9d,%r9d jns .L9 .L160: movl $3,236(%rsp) .L8: mov $6,%edx lea 236(%rsp),%rsi mov $.LC5,%edi call xerbla_ .L1: add $248,%rsp pop %rbx pop %rbp pop %r12 pop %r13 pop %r14 pop %r15 ret .L161: movl $4,236(%rsp) jmp .L8 .L5: mov 8(%rsp),%rax mov $1,%ecx mov $1,%edx mov %r15,%rdi mov $.LC2,%esi mov %r10d,56(%rsp) movl $0,236(%rsp) mov (%rax),%eax mov %eax,48(%rsp) call lsame_ mov 56(%rsp),%r10d mov 64(%rsp),%r8d test %eax,%eax mov 320(%rsp),%r11 jne .L103 mov $1,%ecx mov $1,%edx mov $.LC3,%esi mov %r15,%rdi mov %r11,320(%rsp) mov %r8d,64(%rsp) mov %r10d,56(%rsp) call lsame_ mov 56(%rsp),%r10d mov 64(%rsp),%r8d test %eax,%eax mov 320(%rsp),%r11 jne .L103 .L104: movl $1,236(%rsp) jmp .L8 .L162: movl $5,236(%rsp) jmp .L8 .L12: mov 48(%rsp),%ecx mov 328(%rsp),%rdi test %ecx,%ecx cmovle %edx,%ecx cmp %ecx,(%rdi) jl .L163 mov 352(%rsp),%rcx test %r9d,%r9d cmovg %r9d,%edx cmp %edx,(%rcx) jge .L14 movl $13,236(%rsp) jmp .L8 .L145: mov $1,%ecx mov $1,%edx mov $.LC3,%esi mov %r15,%rdi mov %r11,320(%rsp) mov %r8d,64(%rsp) mov %r10d,56(%rsp) call lsame_ test %eax,%eax je .L104 .L155: mov (%rsp),%rax mov 56(%rsp),%r10d mov 64(%rsp),%r8d mov 320(%rsp),%r11 mov (%rax),%r9d jmp .L6 .L163: movl $10,236(%rsp) jmp .L8 .p2align 4,,10 .p2align 3 .L14: test %r9d,%r9d je .L1 test %eax,%eax je .L1 movslq 16(%rsp),%rcx mov $0,%edx pxor %xmm3,%xmm3 test %rcx,%rcx cmovns %rcx,%rdx mov 24(%rsp),%rcx movsd (%rcx),%xmm2 mov %rdx,48(%rsp) ucomisd %xmm3,%xmm2 jp .L119 jne .L119 mov 336(%rsp),%rbx movsd (%rbx),%xmm1 ucomisd .LC4(%rip),%xmm1 jp .L100 je .L1 .L100: ucomisd %xmm3,%xmm1 jnp .L164 .L21: mov 48(%rsp),%rbx mov %r9d,%r8d mov %r9d,%r10d mov %r9d,%r11d shrl %r8d mov %rbp,%rsi add $1,%eax and $-2,%r10d lea 0(,%rbx,8),%r12 sal $4,%r8 or $1,%r11d mov $-1,%rcx mov $1,%edi movddup %xmm1,%xmm2 .p2align 4,,10 .p2align 3 .L28: cmp $1,%r9d je .L106 lea (%rsi,%r8),%r13 mov %rsi,%rdx .p2align 4,,10 .p2align 3 .L27: movupd (%rdx),%xmm0 add $16,%rdx mulpd %xmm2,%xmm0 movups %xmm0,-16(%rdx) cmp %rdx,%r13 jne .L27 movslq %r11d,%rdx cmp %r9d,%r10d je .L29 .L25: add %rcx,%rdx lea 0(%rbp,%rdx,8),%rdx movsd (%rdx),%xmm0 mulsd %xmm1,%xmm0 movsd %xmm0,(%rdx) .L29: add $1,%edi add %rbx,%rcx add %r12,%rsi cmp %edi,%eax jne .L28 jmp .L1 .L119: test %r13d,%r13d jne .L20 mov 336(%rsp),%rcx movsd .LC4(%rip),%xmm0 ucomisd (%rcx),%xmm0 jnp .L165 .L20: movslq 32(%rsp),%rcx xor %edx,%edx test %rcx,%rcx cmovs %rdx,%rcx mov %rcx,%rsi mov %rcx,56(%rsp) notq %rsi mov %rsi,72(%rsp) movslq 40(%rsp),%rsi test %rsi,%rsi cmovns %rsi,%rdx mov %rdx,%rsi mov %rdx,(%rsp) mov %rdx,%rdi notq %rsi mov %rsi,80(%rsp) test %r8d,%r8d je .L30 test %r12d,%r12d jne .L166 mov 48(%rsp),%rcx add $1,%eax lea -1(%r9),%edx mov %r13d,%r8d mov %eax,32(%rsp) lea 8(%rbp,%rdx,8),%r10 notq %rdx shrl %r8d sal $3,%rcx mov $1,%r14d mov %r13d,%ebp mov %r13d,%r12d mov %rcx,16(%rsp) mov 56(%rsp),%rcx mov %r11,%rdi and $-2,%ebp mov 336(%rsp),%rax sal $4,%r8 or $1,%r12d lea 8(%rbx),%r15 sal $3,%rcx mov %r14d,8(%rsp) mov (%rsp),%r14 mov $-1,%rsi movsd (%rax),%xmm5 lea 0(,%rdx,8),%rax mov %rcx,24(%rsp) mov %rax,40(%rsp) .p2align 4,,10 .p2align 3 .L64: mov 40(%rsp),%rax mov $-1,%rcx lea (%rax,%r10),%r9 .p2align 4,,10 .p2align 3 .L63: test %r13d,%r13d je .L111 cmp $1,%r13d je .L112 lea (%r15,%rcx,8),%rdx xor %eax,%eax pxor %xmm4,%xmm4 .p2align 4,,10 .p2align 3 .L58: movupd (%rdx,%rax),%xmm0 movupd (%rdi,%rax),%xmm7 add $16,%rax mulpd %xmm7,%xmm0 addpd %xmm0,%xmm4 cmp %r8,%rax jne .L58 movapd %xmm4,%xmm1 movslq %r12d,%rax unpckhpd %xmm4,%xmm1 addpd %xmm4,%xmm1 cmp %ebp,%r13d je .L55 .L56: lea (%rcx,%rax),%rdx add %rsi,%rax movsd (%rbx,%rdx,8),%xmm0 mulsd (%r11,%rax,8),%xmm0 addsd %xmm0,%xmm1 .L55: mulsd %xmm2,%xmm1 ucomisd %xmm3,%xmm5 jp .L60 jne .L60 .L157: movsd %xmm1,(%r9) add $8,%r9 add %r14,%rcx cmp %r10,%r9 jne .L63 addl $1,8(%rsp) add 16(%rsp),%r10 mov 8(%rsp),%eax add 56(%rsp),%rsi add 24(%rsp),%rdi cmp 32(%rsp),%eax jne .L64 jmp .L1 .p2align 4,,10 .p2align 3 .L60: movsd (%r9),%xmm0 mulsd %xmm5,%xmm0 addsd %xmm0,%xmm1 jmp .L157 .p2align 4,,10 .p2align 3 .L111: movapd %xmm3,%xmm1 jmp .L55 .L112: mov $1,%eax movapd %xmm3,%xmm1 jmp .L56 .L30: test %r12d,%r12d jne .L167 sub $1,%eax lea -1(%r9),%esi mov 48(%rsp),%rcx movq $1,64(%rsp) add $2,%rax lea 8(%rbp,%rsi,8),%r14 mov 56(%rsp),%rbp mov %r11,16(%rsp) mov %rax,88(%rsp) mov 336(%rsp),%rax sal $3,%rcx mov %rcx,80(%rsp) mov %rbp,%rcx lea 0(,%rbp,8),%rdx movsd (%rax),%xmm4 mov (%rsp),%rax sal $4,%rcx sal $3,%rax mov %rax,8(%rsp) lea -1(%r13),%eax mov %eax,24(%rsp) mov %r13d,%eax shrl %eax sal $4,%rax mov %rax,40(%rsp) mov %r13d,%eax and $-2,%eax mov %eax,32(%rsp) mov %r13d,%eax or $1,%eax mov %eax,56(%rsp) mov %rsi,%rax notq %rax sal $3,%rax mov %rax,96(%rsp) .p2align 4,,10 .p2align 3 .L98: mov 96(%rsp),%rax mov 72(%rsp),%rdi mov %rbx,%r10 mov $-1,%r9 lea (%rax,%r14),%rsi mov 64(%rsp),%rax mov %rax,%r12 add %rbp,%rax add %rdi,%rax add %rdi,%r12 mov %rax,48(%rsp) .p2align 4,,10 .p2align 3 .L97: test %r13d,%r13d je .L117 cmpl $2,24(%rsp) jbe .L118 mov 40(%rsp),%rax mov 16(%rsp),%rdi pxor %xmm1,%xmm1 lea (%rax,%r10),%r8 mov %r10,%rax .p2align 4,,10 .p2align 3 .L92: movsd (%rdi),%xmm0 movupd (%rax),%xmm6 add $16,%rax movhpd (%rdi,%rdx),%xmm0 add %rcx,%rdi mulpd %xmm6,%xmm0 addpd %xmm0,%xmm1 cmp %r8,%rax jne .L92 movapd %xmm1,%xmm0 unpckhpd %xmm1,%xmm0 addpd %xmm0,%xmm1 cmp 32(%rsp),%r13d je .L89 mov 56(%rsp),%eax .L90: movslq %eax,%r8 mov %rbp,%rdi imul %r8,%rdi add %r9,%r8 lea (%r12,%rdi),%r15 movsd (%r11,%r15,8),%xmm0 mulsd (%rbx,%r8,8),%xmm0 lea 1(%rax),%r8d addsd %xmm0,%xmm1 cmp %r8d,%r13d jl .L89 add %rbp,%rdi movslq %r8d,%r8 add $2,%eax lea (%r12,%rdi),%r15 add %r9,%r8 movsd (%r11,%r15,8),%xmm0 mulsd (%rbx,%r8,8),%xmm0 addsd %xmm0,%xmm1 cmp %eax,%r13d jl .L89 cltq add 48(%rsp),%rdi add %r9,%rax movsd (%r11,%rdi,8),%xmm0 mulsd (%rbx,%rax,8),%xmm0 addsd %xmm0,%xmm1 .L89: mulsd %xmm2,%xmm1 ucomisd %xmm3,%xmm4 jp .L94 jne .L94 .L158: movsd %xmm1,(%rsi) add $8,%rsi add (%rsp),%r9 add 8(%rsp),%r10 cmp %r14,%rsi jne .L97 addq $1,64(%rsp) add 80(%rsp),%r14 addq $8,16(%rsp) mov 64(%rsp),%rax cmp 88(%rsp),%rax jne .L98 jmp .L1 .p2align 4,,10 .p2align 3 .L94: movsd (%rsi),%xmm0 mulsd %xmm4,%xmm0 addsd %xmm0,%xmm1 jmp .L158 .p2align 4,,10 .p2align 3 .L117: movapd %xmm3,%xmm1 jmp .L89 .L118: movapd %xmm3,%xmm1 mov $1,%eax jmp .L90 .L164: jne .L21 mov 48(%rsp),%r12 lea 1(%rax),%r14d lea -1(%r9),%eax mov $1,%ebx lea 8(,%rax,8),%r13 sal $3,%r12 .p2align 4,,10 .p2align 3 .L24: mov %rbp,%rdi mov %r13,%rdx xor %esi,%esi add $1,%ebx call memset add %r12,%rbp cmp %ebx,%r14d jne .L24 jmp .L1 .L167: sub $1,%eax mov 72(%rsp),%rsi mov %r9d,%r15d mov %rbp,%r12 add $2,%rax mov 336(%rsp),%rcx movq $1,24(%rsp) shrl %r15d mov %rax,64(%rsp) mov %r9d,%eax sal $4,%r15 movsd .LC4(%rip),%xmm7 and $-2,%eax movsd (%rcx),%xmm5 mov 48(%rsp),%rcx mov %r11,320(%rsp) mov %eax,8(%rsp) mov %r9d,%eax mov $-1,%r14 or $1,%eax sal $3,%rcx movddup %xmm5,%xmm6 mov %eax,16(%rsp) lea -1(%r9),%eax mov %rcx,40(%rsp) mov 80(%rsp),%rcx mov %eax,88(%rsp) lea 8(,%rax,8),%rax mov %rax,208(%rsp) lea (%rdx,%rdx),%rax add %rax,%rcx mov %rax,96(%rsp) mov 56(%rsp),%rax mov %rcx,128(%rsp) mov %rax,%rcx sal $4,%rcx mov %rcx,112(%rsp) mov %rax,%rcx lea (%rsi,%rax,2),%rax lea (%r11,%rax,8),%rax mov %rax,144(%rsp) mov %rdx,%rax sal $4,%rax mov %rax,136(%rsp) mov %rcx,%rax negq %rax sal $3,%rax mov %rax,152(%rsp) lea 0(,%rdx,8),%rax mov %rax,168(%rsp) mov %r9d,%eax shr $2,%eax sal $5,%rax mov %rax,104(%rsp) mov %r9d,%eax and $-4,%eax mov %eax,200(%rsp) add $1,%eax mov %eax,192(%rsp) lea 0(,%rcx,8),%rax mov %r13d,%ecx mov %r9d,%r13d mov %rax,216(%rsp) .p2align 4,,10 .p2align 3 .L77: ucomisd %xmm3,%xmm5 jnp .L168 .L68: ucomisd %xmm7,%xmm5 jp .L123 jne .L123 .L71: test %ecx,%ecx je .L69 cmp $2,%ecx jle .L114 mov 144(%rsp),%rax mov 24(%rsp),%rdi mov %r15,160(%rsp) mov $-1,%r10 mov 128(%rsp),%rsi lea (%rax,%rdi,8),%r11 mov 104(%rsp),%rdi mov %rbx,%rax mov %rsi,32(%rsp) mov $2,%esi mov %esi,%r15d .L82: mov 152(%rsp),%rsi cmpl $2,88(%rsp) movsd (%r11,%rsi),%xmm10 mulsd %xmm2,%xmm10 jbe .L169 movsd (%r11),%xmm11 mov 168(%rsp),%rsi movddup %xmm10,%xmm9 xor %edx,%edx mulsd %xmm2,%xmm11 lea (%rax,%rsi),%r8 mov %r12,%rsi movddup %xmm11,%xmm8 .p2align 4,,10 .p2align 3 .L88: movupd 16(%rax,%rdx),%xmm0 movupd 16(%r8,%rdx),%xmm1 add $32,%rsi movupd -16(%rsi),%xmm4 mulpd %xmm9,%xmm0 mulpd %xmm8,%xmm1 addpd %xmm4,%xmm0 movupd -32(%rsi),%xmm4 addpd %xmm1,%xmm0 movupd (%rax,%rdx),%xmm1 mulpd %xmm9,%xmm1 movups %xmm0,-16(%rsi) addpd %xmm4,%xmm1 movupd (%r8,%rdx),%xmm4 add $32,%rdx mulpd %xmm8,%xmm4 addpd %xmm4,%xmm1 movups %xmm1,-32(%rsi) cmp %rdi,%rdx jne .L88 cmp %r13d,200(%rsp) je .L81 mov 192(%rsp),%edx .L79: movslq %edx,%rsi lea (%r14,%rsi),%r8 lea (%r10,%rsi),%r9 add 32(%rsp),%rsi movsd (%rbx,%rsi,8),%xmm1 movsd (%rbx,%r9,8),%xmm0 lea 0(%rbp,%r8,8),%r8 lea 1(%rdx),%esi mulsd %xmm11,%xmm1 mulsd %xmm10,%xmm0 addsd (%r8),%xmm0 addsd %xmm1,%xmm0 movsd %xmm0,(%r8) cmp %esi,%r13d jl .L81 movslq %esi,%rsi add $2,%edx lea (%r10,%rsi),%r9 lea (%r14,%rsi),%r8 movsd (%rbx,%r9,8),%xmm0 mov 32(%rsp),%r9 lea 0(%rbp,%r8,8),%r8 mulsd %xmm10,%xmm0 add %r9,%rsi movsd (%rbx,%rsi,8),%xmm1 addsd (%r8),%xmm0 mulsd %xmm11,%xmm1 addsd %xmm1,%xmm0 movsd %xmm0,(%r8) cmp %edx,%r13d jl .L81 movslq %edx,%rdx lea (%r14,%rdx),%rsi lea (%r10,%rdx),%r8 add %r9,%rdx mulsd (%rbx,%rdx,8),%xmm11 lea 0(%rbp,%rsi,8),%rsi mulsd (%rbx,%r8,8),%xmm10 addsd (%rsi),%xmm10 addsd %xmm10,%xmm11 movsd %xmm11,(%rsi) .L81: mov 96(%rsp),%rdx lea 1(%r15),%esi add $2,%r15d add %rdx,32(%rsp) add 112(%rsp),%r11 add 136(%rsp),%rax add %rdx,%r10 cmp %r15d,%ecx jg .L82 mov 160(%rsp),%r15 .L78: mov 56(%rsp),%rdi movslq %esi,%rdx mov (%rsp),%r8 lea 8(%rbx),%r10 mov 72(%rsp),%rax add 24(%rsp),%rax imul %rdx,%rdi mov 216(%rsp),%r9 imul %r8,%rdx add 80(%rsp),%rdx add %rdi,%rax mov 320(%rsp),%rdi lea (%rdi,%rax,8),%rdi .p2align 4,,10 .p2align 3 .L86: movsd (%rdi),%xmm0 mulsd %xmm2,%xmm0 cmp $1,%r13d je .L116 lea (%r10,%rdx,8),%r11 movddup %xmm0,%xmm4 xor %eax,%eax .p2align 4,,10 .p2align 3 .L84: movupd (%r11,%rax),%xmm1 movupd (%r12,%rax),%xmm13 mulpd %xmm4,%xmm1 addpd %xmm13,%xmm1 movups %xmm1,(%r12,%rax) add $16,%rax cmp %r15,%rax jne .L84 cmp 8(%rsp),%r13d je .L85 movslq 16(%rsp),%rax .L83: lea (%r14,%rax),%r11 add %rdx,%rax lea 0(%rbp,%r11,8),%r11 mulsd (%rbx,%rax,8),%xmm0 addsd (%r11),%xmm0 movsd %xmm0,(%r11) .L85: add $1,%esi add %r9,%rdi add %r8,%rdx cmp %esi,%ecx jge .L86 .L69: addq $1,24(%rsp) add 48(%rsp),%r14 mov 24(%rsp),%rax add 40(%rsp),%r12 cmp 64(%rsp),%rax jne .L77 jmp .L1 .L116: mov $1,%eax jmp .L83 .L168: jne .L68 mov 208(%rsp),%rdx xor %esi,%esi mov %r12,%rdi mov %ecx,204(%rsp) movaps %xmm6,176(%rsp) movsd %xmm5,160(%rsp) movsd %xmm2,32(%rsp) call memset mov .LC4(%rip),%rax movsd 32(%rsp),%xmm2 pxor %xmm3,%xmm3 movsd 160(%rsp),%xmm5 mov 204(%rsp),%ecx movapd 176(%rsp),%xmm6 movq %rax,%xmm7 jmp .L71 .L123: cmp $1,%r13d je .L113 lea (%r15,%r12),%rdx mov %r12,%rax .p2align 4,,10 .p2align 3 .L75: movupd (%rax),%xmm0 add $16,%rax mulpd %xmm6,%xmm0 movups %xmm0,-16(%rax) cmp %rdx,%rax jne .L75 cmp 8(%rsp),%r13d je .L71 movslq 16(%rsp),%rax .L73: add %r14,%rax lea 0(%rbp,%rax,8),%rax movsd (%rax),%xmm0 mulsd %xmm5,%xmm0 movsd %xmm0,(%rax) jmp .L71 .L166: add $1,%eax mov %r9d,%r15d sal $3,%rcx mov %r9d,%r8d mov %eax,32(%rsp) mov %r9d,%eax shrl %r15d mov %rbp,%r12 or $1,%eax mov %rcx,40(%rsp) sub $8,%r11 mov %r13d,%ecx mov %eax,8(%rsp) lea -1(%r9),%eax sal $4,%r15 and $-2,%r8d mov %eax,96(%rsp) lea 8(,%rax,8),%rax mov 336(%rsp),%rdx mov $-1,%r14 mov %rax,192(%rsp) lea (%rdi,%rdi),%rax movsd .LC4(%rip),%xmm7 mov %r9d,%r13d add %rax,%rsi mov %rax,88(%rsp) mov %rdi,%rax movsd (%rdx),%xmm5 mov 48(%rsp),%rdx sal $4,%rdi lea 0(,%rax,8),%rax mov %rsi,72(%rsp) mov %rax,152(%rsp) mov %r9d,%eax movddup %xmm5,%xmm6 shr $2,%eax sal $3,%rdx mov %rdi,104(%rsp) sal $5,%rax mov %rdx,56(%rsp) mov %rax,64(%rsp) mov %r9d,%eax and $-4,%eax movl $1,16(%rsp) mov %eax,144(%rsp) add $1,%eax mov %eax,160(%rsp) .p2align 4,,10 .p2align 3 .L43: ucomisd %xmm3,%xmm5 jnp .L170 .L34: ucomisd %xmm7,%xmm5 jp .L121 jne .L121 .L37: test %ecx,%ecx je .L35 cmp $2,%ecx jle .L108 mov 72(%rsp),%rdi mov %r15,128(%rsp) lea 16(%r11),%rsi mov %rbx,%rax mov $2,%edx mov %r8d,112(%rsp) mov $-1,%r10 mov %rdi,24(%rsp) mov 64(%rsp),%rdi mov %edx,%r8d mov %r11,136(%rsp) mov %rsi,%r11 .L48: movsd -8(%r11),%xmm10 cmpl $2,96(%rsp) mulsd %xmm2,%xmm10 jbe .L171 movsd (%r11),%xmm11 mov 152(%rsp),%rsi movddup %xmm10,%xmm9 xor %edx,%edx mulsd %xmm2,%xmm11 lea (%rsi,%rax),%r9 mov %r12,%rsi movddup %xmm11,%xmm8 .p2align 4,,10 .p2align 3 .L54: movupd 16(%rax,%rdx),%xmm0 movupd 16(%r9,%rdx),%xmm1 add $32,%rsi movupd -16(%rsi),%xmm4 mulpd %xmm9,%xmm0 mulpd %xmm8,%xmm1 addpd %xmm4,%xmm0 movupd -32(%rsi),%xmm4 addpd %xmm1,%xmm0 movupd (%rax,%rdx),%xmm1 mulpd %xmm9,%xmm1 movups %xmm0,-16(%rsi) addpd %xmm4,%xmm1 movupd (%r9,%rdx),%xmm4 add $32,%rdx mulpd %xmm8,%xmm4 addpd %xmm4,%xmm1 movups %xmm1,-32(%rsi) cmp %rdi,%rdx jne .L54 cmp %r13d,144(%rsp) je .L47 mov 160(%rsp),%edx .L45: movslq %edx,%rsi lea (%r14,%rsi),%r9 lea (%r10,%rsi),%r15 add 24(%rsp),%rsi movsd (%rbx,%rsi,8),%xmm1 movsd (%rbx,%r15,8),%xmm0 lea 0(%rbp,%r9,8),%r9 lea 1(%rdx),%esi mulsd %xmm11,%xmm1 mulsd %xmm10,%xmm0 addsd (%r9),%xmm0 addsd %xmm1,%xmm0 movsd %xmm0,(%r9) cmp %esi,%r13d jl .L47 movslq %esi,%rsi add $2,%edx lea (%r10,%rsi),%r15 lea (%r14,%rsi),%r9 movsd (%rbx,%r15,8),%xmm0 mov 24(%rsp),%r15 lea 0(%rbp,%r9,8),%r9 mulsd %xmm10,%xmm0 add %r15,%rsi movsd (%rbx,%rsi,8),%xmm1 addsd (%r9),%xmm0 mulsd %xmm11,%xmm1 addsd %xmm1,%xmm0 movsd %xmm0,(%r9) cmp %edx,%r13d jl .L47 movslq %edx,%rdx lea (%r14,%rdx),%rsi lea (%r10,%rdx),%r9 add %r15,%rdx mulsd (%rbx,%rdx,8),%xmm11 lea 0(%rbp,%rsi,8),%rsi mulsd (%rbx,%r9,8),%xmm10 addsd (%rsi),%xmm10 addsd %xmm10,%xmm11 movsd %xmm11,(%rsi) .L47: mov 88(%rsp),%rsi lea 1(%r8),%edx add $2,%r8d add %rsi,24(%rsp) add 104(%rsp),%rax add $16,%r11 add %rsi,%r10 cmp %r8d,%ecx jg .L48 mov 128(%rsp),%r15 mov 112(%rsp),%r8d mov 136(%rsp),%r11 .L44: mov (%rsp),%r9 movslq %edx,%rdx lea 8(%rbx),%rdi mov %r9,%rsi imul %rdx,%rsi add 80(%rsp),%rsi .p2align 4,,10 .p2align 3 .L52: movsd (%r11,%rdx,8),%xmm0 mulsd %xmm2,%xmm0 cmp $1,%r13d je .L110 lea (%rdi,%rsi,8),%r10 movddup %xmm0,%xmm4 xor %eax,%eax .p2align 4,,10 .p2align 3 .L50: movupd (%r10,%rax),%xmm1 movupd (%r12,%rax),%xmm12 mulpd %xmm4,%xmm1 addpd %xmm12,%xmm1 movups %xmm1,(%r12,%rax) add $16,%rax cmp %rax,%r15 jne .L50 cmp %r8d,%r13d je .L51 movslq 8(%rsp),%rax .L49: lea (%r14,%rax),%r10 add %rsi,%rax lea 0(%rbp,%r10,8),%r10 mulsd (%rbx,%rax,8),%xmm0 addsd (%r10),%xmm0 movsd %xmm0,(%r10) .L51: add $1,%rdx add %r9,%rsi cmp %edx,%ecx jge .L52 .L35: addl $1,16(%rsp) add 48(%rsp),%r14 mov 16(%rsp),%eax add 56(%rsp),%r12 add 40(%rsp),%r11 cmp 32(%rsp),%eax jne .L43 jmp .L1 .L110: mov $1,%eax jmp .L49 .L170: jne .L34 mov 192(%rsp),%rdx xor %esi,%esi mov %r12,%rdi mov %r11,168(%rsp) mov %r8d,200(%rsp) mov %ecx,136(%rsp) movaps %xmm6,112(%rsp) movsd %xmm5,128(%rsp) movsd %xmm2,24(%rsp) call memset mov .LC4(%rip),%rax pxor %xmm3,%xmm3 movsd 24(%rsp),%xmm2 movsd 128(%rsp),%xmm5 movapd 112(%rsp),%xmm6 mov 136(%rsp),%ecx mov 200(%rsp),%r8d movq %rax,%xmm7 mov 168(%rsp),%r11 jmp .L37 .L121: cmp $1,%r13d je .L107 lea (%r15,%r12),%rdx mov %r12,%rax .p2align 4,,10 .p2align 3 .L41: movupd (%rax),%xmm0 add $16,%rax mulpd %xmm6,%xmm0 movups %xmm0,-16(%rax) cmp %rax,%rdx jne .L41 cmp %r8d,%r13d je .L37 movslq 8(%rsp),%rax .L39: add %r14,%rax lea 0(%rbp,%rax,8),%rax movsd (%rax),%xmm0 mulsd %xmm5,%xmm0 movsd %xmm0,(%rax) jmp .L37 .L108: mov $1,%edx jmp .L44 .L114: mov $1,%esi jmp .L78 .L106: mov $1,%edx jmp .L25 .L171: movsd (%r11),%xmm11 mov $1,%edx mulsd %xmm2,%xmm11 jmp .L45 .L169: movsd (%r11),%xmm11 mov $1,%edx mulsd %xmm2,%xmm11 jmp .L79 .L107: mov $1,%eax jmp .L39 .L113: mov $1,%eax jmp .L73 .L165: je .L1 jmp .L20 .endfn dgemm_,globl .rodata.cst8 .LC4: .double 1 .rodata.str1.1 .LC1: .string "N" .LC2: .string "C" .LC3: .string "T" .LC5: .string "DGEMM "