cosmopolitan/third_party/blas/dgemm_.S

1103 lines
20 KiB
ArmAsm
Raw Normal View History

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
2020-06-15 14:18:57 +00:00
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
/ Multiplies matrices.
.p2align 4
dgemm_: push %r15
mov %rdi,%r15
push %r14
mov %r8,%r14
push %r13
mov %rsi,%r13
mov $.LC1,%esi
push %r12
push %rbp
push %rbx
sub $248,%rsp
mov %rcx,8(%rsp)
mov 352(%rsp),%rcx
mov %rdx,(%rsp)
mov $1,%edx
mov 304(%rsp),%rbx
mov (%rcx),%ecx
mov 344(%rsp),%rbp
mov %r9,24(%rsp)
mov %ecx,16(%rsp)
mov 328(%rsp),%rcx
mov (%rcx),%ecx
mov %ecx,32(%rsp)
mov 312(%rsp),%rcx
mov (%rcx),%ecx
mov %ecx,40(%rsp)
mov $1,%ecx
call lsame_
mov $1,%ecx
mov $1,%edx
mov %r13,%rdi
mov %eax,%r12d
mov $.LC1,%esi
call lsame_
test %r12d,%r12d
mov 320(%rsp),%r11
mov %eax,%r8d
je .L2
mov (%rsp),%rax
mov (%rax),%r10d
mov %r10d,%r9d
test %r8d,%r8d
jne .L3
movl $0,236(%rsp)
mov 8(%rsp),%rax
mov (%rax),%eax
mov %eax,48(%rsp)
.L103: mov $1,%ecx
mov $1,%edx
mov $.LC2,%esi
mov %r13,%rdi
mov %r11,320(%rsp)
mov %r8d,64(%rsp)
mov %r10d,56(%rsp)
call lsame_
mov 56(%rsp),%r10d
mov 64(%rsp),%r8d
test %eax,%eax
mov 320(%rsp),%r11
jne .L156
mov $1,%ecx
mov $1,%edx
mov $.LC3,%esi
mov %r13,%rdi
mov %r11,320(%rsp)
mov %r8d,64(%rsp)
mov %r10d,56(%rsp)
call lsame_
test %eax,%eax
jne .L155
movl $2,236(%rsp)
jmp .L8
.p2align 4,,10
.p2align 3
.L2: mov %r11,320(%rsp)
mov (%r14),%r10d
mov %eax,64(%rsp)
test %eax,%eax
je .L5
mov $1,%ecx
mov $1,%edx
mov $.LC2,%esi
mov %r15,%rdi
mov %r10d,48(%rsp)
mov %r10d,56(%rsp)
movl $0,236(%rsp)
call lsame_
mov 56(%rsp),%r10d
mov 64(%rsp),%r8d
test %eax,%eax
mov 320(%rsp),%r11
je .L145
.L156: mov (%rsp),%rax
mov (%rax),%r9d
.L6: test %r9d,%r9d
js .L160
.L9: mov 8(%rsp),%rax
mov (%rax),%eax
test %eax,%eax
js .L161
mov (%r14),%r13d
test %r13d,%r13d
js .L162
mov 312(%rsp),%rdi
mov $1,%edx
test %r10d,%r10d
mov %r10d,%ecx
cmovle %edx,%ecx
cmp %ecx,(%rdi)
jge .L12
movl $8,236(%rsp)
jmp .L8
.L3: movl $0,236(%rsp)
mov (%r14),%eax
mov %eax,48(%rsp)
test %r9d,%r9d
jns .L9
.L160: movl $3,236(%rsp)
.L8: mov $6,%edx
lea 236(%rsp),%rsi
mov $.LC5,%edi
call xerbla_
.L1: add $248,%rsp
pop %rbx
pop %rbp
pop %r12
pop %r13
pop %r14
pop %r15
ret
.L161: movl $4,236(%rsp)
jmp .L8
.L5: mov 8(%rsp),%rax
mov $1,%ecx
mov $1,%edx
mov %r15,%rdi
mov $.LC2,%esi
mov %r10d,56(%rsp)
movl $0,236(%rsp)
mov (%rax),%eax
mov %eax,48(%rsp)
call lsame_
mov 56(%rsp),%r10d
mov 64(%rsp),%r8d
test %eax,%eax
mov 320(%rsp),%r11
jne .L103
mov $1,%ecx
mov $1,%edx
mov $.LC3,%esi
mov %r15,%rdi
mov %r11,320(%rsp)
mov %r8d,64(%rsp)
mov %r10d,56(%rsp)
call lsame_
mov 56(%rsp),%r10d
mov 64(%rsp),%r8d
test %eax,%eax
mov 320(%rsp),%r11
jne .L103
.L104: movl $1,236(%rsp)
jmp .L8
.L162: movl $5,236(%rsp)
jmp .L8
.L12: mov 48(%rsp),%ecx
mov 328(%rsp),%rdi
test %ecx,%ecx
cmovle %edx,%ecx
cmp %ecx,(%rdi)
jl .L163
mov 352(%rsp),%rcx
test %r9d,%r9d
cmovg %r9d,%edx
cmp %edx,(%rcx)
jge .L14
movl $13,236(%rsp)
jmp .L8
.L145: mov $1,%ecx
mov $1,%edx
mov $.LC3,%esi
mov %r15,%rdi
mov %r11,320(%rsp)
mov %r8d,64(%rsp)
mov %r10d,56(%rsp)
call lsame_
test %eax,%eax
je .L104
.L155: mov (%rsp),%rax
mov 56(%rsp),%r10d
mov 64(%rsp),%r8d
mov 320(%rsp),%r11
mov (%rax),%r9d
jmp .L6
.L163: movl $10,236(%rsp)
jmp .L8
.p2align 4,,10
.p2align 3
.L14: test %r9d,%r9d
je .L1
test %eax,%eax
je .L1
movslq 16(%rsp),%rcx
mov $0,%edx
pxor %xmm3,%xmm3
test %rcx,%rcx
cmovns %rcx,%rdx
mov 24(%rsp),%rcx
movsd (%rcx),%xmm2
mov %rdx,48(%rsp)
ucomisd %xmm3,%xmm2
jp .L119
jne .L119
mov 336(%rsp),%rbx
movsd (%rbx),%xmm1
ucomisd .LC4(%rip),%xmm1
jp .L100
je .L1
.L100: ucomisd %xmm3,%xmm1
jnp .L164
.L21: mov 48(%rsp),%rbx
mov %r9d,%r8d
mov %r9d,%r10d
mov %r9d,%r11d
shrl %r8d
mov %rbp,%rsi
add $1,%eax
and $-2,%r10d
lea 0(,%rbx,8),%r12
sal $4,%r8
or $1,%r11d
mov $-1,%rcx
mov $1,%edi
movddup %xmm1,%xmm2
.p2align 4,,10
.p2align 3
.L28: cmp $1,%r9d
je .L106
lea (%rsi,%r8),%r13
mov %rsi,%rdx
.p2align 4,,10
.p2align 3
.L27: movupd (%rdx),%xmm0
add $16,%rdx
mulpd %xmm2,%xmm0
movups %xmm0,-16(%rdx)
cmp %rdx,%r13
jne .L27
movslq %r11d,%rdx
cmp %r9d,%r10d
je .L29
.L25: add %rcx,%rdx
lea 0(%rbp,%rdx,8),%rdx
movsd (%rdx),%xmm0
mulsd %xmm1,%xmm0
movsd %xmm0,(%rdx)
.L29: add $1,%edi
add %rbx,%rcx
add %r12,%rsi
cmp %edi,%eax
jne .L28
jmp .L1
.L119: test %r13d,%r13d
jne .L20
mov 336(%rsp),%rcx
movsd .LC4(%rip),%xmm0
ucomisd (%rcx),%xmm0
jnp .L165
.L20: movslq 32(%rsp),%rcx
xor %edx,%edx
test %rcx,%rcx
cmovs %rdx,%rcx
mov %rcx,%rsi
mov %rcx,56(%rsp)
notq %rsi
mov %rsi,72(%rsp)
movslq 40(%rsp),%rsi
test %rsi,%rsi
cmovns %rsi,%rdx
mov %rdx,%rsi
mov %rdx,(%rsp)
mov %rdx,%rdi
notq %rsi
mov %rsi,80(%rsp)
test %r8d,%r8d
je .L30
test %r12d,%r12d
jne .L166
mov 48(%rsp),%rcx
add $1,%eax
lea -1(%r9),%edx
mov %r13d,%r8d
mov %eax,32(%rsp)
lea 8(%rbp,%rdx,8),%r10
notq %rdx
shrl %r8d
sal $3,%rcx
mov $1,%r14d
mov %r13d,%ebp
mov %r13d,%r12d
mov %rcx,16(%rsp)
mov 56(%rsp),%rcx
mov %r11,%rdi
and $-2,%ebp
mov 336(%rsp),%rax
sal $4,%r8
or $1,%r12d
lea 8(%rbx),%r15
sal $3,%rcx
mov %r14d,8(%rsp)
mov (%rsp),%r14
mov $-1,%rsi
movsd (%rax),%xmm5
lea 0(,%rdx,8),%rax
mov %rcx,24(%rsp)
mov %rax,40(%rsp)
.p2align 4,,10
.p2align 3
.L64: mov 40(%rsp),%rax
mov $-1,%rcx
lea (%rax,%r10),%r9
.p2align 4,,10
.p2align 3
.L63: test %r13d,%r13d
je .L111
cmp $1,%r13d
je .L112
lea (%r15,%rcx,8),%rdx
xor %eax,%eax
pxor %xmm4,%xmm4
.p2align 4,,10
.p2align 3
.L58: movupd (%rdx,%rax),%xmm0
movupd (%rdi,%rax),%xmm7
add $16,%rax
mulpd %xmm7,%xmm0
addpd %xmm0,%xmm4
cmp %r8,%rax
jne .L58
movapd %xmm4,%xmm1
movslq %r12d,%rax
unpckhpd %xmm4,%xmm1
addpd %xmm4,%xmm1
cmp %ebp,%r13d
je .L55
.L56: lea (%rcx,%rax),%rdx
add %rsi,%rax
movsd (%rbx,%rdx,8),%xmm0
mulsd (%r11,%rax,8),%xmm0
addsd %xmm0,%xmm1
.L55: mulsd %xmm2,%xmm1
ucomisd %xmm3,%xmm5
jp .L60
jne .L60
.L157: movsd %xmm1,(%r9)
add $8,%r9
add %r14,%rcx
cmp %r10,%r9
jne .L63
addl $1,8(%rsp)
add 16(%rsp),%r10
mov 8(%rsp),%eax
add 56(%rsp),%rsi
add 24(%rsp),%rdi
cmp 32(%rsp),%eax
jne .L64
jmp .L1
.p2align 4,,10
.p2align 3
.L60: movsd (%r9),%xmm0
mulsd %xmm5,%xmm0
addsd %xmm0,%xmm1
jmp .L157
.p2align 4,,10
.p2align 3
.L111: movapd %xmm3,%xmm1
jmp .L55
.L112: mov $1,%eax
movapd %xmm3,%xmm1
jmp .L56
.L30: test %r12d,%r12d
jne .L167
sub $1,%eax
lea -1(%r9),%esi
mov 48(%rsp),%rcx
movq $1,64(%rsp)
add $2,%rax
lea 8(%rbp,%rsi,8),%r14
mov 56(%rsp),%rbp
mov %r11,16(%rsp)
mov %rax,88(%rsp)
mov 336(%rsp),%rax
sal $3,%rcx
mov %rcx,80(%rsp)
mov %rbp,%rcx
lea 0(,%rbp,8),%rdx
movsd (%rax),%xmm4
mov (%rsp),%rax
sal $4,%rcx
sal $3,%rax
mov %rax,8(%rsp)
lea -1(%r13),%eax
mov %eax,24(%rsp)
mov %r13d,%eax
shrl %eax
sal $4,%rax
mov %rax,40(%rsp)
mov %r13d,%eax
and $-2,%eax
mov %eax,32(%rsp)
mov %r13d,%eax
or $1,%eax
mov %eax,56(%rsp)
mov %rsi,%rax
notq %rax
sal $3,%rax
mov %rax,96(%rsp)
.p2align 4,,10
.p2align 3
.L98: mov 96(%rsp),%rax
mov 72(%rsp),%rdi
mov %rbx,%r10
mov $-1,%r9
lea (%rax,%r14),%rsi
mov 64(%rsp),%rax
mov %rax,%r12
add %rbp,%rax
add %rdi,%rax
add %rdi,%r12
mov %rax,48(%rsp)
.p2align 4,,10
.p2align 3
.L97: test %r13d,%r13d
je .L117
cmpl $2,24(%rsp)
jbe .L118
mov 40(%rsp),%rax
mov 16(%rsp),%rdi
pxor %xmm1,%xmm1
lea (%rax,%r10),%r8
mov %r10,%rax
.p2align 4,,10
.p2align 3
.L92: movsd (%rdi),%xmm0
movupd (%rax),%xmm6
add $16,%rax
movhpd (%rdi,%rdx),%xmm0
add %rcx,%rdi
mulpd %xmm6,%xmm0
addpd %xmm0,%xmm1
cmp %r8,%rax
jne .L92
movapd %xmm1,%xmm0
unpckhpd %xmm1,%xmm0
addpd %xmm0,%xmm1
cmp 32(%rsp),%r13d
je .L89
mov 56(%rsp),%eax
.L90: movslq %eax,%r8
mov %rbp,%rdi
imul %r8,%rdi
add %r9,%r8
lea (%r12,%rdi),%r15
movsd (%r11,%r15,8),%xmm0
mulsd (%rbx,%r8,8),%xmm0
lea 1(%rax),%r8d
addsd %xmm0,%xmm1
cmp %r8d,%r13d
jl .L89
add %rbp,%rdi
movslq %r8d,%r8
add $2,%eax
lea (%r12,%rdi),%r15
add %r9,%r8
movsd (%r11,%r15,8),%xmm0
mulsd (%rbx,%r8,8),%xmm0
addsd %xmm0,%xmm1
cmp %eax,%r13d
jl .L89
cltq
add 48(%rsp),%rdi
add %r9,%rax
movsd (%r11,%rdi,8),%xmm0
mulsd (%rbx,%rax,8),%xmm0
addsd %xmm0,%xmm1
.L89: mulsd %xmm2,%xmm1
ucomisd %xmm3,%xmm4
jp .L94
jne .L94
.L158: movsd %xmm1,(%rsi)
add $8,%rsi
add (%rsp),%r9
add 8(%rsp),%r10
cmp %r14,%rsi
jne .L97
addq $1,64(%rsp)
add 80(%rsp),%r14
addq $8,16(%rsp)
mov 64(%rsp),%rax
cmp 88(%rsp),%rax
jne .L98
jmp .L1
.p2align 4,,10
.p2align 3
.L94: movsd (%rsi),%xmm0
mulsd %xmm4,%xmm0
addsd %xmm0,%xmm1
jmp .L158
.p2align 4,,10
.p2align 3
.L117: movapd %xmm3,%xmm1
jmp .L89
.L118: movapd %xmm3,%xmm1
mov $1,%eax
jmp .L90
.L164: jne .L21
mov 48(%rsp),%r12
lea 1(%rax),%r14d
lea -1(%r9),%eax
mov $1,%ebx
lea 8(,%rax,8),%r13
sal $3,%r12
.p2align 4,,10
.p2align 3
.L24: mov %rbp,%rdi
mov %r13,%rdx
xor %esi,%esi
add $1,%ebx
call memset
add %r12,%rbp
cmp %ebx,%r14d
jne .L24
jmp .L1
.L167: sub $1,%eax
mov 72(%rsp),%rsi
mov %r9d,%r15d
mov %rbp,%r12
add $2,%rax
mov 336(%rsp),%rcx
movq $1,24(%rsp)
shrl %r15d
mov %rax,64(%rsp)
mov %r9d,%eax
sal $4,%r15
movsd .LC4(%rip),%xmm7
and $-2,%eax
movsd (%rcx),%xmm5
mov 48(%rsp),%rcx
mov %r11,320(%rsp)
mov %eax,8(%rsp)
mov %r9d,%eax
mov $-1,%r14
or $1,%eax
sal $3,%rcx
movddup %xmm5,%xmm6
mov %eax,16(%rsp)
lea -1(%r9),%eax
mov %rcx,40(%rsp)
mov 80(%rsp),%rcx
mov %eax,88(%rsp)
lea 8(,%rax,8),%rax
mov %rax,208(%rsp)
lea (%rdx,%rdx),%rax
add %rax,%rcx
mov %rax,96(%rsp)
mov 56(%rsp),%rax
mov %rcx,128(%rsp)
mov %rax,%rcx
sal $4,%rcx
mov %rcx,112(%rsp)
mov %rax,%rcx
lea (%rsi,%rax,2),%rax
lea (%r11,%rax,8),%rax
mov %rax,144(%rsp)
mov %rdx,%rax
sal $4,%rax
mov %rax,136(%rsp)
mov %rcx,%rax
negq %rax
sal $3,%rax
mov %rax,152(%rsp)
lea 0(,%rdx,8),%rax
mov %rax,168(%rsp)
mov %r9d,%eax
shr $2,%eax
sal $5,%rax
mov %rax,104(%rsp)
mov %r9d,%eax
and $-4,%eax
mov %eax,200(%rsp)
add $1,%eax
mov %eax,192(%rsp)
lea 0(,%rcx,8),%rax
mov %r13d,%ecx
mov %r9d,%r13d
mov %rax,216(%rsp)
.p2align 4,,10
.p2align 3
.L77: ucomisd %xmm3,%xmm5
jnp .L168
.L68: ucomisd %xmm7,%xmm5
jp .L123
jne .L123
.L71: test %ecx,%ecx
je .L69
cmp $2,%ecx
jle .L114
mov 144(%rsp),%rax
mov 24(%rsp),%rdi
mov %r15,160(%rsp)
mov $-1,%r10
mov 128(%rsp),%rsi
lea (%rax,%rdi,8),%r11
mov 104(%rsp),%rdi
mov %rbx,%rax
mov %rsi,32(%rsp)
mov $2,%esi
mov %esi,%r15d
.L82: mov 152(%rsp),%rsi
cmpl $2,88(%rsp)
movsd (%r11,%rsi),%xmm10
mulsd %xmm2,%xmm10
jbe .L169
movsd (%r11),%xmm11
mov 168(%rsp),%rsi
movddup %xmm10,%xmm9
xor %edx,%edx
mulsd %xmm2,%xmm11
lea (%rax,%rsi),%r8
mov %r12,%rsi
movddup %xmm11,%xmm8
.p2align 4,,10
.p2align 3
.L88: movupd 16(%rax,%rdx),%xmm0
movupd 16(%r8,%rdx),%xmm1
add $32,%rsi
movupd -16(%rsi),%xmm4
mulpd %xmm9,%xmm0
mulpd %xmm8,%xmm1
addpd %xmm4,%xmm0
movupd -32(%rsi),%xmm4
addpd %xmm1,%xmm0
movupd (%rax,%rdx),%xmm1
mulpd %xmm9,%xmm1
movups %xmm0,-16(%rsi)
addpd %xmm4,%xmm1
movupd (%r8,%rdx),%xmm4
add $32,%rdx
mulpd %xmm8,%xmm4
addpd %xmm4,%xmm1
movups %xmm1,-32(%rsi)
cmp %rdi,%rdx
jne .L88
cmp %r13d,200(%rsp)
je .L81
mov 192(%rsp),%edx
.L79: movslq %edx,%rsi
lea (%r14,%rsi),%r8
lea (%r10,%rsi),%r9
add 32(%rsp),%rsi
movsd (%rbx,%rsi,8),%xmm1
movsd (%rbx,%r9,8),%xmm0
lea 0(%rbp,%r8,8),%r8
lea 1(%rdx),%esi
mulsd %xmm11,%xmm1
mulsd %xmm10,%xmm0
addsd (%r8),%xmm0
addsd %xmm1,%xmm0
movsd %xmm0,(%r8)
cmp %esi,%r13d
jl .L81
movslq %esi,%rsi
add $2,%edx
lea (%r10,%rsi),%r9
lea (%r14,%rsi),%r8
movsd (%rbx,%r9,8),%xmm0
mov 32(%rsp),%r9
lea 0(%rbp,%r8,8),%r8
mulsd %xmm10,%xmm0
add %r9,%rsi
movsd (%rbx,%rsi,8),%xmm1
addsd (%r8),%xmm0
mulsd %xmm11,%xmm1
addsd %xmm1,%xmm0
movsd %xmm0,(%r8)
cmp %edx,%r13d
jl .L81
movslq %edx,%rdx
lea (%r14,%rdx),%rsi
lea (%r10,%rdx),%r8
add %r9,%rdx
mulsd (%rbx,%rdx,8),%xmm11
lea 0(%rbp,%rsi,8),%rsi
mulsd (%rbx,%r8,8),%xmm10
addsd (%rsi),%xmm10
addsd %xmm10,%xmm11
movsd %xmm11,(%rsi)
.L81: mov 96(%rsp),%rdx
lea 1(%r15),%esi
add $2,%r15d
add %rdx,32(%rsp)
add 112(%rsp),%r11
add 136(%rsp),%rax
add %rdx,%r10
cmp %r15d,%ecx
jg .L82
mov 160(%rsp),%r15
.L78: mov 56(%rsp),%rdi
movslq %esi,%rdx
mov (%rsp),%r8
lea 8(%rbx),%r10
mov 72(%rsp),%rax
add 24(%rsp),%rax
imul %rdx,%rdi
mov 216(%rsp),%r9
imul %r8,%rdx
add 80(%rsp),%rdx
add %rdi,%rax
mov 320(%rsp),%rdi
lea (%rdi,%rax,8),%rdi
.p2align 4,,10
.p2align 3
.L86: movsd (%rdi),%xmm0
mulsd %xmm2,%xmm0
cmp $1,%r13d
je .L116
lea (%r10,%rdx,8),%r11
movddup %xmm0,%xmm4
xor %eax,%eax
.p2align 4,,10
.p2align 3
.L84: movupd (%r11,%rax),%xmm1
movupd (%r12,%rax),%xmm13
mulpd %xmm4,%xmm1
addpd %xmm13,%xmm1
movups %xmm1,(%r12,%rax)
add $16,%rax
cmp %r15,%rax
jne .L84
cmp 8(%rsp),%r13d
je .L85
movslq 16(%rsp),%rax
.L83: lea (%r14,%rax),%r11
add %rdx,%rax
lea 0(%rbp,%r11,8),%r11
mulsd (%rbx,%rax,8),%xmm0
addsd (%r11),%xmm0
movsd %xmm0,(%r11)
.L85: add $1,%esi
add %r9,%rdi
add %r8,%rdx
cmp %esi,%ecx
jge .L86
.L69: addq $1,24(%rsp)
add 48(%rsp),%r14
mov 24(%rsp),%rax
add 40(%rsp),%r12
cmp 64(%rsp),%rax
jne .L77
jmp .L1
.L116: mov $1,%eax
jmp .L83
.L168: jne .L68
mov 208(%rsp),%rdx
xor %esi,%esi
mov %r12,%rdi
mov %ecx,204(%rsp)
movaps %xmm6,176(%rsp)
movsd %xmm5,160(%rsp)
movsd %xmm2,32(%rsp)
call memset
mov .LC4(%rip),%rax
movsd 32(%rsp),%xmm2
pxor %xmm3,%xmm3
movsd 160(%rsp),%xmm5
mov 204(%rsp),%ecx
movapd 176(%rsp),%xmm6
movq %rax,%xmm7
jmp .L71
.L123: cmp $1,%r13d
je .L113
lea (%r15,%r12),%rdx
mov %r12,%rax
.p2align 4,,10
.p2align 3
.L75: movupd (%rax),%xmm0
add $16,%rax
mulpd %xmm6,%xmm0
movups %xmm0,-16(%rax)
cmp %rdx,%rax
jne .L75
cmp 8(%rsp),%r13d
je .L71
movslq 16(%rsp),%rax
.L73: add %r14,%rax
lea 0(%rbp,%rax,8),%rax
movsd (%rax),%xmm0
mulsd %xmm5,%xmm0
movsd %xmm0,(%rax)
jmp .L71
.L166: add $1,%eax
mov %r9d,%r15d
sal $3,%rcx
mov %r9d,%r8d
mov %eax,32(%rsp)
mov %r9d,%eax
shrl %r15d
mov %rbp,%r12
or $1,%eax
mov %rcx,40(%rsp)
sub $8,%r11
mov %r13d,%ecx
mov %eax,8(%rsp)
lea -1(%r9),%eax
sal $4,%r15
and $-2,%r8d
mov %eax,96(%rsp)
lea 8(,%rax,8),%rax
mov 336(%rsp),%rdx
mov $-1,%r14
mov %rax,192(%rsp)
lea (%rdi,%rdi),%rax
movsd .LC4(%rip),%xmm7
mov %r9d,%r13d
add %rax,%rsi
mov %rax,88(%rsp)
mov %rdi,%rax
movsd (%rdx),%xmm5
mov 48(%rsp),%rdx
sal $4,%rdi
lea 0(,%rax,8),%rax
mov %rsi,72(%rsp)
mov %rax,152(%rsp)
mov %r9d,%eax
movddup %xmm5,%xmm6
shr $2,%eax
sal $3,%rdx
mov %rdi,104(%rsp)
sal $5,%rax
mov %rdx,56(%rsp)
mov %rax,64(%rsp)
mov %r9d,%eax
and $-4,%eax
movl $1,16(%rsp)
mov %eax,144(%rsp)
add $1,%eax
mov %eax,160(%rsp)
.p2align 4,,10
.p2align 3
.L43: ucomisd %xmm3,%xmm5
jnp .L170
.L34: ucomisd %xmm7,%xmm5
jp .L121
jne .L121
.L37: test %ecx,%ecx
je .L35
cmp $2,%ecx
jle .L108
mov 72(%rsp),%rdi
mov %r15,128(%rsp)
lea 16(%r11),%rsi
mov %rbx,%rax
mov $2,%edx
mov %r8d,112(%rsp)
mov $-1,%r10
mov %rdi,24(%rsp)
mov 64(%rsp),%rdi
mov %edx,%r8d
mov %r11,136(%rsp)
mov %rsi,%r11
.L48: movsd -8(%r11),%xmm10
cmpl $2,96(%rsp)
mulsd %xmm2,%xmm10
jbe .L171
movsd (%r11),%xmm11
mov 152(%rsp),%rsi
movddup %xmm10,%xmm9
xor %edx,%edx
mulsd %xmm2,%xmm11
lea (%rsi,%rax),%r9
mov %r12,%rsi
movddup %xmm11,%xmm8
.p2align 4,,10
.p2align 3
.L54: movupd 16(%rax,%rdx),%xmm0
movupd 16(%r9,%rdx),%xmm1
add $32,%rsi
movupd -16(%rsi),%xmm4
mulpd %xmm9,%xmm0
mulpd %xmm8,%xmm1
addpd %xmm4,%xmm0
movupd -32(%rsi),%xmm4
addpd %xmm1,%xmm0
movupd (%rax,%rdx),%xmm1
mulpd %xmm9,%xmm1
movups %xmm0,-16(%rsi)
addpd %xmm4,%xmm1
movupd (%r9,%rdx),%xmm4
add $32,%rdx
mulpd %xmm8,%xmm4
addpd %xmm4,%xmm1
movups %xmm1,-32(%rsi)
cmp %rdi,%rdx
jne .L54
cmp %r13d,144(%rsp)
je .L47
mov 160(%rsp),%edx
.L45: movslq %edx,%rsi
lea (%r14,%rsi),%r9
lea (%r10,%rsi),%r15
add 24(%rsp),%rsi
movsd (%rbx,%rsi,8),%xmm1
movsd (%rbx,%r15,8),%xmm0
lea 0(%rbp,%r9,8),%r9
lea 1(%rdx),%esi
mulsd %xmm11,%xmm1
mulsd %xmm10,%xmm0
addsd (%r9),%xmm0
addsd %xmm1,%xmm0
movsd %xmm0,(%r9)
cmp %esi,%r13d
jl .L47
movslq %esi,%rsi
add $2,%edx
lea (%r10,%rsi),%r15
lea (%r14,%rsi),%r9
movsd (%rbx,%r15,8),%xmm0
mov 24(%rsp),%r15
lea 0(%rbp,%r9,8),%r9
mulsd %xmm10,%xmm0
add %r15,%rsi
movsd (%rbx,%rsi,8),%xmm1
addsd (%r9),%xmm0
mulsd %xmm11,%xmm1
addsd %xmm1,%xmm0
movsd %xmm0,(%r9)
cmp %edx,%r13d
jl .L47
movslq %edx,%rdx
lea (%r14,%rdx),%rsi
lea (%r10,%rdx),%r9
add %r15,%rdx
mulsd (%rbx,%rdx,8),%xmm11
lea 0(%rbp,%rsi,8),%rsi
mulsd (%rbx,%r9,8),%xmm10
addsd (%rsi),%xmm10
addsd %xmm10,%xmm11
movsd %xmm11,(%rsi)
.L47: mov 88(%rsp),%rsi
lea 1(%r8),%edx
add $2,%r8d
add %rsi,24(%rsp)
add 104(%rsp),%rax
add $16,%r11
add %rsi,%r10
cmp %r8d,%ecx
jg .L48
mov 128(%rsp),%r15
mov 112(%rsp),%r8d
mov 136(%rsp),%r11
.L44: mov (%rsp),%r9
movslq %edx,%rdx
lea 8(%rbx),%rdi
mov %r9,%rsi
imul %rdx,%rsi
add 80(%rsp),%rsi
.p2align 4,,10
.p2align 3
.L52: movsd (%r11,%rdx,8),%xmm0
mulsd %xmm2,%xmm0
cmp $1,%r13d
je .L110
lea (%rdi,%rsi,8),%r10
movddup %xmm0,%xmm4
xor %eax,%eax
.p2align 4,,10
.p2align 3
.L50: movupd (%r10,%rax),%xmm1
movupd (%r12,%rax),%xmm12
mulpd %xmm4,%xmm1
addpd %xmm12,%xmm1
movups %xmm1,(%r12,%rax)
add $16,%rax
cmp %rax,%r15
jne .L50
cmp %r8d,%r13d
je .L51
movslq 8(%rsp),%rax
.L49: lea (%r14,%rax),%r10
add %rsi,%rax
lea 0(%rbp,%r10,8),%r10
mulsd (%rbx,%rax,8),%xmm0
addsd (%r10),%xmm0
movsd %xmm0,(%r10)
.L51: add $1,%rdx
add %r9,%rsi
cmp %edx,%ecx
jge .L52
.L35: addl $1,16(%rsp)
add 48(%rsp),%r14
mov 16(%rsp),%eax
add 56(%rsp),%r12
add 40(%rsp),%r11
cmp 32(%rsp),%eax
jne .L43
jmp .L1
.L110: mov $1,%eax
jmp .L49
.L170: jne .L34
mov 192(%rsp),%rdx
xor %esi,%esi
mov %r12,%rdi
mov %r11,168(%rsp)
mov %r8d,200(%rsp)
mov %ecx,136(%rsp)
movaps %xmm6,112(%rsp)
movsd %xmm5,128(%rsp)
movsd %xmm2,24(%rsp)
call memset
mov .LC4(%rip),%rax
pxor %xmm3,%xmm3
movsd 24(%rsp),%xmm2
movsd 128(%rsp),%xmm5
movapd 112(%rsp),%xmm6
mov 136(%rsp),%ecx
mov 200(%rsp),%r8d
movq %rax,%xmm7
mov 168(%rsp),%r11
jmp .L37
.L121: cmp $1,%r13d
je .L107
lea (%r15,%r12),%rdx
mov %r12,%rax
.p2align 4,,10
.p2align 3
.L41: movupd (%rax),%xmm0
add $16,%rax
mulpd %xmm6,%xmm0
movups %xmm0,-16(%rax)
cmp %rax,%rdx
jne .L41
cmp %r8d,%r13d
je .L37
movslq 8(%rsp),%rax
.L39: add %r14,%rax
lea 0(%rbp,%rax,8),%rax
movsd (%rax),%xmm0
mulsd %xmm5,%xmm0
movsd %xmm0,(%rax)
jmp .L37
.L108: mov $1,%edx
jmp .L44
.L114: mov $1,%esi
jmp .L78
.L106: mov $1,%edx
jmp .L25
.L171: movsd (%r11),%xmm11
mov $1,%edx
mulsd %xmm2,%xmm11
jmp .L45
.L169: movsd (%r11),%xmm11
mov $1,%edx
mulsd %xmm2,%xmm11
jmp .L79
.L107: mov $1,%eax
jmp .L39
.L113: mov $1,%eax
jmp .L73
.L165: je .L1
jmp .L20
.endfn dgemm_,globl
.rodata.cst8
.LC4: .double 1
.rodata.str1.1
.LC1: .string "N"
.LC2: .string "C"
.LC3: .string "T"
.LC5: .string "DGEMM "