2328 lines
49 KiB
ArmAsm
2328 lines
49 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ This program is free software; you can redistribute it and/or modify │
|
|
│ it under the terms of the GNU General Public License as published by │
|
|
│ the Free Software Foundation; version 2 of the License. │
|
|
│ │
|
|
│ This program is distributed in the hope that it will be useful, but │
|
|
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
|
|
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
|
|
│ General Public License for more details. │
|
|
│ │
|
|
│ You should have received a copy of the GNU General Public License │
|
|
│ along with this program; if not, write to the Free Software │
|
|
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
|
|
│ 02110-1301 USA │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.h"
|
|
.source __FILE__
|
|
|
|
.p2align 4
|
|
minmax_vector:
|
|
cmp $7,%rdx
|
|
jle .L27
|
|
test $7,%dl
|
|
je .L5
|
|
lea -32(,%rdx,4),%rax
|
|
lea (%rdi,%rax),%rcx
|
|
add %rsi,%rax
|
|
vmovdqu (%rax),%ymm0
|
|
vmovdqu (%rcx),%ymm1
|
|
and $-8,%rdx
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,(%rcx)
|
|
vmovdqu %ymm0,(%rax)
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L5: vmovdqu (%rsi),%ymm1
|
|
vmovdqu (%rdi),%ymm0
|
|
add $32,%rsi
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,(%rdi)
|
|
vmovdqu %ymm0,-32(%rsi)
|
|
add $32,%rdi
|
|
sub $8,%rdx
|
|
jne .L5
|
|
vzeroupper
|
|
.L25: ret
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L27: test %rdx,%rdx
|
|
jle .L25
|
|
mov (%rdi),%eax
|
|
cmp (%rsi),%eax
|
|
cmovg (%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,(%rdi)
|
|
mov %eax,(%rsi)
|
|
cmp $1,%rdx
|
|
je .L25
|
|
mov 4(%rdi),%eax
|
|
cmp 4(%rsi),%eax
|
|
cmovg 4(%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,4(%rdi)
|
|
mov %eax,4(%rsi)
|
|
cmp $2,%rdx
|
|
je .L25
|
|
mov 8(%rdi),%eax
|
|
cmp 8(%rsi),%eax
|
|
cmovg 8(%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,8(%rdi)
|
|
mov %eax,8(%rsi)
|
|
cmp $3,%rdx
|
|
je .L25
|
|
mov 12(%rdi),%eax
|
|
cmp 12(%rsi),%eax
|
|
cmovg 12(%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,12(%rdi)
|
|
mov %eax,12(%rsi)
|
|
cmp $4,%rdx
|
|
je .L25
|
|
mov 16(%rdi),%eax
|
|
cmp 16(%rsi),%eax
|
|
cmovg 16(%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,16(%rdi)
|
|
mov %eax,16(%rsi)
|
|
cmp $5,%rdx
|
|
je .L25
|
|
mov 20(%rdi),%eax
|
|
cmp 20(%rsi),%eax
|
|
cmovg 20(%rsi),%ecx
|
|
cmovg %eax,%eax
|
|
mov %ecx,20(%rdi)
|
|
mov %eax,20(%rsi)
|
|
cmp $7,%rdx
|
|
jne .L25
|
|
mov 24(%rdi),%eax
|
|
cmp 24(%rsi),%eax
|
|
cmovg 24(%rsi),%edx
|
|
cmovg %eax,%eax
|
|
mov %edx,24(%rdi)
|
|
mov %eax,24(%rsi)
|
|
ret
|
|
.endfn minmax_vector,globl
|
|
|
|
.p2align 4
|
|
int32_twostages_32:
|
|
test %rsi,%rsi
|
|
jle .L33
|
|
lea -128(%rsi),%rax
|
|
dec %rsi
|
|
and $-128,%rsi
|
|
mov %rax,%rdx
|
|
sub %rsi,%rdx
|
|
jmp .L30
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L34: add $-128,%rax
|
|
.L30: vmovdqu 256(%rdi),%ymm1
|
|
vmovdqu (%rdi),%ymm0
|
|
vmovdqu 384(%rdi),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu 128(%rdi),%ymm1
|
|
add $512,%rdi
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm0,-128(%rdi)
|
|
vmovdqu -224(%rdi),%ymm1
|
|
vmovdqu -480(%rdi),%ymm0
|
|
vmovdqu %ymm4,-512(%rdi)
|
|
vmovdqu %ymm2,-384(%rdi)
|
|
vmovdqu -96(%rdi),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu -352(%rdi),%ymm1
|
|
vmovdqu %ymm3,-256(%rdi)
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm0,-96(%rdi)
|
|
vmovdqu -192(%rdi),%ymm1
|
|
vmovdqu -448(%rdi),%ymm0
|
|
vmovdqu %ymm4,-480(%rdi)
|
|
vmovdqu %ymm2,-352(%rdi)
|
|
vmovdqu -64(%rdi),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu -320(%rdi),%ymm1
|
|
vmovdqu %ymm3,-224(%rdi)
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm0,-64(%rdi)
|
|
vmovdqu -160(%rdi),%ymm1
|
|
vmovdqu -416(%rdi),%ymm0
|
|
vmovdqu %ymm4,-448(%rdi)
|
|
vmovdqu %ymm2,-320(%rdi)
|
|
vmovdqu -32(%rdi),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu -288(%rdi),%ymm1
|
|
vmovdqu %ymm3,-192(%rdi)
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,-416(%rdi)
|
|
vmovdqu %ymm2,-288(%rdi)
|
|
vmovdqu %ymm3,-160(%rdi)
|
|
vmovdqu %ymm0,-32(%rdi)
|
|
cmp %rdx,%rax
|
|
jne .L34
|
|
vzeroupper
|
|
.L33: ret
|
|
.endfn int32_twostages_32,globl
|
|
|
|
.p2align 4
|
|
int32_threestages:
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
push %r14
|
|
lea 0(,%rdx,8),%r14
|
|
push %r13
|
|
push %r12
|
|
push %rbx
|
|
and $-32,%rsp
|
|
sub $32,%rsp
|
|
mov %rsi,16(%rsp)
|
|
cmp %r14,%rsi
|
|
jl .L41
|
|
lea -1(%rdx),%rax
|
|
and $-8,%rax
|
|
lea (%rdx,%rdx),%r8
|
|
mov %rax,8(%rsp)
|
|
lea (%r8,%rdx),%rcx
|
|
lea 0(,%rdx,4),%rsi
|
|
mov %r14,%r9
|
|
mov %rdi,%r13
|
|
lea (%rsi,%rdx),%r11
|
|
lea (%rcx,%rcx),%r10
|
|
sub %rdx,%r9
|
|
xor %r12d,%r12d
|
|
mov %r14,%rbx
|
|
lea 32(%rdi),%r15
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L37: mov %r12,%rdi
|
|
lea (%rdx,%rdi),%rax
|
|
mov %rbx,24(%rsp)
|
|
mov %rbx,%r12
|
|
cmp %rax,%rdi
|
|
jge .L40
|
|
lea 0(%r13,%rdi,4),%rax
|
|
add 8(%rsp),%rdi
|
|
lea (%r15,%rdi,4),%rdi
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L38: vmovdqu (%rax,%rsi,4),%ymm0
|
|
vmovdqu (%rax),%ymm6
|
|
vmovdqu (%rax,%rdx,4),%ymm1
|
|
vpminsd %ymm0,%ymm6,%ymm7
|
|
vpmaxsd %ymm0,%ymm6,%ymm6
|
|
vmovdqu (%rax,%r11,4),%ymm0
|
|
vmovdqu (%rax,%r9,4),%ymm8
|
|
vpmaxsd %ymm0,%ymm1,%ymm3
|
|
vpminsd %ymm0,%ymm1,%ymm2
|
|
vmovdqu (%rax,%r10,4),%ymm1
|
|
vmovdqu (%rax,%r8,4),%ymm0
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu (%rax,%rcx,4),%ymm1
|
|
vpminsd %ymm8,%ymm1,%ymm5
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpminsd %ymm4,%ymm7,%ymm8
|
|
vpmaxsd %ymm4,%ymm7,%ymm4
|
|
vpminsd %ymm5,%ymm2,%ymm7
|
|
vpmaxsd %ymm5,%ymm2,%ymm2
|
|
vpminsd %ymm0,%ymm6,%ymm5
|
|
vpmaxsd %ymm0,%ymm6,%ymm0
|
|
vpminsd %ymm1,%ymm3,%ymm6
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vpminsd %ymm7,%ymm8,%ymm9
|
|
vpmaxsd %ymm7,%ymm8,%ymm3
|
|
vpminsd %ymm2,%ymm4,%ymm8
|
|
vpminsd %ymm6,%ymm5,%ymm7
|
|
vpmaxsd %ymm2,%ymm4,%ymm2
|
|
vpmaxsd %ymm6,%ymm5,%ymm5
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm9,(%rax)
|
|
vmovdqu %ymm3,(%rax,%rdx,4)
|
|
vmovdqu %ymm8,(%rax,%r8,4)
|
|
vmovdqu %ymm2,(%rax,%rcx,4)
|
|
vmovdqu %ymm7,(%rax,%rsi,4)
|
|
vmovdqu %ymm5,(%rax,%r11,4)
|
|
vmovdqu %ymm4,(%rax,%r10,4)
|
|
vmovdqu %ymm0,(%rax,%r9,4)
|
|
add $32,%rax
|
|
cmp %rax,%rdi
|
|
jne .L38
|
|
.L40: add %r14,%rbx
|
|
cmp %rbx,16(%rsp)
|
|
jge .L37
|
|
vzeroupper
|
|
.L35: mov 24(%rsp),%rax
|
|
lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.L41: movq $0,24(%rsp)
|
|
jmp .L35
|
|
.endfn int32_threestages,globl
|
|
|
|
.p2align 4
|
|
merge16_finish:
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm3,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm3,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm2
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm1,%ymm0
|
|
vpunpckhdq %ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm3
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm2
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm1,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm1,%ymm0
|
|
test %esi,%esi
|
|
je .L46
|
|
vpcmpeqd %ymm1,%ymm1,%ymm1
|
|
vpxor %ymm1,%ymm2,%ymm2
|
|
vpxor %ymm1,%ymm0,%ymm0
|
|
.L46: vmovdqu %ymm2,(%rdi)
|
|
vmovdqu %ymm0,32(%rdi)
|
|
ret
|
|
.endfn merge16_finish,globl
|
|
|
|
.p2align 4
|
|
djbsort$avx2_2power:
|
|
push %r13
|
|
mov %rdi,%r11
|
|
lea 16(%rsp),%r13
|
|
and $-32,%rsp
|
|
push -8(%r13)
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
push %r14
|
|
push %r13
|
|
push %r12
|
|
push %rbx
|
|
sub $200,%rsp
|
|
mov %rsi,-144(%rbp)
|
|
mov %edx,-164(%rbp)
|
|
cmp $8,%rsi
|
|
je .L194
|
|
cmpq $16,-144(%rbp)
|
|
je .L195
|
|
cmpq $32,-144(%rbp)
|
|
je .L196
|
|
mov %rsi,%r15
|
|
sar $3,%r15
|
|
test %r15,%r15
|
|
jle .L197
|
|
lea -1(%r15),%rbx
|
|
mov %rbx,-200(%rbp)
|
|
shr $3,%rbx
|
|
mov %rbx,%rdx
|
|
lea 32(%r11),%rbx
|
|
lea (%r15,%r15),%r8
|
|
mov %rbx,-120(%rbp)
|
|
lea 0(,%r15,4),%rsi
|
|
lea (%r8,%r15),%rdi
|
|
lea 0(,%r15,8),%rcx
|
|
sal $5,%rdx
|
|
lea (%rdi,%rdi),%r10
|
|
lea (%rsi,%r15),%r9
|
|
sub %r15,%rcx
|
|
mov %r11,%rax
|
|
add %rbx,%rdx
|
|
.L61: vmovdqu (%rax),%ymm0
|
|
vmovdqu (%rax,%rsi,4),%ymm2
|
|
vmovdqu (%rax,%r10,4),%ymm3
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpmaxsd %ymm2,%ymm0,%ymm2
|
|
vmovdqu (%rax,%r8,4),%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm1
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpminsd %ymm4,%ymm1,%ymm2
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm3,%ymm4
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vmovdqu %ymm0,(%rax)
|
|
vmovdqu %ymm4,(%rax,%r8,4)
|
|
vmovdqu %ymm1,(%rax,%rsi,4)
|
|
vmovdqu %ymm2,(%rax,%r10,4)
|
|
vmovdqu (%rax,%r15,4),%ymm2
|
|
vmovdqu (%rax,%r9,4),%ymm0
|
|
vmovdqu (%rax,%rdi,4),%ymm4
|
|
vpminsd %ymm2,%ymm0,%ymm1
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vmovdqu (%rax,%rcx,4),%ymm2
|
|
vpminsd %ymm4,%ymm2,%ymm3
|
|
vpmaxsd %ymm4,%ymm2,%ymm2
|
|
vpminsd %ymm3,%ymm1,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpminsd %ymm1,%ymm3,%ymm2
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vmovdqu %ymm4,(%rax,%r15,4)
|
|
vmovdqu %ymm1,(%rax,%rdi,4)
|
|
vmovdqu %ymm2,(%rax,%r9,4)
|
|
vmovdqu %ymm0,(%rax,%rcx,4)
|
|
add $32,%rax
|
|
cmp %rdx,%rax
|
|
jne .L61
|
|
.L62: lea 0(,%r15,8),%rax
|
|
sub %r15,%rax
|
|
lea (%r15,%r15),%r12
|
|
mov %rax,%r9
|
|
mov -144(%rbp),%rax
|
|
lea 0(,%r15,4),%rbx
|
|
lea (%r12,%r15),%r13
|
|
lea (%rbx,%r15),%r10
|
|
lea (%r13,%r13),%r14
|
|
cmp $127,%rax
|
|
jg .L59
|
|
lea 64(%r11),%rdi
|
|
dec %rax
|
|
mov %rdi,-192(%rbp)
|
|
mov %rax,-176(%rbp)
|
|
.L60: mov -144(%rbp),%rdi
|
|
mov %r11,-208(%rbp)
|
|
lea (%r11,%rdi,4),%rax
|
|
mov %rax,-112(%rbp)
|
|
mov %rdi,%rax
|
|
sar $4,%rax
|
|
cmp $32,%rax
|
|
sete %dl
|
|
cmp $127,%rax
|
|
mov %rax,-80(%rbp)
|
|
setg %al
|
|
or %eax,%edx
|
|
mov -176(%rbp),%rax
|
|
mov %dl,-152(%rbp)
|
|
shr $4,%rax
|
|
sal $6,%rax
|
|
add -192(%rbp),%rax
|
|
mov %rax,-128(%rbp)
|
|
mov -200(%rbp),%rax
|
|
movl $3,-184(%rbp)
|
|
shr $3,%rax
|
|
sal $5,%rax
|
|
add -120(%rbp),%rax
|
|
mov %rax,-160(%rbp)
|
|
movq $4,-136(%rbp)
|
|
mov %r12,-200(%rbp)
|
|
mov %r13,-216(%rbp)
|
|
mov %r10,-224(%rbp)
|
|
mov %r9,-232(%rbp)
|
|
vmovdqa .LC1(%rip),%ymm11
|
|
vmovdqa .LC3(%rip),%ymm10
|
|
vmovdqa .LC2(%rip),%ymm12
|
|
mov %rbx,-192(%rbp)
|
|
mov %rdi,%rbx
|
|
.L63: cmpq $4,-136(%rbp)
|
|
je .L198
|
|
cmpq $2,-136(%rbp)
|
|
je .L91
|
|
mov -112(%rbp),%rdx
|
|
mov %r11,%rax
|
|
cmp -112(%rbp),%r11
|
|
je .L90
|
|
.L92: vpxor 32(%rax),%ymm10,%ymm2
|
|
vpxor (%rax),%ymm10,%ymm1
|
|
add $64,%rax
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm2
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,%rdx
|
|
jne .L92
|
|
.L90: cmpb $0,-152(%rbp)
|
|
mov -80(%rbp),%r12
|
|
je .L89
|
|
mov %rbx,%r13
|
|
mov %r11,%rbx
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L146: mov %r12,%rdx
|
|
sar $2,%rdx
|
|
mov %r13,%rsi
|
|
mov %rbx,%rdi
|
|
vzeroupper
|
|
sar $3,%r12
|
|
call int32_threestages
|
|
cmp $127,%r12
|
|
vmovdqa .LC1(%rip),%ymm11
|
|
vmovdqa .LC3(%rip),%ymm10
|
|
vmovdqa .LC2(%rip),%ymm12
|
|
jg .L146
|
|
cmp $32,%r12
|
|
je .L146
|
|
mov %rbx,%r11
|
|
mov %r13,%rbx
|
|
.L89: cmp $15,%r12
|
|
jle .L94
|
|
mov -120(%rbp),%r13
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L100: mov %r12,%rdx
|
|
sar %rdx
|
|
test %rbx,%rbx
|
|
jle .L95
|
|
lea (%rdx,%rdx),%rcx
|
|
lea -1(%rdx),%r9
|
|
lea (%rcx,%rdx),%rsi
|
|
lea 0(,%rdx,4),%r10
|
|
xor %r8d,%r8d
|
|
and $-8,%r9
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L96: lea (%rdx,%r8),%rax
|
|
cmp %rax,%r8
|
|
jge .L99
|
|
lea (%r9,%r8),%rdi
|
|
lea (%r11,%r8,4),%rax
|
|
lea 0(%r13,%rdi,4),%rdi
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L97: vmovdqu (%rax,%rcx,4),%ymm1
|
|
vmovdqu (%rax),%ymm0
|
|
vmovdqu (%rax,%rsi,4),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu (%rax,%rdx,4),%ymm1
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%rax)
|
|
vmovdqu %ymm2,(%rax,%rdx,4)
|
|
vmovdqu %ymm3,(%rax,%rcx,4)
|
|
vmovdqu %ymm0,(%rax,%rsi,4)
|
|
add $32,%rax
|
|
cmp %rdi,%rax
|
|
jne .L97
|
|
.L99: add %r10,%r8
|
|
cmp %r8,%rbx
|
|
jg .L96
|
|
.L95: sar $2,%r12
|
|
cmp $15,%r12
|
|
jg .L100
|
|
.L94: cmp $8,%r12
|
|
je .L101
|
|
.L104: mov %r11,%rax
|
|
test %r15,%r15
|
|
jle .L103
|
|
mov -160(%rbp),%r9
|
|
mov -192(%rbp),%rdx
|
|
mov -200(%rbp),%rcx
|
|
mov -216(%rbp),%rsi
|
|
mov -224(%rbp),%rdi
|
|
mov -232(%rbp),%r8
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L102: vmovdqu (%rax,%r15,4),%ymm0
|
|
vmovdqu (%rax),%ymm1
|
|
vmovdqu (%rax,%rcx,4),%ymm2
|
|
vmovdqu (%rax,%rdi,4),%ymm4
|
|
vmovdqu (%rax,%rdx,4),%ymm7
|
|
vpminsd %ymm0,%ymm1,%ymm5
|
|
vpmaxsd %ymm0,%ymm1,%ymm1
|
|
vmovdqu (%rax,%rsi,4),%ymm0
|
|
vmovdqu (%rax,%r14,4),%ymm8
|
|
vpminsd %ymm0,%ymm2,%ymm3
|
|
vpmaxsd %ymm0,%ymm2,%ymm0
|
|
vpminsd %ymm4,%ymm7,%ymm2
|
|
vpmaxsd %ymm4,%ymm7,%ymm7
|
|
vmovdqu (%rax,%r8,4),%ymm4
|
|
vpminsd %ymm3,%ymm5,%ymm9
|
|
vpminsd %ymm4,%ymm8,%ymm6
|
|
vpmaxsd %ymm4,%ymm8,%ymm4
|
|
vpmaxsd %ymm3,%ymm5,%ymm5
|
|
vpminsd %ymm0,%ymm1,%ymm8
|
|
vpminsd %ymm6,%ymm2,%ymm3
|
|
vpmaxsd %ymm0,%ymm1,%ymm0
|
|
vpmaxsd %ymm6,%ymm2,%ymm1
|
|
vpminsd %ymm4,%ymm7,%ymm2
|
|
vpmaxsd %ymm4,%ymm7,%ymm4
|
|
vpminsd %ymm3,%ymm9,%ymm6
|
|
vpminsd %ymm2,%ymm8,%ymm7
|
|
vpmaxsd %ymm3,%ymm9,%ymm3
|
|
vpmaxsd %ymm2,%ymm8,%ymm2
|
|
vpminsd %ymm1,%ymm5,%ymm8
|
|
vpmaxsd %ymm1,%ymm5,%ymm1
|
|
vpminsd %ymm4,%ymm0,%ymm5
|
|
vpmaxsd %ymm4,%ymm0,%ymm0
|
|
vmovdqu %ymm6,(%rax)
|
|
vmovdqu %ymm7,(%rax,%r15,4)
|
|
vmovdqu %ymm8,(%rax,%rcx,4)
|
|
vmovdqu %ymm5,(%rax,%rsi,4)
|
|
vmovdqu %ymm3,(%rax,%rdx,4)
|
|
vmovdqu %ymm2,(%rax,%rdi,4)
|
|
vmovdqu %ymm1,(%rax,%r14,4)
|
|
vmovdqu %ymm0,(%rax,%r8,4)
|
|
add $32,%rax
|
|
cmp %rax,%r9
|
|
jne .L102
|
|
.L103: sarq -136(%rbp)
|
|
decl -184(%rbp)
|
|
jne .L63
|
|
cmpq $0,-144(%rbp)
|
|
jle .L113
|
|
mov -176(%rbp),%rax
|
|
vpcmpeqd %ymm4,%ymm4,%ymm4
|
|
shr $6,%rax
|
|
sal $8,%rax
|
|
lea 256(%r11,%rax),%rdx
|
|
mov %r11,%rax
|
|
jmp .L112
|
|
.L199: vpxor %ymm4,%ymm7,%ymm7
|
|
vpxor %ymm4,%ymm2,%ymm2
|
|
vpxor %ymm4,%ymm1,%ymm1
|
|
vpxor %ymm4,%ymm0,%ymm0
|
|
.L111: vperm2i128 $32,%ymm5,%ymm9,%ymm11
|
|
vperm2i128 $32,%ymm6,%ymm10,%ymm3
|
|
vperm2i128 $32,%ymm1,%ymm7,%ymm12
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm8
|
|
vperm2i128 $49,%ymm6,%ymm10,%ymm6
|
|
vperm2i128 $49,%ymm5,%ymm9,%ymm9
|
|
vperm2i128 $49,%ymm1,%ymm7,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vpminsd %ymm3,%ymm12,%ymm7
|
|
vpmaxsd %ymm11,%ymm8,%ymm2
|
|
vpminsd %ymm9,%ymm0,%ymm10
|
|
vpminsd %ymm6,%ymm1,%ymm5
|
|
vpmaxsd %ymm9,%ymm0,%ymm0
|
|
vpmaxsd %ymm3,%ymm12,%ymm3
|
|
vpmaxsd %ymm6,%ymm1,%ymm1
|
|
vpminsd %ymm11,%ymm8,%ymm12
|
|
vpminsd %ymm12,%ymm7,%ymm9
|
|
vpmaxsd %ymm12,%ymm7,%ymm6
|
|
vpminsd %ymm10,%ymm5,%ymm8
|
|
vpminsd %ymm2,%ymm3,%ymm7
|
|
vpmaxsd %ymm10,%ymm5,%ymm5
|
|
vpmaxsd %ymm2,%ymm3,%ymm3
|
|
vpminsd %ymm0,%ymm1,%ymm2
|
|
vpmaxsd %ymm0,%ymm1,%ymm1
|
|
vpminsd %ymm8,%ymm9,%ymm10
|
|
vpmaxsd %ymm5,%ymm6,%ymm0
|
|
vpmaxsd %ymm8,%ymm9,%ymm8
|
|
vpminsd %ymm2,%ymm7,%ymm9
|
|
vpmaxsd %ymm2,%ymm7,%ymm7
|
|
vpminsd %ymm5,%ymm6,%ymm2
|
|
vpminsd %ymm1,%ymm3,%ymm5
|
|
vpmaxsd %ymm1,%ymm3,%ymm3
|
|
vpunpckldq %ymm9,%ymm10,%ymm11
|
|
vpunpckhdq %ymm9,%ymm10,%ymm6
|
|
vpunpckldq %ymm7,%ymm8,%ymm1
|
|
vpunpckldq %ymm5,%ymm2,%ymm9
|
|
vpunpckldq %ymm3,%ymm0,%ymm10
|
|
vpunpckhdq %ymm5,%ymm2,%ymm2
|
|
vpunpckhdq %ymm3,%ymm0,%ymm0
|
|
vpunpckhdq %ymm7,%ymm8,%ymm5
|
|
vpunpcklqdq %ymm9,%ymm11,%ymm3
|
|
vpunpcklqdq %ymm2,%ymm6,%ymm8
|
|
vpunpckhqdq %ymm9,%ymm11,%ymm7
|
|
vpunpckhqdq %ymm2,%ymm6,%ymm6
|
|
vpunpcklqdq %ymm0,%ymm5,%ymm9
|
|
vpunpcklqdq %ymm10,%ymm1,%ymm2
|
|
vpunpckhqdq %ymm0,%ymm5,%ymm0
|
|
vpunpckhqdq %ymm10,%ymm1,%ymm1
|
|
vperm2i128 $32,%ymm2,%ymm3,%ymm12
|
|
vperm2i128 $32,%ymm1,%ymm7,%ymm11
|
|
vperm2i128 $32,%ymm0,%ymm6,%ymm5
|
|
vperm2i128 $49,%ymm2,%ymm3,%ymm3
|
|
vperm2i128 $32,%ymm9,%ymm8,%ymm10
|
|
vperm2i128 $49,%ymm1,%ymm7,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm6,%ymm0
|
|
vperm2i128 $49,%ymm9,%ymm8,%ymm1
|
|
vmovdqu %ymm12,(%rax)
|
|
vmovdqu %ymm11,32(%rax)
|
|
vmovdqu %ymm10,64(%rax)
|
|
vmovdqu %ymm5,96(%rax)
|
|
vmovdqu %ymm3,128(%rax)
|
|
vmovdqu %ymm2,160(%rax)
|
|
vmovdqu %ymm1,192(%rax)
|
|
vmovdqu %ymm0,224(%rax)
|
|
add $256,%rax
|
|
cmp %rdx,%rax
|
|
je .L113
|
|
.L112: vmovdqu 32(%rax),%ymm0
|
|
vmovdqu (%rax),%ymm2
|
|
vmovdqu 128(%rax),%ymm3
|
|
vpunpckhdq %ymm0,%ymm2,%ymm5
|
|
vpunpckldq %ymm0,%ymm2,%ymm7
|
|
vmovdqu 96(%rax),%ymm0
|
|
vmovdqu 64(%rax),%ymm2
|
|
vmovdqu 224(%rax),%ymm9
|
|
vpunpckldq %ymm0,%ymm2,%ymm6
|
|
vpunpckhdq %ymm0,%ymm2,%ymm2
|
|
vmovdqu 160(%rax),%ymm0
|
|
mov -164(%rbp),%ebx
|
|
vpunpckldq %ymm0,%ymm3,%ymm1
|
|
vpunpckhdq %ymm0,%ymm3,%ymm0
|
|
vmovdqu 192(%rax),%ymm3
|
|
vpunpcklqdq %ymm6,%ymm7,%ymm10
|
|
vpunpckldq %ymm9,%ymm3,%ymm8
|
|
vpunpckhdq %ymm9,%ymm3,%ymm3
|
|
vpunpckhqdq %ymm6,%ymm7,%ymm7
|
|
vpunpcklqdq %ymm2,%ymm5,%ymm9
|
|
vpunpcklqdq %ymm8,%ymm1,%ymm6
|
|
vpunpckhqdq %ymm2,%ymm5,%ymm2
|
|
vpunpckhqdq %ymm8,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm3,%ymm0,%ymm5
|
|
vpunpckhqdq %ymm3,%ymm0,%ymm0
|
|
test %ebx,%ebx
|
|
jne .L199
|
|
vpxor %ymm4,%ymm10,%ymm10
|
|
vpxor %ymm4,%ymm9,%ymm9
|
|
vpxor %ymm4,%ymm6,%ymm6
|
|
vpxor %ymm4,%ymm5,%ymm5
|
|
jmp .L111
|
|
.L91: mov -112(%rbp),%rdx
|
|
cmp %rdx,%r11
|
|
je .L90
|
|
mov %r11,%rax
|
|
.L93: vpxor 32(%rax),%ymm11,%ymm2
|
|
vpxor (%rax),%ymm11,%ymm1
|
|
add $64,%rax
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,%rdx
|
|
jne .L93
|
|
jmp .L90
|
|
.L101: test %rbx,%rbx
|
|
jle .L104
|
|
mov %r11,%rax
|
|
.L105: vmovdqu 32(%rax),%ymm1
|
|
vmovdqu (%rax),%ymm0
|
|
add $64,%rax
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,-128(%rbp)
|
|
jne .L105
|
|
jmp .L104
|
|
.L198: mov %r11,%rax
|
|
cmp -112(%rbp),%r11
|
|
je .L90
|
|
.L87: vpxor 32(%rax),%ymm12,%ymm0
|
|
vpxor (%rax),%ymm12,%ymm1
|
|
vmovdqu %ymm0,32(%rax)
|
|
vmovdqu %ymm1,(%rax)
|
|
add $64,%rax
|
|
cmp %rax,-112(%rbp)
|
|
jne .L87
|
|
jmp .L90
|
|
.L113: cmpb $0,-152(%rbp)
|
|
mov -80(%rbp),%r13
|
|
je .L109
|
|
mov %r15,-112(%rbp)
|
|
mov -120(%rbp),%r15
|
|
.L145: mov -80(%rbp),%rdx
|
|
sar $2,%rdx
|
|
cmpq $0,-144(%rbp)
|
|
jle .L114
|
|
lea (%rdx,%rdx),%rdi
|
|
lea 0(,%rdx,8),%r14
|
|
lea (%rdi,%rdx),%rcx
|
|
lea 0(,%rdx,4),%rsi
|
|
mov %r14,%r8
|
|
lea -1(%rdx),%r13
|
|
lea (%rsi,%rdx),%r10
|
|
lea (%rcx,%rcx),%r9
|
|
sub %rdx,%r8
|
|
xor %r12d,%r12d
|
|
and $-8,%r13
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L115: lea (%rdx,%r12),%rax
|
|
cmp %rax,%r12
|
|
jge .L118
|
|
lea 0(%r13,%r12),%rbx
|
|
lea (%r11,%r12,4),%rax
|
|
lea (%r15,%rbx,4),%rbx
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L116: vmovdqu (%rax,%rsi,4),%ymm0
|
|
vmovdqu (%rax),%ymm6
|
|
vmovdqu (%rax,%rdx,4),%ymm1
|
|
vpminsd %ymm0,%ymm6,%ymm7
|
|
vpmaxsd %ymm0,%ymm6,%ymm6
|
|
vmovdqu (%rax,%r10,4),%ymm0
|
|
vmovdqu (%rax,%r8,4),%ymm8
|
|
vpmaxsd %ymm0,%ymm1,%ymm3
|
|
vpminsd %ymm0,%ymm1,%ymm2
|
|
vmovdqu (%rax,%r9,4),%ymm1
|
|
vmovdqu (%rax,%rdi,4),%ymm0
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu (%rax,%rcx,4),%ymm1
|
|
vpminsd %ymm8,%ymm1,%ymm5
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpminsd %ymm4,%ymm7,%ymm8
|
|
vpmaxsd %ymm4,%ymm7,%ymm4
|
|
vpminsd %ymm5,%ymm2,%ymm7
|
|
vpmaxsd %ymm5,%ymm2,%ymm2
|
|
vpminsd %ymm0,%ymm6,%ymm5
|
|
vpmaxsd %ymm0,%ymm6,%ymm0
|
|
vpminsd %ymm1,%ymm3,%ymm6
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vpminsd %ymm7,%ymm8,%ymm9
|
|
vpmaxsd %ymm7,%ymm8,%ymm3
|
|
vpminsd %ymm2,%ymm4,%ymm8
|
|
vpminsd %ymm6,%ymm5,%ymm7
|
|
vpmaxsd %ymm2,%ymm4,%ymm2
|
|
vpmaxsd %ymm6,%ymm5,%ymm5
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm9,(%rax)
|
|
vmovdqu %ymm3,(%rax,%rdx,4)
|
|
vmovdqu %ymm8,(%rax,%rdi,4)
|
|
vmovdqu %ymm2,(%rax,%rcx,4)
|
|
vmovdqu %ymm7,(%rax,%rsi,4)
|
|
vmovdqu %ymm5,(%rax,%r10,4)
|
|
vmovdqu %ymm4,(%rax,%r9,4)
|
|
vmovdqu %ymm0,(%rax,%r8,4)
|
|
add $32,%rax
|
|
cmp %rbx,%rax
|
|
jne .L116
|
|
.L118: add %r14,%r12
|
|
cmp %r12,-144(%rbp)
|
|
jg .L115
|
|
.L114: sarq $3,-80(%rbp)
|
|
mov -80(%rbp),%rax
|
|
cmp $127,%rax
|
|
jg .L145
|
|
cmp $32,%rax
|
|
je .L145
|
|
mov -112(%rbp),%r15
|
|
mov %rax,%r13
|
|
.L109: cmp $15,%r13
|
|
jle .L119
|
|
mov -144(%rbp),%r10
|
|
mov -120(%rbp),%r12
|
|
.L125: mov %r13,%rdx
|
|
sar %rdx
|
|
test %r10,%r10
|
|
jle .L120
|
|
lea (%rdx,%rdx),%rcx
|
|
lea -1(%rdx),%r9
|
|
lea (%rcx,%rdx),%rsi
|
|
lea 0(,%rdx,4),%rbx
|
|
xor %r8d,%r8d
|
|
and $-8,%r9
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L121: lea (%rdx,%r8),%rax
|
|
cmp %rax,%r8
|
|
jge .L124
|
|
lea (%r9,%r8),%rdi
|
|
lea (%r11,%r8,4),%rax
|
|
lea (%r12,%rdi,4),%rdi
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L122: vmovdqu (%rax,%rcx,4),%ymm1
|
|
vmovdqu (%rax),%ymm0
|
|
vmovdqu (%rax,%rsi,4),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu (%rax,%rdx,4),%ymm1
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%rax)
|
|
vmovdqu %ymm2,(%rax,%rdx,4)
|
|
vmovdqu %ymm3,(%rax,%rcx,4)
|
|
vmovdqu %ymm0,(%rax,%rsi,4)
|
|
add $32,%rax
|
|
cmp %rax,%rdi
|
|
jne .L122
|
|
.L124: add %rbx,%r8
|
|
cmp %r8,%r10
|
|
jg .L121
|
|
.L120: sar $2,%r13
|
|
cmp $15,%r13
|
|
jg .L125
|
|
mov %r13,-80(%rbp)
|
|
.L119: cmpq $8,-80(%rbp)
|
|
je .L126
|
|
.L129: test %r15,%r15
|
|
jle .L192
|
|
lea (%r15,%r15),%rsi
|
|
lea (%rsi,%r15),%rdx
|
|
lea 0(,%r15,4),%rcx
|
|
lea 0(,%r15,8),%rax
|
|
mov -208(%rbp),%r9
|
|
lea (%rcx,%r15),%r8
|
|
lea (%rdx,%rdx),%rdi
|
|
sub %r15,%rax
|
|
vpcmpeqd %ymm6,%ymm6,%ymm6
|
|
.L132: vmovdqu (%r9,%r15,4),%ymm1
|
|
vmovdqu (%r9),%ymm0
|
|
vmovdqu (%r9,%r8,4),%ymm8
|
|
vpmaxsd %ymm0,%ymm1,%ymm4
|
|
vpminsd %ymm0,%ymm1,%ymm5
|
|
vmovdqu (%r9,%rdx,4),%ymm0
|
|
vmovdqu (%r9,%rsi,4),%ymm1
|
|
vmovdqu (%r9,%rdi,4),%ymm7
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm1
|
|
vmovdqu (%r9,%rcx,4),%ymm0
|
|
mov -164(%rbp),%r10d
|
|
vpminsd %ymm0,%ymm8,%ymm2
|
|
vpmaxsd %ymm0,%ymm8,%ymm8
|
|
vmovdqu (%r9,%rax,4),%ymm0
|
|
vpminsd %ymm7,%ymm0,%ymm10
|
|
vpmaxsd %ymm7,%ymm0,%ymm0
|
|
vpminsd %ymm10,%ymm2,%ymm9
|
|
vpminsd %ymm3,%ymm5,%ymm7
|
|
vpmaxsd %ymm10,%ymm2,%ymm2
|
|
vpmaxsd %ymm3,%ymm5,%ymm5
|
|
vpminsd %ymm1,%ymm4,%ymm3
|
|
vpmaxsd %ymm1,%ymm4,%ymm1
|
|
vpminsd %ymm0,%ymm8,%ymm4
|
|
vpmaxsd %ymm0,%ymm8,%ymm8
|
|
vpminsd %ymm4,%ymm3,%ymm11
|
|
vpminsd %ymm9,%ymm7,%ymm0
|
|
vpmaxsd %ymm4,%ymm3,%ymm3
|
|
vpmaxsd %ymm9,%ymm7,%ymm7
|
|
vpminsd %ymm8,%ymm1,%ymm4
|
|
vpminsd %ymm2,%ymm5,%ymm9
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpmaxsd %ymm2,%ymm5,%ymm2
|
|
vpunpckldq %ymm3,%ymm11,%ymm10
|
|
vpunpckhdq %ymm2,%ymm9,%ymm5
|
|
vpunpckhdq %ymm3,%ymm11,%ymm3
|
|
vpunpckldq %ymm7,%ymm0,%ymm8
|
|
vpunpckldq %ymm2,%ymm9,%ymm11
|
|
vpunpckhdq %ymm7,%ymm0,%ymm0
|
|
vpunpckldq %ymm1,%ymm4,%ymm9
|
|
vpunpckhdq %ymm1,%ymm4,%ymm4
|
|
vpunpcklqdq %ymm5,%ymm0,%ymm2
|
|
vpunpcklqdq %ymm9,%ymm10,%ymm13
|
|
vpunpcklqdq %ymm4,%ymm3,%ymm12
|
|
vpunpcklqdq %ymm11,%ymm8,%ymm7
|
|
vpunpckhqdq %ymm9,%ymm10,%ymm1
|
|
vpunpckhqdq %ymm11,%ymm8,%ymm8
|
|
vpunpckhqdq %ymm4,%ymm3,%ymm4
|
|
vpunpckhqdq %ymm5,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm12,%ymm2,%ymm10
|
|
vperm2i128 $32,%ymm1,%ymm8,%ymm9
|
|
vperm2i128 $32,%ymm4,%ymm0,%ymm5
|
|
vperm2i128 $32,%ymm13,%ymm7,%ymm11
|
|
vperm2i128 $49,%ymm13,%ymm7,%ymm3
|
|
vperm2i128 $49,%ymm12,%ymm2,%ymm2
|
|
vperm2i128 $49,%ymm1,%ymm8,%ymm1
|
|
vperm2i128 $49,%ymm4,%ymm0,%ymm0
|
|
test %r10d,%r10d
|
|
je .L131
|
|
vpxor %ymm6,%ymm11,%ymm11
|
|
vpxor %ymm6,%ymm10,%ymm10
|
|
vpxor %ymm6,%ymm9,%ymm9
|
|
vpxor %ymm6,%ymm5,%ymm5
|
|
vpxor %ymm6,%ymm3,%ymm3
|
|
vpxor %ymm6,%ymm2,%ymm2
|
|
vpxor %ymm6,%ymm1,%ymm1
|
|
vpxor %ymm6,%ymm0,%ymm0
|
|
.L131: vmovdqu %ymm11,(%r9)
|
|
vmovdqu %ymm3,(%r9,%r15,4)
|
|
vmovdqu %ymm10,(%r9,%rsi,4)
|
|
vmovdqu %ymm2,(%r9,%rdx,4)
|
|
vmovdqu %ymm9,(%r9,%rcx,4)
|
|
vmovdqu %ymm1,(%r9,%r8,4)
|
|
vmovdqu %ymm5,(%r9,%rdi,4)
|
|
vmovdqu %ymm0,(%r9,%rax,4)
|
|
add $32,%r9
|
|
cmp %r9,-160(%rbp)
|
|
jne .L132
|
|
.L192: vzeroupper
|
|
.L190: add $200,%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
lea -16(%r13),%rsp
|
|
pop %r13
|
|
ret
|
|
.L59: dec %rax
|
|
mov %rax,-176(%rbp)
|
|
shr $5,%rax
|
|
sal $7,%rax
|
|
lea 128(%r11,%rax),%rax
|
|
mov %rax,-184(%rbp)
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
mov %r11,%rax
|
|
.L64: vpxor 64(%rax),%ymm0,%ymm1
|
|
vpxor (%rax),%ymm0,%ymm2
|
|
vmovdqu %ymm1,64(%rax)
|
|
vmovdqu %ymm2,(%rax)
|
|
sub $-128,%rax
|
|
cmp -184(%rbp),%rax
|
|
jne .L64
|
|
mov -176(%rbp),%rdi
|
|
lea 64(%r11),%rsi
|
|
mov %rdi,%rax
|
|
shr $4,%rax
|
|
sal $6,%rax
|
|
add %rsi,%rax
|
|
mov %rax,-208(%rbp)
|
|
mov %rdi,%rax
|
|
shr $6,%rax
|
|
sal $8,%rax
|
|
lea 256(%r11,%rax),%rax
|
|
mov $4,%ecx
|
|
mov %r14,%r8
|
|
mov %rsi,-192(%rbp)
|
|
mov %rax,-216(%rbp)
|
|
movq $8,-112(%rbp)
|
|
vpcmpeqd %ymm11,%ymm11,%ymm11
|
|
mov %r10,%r14
|
|
cmp $64,%rcx
|
|
je .L200
|
|
.L68: cmp $32,%rcx
|
|
je .L201
|
|
cmp $16,%rcx
|
|
je .L74
|
|
cmp $8,%rcx
|
|
je .L202
|
|
.L76: mov -112(%rbp),%rdi
|
|
xor %edx,%edx
|
|
lea (%rdi,%rdi),%rax
|
|
cmp %r15,%rax
|
|
mov %rax,-152(%rbp)
|
|
setne %al
|
|
movzbl %al,%eax
|
|
mov %eax,-160(%rbp)
|
|
lea -1(%rdi),%rax
|
|
sete %dl
|
|
and $-8,%rax
|
|
movq $0,-136(%rbp)
|
|
mov %rax,-128(%rbp)
|
|
mov %rdi,%r10
|
|
test %r15,%r15
|
|
jle .L73
|
|
.L78: mov -112(%rbp),%rax
|
|
mov -136(%rbp),%rdi
|
|
add %r10,%rax
|
|
cmp %rdi,%rax
|
|
jle .L81
|
|
mov %rdi,%rsi
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L84: mov %rsi,%rcx
|
|
mov %rsi,%rdi
|
|
add -112(%rbp),%rsi
|
|
cmp %rsi,%rcx
|
|
jge .L83
|
|
lea (%r11,%rcx,4),%rax
|
|
mov %rax,-80(%rbp)
|
|
mov -120(%rbp),%rax
|
|
add -128(%rbp),%rcx
|
|
lea (%rax,%rcx,4),%rcx
|
|
mov -80(%rbp),%rax
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L80: vmovdqu (%rax),%ymm0
|
|
vmovdqu (%rax,%r15,4),%ymm15
|
|
vmovdqu (%rax,%r13,4),%ymm7
|
|
vpminsd %ymm0,%ymm15,%ymm6
|
|
vpmaxsd %ymm0,%ymm15,%ymm15
|
|
vmovdqu (%rax,%r12,4),%ymm0
|
|
vmovdqu (%rax,%r14,4),%ymm5
|
|
vpminsd %ymm0,%ymm7,%ymm1
|
|
vpmaxsd %ymm0,%ymm7,%ymm7
|
|
vmovdqu (%rax,%rbx,4),%ymm0
|
|
vmovdqu (%rax,%r9,4),%ymm4
|
|
vpminsd %ymm0,%ymm5,%ymm9
|
|
vpmaxsd %ymm0,%ymm5,%ymm5
|
|
vmovdqu (%rax,%r8,4),%ymm0
|
|
vpminsd %ymm1,%ymm6,%ymm8
|
|
vpminsd %ymm0,%ymm4,%ymm3
|
|
vpmaxsd %ymm0,%ymm4,%ymm4
|
|
vpminsd %ymm3,%ymm9,%ymm2
|
|
vpmaxsd %ymm4,%ymm5,%ymm0
|
|
vpmaxsd %ymm3,%ymm9,%ymm3
|
|
vpmaxsd %ymm1,%ymm6,%ymm6
|
|
vpminsd %ymm7,%ymm15,%ymm1
|
|
vpmaxsd %ymm7,%ymm15,%ymm15
|
|
vpminsd %ymm4,%ymm5,%ymm7
|
|
vpminsd %ymm2,%ymm8,%ymm14
|
|
vpminsd %ymm7,%ymm1,%ymm13
|
|
vpminsd %ymm3,%ymm6,%ymm12
|
|
vpminsd %ymm0,%ymm15,%ymm10
|
|
vpmaxsd %ymm3,%ymm6,%ymm6
|
|
vpmaxsd %ymm2,%ymm8,%ymm2
|
|
vpmaxsd %ymm7,%ymm1,%ymm1
|
|
vpmaxsd %ymm0,%ymm15,%ymm0
|
|
vmovdqa %ymm6,-80(%rbp)
|
|
vmovdqa %ymm6,%ymm3
|
|
vmovdqa %ymm14,%ymm9
|
|
vmovdqa %ymm2,%ymm5
|
|
vmovdqa %ymm13,%ymm8
|
|
vmovdqa %ymm1,%ymm4
|
|
vmovdqa %ymm12,%ymm7
|
|
vmovdqa %ymm10,%ymm6
|
|
vmovdqa %ymm0,%ymm15
|
|
test %edx,%edx
|
|
je .L79
|
|
vpxor -80(%rbp),%ymm11,%ymm3
|
|
vpxor %ymm14,%ymm11,%ymm9
|
|
vpxor %ymm13,%ymm11,%ymm8
|
|
vpxor %ymm12,%ymm11,%ymm7
|
|
vpxor %ymm10,%ymm11,%ymm6
|
|
vpxor %ymm2,%ymm11,%ymm5
|
|
vpxor %ymm1,%ymm11,%ymm4
|
|
vpxor %ymm0,%ymm11,%ymm15
|
|
.L79: vmovdqu %ymm9,(%rax)
|
|
vmovdqu %ymm8,(%rax,%r15,4)
|
|
vmovdqu %ymm7,(%rax,%r12,4)
|
|
vmovdqu %ymm6,(%rax,%r13,4)
|
|
vmovdqu %ymm5,(%rax,%rbx,4)
|
|
vmovdqu %ymm4,(%rax,%r14,4)
|
|
vmovdqu %ymm3,(%rax,%r8,4)
|
|
vmovdqu %ymm15,(%rax,%r9,4)
|
|
add $32,%rax
|
|
cmp %rax,%rcx
|
|
jne .L80
|
|
.L83: xor $1,%edx
|
|
cmp %rdi,%r10
|
|
jg .L84
|
|
.L81: mov -152(%rbp),%rdi
|
|
xor -160(%rbp),%edx
|
|
add %rdi,-136(%rbp)
|
|
add %rdi,%r10
|
|
mov -136(%rbp),%rax
|
|
cmp %rax,%r15
|
|
jg .L78
|
|
.L73: mov -112(%rbp),%rax
|
|
sal $4,%rax
|
|
cmp -144(%rbp),%rax
|
|
je .L203
|
|
mov -152(%rbp),%rax
|
|
mov %rax,%rcx
|
|
sar %rcx
|
|
cmp $254,%rax
|
|
jle .L66
|
|
mov %r8,-136(%rbp)
|
|
mov %r9,-160(%rbp)
|
|
mov %r15,-80(%rbp)
|
|
mov -144(%rbp),%r15
|
|
mov %rbx,-112(%rbp)
|
|
mov %r12,-128(%rbp)
|
|
mov %rcx,%rbx
|
|
mov %r11,%r12
|
|
.L67: mov %rbx,%rdx
|
|
sar $2,%rdx
|
|
mov %r15,%rsi
|
|
mov %r12,%rdi
|
|
vzeroupper
|
|
sar $3,%rbx
|
|
call int32_threestages
|
|
cmp $127,%rbx
|
|
vpcmpeqd %ymm11,%ymm11,%ymm11
|
|
jg .L67
|
|
mov %rbx,%rcx
|
|
mov %r12,%r11
|
|
mov -80(%rbp),%r15
|
|
mov -112(%rbp),%rbx
|
|
mov -128(%rbp),%r12
|
|
mov -136(%rbp),%r8
|
|
mov -160(%rbp),%r9
|
|
.L66: mov -152(%rbp),%rax
|
|
mov %rax,-112(%rbp)
|
|
cmp $64,%rcx
|
|
jne .L68
|
|
.L200: mov -144(%rbp),%rsi
|
|
mov %r11,%rdi
|
|
vzeroupper
|
|
call int32_twostages_32
|
|
vpcmpeqd %ymm11,%ymm11,%ymm11
|
|
.L74: mov %r11,%rax
|
|
.L69: vmovdqu 64(%rax),%ymm1
|
|
vmovdqu (%rax),%ymm0
|
|
vmovdqu 96(%rax),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu 32(%rax),%ymm1
|
|
sub $-128,%rax
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,-128(%rax)
|
|
vmovdqu %ymm2,-96(%rax)
|
|
vmovdqu %ymm3,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp -184(%rbp),%rax
|
|
jne .L69
|
|
jmp .L76
|
|
.L202: mov %r11,%rax
|
|
.L77: vmovdqu 32(%rax),%ymm0
|
|
vmovdqu (%rax),%ymm1
|
|
add $64,%rax
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,-208(%rbp)
|
|
jne .L77
|
|
jmp .L76
|
|
.L203: mov %r14,%r10
|
|
mov %r8,%r14
|
|
jmp .L60
|
|
.L201: mov %r11,%rax
|
|
.L71: vmovdqu 128(%rax),%ymm0
|
|
vmovdqu (%rax),%ymm6
|
|
vmovdqu 32(%rax),%ymm1
|
|
vpminsd %ymm0,%ymm6,%ymm7
|
|
vpmaxsd %ymm0,%ymm6,%ymm6
|
|
vmovdqu 160(%rax),%ymm0
|
|
vmovdqu 224(%rax),%ymm8
|
|
vpminsd %ymm0,%ymm1,%ymm5
|
|
vpmaxsd %ymm0,%ymm1,%ymm3
|
|
vmovdqu 192(%rax),%ymm1
|
|
vmovdqu 64(%rax),%ymm0
|
|
add $256,%rax
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu -160(%rax),%ymm1
|
|
vpminsd %ymm8,%ymm1,%ymm2
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpminsd %ymm4,%ymm7,%ymm8
|
|
vpmaxsd %ymm4,%ymm7,%ymm4
|
|
vpminsd %ymm2,%ymm5,%ymm7
|
|
vpmaxsd %ymm2,%ymm5,%ymm2
|
|
vpminsd %ymm0,%ymm6,%ymm5
|
|
vpmaxsd %ymm0,%ymm6,%ymm0
|
|
vpminsd %ymm1,%ymm3,%ymm6
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vpminsd %ymm7,%ymm8,%ymm9
|
|
vpmaxsd %ymm7,%ymm8,%ymm3
|
|
vpminsd %ymm2,%ymm4,%ymm8
|
|
vpminsd %ymm6,%ymm5,%ymm7
|
|
vpmaxsd %ymm2,%ymm4,%ymm2
|
|
vpmaxsd %ymm6,%ymm5,%ymm5
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm9,-256(%rax)
|
|
vmovdqu %ymm3,-224(%rax)
|
|
vmovdqu %ymm8,-192(%rax)
|
|
vmovdqu %ymm2,-160(%rax)
|
|
vmovdqu %ymm7,-128(%rax)
|
|
vmovdqu %ymm5,-96(%rax)
|
|
vmovdqu %ymm4,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,-216(%rbp)
|
|
jne .L71
|
|
jmp .L76
|
|
.L194: mov 4(%rdi),%eax
|
|
mov 12(%rdi),%ebx
|
|
cmp (%rdi),%eax
|
|
cmovg (%rdi),%ecx
|
|
cmovg %eax,%r9d
|
|
cmp 8(%rdi),%ebx
|
|
cmovg 8(%rdi),%eax
|
|
cmovg %ebx,%edi
|
|
cmp %ecx,%eax
|
|
cmovg %ecx,%r8d
|
|
cmovg %eax,%eax
|
|
cmp %r9d,%edi
|
|
cmovg %r9d,%edx
|
|
cmovg %edi,%r9d
|
|
cmp %eax,%edx
|
|
cmovg %eax,%r12d
|
|
cmovg %edx,%r10d
|
|
mov 20(%r11),%eax
|
|
cmp 16(%r11),%eax
|
|
cmovg 16(%r11),%esi
|
|
cmovg %eax,%ecx
|
|
mov 28(%r11),%eax
|
|
cmp 24(%r11),%eax
|
|
cmovg 24(%r11),%edx
|
|
cmovg %eax,%edi
|
|
cmp %ecx,%edi
|
|
cmovg %ecx,%eax
|
|
cmovg %edi,%edi
|
|
cmp %esi,%edx
|
|
cmovg %esi,%ecx
|
|
cmovg %edx,%edx
|
|
cmp %r9d,%edi
|
|
cmovg %r9d,%ebx
|
|
cmovg %edi,%edi
|
|
cmp %edx,%eax
|
|
cmovg %edx,%esi
|
|
cmovg %eax,%edx
|
|
mov %edi,(%r11)
|
|
cmp %r12d,%esi
|
|
cmovg %r12d,%r9d
|
|
cmovg %esi,%esi
|
|
cmp %r10d,%edx
|
|
cmovg %r10d,%eax
|
|
cmovg %edx,%edx
|
|
cmp %esi,%ebx
|
|
cmovg %esi,%r13d
|
|
cmovg %ebx,%ebx
|
|
cmp %r8d,%ecx
|
|
cmovg %r8d,%esi
|
|
cmovg %ecx,%ecx
|
|
cmp %ecx,%eax
|
|
cmovg %ecx,%r8d
|
|
cmovg %eax,%eax
|
|
mov %esi,28(%r11)
|
|
cmp %edx,%ebx
|
|
cmovg %edx,%r12d
|
|
cmovg %ebx,%ecx
|
|
cmp %eax,%r13d
|
|
cmovg %eax,%ebx
|
|
cmovg %r13d,%edx
|
|
mov %ecx,4(%r11)
|
|
cmp %r8d,%r9d
|
|
cmovg %r8d,%r10d
|
|
cmovg %r9d,%eax
|
|
mov %r12d,8(%r11)
|
|
mov %edx,12(%r11)
|
|
mov %ebx,16(%r11)
|
|
mov %eax,20(%r11)
|
|
mov %r10d,24(%r11)
|
|
jmp .L190
|
|
.L126: cmpq $0,-144(%rbp)
|
|
jle .L129
|
|
mov %r11,%rax
|
|
.L130: vmovdqu 32(%rax),%ymm1
|
|
vmovdqu (%rax),%ymm0
|
|
add $64,%rax
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,-128(%rbp)
|
|
jne .L130
|
|
jmp .L129
|
|
.L195: vmovdqa .LC0(%rip),%ymm1
|
|
vmovdqa .LC1(%rip),%ymm3
|
|
vpxor 32(%rdi),%ymm1,%ymm2
|
|
vpxor (%rdi),%ymm1,%ymm1
|
|
mov -164(%rbp),%r14d
|
|
vpunpckldq %ymm2,%ymm1,%ymm0
|
|
vpunpckhdq %ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm2
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm2,%ymm0,%ymm1
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpxor %ymm3,%ymm1,%ymm1
|
|
vpxor %ymm3,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm1,%ymm4
|
|
vpunpckhdq %ymm0,%ymm1,%ymm0
|
|
vpmaxsd %ymm0,%ymm4,%ymm1
|
|
vpminsd %ymm0,%ymm4,%ymm2
|
|
vpunpcklqdq %ymm1,%ymm2,%ymm0
|
|
vpunpckhqdq %ymm1,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm0,%ymm1
|
|
vpunpckhdq %ymm2,%ymm0,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm4
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm1
|
|
vpminsd %ymm4,%ymm1,%ymm2
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpunpckldq %ymm1,%ymm2,%ymm0
|
|
vpunpckhdq %ymm1,%ymm2,%ymm1
|
|
vpxor %ymm3,%ymm1,%ymm1
|
|
vpxor %ymm3,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm1,%ymm0,%ymm2
|
|
vperm2i128 $49,%ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm2,%ymm0,%ymm1
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm1,%ymm3
|
|
vperm2i128 $49,%ymm0,%ymm1,%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm2
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm2,%ymm1
|
|
vpunpckhqdq %ymm0,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm1,%ymm0
|
|
vpunpckhdq %ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm3
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm2
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm2
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm1
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
test %r14d,%r14d
|
|
je .L54
|
|
vpxor %ymm0,%ymm1,%ymm1
|
|
mov %edx,%esi
|
|
.L55: vmovdqa %ymm2,%ymm0
|
|
mov %r11,%rdi
|
|
.L193: add $200,%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
lea -16(%r13),%rsp
|
|
pop %r13
|
|
jmp merge16_finish
|
|
.L197: lea -1(%r15),%rax
|
|
mov %rax,-200(%rbp)
|
|
lea 32(%rdi),%rax
|
|
mov %rax,-120(%rbp)
|
|
jmp .L62
|
|
.L196: mov $1,%edx
|
|
mov $16,%esi
|
|
mov %rdi,-80(%rbp)
|
|
call djbsort$avx2_2power
|
|
mov -80(%rbp),%r11
|
|
xor %edx,%edx
|
|
lea 64(%r11),%r12
|
|
mov $16,%esi
|
|
mov %r12,%rdi
|
|
call djbsort$avx2_2power
|
|
mov -80(%rbp),%r11
|
|
mov -164(%rbp),%r13d
|
|
vmovdqu (%r11),%ymm4
|
|
vmovdqu 32(%r11),%ymm1
|
|
vmovdqu 64(%r11),%ymm2
|
|
vmovdqu 96(%r11),%ymm3
|
|
test %r13d,%r13d
|
|
je .L57
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
vpxor %ymm0,%ymm4,%ymm4
|
|
vpxor %ymm0,%ymm1,%ymm1
|
|
vpxor %ymm0,%ymm2,%ymm2
|
|
vpxor %ymm0,%ymm3,%ymm3
|
|
.L57: mov -164(%rbp),%esi
|
|
vpmaxsd %ymm1,%ymm3,%ymm5
|
|
vpminsd %ymm4,%ymm2,%ymm0
|
|
vpminsd %ymm1,%ymm3,%ymm1
|
|
vpmaxsd %ymm4,%ymm2,%ymm4
|
|
mov %r11,%rdi
|
|
vmovdqa %ymm4,-112(%rbp)
|
|
vmovdqa %ymm5,-80(%rbp)
|
|
call merge16_finish
|
|
vmovdqa -80(%rbp),%ymm5
|
|
vmovdqa -112(%rbp),%ymm4
|
|
vmovdqa %ymm5,%ymm1
|
|
vmovdqa %ymm4,%ymm0
|
|
mov %r12,%rdi
|
|
jmp .L193
|
|
.L54: vpxor %ymm0,%ymm2,%ymm2
|
|
mov %edx,%esi
|
|
jmp .L55
|
|
.endfn djbsort$avx2_2power,globl
|
|
|
|
.p2align 4
|
|
djbsort$avx2:
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
mov %rdi,%r15
|
|
push %r14
|
|
mov %rsi,%r14
|
|
push %r13
|
|
push %r12
|
|
push %rbx
|
|
and $-32,%rsp
|
|
sub $1056,%rsp
|
|
cmp $8,%rsi
|
|
jle .L265
|
|
blsr %rsi,%rax
|
|
je .L220
|
|
lea -8(%rsi),%rax
|
|
mov %rax,8(%rsp)
|
|
mov $8,%ebx
|
|
cmp $8,%rax
|
|
jle .L266
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L221: mov %rbx,%rax
|
|
mov %r14,%r12
|
|
add %rbx,%rbx
|
|
sub %rbx,%r12
|
|
cmp %rbx,%r12
|
|
jg .L221
|
|
cmp $128,%rbx
|
|
jle .L267
|
|
mov $1,%edx
|
|
mov %rbx,%rsi
|
|
mov %r15,%rdi
|
|
call djbsort$avx2_2power
|
|
lea (%r15,%rbx,4),%rdi
|
|
mov %r12,%rsi
|
|
call djbsort$avx2
|
|
lea 32(%r15),%rax
|
|
mov %rax,16(%rsp)
|
|
jmp .L230
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L228: lea 0(%r13,%r12),%rdx
|
|
add %r11,%r12
|
|
sub %r9,%rdx
|
|
lea (%r15,%r12,4),%rsi
|
|
mov %r8,%rdi
|
|
sar $3,%rbx
|
|
call minmax_vector
|
|
cmp $63,%rbx
|
|
jle .L268
|
|
.L230: mov %rbx,%r12
|
|
sar $2,%r12
|
|
mov %r12,%rdx
|
|
mov %r14,%rsi
|
|
mov %r15,%rdi
|
|
call int32_threestages
|
|
lea 0(,%r12,4),%rcx
|
|
mov %r14,%rdx
|
|
sub %rcx,%rdx
|
|
lea (%rcx,%rax),%r13
|
|
lea (%r15,%rax,4),%r8
|
|
lea (%r15,%r13,4),%rsi
|
|
sub %rax,%rdx
|
|
mov %r8,%rdi
|
|
mov %rsi,24(%rsp)
|
|
mov %rax,%r9
|
|
mov %rax,%r11
|
|
call minmax_vector
|
|
cmp %r14,%r13
|
|
mov 24(%rsp),%rsi
|
|
lea (%r12,%r12),%r10
|
|
jle .L269
|
|
.L226: mov %r14,%r13
|
|
sub %r10,%r13
|
|
lea (%r11,%r12,2),%rax
|
|
mov %r13,%rdx
|
|
sub %r9,%rdx
|
|
lea (%r15,%rax,4),%rsi
|
|
mov %r8,%rdi
|
|
call minmax_vector
|
|
add %r9,%r10
|
|
cmp %r14,%r10
|
|
jg .L228
|
|
mov %r10,%rax
|
|
sub %r12,%rax
|
|
mov %r10,%r11
|
|
lea (%r15,%r10,4),%r8
|
|
cmp %rax,%r9
|
|
jge .L247
|
|
sub %r9,%rax
|
|
dec %rax
|
|
and $-8,%rax
|
|
lea (%r15,%r9,4),%rdx
|
|
add %rax,%r9
|
|
mov 16(%rsp),%rax
|
|
lea (%rax,%r9,4),%rax
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L229: vmovdqu (%rdx,%r12,4),%ymm0
|
|
vmovdqu (%rdx),%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,(%rdx)
|
|
vmovdqu %ymm0,(%rdx,%r12,4)
|
|
add $32,%rdx
|
|
cmp %rdx,%rax
|
|
jne .L229
|
|
mov %r10,%r9
|
|
vzeroupper
|
|
jmp .L228
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L267: mov %rbx,%rdx
|
|
sar $2,%rdx
|
|
sar $3,%rbx
|
|
lea 0(,%rax,4),%r12
|
|
lea 32(%rsp),%r13
|
|
cmp %rbx,%rdx
|
|
jle .L224
|
|
vmovdqa .LC4(%rip),%ymm0
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L225: mov %rbx,%rax
|
|
sal $5,%rax
|
|
inc %rbx
|
|
vmovdqa %ymm0,0(%r13,%rax)
|
|
cmp %rdx,%rbx
|
|
jl .L225
|
|
vzeroupper
|
|
.L224: sal $2,%r14
|
|
mov %r14,%rdx
|
|
mov %r15,%rsi
|
|
mov %r13,%rdi
|
|
call memcpy
|
|
xor %edx,%edx
|
|
mov %r12,%rsi
|
|
mov %r13,%rdi
|
|
call djbsort$avx2_2power
|
|
mov %r14,%rdx
|
|
mov %r13,%rsi
|
|
mov %r15,%rdi
|
|
call memcpy
|
|
.L263: lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L269: lea (%r12,%r9),%rax
|
|
cmp %rax,%r9
|
|
jge .L246
|
|
notq %r11
|
|
add %r11,%rax
|
|
and $-8,%rax
|
|
add %rax,%r9
|
|
mov 16(%rsp),%rax
|
|
lea (%r10,%r12),%rdx
|
|
mov %r8,%rdi
|
|
lea (%rax,%r9,4),%rax
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L227: vmovdqu (%rdi),%ymm1
|
|
vmovdqu (%rdi,%r10,4),%ymm0
|
|
vmovdqu (%rdi,%r12,4),%ymm4
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu (%rdi,%rdx,4),%ymm1
|
|
vpminsd %ymm4,%ymm1,%ymm3
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%rdi)
|
|
vmovdqu %ymm2,(%rdi,%r12,4)
|
|
vmovdqu %ymm3,(%rdi,%r10,4)
|
|
vmovdqu %ymm0,(%rdi,%rdx,4)
|
|
add $32,%rdi
|
|
cmp %rdi,%rax
|
|
jne .L227
|
|
mov %rsi,%r8
|
|
mov %r13,%r11
|
|
mov %r13,%r9
|
|
vzeroupper
|
|
jmp .L226
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L268: cmp $32,%rbx
|
|
je .L270
|
|
mov %r15,%r10
|
|
cmp $16,%rbx
|
|
je .L249
|
|
mov $32,%ebx
|
|
xor %r11d,%r11d
|
|
mov $15,%eax
|
|
xor %r9d,%r9d
|
|
.L237: cmp %rax,%r14
|
|
jle .L239
|
|
mov %r9,%rax
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L240: vmovdqu 32(%r15,%rax,4),%ymm0
|
|
vmovdqu (%r15,%rax,4),%ymm2
|
|
mov %rax,%rdx
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
add $31,%rdx
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,(%r15,%rax,4)
|
|
vmovdqu %ymm0,32(%r15,%rax,4)
|
|
add $16,%rax
|
|
cmp %rdx,%r14
|
|
jg .L240
|
|
lea -16(%r14),%rax
|
|
sub %r9,%rax
|
|
lea 15(%r9),%rdx
|
|
and $-16,%rax
|
|
cmp %rdx,%r14
|
|
mov $0,%edx
|
|
cmovle %rdx,%rax
|
|
lea 16(%r9,%rax),%r9
|
|
mov %r9,%r11
|
|
lea 32(,%r9,4),%rbx
|
|
lea (%r15,%r9,4),%r10
|
|
vzeroupper
|
|
.L239: mov 8(%rsp),%rdx
|
|
lea (%r15,%rbx),%rsi
|
|
sub %r9,%rdx
|
|
mov %r10,%rdi
|
|
call minmax_vector
|
|
lea 16(,%r11,4),%rax
|
|
lea 7(%r9),%rdx
|
|
lea (%r15,%rax),%rsi
|
|
cmp %r14,%rdx
|
|
jge .L241
|
|
mov (%r10),%ebx
|
|
cmp (%rsi),%ebx
|
|
cmovg (%rsi),%ecx
|
|
cmovg %ebx,%edx
|
|
mov %ecx,(%r10)
|
|
mov %edx,(%rsi)
|
|
lea -12(%r15,%rax),%rbx
|
|
lea 4(%r15,%rax),%rdi
|
|
mov (%rbx),%edx
|
|
cmp (%rdi),%edx
|
|
cmovg (%rdi),%ecx
|
|
cmovg %edx,%edx
|
|
mov %ecx,(%rbx)
|
|
mov %edx,(%rdi)
|
|
lea -8(%r15,%rax),%r11
|
|
lea 8(%r15,%rax),%rdx
|
|
mov (%r11),%ecx
|
|
cmp (%rdx),%ecx
|
|
cmovg (%rdx),%r12d
|
|
cmovg %ecx,%ecx
|
|
mov %r12d,(%r11)
|
|
mov %ecx,(%rdx)
|
|
lea -4(%r15,%rax),%rcx
|
|
lea 12(%r15,%rax),%rax
|
|
mov (%rcx),%r13d
|
|
cmp (%rax),%r13d
|
|
cmovg (%rax),%r8d
|
|
cmovg %r13d,%r13d
|
|
mov %r8d,(%rcx)
|
|
mov %r13d,(%rax)
|
|
cmp %r12d,(%r10)
|
|
cmovg %r12d,%r13d
|
|
cmovg (%r10),%r12d
|
|
mov %r13d,(%r10)
|
|
mov %r12d,(%r11)
|
|
mov (%rbx),%r8d
|
|
cmp (%rcx),%r8d
|
|
cmovg (%rcx),%r12d
|
|
cmovg %r8d,%r13d
|
|
mov %r12d,(%rbx)
|
|
mov %r13d,(%rcx)
|
|
cmp %r12d,(%r10)
|
|
cmovg %r12d,%r13d
|
|
cmovg (%r10),%r12d
|
|
mov %r13d,(%r10)
|
|
mov %r12d,(%rbx)
|
|
mov (%r11),%r8d
|
|
cmp (%rcx),%r8d
|
|
cmovg (%rcx),%ebx
|
|
cmovg %r8d,%r10d
|
|
mov %ebx,(%r11)
|
|
mov %r10d,(%rcx)
|
|
lea 8(%r9),%r11
|
|
mov (%rsi),%ecx
|
|
cmp (%rdx),%ecx
|
|
cmovg (%rdx),%r10d
|
|
cmovg %ecx,%ecx
|
|
mov %r10d,(%rsi)
|
|
mov %ecx,(%rdx)
|
|
mov (%rdi),%ebx
|
|
cmp (%rax),%ebx
|
|
cmovg (%rax),%ecx
|
|
cmovg %ebx,%r10d
|
|
mov %ecx,(%rdi)
|
|
mov %r10d,(%rax)
|
|
cmp %ecx,(%rsi)
|
|
cmovg %ecx,%r10d
|
|
cmovg (%rsi),%ecx
|
|
mov %r10d,(%rsi)
|
|
mov %ecx,(%rdi)
|
|
mov (%rdx),%ecx
|
|
cmp (%rax),%ecx
|
|
cmovg (%rax),%esi
|
|
cmovg %ecx,%ecx
|
|
mov %esi,(%rdx)
|
|
mov %ecx,(%rax)
|
|
lea 48(,%r9,4),%rax
|
|
lea (%r15,%rax),%rsi
|
|
lea -16(%r15,%rax),%r10
|
|
mov %r11,%r9
|
|
.L241: lea -4(%r14),%rdx
|
|
sub %r9,%rdx
|
|
mov %r10,%rdi
|
|
call minmax_vector
|
|
lea 3(%r9),%rax
|
|
cmp %r14,%rax
|
|
jge .L242
|
|
lea 8(,%r11,4),%rax
|
|
lea (%r15,%rax),%rdx
|
|
mov (%r10),%ecx
|
|
cmp (%rdx),%ecx
|
|
cmovg (%rdx),%esi
|
|
cmovg %ecx,%ecx
|
|
mov %esi,(%r10)
|
|
mov %ecx,(%rdx)
|
|
lea -4(%r15,%rax),%rsi
|
|
lea 4(%r15,%rax),%rax
|
|
mov (%rsi),%ebx
|
|
cmp (%rax),%ebx
|
|
cmovg (%rax),%ecx
|
|
cmovg %ebx,%edi
|
|
mov %ecx,(%rsi)
|
|
mov %edi,(%rax)
|
|
cmp %ecx,(%r10)
|
|
cmovg %ecx,%edi
|
|
cmovg (%r10),%ecx
|
|
mov %edi,(%r10)
|
|
mov %ecx,(%rsi)
|
|
add $4,%r9
|
|
mov (%rdx),%ecx
|
|
cmp (%rax),%ecx
|
|
cmovg (%rax),%esi
|
|
cmovg %ecx,%ecx
|
|
mov %esi,(%rdx)
|
|
mov %ecx,(%rax)
|
|
.L242: lea 2(%r9),%rax
|
|
cmp %r14,%rax
|
|
jge .L243
|
|
lea 0(,%r9,4),%rax
|
|
lea (%r15,%rax),%rdx
|
|
lea 8(%r15,%rax),%rax
|
|
mov (%rdx),%ecx
|
|
cmp (%rax),%ecx
|
|
cmovg (%rax),%esi
|
|
cmovg %ecx,%ecx
|
|
mov %esi,(%rdx)
|
|
mov %ecx,(%rax)
|
|
.L243: lea 1(%r9),%rax
|
|
cmp %r14,%rax
|
|
jge .L263
|
|
sal $2,%r9
|
|
lea (%r15,%r9),%rdx
|
|
lea 4(%r15,%r9),%rax
|
|
mov (%rdx),%ecx
|
|
cmp (%rax),%ecx
|
|
cmovg (%rax),%esi
|
|
cmovg %ecx,%ecx
|
|
mov %esi,(%rdx)
|
|
mov %ecx,(%rax)
|
|
lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L265: je .L271
|
|
cmp $7,%rsi
|
|
je .L272
|
|
cmp $6,%rsi
|
|
je .L273
|
|
cmp $5,%rsi
|
|
je .L274
|
|
cmp $4,%rsi
|
|
je .L275
|
|
cmp $3,%rsi
|
|
je .L276
|
|
cmp $2,%rsi
|
|
jne .L263
|
|
mov (%rdi),%edx
|
|
mov 4(%rdi),%ecx
|
|
jmp .L217
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L271: mov (%rdi),%ecx
|
|
cmp 4(%rdi),%ecx
|
|
cmovg 4(%rdi),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%edx
|
|
cmp 8(%rdi),%ecx
|
|
cmovg 8(%rdi),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%esi
|
|
cmp 12(%rdi),%ecx
|
|
cmovg 12(%rdi),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%r9d
|
|
cmp 16(%rdi),%ecx
|
|
cmovg 16(%rdi),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%edi
|
|
cmp 20(%r15),%ecx
|
|
cmovg 20(%r15),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%r8d
|
|
cmp 24(%r15),%ecx
|
|
cmovg 24(%r15),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %eax,%r10d
|
|
cmp 28(%r15),%ecx
|
|
cmovg 28(%r15),%eax
|
|
cmovg %ecx,%ecx
|
|
mov %ecx,28(%r15)
|
|
mov %eax,%r11d
|
|
.L207: cmp %esi,%edx
|
|
cmovg %esi,%ecx
|
|
cmovg %edx,%eax
|
|
cmp %r9d,%eax
|
|
cmovg %r9d,%esi
|
|
cmovg %eax,%eax
|
|
cmp %edi,%eax
|
|
cmovg %edi,%r9d
|
|
cmovg %eax,%eax
|
|
cmp %r8d,%eax
|
|
cmovg %r8d,%edi
|
|
cmovg %eax,%eax
|
|
cmp %r10d,%eax
|
|
cmovg %r10d,%r8d
|
|
cmovg %eax,%eax
|
|
cmp %r11d,%eax
|
|
cmovg %r11d,%r10d
|
|
cmovg %eax,%eax
|
|
mov %eax,24(%r15)
|
|
mov %ecx,%edx
|
|
.L209: cmp %esi,%edx
|
|
cmovg %esi,%ecx
|
|
cmovg %edx,%edx
|
|
cmp %r9d,%edx
|
|
cmovg %r9d,%esi
|
|
cmovg %edx,%eax
|
|
cmp %edi,%eax
|
|
cmovg %edi,%r9d
|
|
cmovg %eax,%eax
|
|
cmp %r8d,%eax
|
|
cmovg %r8d,%edi
|
|
cmovg %eax,%eax
|
|
cmp %r10d,%eax
|
|
cmovg %r10d,%r8d
|
|
cmovg %eax,%eax
|
|
mov %eax,20(%r15)
|
|
.L211: cmp %esi,%ecx
|
|
cmovg %esi,%edx
|
|
cmovg %ecx,%ecx
|
|
cmp %r9d,%ecx
|
|
cmovg %r9d,%esi
|
|
cmovg %ecx,%eax
|
|
mov %esi,%ecx
|
|
cmp %edi,%eax
|
|
cmovg %edi,%esi
|
|
cmovg %eax,%eax
|
|
cmp %r8d,%eax
|
|
cmovg %r8d,%edi
|
|
cmovg %eax,%eax
|
|
mov %eax,16(%r15)
|
|
.L213: cmp %ecx,%edx
|
|
cmovg %ecx,%eax
|
|
cmovg %edx,%edx
|
|
cmp %esi,%edx
|
|
cmovg %esi,%ecx
|
|
cmovg %edx,%edx
|
|
cmp %edi,%edx
|
|
cmovg %edi,%esi
|
|
cmovg %edx,%edx
|
|
mov %edx,12(%r15)
|
|
.L215: cmp %ecx,%eax
|
|
cmovg %ecx,%edx
|
|
cmovg %eax,%eax
|
|
cmp %esi,%eax
|
|
cmovg %esi,%ecx
|
|
cmovg %eax,%eax
|
|
mov %eax,8(%r15)
|
|
.L217: cmp %ecx,%edx
|
|
cmovg %ecx,%eax
|
|
cmovg %edx,%edx
|
|
mov %eax,(%r15)
|
|
mov %edx,4(%r15)
|
|
lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.L249: mov $64,%r12d
|
|
mov $32,%ebx
|
|
xor %r11d,%r11d
|
|
mov $31,%r8d
|
|
xor %r9d,%r9d
|
|
.L236: lea (%r15,%r9,4),%rax
|
|
mov %r9,%rcx
|
|
cmp %r8,%r14
|
|
jle .L235
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L238: vmovdqu 64(%rax),%ymm1
|
|
vmovdqu 96(%rax),%ymm3
|
|
vmovdqu (%rax),%ymm0
|
|
vmovdqu 32(%rax),%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm5
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm3,%ymm2,%ymm1
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpminsd %ymm1,%ymm5,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpmaxsd %ymm1,%ymm5,%ymm5
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm2
|
|
vperm2i128 $32,%ymm5,%ymm3,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm0
|
|
vperm2i128 $49,%ymm5,%ymm3,%ymm3
|
|
vpminsd %ymm3,%ymm1,%ymm5
|
|
vpminsd %ymm0,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm0
|
|
vperm2i128 $32,%ymm1,%ymm5,%ymm3
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm2
|
|
vperm2i128 $49,%ymm1,%ymm5,%ymm5
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm4
|
|
vpunpcklqdq %ymm5,%ymm3,%ymm1
|
|
vpunpcklqdq %ymm4,%ymm2,%ymm0
|
|
vpunpckhqdq %ymm5,%ymm3,%ymm3
|
|
vpunpckhqdq %ymm4,%ymm2,%ymm2
|
|
vpminsd %ymm3,%ymm1,%ymm5
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpunpckldq %ymm1,%ymm5,%ymm3
|
|
vpunpckldq %ymm0,%ymm4,%ymm2
|
|
vpunpckhdq %ymm1,%ymm5,%ymm5
|
|
vpunpckhdq %ymm0,%ymm4,%ymm4
|
|
vpunpcklqdq %ymm5,%ymm3,%ymm1
|
|
vpunpcklqdq %ymm4,%ymm2,%ymm0
|
|
vpunpckhqdq %ymm5,%ymm3,%ymm3
|
|
vpunpckhqdq %ymm4,%ymm2,%ymm2
|
|
mov %rcx,%rdx
|
|
vpminsd %ymm3,%ymm1,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpunpckldq %ymm1,%ymm4,%ymm5
|
|
vpunpckldq %ymm0,%ymm3,%ymm2
|
|
vpunpckhdq %ymm1,%ymm4,%ymm1
|
|
vpunpckhdq %ymm0,%ymm3,%ymm0
|
|
add $63,%rdx
|
|
vmovdqu %ymm5,(%rax)
|
|
vmovdqu %ymm1,32(%rax)
|
|
vmovdqu %ymm2,64(%rax)
|
|
vmovdqu %ymm0,96(%rax)
|
|
add $32,%rcx
|
|
sub $-128,%rax
|
|
cmp %rdx,%r14
|
|
jg .L238
|
|
lea -32(%r14),%rax
|
|
sub %r9,%rax
|
|
lea 31(%r9),%rdx
|
|
and $-32,%rax
|
|
cmp %rdx,%r14
|
|
mov $0,%edx
|
|
cmovle %rdx,%rax
|
|
lea 32(%r9,%rax),%r9
|
|
lea 64(,%r9,4),%r12
|
|
mov %r9,%r11
|
|
lea (%r15,%r9,4),%r10
|
|
lea -32(%r12),%rbx
|
|
vzeroupper
|
|
.L235: lea -16(%r14),%rdx
|
|
sub %r9,%rdx
|
|
lea (%r15,%r12),%rsi
|
|
mov %r10,%rdi
|
|
call minmax_vector
|
|
lea 15(%r9),%rax
|
|
jmp .L237
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L220: xor %edx,%edx
|
|
call djbsort$avx2_2power
|
|
lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L246: mov %rsi,%r8
|
|
mov %r13,%r11
|
|
mov %r13,%r9
|
|
jmp .L226
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L247: mov %r10,%r9
|
|
jmp .L228
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L266: vmovdqa .LC4(%rip),%ymm0
|
|
mov $16,%r12d
|
|
lea 32(%rsp),%r13
|
|
vmovdqa %ymm0,64(%rsp)
|
|
vzeroupper
|
|
jmp .L224
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L270: cmp $63,%r14
|
|
jle .L248
|
|
lea -64(%r14),%rcx
|
|
shr $6,%rcx
|
|
mov %rcx,%rdx
|
|
sal $8,%rdx
|
|
mov %r15,%rax
|
|
lea 256(%r15,%rdx),%rdx
|
|
.p2align 4,,10
|
|
.p2align 3
|
|
.L233: vmovdqu 128(%rax),%ymm0
|
|
vmovdqu (%rax),%ymm3
|
|
vmovdqu 32(%rax),%ymm15
|
|
vpminsd %ymm0,%ymm3,%ymm13
|
|
vpmaxsd %ymm0,%ymm3,%ymm3
|
|
vmovdqu 160(%rax),%ymm0
|
|
vmovdqu 224(%rax),%ymm2
|
|
vmovdqu 64(%rax),%ymm6
|
|
vmovdqu 96(%rax),%ymm5
|
|
vpminsd %ymm0,%ymm15,%ymm4
|
|
vpmaxsd %ymm0,%ymm15,%ymm15
|
|
vmovdqu 192(%rax),%ymm0
|
|
add $256,%rax
|
|
vpminsd %ymm0,%ymm6,%ymm1
|
|
vpmaxsd %ymm0,%ymm6,%ymm6
|
|
vpminsd %ymm2,%ymm5,%ymm0
|
|
vpmaxsd %ymm2,%ymm5,%ymm5
|
|
vpminsd %ymm0,%ymm4,%ymm11
|
|
vpminsd %ymm1,%ymm13,%ymm14
|
|
vpmaxsd %ymm0,%ymm4,%ymm4
|
|
vpminsd %ymm5,%ymm15,%ymm12
|
|
vpminsd %ymm6,%ymm3,%ymm0
|
|
vpmaxsd %ymm5,%ymm15,%ymm15
|
|
vpmaxsd %ymm6,%ymm3,%ymm3
|
|
vpmaxsd %ymm1,%ymm13,%ymm13
|
|
vpminsd %ymm15,%ymm3,%ymm8
|
|
vpminsd %ymm4,%ymm13,%ymm1
|
|
vpminsd %ymm12,%ymm0,%ymm5
|
|
vpmaxsd %ymm4,%ymm13,%ymm13
|
|
vpminsd %ymm11,%ymm14,%ymm2
|
|
vpmaxsd %ymm12,%ymm0,%ymm12
|
|
vpmaxsd %ymm11,%ymm14,%ymm14
|
|
vpmaxsd %ymm15,%ymm3,%ymm3
|
|
vperm2i128 $32,%ymm14,%ymm2,%ymm11
|
|
vperm2i128 $32,%ymm13,%ymm1,%ymm10
|
|
vperm2i128 $32,%ymm12,%ymm5,%ymm9
|
|
vperm2i128 $49,%ymm12,%ymm5,%ymm0
|
|
vperm2i128 $32,%ymm3,%ymm8,%ymm4
|
|
vperm2i128 $49,%ymm14,%ymm2,%ymm2
|
|
vperm2i128 $49,%ymm13,%ymm1,%ymm1
|
|
vperm2i128 $49,%ymm3,%ymm8,%ymm3
|
|
vpminsd %ymm2,%ymm11,%ymm15
|
|
vpminsd %ymm1,%ymm10,%ymm14
|
|
vpmaxsd %ymm2,%ymm11,%ymm2
|
|
vpmaxsd %ymm1,%ymm10,%ymm1
|
|
vpminsd %ymm0,%ymm9,%ymm13
|
|
vpminsd %ymm3,%ymm4,%ymm12
|
|
vpmaxsd %ymm0,%ymm9,%ymm0
|
|
vpmaxsd %ymm3,%ymm4,%ymm8
|
|
vperm2i128 $49,%ymm2,%ymm15,%ymm11
|
|
vperm2i128 $49,%ymm1,%ymm14,%ymm10
|
|
vperm2i128 $49,%ymm0,%ymm13,%ymm9
|
|
vperm2i128 $32,%ymm2,%ymm15,%ymm7
|
|
vperm2i128 $32,%ymm1,%ymm14,%ymm6
|
|
vperm2i128 $32,%ymm0,%ymm13,%ymm5
|
|
vperm2i128 $32,%ymm8,%ymm12,%ymm4
|
|
vperm2i128 $49,%ymm8,%ymm12,%ymm8
|
|
vpunpcklqdq %ymm11,%ymm7,%ymm3
|
|
vpunpcklqdq %ymm10,%ymm6,%ymm2
|
|
vpunpcklqdq %ymm9,%ymm5,%ymm1
|
|
vpunpcklqdq %ymm8,%ymm4,%ymm0
|
|
vpunpckhqdq %ymm11,%ymm7,%ymm7
|
|
vpunpckhqdq %ymm10,%ymm6,%ymm6
|
|
vpunpckhqdq %ymm9,%ymm5,%ymm5
|
|
vpunpckhqdq %ymm8,%ymm4,%ymm4
|
|
vpminsd %ymm3,%ymm7,%ymm11
|
|
vpminsd %ymm2,%ymm6,%ymm10
|
|
vpminsd %ymm1,%ymm5,%ymm9
|
|
vpminsd %ymm0,%ymm4,%ymm8
|
|
vpmaxsd %ymm3,%ymm7,%ymm7
|
|
vpmaxsd %ymm2,%ymm6,%ymm6
|
|
vpmaxsd %ymm1,%ymm5,%ymm5
|
|
vpmaxsd %ymm0,%ymm4,%ymm4
|
|
vpunpckldq %ymm7,%ymm11,%ymm3
|
|
vpunpckldq %ymm6,%ymm10,%ymm2
|
|
vpunpckhdq %ymm7,%ymm11,%ymm7
|
|
vpunpckhdq %ymm6,%ymm10,%ymm6
|
|
vpunpckldq %ymm5,%ymm9,%ymm1
|
|
vpunpckldq %ymm4,%ymm8,%ymm0
|
|
vpunpckhdq %ymm5,%ymm9,%ymm5
|
|
vpunpckhdq %ymm4,%ymm8,%ymm4
|
|
vpunpcklqdq %ymm7,%ymm3,%ymm10
|
|
vpunpcklqdq %ymm5,%ymm1,%ymm8
|
|
vpunpckhqdq %ymm7,%ymm3,%ymm3
|
|
vpunpcklqdq %ymm6,%ymm2,%ymm9
|
|
vpunpcklqdq %ymm4,%ymm0,%ymm7
|
|
vpunpckhqdq %ymm6,%ymm2,%ymm2
|
|
vpunpckhqdq %ymm5,%ymm1,%ymm1
|
|
vpunpckhqdq %ymm4,%ymm0,%ymm0
|
|
vpminsd %ymm8,%ymm1,%ymm5
|
|
vpminsd %ymm9,%ymm2,%ymm6
|
|
vpminsd %ymm7,%ymm0,%ymm4
|
|
vpminsd %ymm10,%ymm3,%ymm11
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpmaxsd %ymm7,%ymm0,%ymm0
|
|
vpmaxsd %ymm10,%ymm3,%ymm3
|
|
vpmaxsd %ymm9,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm6,%ymm7
|
|
vpunpckldq %ymm3,%ymm11,%ymm8
|
|
vpunpckhdq %ymm2,%ymm6,%ymm2
|
|
vpunpckhdq %ymm3,%ymm11,%ymm3
|
|
vpunpckldq %ymm1,%ymm5,%ymm6
|
|
vpunpckhdq %ymm1,%ymm5,%ymm1
|
|
vpunpckldq %ymm0,%ymm4,%ymm5
|
|
vpunpckhdq %ymm0,%ymm4,%ymm0
|
|
vmovdqu %ymm8,-256(%rax)
|
|
vmovdqu %ymm3,-224(%rax)
|
|
vmovdqu %ymm7,-192(%rax)
|
|
vmovdqu %ymm2,-160(%rax)
|
|
vmovdqu %ymm6,-128(%rax)
|
|
vmovdqu %ymm1,-96(%rax)
|
|
vmovdqu %ymm5,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
cmp %rax,%rdx
|
|
jne .L233
|
|
lea 1(%rcx),%rax
|
|
mov %rax,%r9
|
|
sal $6,%r9
|
|
lea 128(,%r9,4),%rcx
|
|
sal $8,%rax
|
|
mov %r9,%r11
|
|
lea (%r15,%rax),%r10
|
|
lea -96(%rcx),%rbx
|
|
lea -64(%rcx),%r12
|
|
lea 31(%r9),%r8
|
|
vzeroupper
|
|
.L232: lea -32(%r14),%rdx
|
|
sub %r9,%rdx
|
|
lea (%r15,%rcx),%rsi
|
|
mov %r10,%rdi
|
|
call minmax_vector
|
|
jmp .L236
|
|
.L272: mov (%rdi),%edx
|
|
mov 4(%rdi),%esi
|
|
mov 8(%rdi),%r9d
|
|
mov 16(%r15),%r8d
|
|
mov 12(%rdi),%edi
|
|
mov 20(%r15),%r10d
|
|
mov 24(%r15),%r11d
|
|
jmp .L207
|
|
.L248: mov %r15,%r10
|
|
mov $64,%r12d
|
|
mov $32,%ebx
|
|
mov $31,%r8d
|
|
mov $128,%ecx
|
|
xor %r11d,%r11d
|
|
xor %r9d,%r9d
|
|
jmp .L232
|
|
.L276: mov (%rdi),%eax
|
|
mov 4(%rdi),%ecx
|
|
mov 8(%rdi),%esi
|
|
jmp .L215
|
|
.L275: mov (%rdi),%edx
|
|
mov 4(%rdi),%ecx
|
|
mov 8(%rdi),%esi
|
|
mov 12(%rdi),%edi
|
|
jmp .L213
|
|
.L274: mov (%rdi),%ecx
|
|
mov 4(%rdi),%esi
|
|
mov 8(%rdi),%r9d
|
|
mov 16(%r15),%r8d
|
|
mov 12(%rdi),%edi
|
|
jmp .L211
|
|
.L273: mov (%rdi),%edx
|
|
mov 4(%rdi),%esi
|
|
mov 8(%rdi),%r9d
|
|
mov 16(%r15),%r8d
|
|
mov 12(%rdi),%edi
|
|
mov 20(%r15),%r10d
|
|
jmp .L209
|
|
.endfn djbsort$avx2,globl
|
|
|
|
.rodata.cst32
|
|
.LC0: .quad -1,0,-1,0
|
|
.LC1: .quad 0,-1,-1,0
|
|
.LC2: .quad -1,-1,0,0
|
|
.LC3: .quad -4294967296,4294967295,-4294967296,4294967295
|
|
.LC4: .quad 0x7fffffff7fffffff,0x7fffffff7fffffff
|
|
.quad 0x7fffffff7fffffff,0x7fffffff7fffffff
|