2090 lines
44 KiB
ArmAsm
2090 lines
44 KiB
ArmAsm
#include "libc/macros.h"
|
|
.source __FILE__
|
|
|
|
/ D.J. Bernstein's outrageously fast integer sorting algorithm.
|
|
/
|
|
/ @param rdi is int32 array
|
|
/ @param rsi is number of elements in rdi
|
|
/ @note public domain
|
|
/ @see en.wikipedia.org/wiki/Sorting_network
|
|
djbsort$avx2:
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
push %r14
|
|
push %r13
|
|
mov %rsi,%r13
|
|
push %r12
|
|
mov %rdi,%r12
|
|
push %rbx
|
|
andq $-32,%rsp
|
|
sub $1056,%rsp
|
|
cmp $8,%rsi
|
|
jg .L148
|
|
jne .L149
|
|
mov (%rdi),%eax
|
|
mov 4(%rdi),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,(%rdi)
|
|
mov 8(%rdi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,4(%rdi)
|
|
mov 12(%rdi),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,8(%rdi)
|
|
mov 16(%rdi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,12(%rdi)
|
|
mov 20(%rdi),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,16(%rdi)
|
|
mov 24(%rdi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,20(%rdi)
|
|
mov 28(%rdi),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,24(%rdi)
|
|
mov %edx,28(%rdi)
|
|
jmp .L150
|
|
.L149: cmp $7,%rsi
|
|
jne .L151
|
|
.L150: mov (%r12),%edx
|
|
mov 4(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,(%r12)
|
|
mov 8(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,4(%r12)
|
|
mov 12(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,8(%r12)
|
|
mov 16(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,12(%r12)
|
|
mov 20(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,16(%r12)
|
|
mov 24(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,20(%r12)
|
|
mov %edx,24(%r12)
|
|
jmp .L152
|
|
.L151: cmp $6,%rsi
|
|
jne .L153
|
|
.L152: mov (%r12),%eax
|
|
mov 4(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,(%r12)
|
|
mov 8(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,4(%r12)
|
|
mov 12(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,8(%r12)
|
|
mov 16(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,12(%r12)
|
|
mov 20(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,16(%r12)
|
|
mov %edx,20(%r12)
|
|
jmp .L154
|
|
.L153: cmp $5,%rsi
|
|
jne .L155
|
|
.L154: mov (%r12),%edx
|
|
mov 4(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,(%r12)
|
|
mov 8(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,4(%r12)
|
|
mov 12(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,8(%r12)
|
|
mov 16(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,12(%r12)
|
|
mov %edx,16(%r12)
|
|
jmp .L156
|
|
.L155: cmp $4,%rsi
|
|
jne .L157
|
|
.L156: mov (%r12),%eax
|
|
mov 4(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,(%r12)
|
|
mov 8(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,4(%r12)
|
|
mov 12(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,8(%r12)
|
|
mov %edx,12(%r12)
|
|
jmp .L158
|
|
.L157: cmp $3,%rsi
|
|
jne .L159
|
|
.L158: mov (%r12),%edx
|
|
mov 4(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,(%r12)
|
|
mov 8(%r12),%edx
|
|
cmp %edx,%eax
|
|
mov %eax,%ecx
|
|
cmovg %edx,%eax
|
|
cmovg %ecx,%edx
|
|
mov %eax,4(%r12)
|
|
mov %edx,8(%r12)
|
|
jmp .L160
|
|
.L159: cmp $2,%rsi
|
|
jne .L147
|
|
.L160: mov (%r12),%edx
|
|
mov 4(%r12),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,(%r12)
|
|
mov %eax,4(%r12)
|
|
jmp .L147
|
|
.L148: lea -1(%rsi),%rax
|
|
mov $8,%ebx
|
|
test %rsi,%rax
|
|
jne .L162
|
|
xor %edx,%edx
|
|
call int32_sort_2power
|
|
jmp .L147
|
|
.L162: mov %r13,%r14
|
|
sub %rbx,%r14
|
|
cmp %rbx,%r14
|
|
jle .L199
|
|
add %rbx,%rbx
|
|
jmp .L162
|
|
.L199: cmp $128,%rbx
|
|
jg .L164
|
|
mov %rbx,%rax
|
|
mov %rbx,%rdx
|
|
vmovdqa .LC4(%rip),%ymm0
|
|
sar $3,%rax
|
|
sar $2,%rdx
|
|
.L165: cmp %rdx,%rax
|
|
jge .L200
|
|
mov %rax,%rcx
|
|
incq %rax
|
|
salq $5,%rcx
|
|
vmovdqa %ymm0,32(%rsp,%rcx)
|
|
jmp .L165
|
|
.L200: xor %eax,%eax
|
|
.L167: mov (%r12,%rax,4),%edx
|
|
mov %rax,%r14
|
|
mov %edx,32(%rsp,%rax,4)
|
|
lea 1(%rax),%rax
|
|
cmp %rax,%r13
|
|
jne .L167
|
|
lea (%rbx,%rbx),%rsi
|
|
xor %edx,%edx
|
|
lea 32(%rsp),%rdi
|
|
call int32_sort_2power
|
|
xor %eax,%eax
|
|
.L168: mov 32(%rsp,%rax,4),%ecx
|
|
mov %rax,%rdx
|
|
mov %ecx,(%r12,%rax,4)
|
|
incq %rax
|
|
cmp %rdx,%r14
|
|
jne .L168
|
|
jmp .L147
|
|
.L164: mov %rbx,%rsi
|
|
mov %r12,%rdi
|
|
mov $1,%edx
|
|
call int32_sort_2power
|
|
lea (%r12,%rbx,4),%rdi
|
|
mov %r14,%rsi
|
|
call djbsort$avx2
|
|
.L175: mov %rbx,%r14
|
|
mov %r13,%rsi
|
|
mov %r12,%rdi
|
|
sar $2,%r14
|
|
mov %r14,%rdx
|
|
call int32_threestages
|
|
lea 0(,%r14,4),%r10
|
|
mov %r13,%rdx
|
|
lea (%r10,%rax),%r11
|
|
sub %r10,%rdx
|
|
lea (%r12,%rax,4),%rdi
|
|
mov %rax,%r9
|
|
sub %rax,%rdx
|
|
lea (%r12,%r11,4),%rsi
|
|
call minmax_vector
|
|
lea (%r14,%r14),%rax
|
|
mov %rax,24(%rsp)
|
|
cmp %r13,%r11
|
|
jg .L169
|
|
imul $-8,%r14,%rax
|
|
lea (%r12,%r10),%rdx
|
|
lea (%rdx,%r10),%rcx
|
|
lea (%r14,%r9),%r15
|
|
lea (%rcx,%r10),%rdi
|
|
add %rdi,%rax
|
|
lea (%rax,%r10),%rsi
|
|
lea (%rsi,%r10),%r8
|
|
.L170: cmp %r9,%r15
|
|
jle .L201
|
|
vmovdqu (%rcx,%r9,4),%ymm7
|
|
vmovdqu (%rdi,%r9,4),%ymm6
|
|
vpminsd (%r12,%r9,4),%ymm7,%ymm2
|
|
vpminsd (%rdx,%r9,4),%ymm6,%ymm3
|
|
vpmaxsd (%r12,%r9,4),%ymm7,%ymm0
|
|
vpmaxsd (%rdx,%r9,4),%ymm6,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%r12,%r9,4)
|
|
vmovdqu %ymm2,(%rax,%r9,4)
|
|
vmovdqu %ymm3,(%rsi,%r9,4)
|
|
vmovdqu %ymm0,(%r8,%r9,4)
|
|
add $8,%r9
|
|
jmp .L170
|
|
.L201: mov %r11,%r9
|
|
.L169: mov 24(%rsp),%rax
|
|
lea (%r14,%r14),%r15
|
|
mov %r13,%r11
|
|
lea (%r12,%r9,4),%rdi
|
|
sub %r15,%r11
|
|
add %r9,%rax
|
|
mov %r11,%rdx
|
|
lea (%r12,%rax,4),%rsi
|
|
sub %r9,%rdx
|
|
call minmax_vector
|
|
lea (%r15,%r9),%rax
|
|
cmp %r13,%rax
|
|
jg .L172
|
|
mov %rax,%rdx
|
|
add %r12,%r10
|
|
sub %r14,%rdx
|
|
.L173: cmp %r9,%rdx
|
|
jle .L202
|
|
vmovdqu (%r10,%r9,4),%ymm6
|
|
vpminsd (%r12,%r9,4),%ymm6,%ymm1
|
|
vpmaxsd (%r12,%r9,4),%ymm6,%ymm0
|
|
vmovdqu %ymm1,(%r12,%r9,4)
|
|
vmovdqu %ymm0,(%r10,%r9,4)
|
|
add $8,%r9
|
|
jmp .L173
|
|
.L202: mov %rax,%r9
|
|
.L172: lea (%r11,%r14),%rdx
|
|
add %r9,%r14
|
|
lea (%r12,%r9,4),%rdi
|
|
sar $3,%rbx
|
|
sub %r9,%rdx
|
|
lea (%r12,%r14,4),%rsi
|
|
call minmax_vector
|
|
cmp $63,%rbx
|
|
jg .L175
|
|
cmp $32,%rbx
|
|
jne .L176
|
|
mov %r12,%rax
|
|
mov $63,%edx
|
|
.L177: cmp %r13,%rdx
|
|
jge .L203
|
|
vmovdqu (%rax),%ymm6
|
|
add $64,%rdx
|
|
add $256,%rax
|
|
vpminsd -128(%rax),%ymm6,%ymm10
|
|
vpmaxsd -128(%rax),%ymm6,%ymm8
|
|
vmovdqu -224(%rax),%ymm6
|
|
vpminsd -96(%rax),%ymm6,%ymm3
|
|
vpmaxsd -96(%rax),%ymm6,%ymm0
|
|
vmovdqu -192(%rax),%ymm6
|
|
vpminsd -64(%rax),%ymm6,%ymm2
|
|
vpmaxsd -64(%rax),%ymm6,%ymm1
|
|
vmovdqu -160(%rax),%ymm6
|
|
vpmaxsd -32(%rax),%ymm6,%ymm4
|
|
vpminsd -32(%rax),%ymm6,%ymm13
|
|
vpminsd %ymm2,%ymm10,%ymm15
|
|
vpminsd %ymm1,%ymm8,%ymm12
|
|
vpminsd %ymm13,%ymm3,%ymm11
|
|
vpminsd %ymm4,%ymm0,%ymm5
|
|
vpmaxsd %ymm1,%ymm8,%ymm1
|
|
vpmaxsd %ymm2,%ymm10,%ymm2
|
|
vpmaxsd %ymm13,%ymm3,%ymm13
|
|
vpmaxsd %ymm4,%ymm0,%ymm0
|
|
vpminsd %ymm13,%ymm2,%ymm10
|
|
vpminsd %ymm0,%ymm1,%ymm4
|
|
vpminsd %ymm5,%ymm12,%ymm9
|
|
vpminsd %ymm11,%ymm15,%ymm14
|
|
vpmaxsd %ymm13,%ymm2,%ymm13
|
|
vpmaxsd %ymm0,%ymm1,%ymm0
|
|
vpmaxsd %ymm11,%ymm15,%ymm15
|
|
vpmaxsd %ymm5,%ymm12,%ymm12
|
|
vperm2i128 $32,%ymm13,%ymm10,%ymm6
|
|
vperm2i128 $32,%ymm12,%ymm9,%ymm5
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm8
|
|
vperm2i128 $32,%ymm15,%ymm14,%ymm11
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm0
|
|
vperm2i128 $49,%ymm12,%ymm9,%ymm12
|
|
vperm2i128 $49,%ymm15,%ymm14,%ymm14
|
|
vperm2i128 $49,%ymm13,%ymm10,%ymm13
|
|
vpminsd %ymm14,%ymm11,%ymm3
|
|
vpminsd %ymm12,%ymm5,%ymm1
|
|
vpminsd %ymm13,%ymm6,%ymm2
|
|
vpmaxsd %ymm12,%ymm5,%ymm9
|
|
vpmaxsd %ymm14,%ymm11,%ymm11
|
|
vpminsd %ymm0,%ymm8,%ymm12
|
|
vperm2i128 $32,%ymm9,%ymm1,%ymm5
|
|
vpmaxsd %ymm0,%ymm8,%ymm8
|
|
vpmaxsd %ymm13,%ymm6,%ymm10
|
|
vperm2i128 $32,%ymm11,%ymm3,%ymm7
|
|
vperm2i128 $32,%ymm10,%ymm2,%ymm6
|
|
vperm2i128 $49,%ymm11,%ymm3,%ymm11
|
|
vperm2i128 $49,%ymm10,%ymm2,%ymm10
|
|
vperm2i128 $49,%ymm9,%ymm1,%ymm9
|
|
vperm2i128 $32,%ymm8,%ymm12,%ymm4
|
|
vperm2i128 $49,%ymm8,%ymm12,%ymm8
|
|
vpunpcklqdq %ymm11,%ymm7,%ymm3
|
|
vpunpcklqdq %ymm10,%ymm6,%ymm2
|
|
vpunpcklqdq %ymm9,%ymm5,%ymm1
|
|
vpunpcklqdq %ymm8,%ymm4,%ymm0
|
|
vpunpckhqdq %ymm11,%ymm7,%ymm7
|
|
vpunpckhqdq %ymm10,%ymm6,%ymm6
|
|
vpunpckhqdq %ymm9,%ymm5,%ymm5
|
|
vpunpckhqdq %ymm8,%ymm4,%ymm4
|
|
vpminsd %ymm3,%ymm7,%ymm11
|
|
vpminsd %ymm2,%ymm6,%ymm10
|
|
vpminsd %ymm1,%ymm5,%ymm9
|
|
vpminsd %ymm0,%ymm4,%ymm8
|
|
vpmaxsd %ymm3,%ymm7,%ymm7
|
|
vpmaxsd %ymm2,%ymm6,%ymm6
|
|
vpmaxsd %ymm1,%ymm5,%ymm5
|
|
vpunpckldq %ymm7,%ymm11,%ymm3
|
|
vpmaxsd %ymm0,%ymm4,%ymm4
|
|
vpunpckhdq %ymm7,%ymm11,%ymm7
|
|
vpunpckldq %ymm6,%ymm10,%ymm2
|
|
vpunpckldq %ymm5,%ymm9,%ymm1
|
|
vpunpckhdq %ymm6,%ymm10,%ymm6
|
|
vpunpckhdq %ymm5,%ymm9,%ymm5
|
|
vpunpckldq %ymm4,%ymm8,%ymm0
|
|
vpunpckhdq %ymm4,%ymm8,%ymm4
|
|
vpunpcklqdq %ymm7,%ymm3,%ymm10
|
|
vpunpcklqdq %ymm5,%ymm1,%ymm8
|
|
vpunpckhqdq %ymm7,%ymm3,%ymm3
|
|
vpunpcklqdq %ymm6,%ymm2,%ymm9
|
|
vpunpcklqdq %ymm4,%ymm0,%ymm7
|
|
vpunpckhqdq %ymm6,%ymm2,%ymm2
|
|
vpunpckhqdq %ymm5,%ymm1,%ymm1
|
|
vpunpckhqdq %ymm4,%ymm0,%ymm0
|
|
vpminsd %ymm8,%ymm1,%ymm5
|
|
vpminsd %ymm9,%ymm2,%ymm6
|
|
vpminsd %ymm7,%ymm0,%ymm4
|
|
vpminsd %ymm10,%ymm3,%ymm11
|
|
vpmaxsd %ymm8,%ymm1,%ymm1
|
|
vpmaxsd %ymm7,%ymm0,%ymm0
|
|
vpmaxsd %ymm10,%ymm3,%ymm3
|
|
vpmaxsd %ymm9,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm6,%ymm7
|
|
vpunpckldq %ymm3,%ymm11,%ymm8
|
|
vpunpckhdq %ymm2,%ymm6,%ymm2
|
|
vpunpckhdq %ymm3,%ymm11,%ymm3
|
|
vpunpckldq %ymm1,%ymm5,%ymm6
|
|
vpunpckhdq %ymm1,%ymm5,%ymm1
|
|
vmovdqu %ymm8,-256(%rax)
|
|
vpunpckldq %ymm0,%ymm4,%ymm5
|
|
vpunpckhdq %ymm0,%ymm4,%ymm0
|
|
vmovdqu %ymm3,-224(%rax)
|
|
vmovdqu %ymm7,-192(%rax)
|
|
vmovdqu %ymm2,-160(%rax)
|
|
vmovdqu %ymm6,-128(%rax)
|
|
vmovdqu %ymm1,-96(%rax)
|
|
vmovdqu %ymm5,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
jmp .L177
|
|
.L203: mov %r13,%rdi
|
|
mov %r13,%r9
|
|
lea -32(%r13),%rdx
|
|
shr $6,%rdi
|
|
andq $-64,%r9
|
|
salq $8,%rdi
|
|
sub %r9,%rdx
|
|
lea 128(%r12,%rdi),%rsi
|
|
add %r12,%rdi
|
|
call minmax_vector
|
|
jmp .L180
|
|
.L176: xor %r10d,%r10d
|
|
cmp $16,%rbx
|
|
jne .L181
|
|
xor %r9d,%r9d
|
|
.L180: lea 31(%r9),%rax
|
|
.L179: cmp %r13,%rax
|
|
jge .L204
|
|
vmovdqu -124(%r12,%rax,4),%ymm6
|
|
vpminsd -60(%r12,%rax,4),%ymm6,%ymm5
|
|
vpmaxsd -60(%r12,%rax,4),%ymm6,%ymm0
|
|
vmovdqu -92(%r12,%rax,4),%ymm6
|
|
vpminsd -28(%r12,%rax,4),%ymm6,%ymm1
|
|
vpmaxsd -28(%r12,%rax,4),%ymm6,%ymm2
|
|
vpminsd %ymm1,%ymm5,%ymm3
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpmaxsd %ymm1,%ymm5,%ymm5
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm2
|
|
vperm2i128 $32,%ymm5,%ymm3,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm0
|
|
vperm2i128 $49,%ymm5,%ymm3,%ymm3
|
|
vpminsd %ymm0,%ymm2,%ymm4
|
|
vpmaxsd %ymm0,%ymm2,%ymm0
|
|
vpminsd %ymm3,%ymm1,%ymm5
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm2
|
|
vperm2i128 $32,%ymm1,%ymm5,%ymm3
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm4
|
|
vperm2i128 $49,%ymm1,%ymm5,%ymm5
|
|
vpunpcklqdq %ymm5,%ymm3,%ymm1
|
|
vpunpcklqdq %ymm4,%ymm2,%ymm0
|
|
vpunpckhqdq %ymm5,%ymm3,%ymm3
|
|
vpunpckhqdq %ymm4,%ymm2,%ymm2
|
|
vpminsd %ymm3,%ymm1,%ymm5
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpunpckldq %ymm1,%ymm5,%ymm3
|
|
vpunpckldq %ymm0,%ymm4,%ymm2
|
|
vpunpckhdq %ymm1,%ymm5,%ymm5
|
|
vpunpckhdq %ymm0,%ymm4,%ymm4
|
|
vpunpcklqdq %ymm5,%ymm3,%ymm1
|
|
vpunpcklqdq %ymm4,%ymm2,%ymm0
|
|
vpunpckhqdq %ymm5,%ymm3,%ymm3
|
|
vpunpckhqdq %ymm4,%ymm2,%ymm2
|
|
vpminsd %ymm3,%ymm1,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpunpckldq %ymm1,%ymm4,%ymm5
|
|
vpunpckldq %ymm0,%ymm3,%ymm2
|
|
vpunpckhdq %ymm1,%ymm4,%ymm1
|
|
vpunpckhdq %ymm0,%ymm3,%ymm0
|
|
vmovdqu %ymm5,-124(%r12,%rax,4)
|
|
vmovdqu %ymm1,-92(%r12,%rax,4)
|
|
vmovdqu %ymm2,-60(%r12,%rax,4)
|
|
vmovdqu %ymm0,-28(%r12,%rax,4)
|
|
add $32,%rax
|
|
jmp .L179
|
|
.L204: mov %r13,%r10
|
|
xor %edx,%edx
|
|
lea 0(,%r9,4),%rax
|
|
sub %r9,%r10
|
|
mov %r10,%rdi
|
|
andq $-32,%r10
|
|
shr $5,%rdi
|
|
cmp %r9,%r13
|
|
cmovl %rdx,%r10
|
|
salq $7,%rdi
|
|
add %r9,%r10
|
|
cmp %r9,%r13
|
|
cmovl %rdx,%rdi
|
|
lea -16(%r13),%rdx
|
|
sub %r10,%rdx
|
|
lea 64(%rax,%rdi),%rsi
|
|
add %rax,%rdi
|
|
add %r12,%rsi
|
|
add %r12,%rdi
|
|
call minmax_vector
|
|
.L181: lea 15(%r10),%rax
|
|
.L183: cmp %r13,%rax
|
|
jge .L205
|
|
vmovdqu -60(%r12,%rax,4),%ymm6
|
|
vpmaxsd -28(%r12,%rax,4),%ymm6,%ymm2
|
|
vpminsd -28(%r12,%rax,4),%ymm6,%ymm1
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,-60(%r12,%rax,4)
|
|
vmovdqu %ymm0,-28(%r12,%rax,4)
|
|
add $16,%rax
|
|
jmp .L183
|
|
.L205: mov %r13,%r9
|
|
xor %edx,%edx
|
|
lea 0(,%r10,4),%rcx
|
|
sub %r10,%r9
|
|
mov %r9,%rax
|
|
andq $-16,%r9
|
|
shr $4,%rax
|
|
cmp %r10,%r13
|
|
cmovl %rdx,%r9
|
|
salq $6,%rax
|
|
add %r10,%r9
|
|
cmp %r10,%r13
|
|
cmovl %rdx,%rax
|
|
lea -8(%r13),%rdx
|
|
sub %r9,%rdx
|
|
lea (%rax,%rcx),%r10
|
|
lea 32(%rcx,%rax),%rsi
|
|
add %r12,%r10
|
|
add %r12,%rsi
|
|
mov %r10,%rdi
|
|
call minmax_vector
|
|
lea 7(%r9),%rax
|
|
cmp %r13,%rax
|
|
jge .L185
|
|
lea 16(,%r9,4),%rax
|
|
mov (%r10),%ecx
|
|
add $8,%r9
|
|
lea -12(%r12,%rax),%r14
|
|
lea (%r12,%rax),%rbx
|
|
lea 4(%r12,%rax),%r11
|
|
mov (%rbx),%edx
|
|
lea 8(%r12,%rax),%r8
|
|
cmp %edx,%ecx
|
|
mov %ecx,%esi
|
|
cmovg %edx,%ecx
|
|
cmovg %esi,%edx
|
|
mov %ecx,(%r10)
|
|
mov %edx,(%rbx)
|
|
mov (%r14),%ecx
|
|
mov (%r11),%edx
|
|
cmp %edx,%ecx
|
|
mov %ecx,%esi
|
|
cmovg %edx,%ecx
|
|
cmovg %esi,%edx
|
|
lea -8(%r12,%rax),%rsi
|
|
mov %ecx,(%r14)
|
|
mov %edx,(%r11)
|
|
mov (%rsi),%ecx
|
|
mov (%r8),%edx
|
|
cmp %edx,%ecx
|
|
mov %ecx,%edi
|
|
cmovg %edx,%ecx
|
|
cmovg %edi,%edx
|
|
lea 12(%r12,%rax),%rdi
|
|
mov %ecx,(%rsi)
|
|
lea -4(%r12,%rax),%rcx
|
|
mov %edx,(%r8)
|
|
mov (%rcx),%edx
|
|
mov (%rdi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%r15d
|
|
cmovg %eax,%edx
|
|
cmovg %r15d,%eax
|
|
mov %edx,(%rcx)
|
|
mov %eax,(%rdi)
|
|
mov (%r10),%edx
|
|
mov (%rsi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%r15d
|
|
cmovg %eax,%edx
|
|
cmovg %r15d,%eax
|
|
mov %edx,(%r10)
|
|
mov %eax,(%rsi)
|
|
mov (%rcx),%eax
|
|
mov (%r14),%edx
|
|
cmp %eax,%edx
|
|
mov %edx,%r15d
|
|
cmovg %eax,%edx
|
|
cmovg %r15d,%eax
|
|
mov %edx,(%r14)
|
|
mov %eax,(%rcx)
|
|
mov (%r10),%edx
|
|
mov (%r14),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%r15d
|
|
cmovg %eax,%edx
|
|
cmovg %r15d,%eax
|
|
mov %edx,(%r10)
|
|
mov %eax,(%r14)
|
|
mov (%rsi),%edx
|
|
mov (%rcx),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%r10d
|
|
cmovg %eax,%edx
|
|
cmovg %r10d,%eax
|
|
mov %edx,(%rsi)
|
|
mov %eax,(%rcx)
|
|
mov (%rbx),%edx
|
|
mov (%r8),%esi
|
|
mov (%rdi),%ecx
|
|
cmp %esi,%edx
|
|
mov %edx,%eax
|
|
cmovg %esi,%edx
|
|
cmovg %eax,%esi
|
|
mov (%r11),%eax
|
|
cmp %ecx,%eax
|
|
mov %eax,%r10d
|
|
cmovg %ecx,%eax
|
|
cmovg %r10d,%ecx
|
|
cmp %eax,%edx
|
|
mov %edx,%r10d
|
|
cmovg %eax,%edx
|
|
cmovg %r10d,%eax
|
|
mov %edx,(%rbx)
|
|
mov %esi,%edx
|
|
mov %eax,(%r11)
|
|
mov %ecx,%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%ecx
|
|
cmovg %eax,%edx
|
|
cmovg %ecx,%eax
|
|
mov %edx,(%r8)
|
|
mov %eax,(%rdi)
|
|
.L185: lea 4(%r9),%r10
|
|
lea -4(%r13),%rdx
|
|
lea 0(,%r10,4),%rbx
|
|
sub %r9,%rdx
|
|
lea -16(%r12,%rbx),%r11
|
|
lea (%r12,%rbx),%rsi
|
|
mov %r11,%rdi
|
|
call minmax_vector
|
|
lea 3(%r9),%rax
|
|
cmp %r13,%rax
|
|
jge .L186
|
|
lea -8(%r12,%rbx),%rcx
|
|
mov (%r11),%edx
|
|
lea -12(%r12,%rbx),%rdi
|
|
mov %r10,%r9
|
|
mov (%rcx),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%esi
|
|
cmovg %eax,%edx
|
|
cmovg %esi,%eax
|
|
lea -4(%r12,%rbx),%rsi
|
|
mov %edx,(%r11)
|
|
mov %eax,(%rcx)
|
|
mov (%rdi),%edx
|
|
mov (%rsi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%r8d
|
|
cmovg %eax,%edx
|
|
cmovg %r8d,%eax
|
|
mov %edx,(%rdi)
|
|
mov %eax,(%rsi)
|
|
mov (%rdi),%eax
|
|
mov (%r11),%edx
|
|
cmp %eax,%edx
|
|
mov %edx,%r8d
|
|
cmovg %eax,%edx
|
|
cmovg %r8d,%eax
|
|
mov %edx,(%r11)
|
|
mov %eax,(%rdi)
|
|
mov (%rcx),%edx
|
|
mov (%rsi),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%edi
|
|
cmovg %eax,%edx
|
|
cmovg %edi,%eax
|
|
mov %edx,(%rcx)
|
|
mov %eax,(%rsi)
|
|
.L186: lea 2(%r9),%rax
|
|
cmp %r13,%rax
|
|
jge .L187
|
|
lea 0(,%r9,4),%rax
|
|
lea (%r12,%rax),%rsi
|
|
lea 8(%r12,%rax),%rcx
|
|
mov (%rsi),%edx
|
|
mov (%rcx),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%edi
|
|
cmovg %eax,%edx
|
|
cmovg %edi,%eax
|
|
mov %edx,(%rsi)
|
|
mov %eax,(%rcx)
|
|
.L187: lea 1(%r9),%rax
|
|
cmp %r13,%rax
|
|
jge .L147
|
|
salq $2,%r9
|
|
lea (%r12,%r9),%rsi
|
|
lea 4(%r12,%r9),%rcx
|
|
mov (%rsi),%edx
|
|
mov (%rcx),%eax
|
|
cmp %eax,%edx
|
|
mov %edx,%edi
|
|
cmovg %eax,%edx
|
|
cmovg %edi,%eax
|
|
mov %edx,(%rsi)
|
|
mov %eax,(%rcx)
|
|
.L147: lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.endfn djbsort$avx2,globl,hidden
|
|
|
|
minmax_vector:
|
|
cmp $7,%rdx
|
|
jg .L13
|
|
.L2: test %rdx,%rdx
|
|
jle .L15
|
|
mov (%rdi),%ecx
|
|
mov (%rsi),%eax
|
|
add $4,%rdi
|
|
add $4,%rsi
|
|
cmp %eax,%ecx
|
|
mov %ecx,%r8d
|
|
cmovg %eax,%ecx
|
|
cmovg %r8d,%eax
|
|
decq %rdx
|
|
mov %ecx,-4(%rdi)
|
|
mov %eax,-4(%rsi)
|
|
jmp .L2
|
|
.L15: ret
|
|
.L13: testb $7,%dl
|
|
je .L6
|
|
lea -32(,%rdx,4),%rax
|
|
andq $-8,%rdx
|
|
lea (%rdi,%rax),%rcx
|
|
add %rsi,%rax
|
|
vmovdqu (%rax),%ymm2
|
|
vpminsd (%rcx),%ymm2,%ymm1
|
|
vpmaxsd (%rcx),%ymm2,%ymm0
|
|
vmovdqu %ymm1,(%rcx)
|
|
vmovdqu %ymm0,(%rax)
|
|
.L6: xor %eax,%eax
|
|
.L7: vmovdqu (%rdi,%rax),%ymm4
|
|
vpminsd (%rsi,%rax),%ymm4,%ymm1
|
|
vpmaxsd (%rsi,%rax),%ymm4,%ymm0
|
|
vmovdqu %ymm1,(%rdi,%rax)
|
|
vmovdqu %ymm0,(%rsi,%rax)
|
|
add $32,%rax
|
|
sub $8,%rdx
|
|
jne .L7
|
|
ret
|
|
.endfn minmax_vector
|
|
|
|
int32_twostages_32:
|
|
sub $-128,%rdi
|
|
.L17: lea -128(%rdi),%rax
|
|
test %rsi,%rsi
|
|
jle .L21
|
|
.L18: vmovdqu (%rax),%ymm5
|
|
vmovdqu 128(%rax),%ymm7
|
|
add $32,%rax
|
|
vpminsd 352(%rax),%ymm7,%ymm3
|
|
vpminsd 224(%rax),%ymm5,%ymm2
|
|
vpmaxsd 224(%rax),%ymm5,%ymm0
|
|
vpmaxsd 352(%rax),%ymm7,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,-32(%rax)
|
|
vmovdqu %ymm2,96(%rax)
|
|
vmovdqu %ymm3,224(%rax)
|
|
vmovdqu %ymm0,352(%rax)
|
|
cmp %rax,%rdi
|
|
jne .L18
|
|
add $-128,%rsi
|
|
add $512,%rdi
|
|
jmp .L17
|
|
.L21: ret
|
|
.endfn int32_twostages_32
|
|
|
|
int32_threestages:
|
|
push %rbp
|
|
imul $-24,%rdx,%r8
|
|
lea 0(,%rdx,8),%rax
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
push %r14
|
|
push %r13
|
|
push %r12
|
|
push %rbx
|
|
andq $-32,%rsp
|
|
sub $64,%rsp
|
|
mov %rax,56(%rsp)
|
|
lea 0(,%rdx,4),%rax
|
|
lea (%rdi,%rax),%rcx
|
|
mov %rsi,8(%rsp)
|
|
lea (%rcx,%rax),%rsi
|
|
lea (%rsi,%rax),%r9
|
|
lea (%r9,%rax),%r11
|
|
lea (%r11,%rax),%r12
|
|
lea (%r12,%rax),%r14
|
|
lea (%r14,%rax),%r15
|
|
lea (%r15,%r8),%rbx
|
|
mov %rbx,40(%rsp)
|
|
add %rax,%rbx
|
|
lea (%rbx,%rax),%r10
|
|
mov %rbx,32(%rsp)
|
|
lea (%r10,%rax),%rbx
|
|
lea (%rbx,%rax),%r13
|
|
lea 0(%r13,%rax),%r8
|
|
mov %r8,24(%rsp)
|
|
add %r8,%rax
|
|
mov %rax,16(%rsp)
|
|
xor %eax,%eax
|
|
.L23: mov 56(%rsp),%r8
|
|
add %rax,%r8
|
|
mov %r8,48(%rsp)
|
|
cmp 8(%rsp),%r8
|
|
jg .L28
|
|
.L25: cmp %rdx,%rax
|
|
jge .L29
|
|
vmovdqu (%rdi,%rax,4),%ymm3
|
|
vmovdqu (%rsi,%rax,4),%ymm6
|
|
vpminsd (%r11,%rax,4),%ymm3,%ymm7
|
|
vpmaxsd (%r11,%rax,4),%ymm3,%ymm4
|
|
vpmaxsd (%r14,%rax,4),%ymm6,%ymm0
|
|
vmovdqu (%rcx,%rax,4),%ymm3
|
|
vmovdqu (%rsi,%rax,4),%ymm5
|
|
vpminsd (%r12,%rax,4),%ymm3,%ymm2
|
|
vpmaxsd (%r12,%rax,4),%ymm3,%ymm1
|
|
vpminsd (%r14,%rax,4),%ymm5,%ymm5
|
|
vmovdqu (%r9,%rax,4),%ymm3
|
|
vpminsd (%r15,%rax,4),%ymm3,%ymm6
|
|
vpmaxsd (%r15,%rax,4),%ymm3,%ymm3
|
|
vpminsd %ymm5,%ymm7,%ymm8
|
|
mov 40(%rsp),%r8
|
|
vpmaxsd %ymm5,%ymm7,%ymm5
|
|
vpminsd %ymm6,%ymm2,%ymm7
|
|
vpminsd %ymm7,%ymm8,%ymm9
|
|
vpmaxsd %ymm6,%ymm2,%ymm2
|
|
vpminsd %ymm0,%ymm4,%ymm6
|
|
vpmaxsd %ymm0,%ymm4,%ymm0
|
|
vmovdqu %ymm9,(%rdi,%rax,4)
|
|
vpminsd %ymm3,%ymm1,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpmaxsd %ymm7,%ymm8,%ymm3
|
|
vpminsd %ymm2,%ymm5,%ymm7
|
|
vmovdqu %ymm3,(%r8,%rax,4)
|
|
mov 32(%rsp),%r8
|
|
vpmaxsd %ymm2,%ymm5,%ymm2
|
|
vpminsd %ymm4,%ymm6,%ymm5
|
|
vpmaxsd %ymm4,%ymm6,%ymm6
|
|
vpminsd %ymm1,%ymm0,%ymm4
|
|
vmovdqu %ymm7,(%r8,%rax,4)
|
|
mov 24(%rsp),%r8
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm2,(%r10,%rax,4)
|
|
vmovdqu %ymm5,(%rbx,%rax,4)
|
|
vmovdqu %ymm6,0(%r13,%rax,4)
|
|
vmovdqu %ymm4,(%r8,%rax,4)
|
|
mov 16(%rsp),%r8
|
|
vmovdqu %ymm0,(%r8,%rax,4)
|
|
add $8,%rax
|
|
jmp .L25
|
|
.L29: mov 48(%rsp),%rax
|
|
add 56(%rsp),%rdx
|
|
jmp .L23
|
|
.L28: lea -40(%rbp),%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
ret
|
|
.endfn int32_threestages
|
|
|
|
merge16_finish:
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm3,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm3,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm2
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vpunpckldq %ymm2,%ymm1,%ymm0
|
|
vpunpckhdq %ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm3
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm2
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm1,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm1,%ymm0
|
|
test %esi,%esi
|
|
je .L31
|
|
vpcmpeqd %ymm1,%ymm1,%ymm1
|
|
vpxor %ymm1,%ymm2,%ymm2
|
|
vpxor %ymm1,%ymm0,%ymm0
|
|
.L31: vmovdqu %ymm2,(%rdi)
|
|
vmovdqu %ymm0,32(%rdi)
|
|
ret
|
|
.endfn merge16_finish
|
|
|
|
int32_sort_2power:
|
|
push %r13
|
|
lea 16(%rsp),%r13
|
|
andq $-32,%rsp
|
|
push -8(%r13)
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
push %r15
|
|
push %r14
|
|
push %r13
|
|
push %r12
|
|
mov %rdi,%r12
|
|
push %rbx
|
|
sub $264,%rsp
|
|
mov %edx,-116(%rbp)
|
|
cmp $8,%rsi
|
|
jne .L36
|
|
mov 4(%rdi),%edx
|
|
mov (%rdi),%r8d
|
|
mov 8(%rdi),%ecx
|
|
mov 28(%r12),%r9d
|
|
cmp %r8d,%edx
|
|
mov %edx,%eax
|
|
cmovg %r8d,%edx
|
|
cmovg %eax,%r8d
|
|
mov 12(%rdi),%eax
|
|
cmp %ecx,%eax
|
|
mov %eax,%esi
|
|
cmovg %ecx,%eax
|
|
cmovg %esi,%ecx
|
|
cmp %r8d,%ecx
|
|
mov %ecx,%esi
|
|
cmovg %r8d,%ecx
|
|
cmovg %esi,%r8d
|
|
cmp %edx,%eax
|
|
mov %eax,%esi
|
|
cmovg %edx,%eax
|
|
cmovg %esi,%edx
|
|
mov 20(%rdi),%esi
|
|
mov %edx,%r10d
|
|
mov 16(%rdi),%edi
|
|
cmp %r10d,%ecx
|
|
mov %ecx,%edx
|
|
cmovg %r10d,%ecx
|
|
cmovg %edx,%r10d
|
|
cmp %edi,%esi
|
|
mov %esi,%edx
|
|
cmovg %edi,%esi
|
|
cmovg %edx,%edi
|
|
mov 24(%r12),%edx
|
|
cmp %edx,%r9d
|
|
mov %r9d,%r11d
|
|
cmovg %edx,%r9d
|
|
cmovg %r11d,%edx
|
|
cmp %edi,%edx
|
|
mov %edx,%r11d
|
|
cmovg %edi,%edx
|
|
cmovg %r11d,%edi
|
|
cmp %esi,%r9d
|
|
mov %r9d,%r11d
|
|
cmovg %esi,%r9d
|
|
cmovg %r11d,%esi
|
|
cmp %esi,%edx
|
|
mov %edx,%r11d
|
|
cmovg %esi,%edx
|
|
cmovg %r11d,%esi
|
|
cmp %r8d,%edi
|
|
mov %edi,%r11d
|
|
cmovg %r8d,%edi
|
|
cmovg %r11d,%r8d
|
|
cmp %ecx,%edx
|
|
mov %edx,%r11d
|
|
cmovg %ecx,%edx
|
|
cmovg %r11d,%ecx
|
|
mov %r8d,(%r12)
|
|
cmp %ecx,%edi
|
|
mov %edi,%r11d
|
|
cmovg %ecx,%edi
|
|
cmovg %r11d,%ecx
|
|
cmp %r10d,%esi
|
|
mov %esi,%r11d
|
|
cmovg %r10d,%esi
|
|
cmovg %r11d,%r10d
|
|
cmp %eax,%r9d
|
|
mov %r9d,%r11d
|
|
cmovg %eax,%r9d
|
|
cmovg %r11d,%eax
|
|
cmp %eax,%esi
|
|
mov %esi,%r11d
|
|
cmovg %eax,%esi
|
|
cmovg %r11d,%eax
|
|
mov %r9d,28(%r12)
|
|
cmp %r10d,%ecx
|
|
mov %ecx,%r11d
|
|
cmovg %r10d,%ecx
|
|
cmovg %r11d,%r10d
|
|
cmp %eax,%edi
|
|
mov %edi,%r11d
|
|
cmovg %eax,%edi
|
|
cmovg %r11d,%eax
|
|
mov %r10d,4(%r12)
|
|
cmp %esi,%edx
|
|
mov %edx,%r11d
|
|
cmovg %esi,%edx
|
|
cmovg %r11d,%esi
|
|
mov %ecx,8(%r12)
|
|
mov %eax,12(%r12)
|
|
mov %edi,16(%r12)
|
|
mov %esi,20(%r12)
|
|
mov %edx,24(%r12)
|
|
jmp .L35
|
|
.L36: mov %rsi,%r15
|
|
cmp $16,%rsi
|
|
jne .L38
|
|
vmovdqa .LC0(%rip),%ymm0
|
|
vpxor 32(%rdi),%ymm0,%ymm2
|
|
vpxor (%rdi),%ymm0,%ymm0
|
|
vmovdqa .LC1(%rip),%ymm4
|
|
cmp $0,-116(%rbp)
|
|
vpunpckldq %ymm2,%ymm0,%ymm1
|
|
vpunpckhdq %ymm2,%ymm0,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm3
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm1,%ymm2
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpxor %ymm4,%ymm2,%ymm2
|
|
vpxor %ymm4,%ymm1,%ymm1
|
|
vpunpckldq %ymm1,%ymm2,%ymm0
|
|
vpunpckhdq %ymm1,%ymm2,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm3,%ymm2
|
|
vpunpckhqdq %ymm1,%ymm3,%ymm3
|
|
vpunpckldq %ymm3,%ymm2,%ymm1
|
|
vpunpckhdq %ymm3,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm0,%ymm1,%ymm2
|
|
vpmaxsd %ymm0,%ymm1,%ymm1
|
|
vpunpckldq %ymm1,%ymm2,%ymm0
|
|
vpunpckhdq %ymm1,%ymm2,%ymm1
|
|
vpxor %ymm4,%ymm1,%ymm1
|
|
vpxor %ymm4,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm1,%ymm0,%ymm3
|
|
vperm2i128 $49,%ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm3,%ymm0,%ymm2
|
|
vpmaxsd %ymm3,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm3,%ymm1
|
|
vpunpckhqdq %ymm2,%ymm3,%ymm2
|
|
vpunpckldq %ymm2,%ymm1,%ymm0
|
|
vpunpckhdq %ymm2,%ymm1,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
|
vpunpckhqdq %ymm2,%ymm0,%ymm0
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vpunpckldq %ymm0,%ymm2,%ymm1
|
|
vpunpckhdq %ymm0,%ymm2,%ymm0
|
|
vpunpcklqdq %ymm0,%ymm1,%ymm2
|
|
vpunpckhqdq %ymm0,%ymm1,%ymm1
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
je .L39
|
|
vpxor %ymm0,%ymm1,%ymm1
|
|
jmp .L40
|
|
.L39: vpxor %ymm0,%ymm2,%ymm2
|
|
.L40: mov -116(%rbp),%esi
|
|
vmovdqa %ymm2,%ymm0
|
|
mov %r12,%rdi
|
|
jmp .L134
|
|
.L38: cmp $32,%rsi
|
|
jne .L41
|
|
mov $1,%edx
|
|
mov $16,%esi
|
|
lea 64(%r12),%r13
|
|
call int32_sort_2power
|
|
xor %edx,%edx
|
|
mov $16,%esi
|
|
mov %r13,%rdi
|
|
call int32_sort_2power
|
|
cmp $0,-116(%rbp)
|
|
vmovdqu (%r12),%ymm4
|
|
vmovdqu 32(%r12),%ymm1
|
|
vmovdqu 64(%r12),%ymm2
|
|
vmovdqu 96(%r12),%ymm3
|
|
je .L42
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
vpxor %ymm0,%ymm4,%ymm4
|
|
vpxor %ymm0,%ymm1,%ymm1
|
|
vpxor %ymm0,%ymm2,%ymm2
|
|
vpxor %ymm0,%ymm3,%ymm3
|
|
.L42: mov -116(%rbp),%esi
|
|
vpmaxsd %ymm1,%ymm3,%ymm5
|
|
vpminsd %ymm4,%ymm2,%ymm0
|
|
mov %r12,%rdi
|
|
vpmaxsd %ymm4,%ymm2,%ymm4
|
|
vpminsd %ymm1,%ymm3,%ymm1
|
|
vmovdqa %ymm5,-80(%rbp)
|
|
vmovdqa %ymm4,-112(%rbp)
|
|
call merge16_finish
|
|
vmovdqa -80(%rbp),%ymm5
|
|
mov -116(%rbp),%esi
|
|
mov %r13,%rdi
|
|
vmovdqa -112(%rbp),%ymm4
|
|
vmovdqa %ymm5,%ymm1
|
|
vmovdqa %ymm4,%ymm0
|
|
.L134: add $264,%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
lea -16(%r13),%rsp
|
|
pop %r13
|
|
jmp merge16_finish
|
|
.L41: mov %rsi,%rax
|
|
sar $3,%rax
|
|
mov %rax,-80(%rbp)
|
|
lea 0(,%rax,4),%r13
|
|
salq $3,%rax
|
|
imul $-20,-80(%rbp),%rdx
|
|
lea (%rdi,%rax),%rdi
|
|
lea (%rdi,%rax),%rsi
|
|
lea (%rsi,%rax),%rcx
|
|
add %rcx,%rdx
|
|
lea (%rdx,%rax),%r9
|
|
lea (%r9,%rax),%r8
|
|
add %r8,%rax
|
|
mov %rax,-136(%rbp)
|
|
mov %rax,%r10
|
|
xor %eax,%eax
|
|
.L43: cmp -80(%rbp),%rax
|
|
jge .L135
|
|
add $32,%rdi
|
|
add $32,%rsi
|
|
add $32,%rcx
|
|
add $32,%rdx
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
add $32,%r9
|
|
add $32,%r8
|
|
add $32,%r10
|
|
vpminsd -32(%rsi),%ymm5,%ymm4
|
|
vpmaxsd -32(%rsi),%ymm5,%ymm2
|
|
vmovdqu -32(%rdi),%ymm5
|
|
vpminsd -32(%rcx),%ymm5,%ymm1
|
|
vpmaxsd -32(%rcx),%ymm5,%ymm0
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpminsd %ymm4,%ymm1,%ymm2
|
|
vpmaxsd %ymm4,%ymm1,%ymm1
|
|
vmovdqu %ymm0,(%r12,%rax,4)
|
|
add $8,%rax
|
|
vpminsd %ymm1,%ymm3,%ymm4
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vmovdqu %ymm4,-32(%rdi)
|
|
vmovdqu %ymm1,-32(%rsi)
|
|
vmovdqu %ymm2,-32(%rcx)
|
|
vmovdqu -32(%r8),%ymm5
|
|
vmovdqu -32(%r10),%ymm6
|
|
vpminsd -32(%rdx),%ymm5,%ymm1
|
|
vpminsd -32(%r9),%ymm6,%ymm3
|
|
vpmaxsd -32(%r9),%ymm6,%ymm2
|
|
vpmaxsd -32(%rdx),%ymm5,%ymm0
|
|
vpminsd %ymm3,%ymm1,%ymm4
|
|
vpmaxsd %ymm3,%ymm1,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm3
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vmovdqu %ymm4,-32(%rdx)
|
|
vpminsd %ymm1,%ymm3,%ymm2
|
|
vpmaxsd %ymm1,%ymm3,%ymm1
|
|
vmovdqu %ymm1,-32(%r9)
|
|
vmovdqu %ymm2,-32(%r8)
|
|
vmovdqu %ymm0,-32(%r10)
|
|
jmp .L43
|
|
.L135: imul $-24,-80(%rbp),%rax
|
|
mov %rax,-128(%rbp)
|
|
cmp $127,%r15
|
|
jg .L105
|
|
.L63: lea (%r12,%r15,4),%rax
|
|
vmovdqa .LC1(%rip),%ymm10
|
|
movl $3,-272(%rbp)
|
|
mov $4,%r14d
|
|
mov %rax,-144(%rbp)
|
|
mov %r15,%rax
|
|
vmovdqa .LC3(%rip),%ymm11
|
|
sar $4,%rax
|
|
vmovdqa .LC2(%rip),%ymm12
|
|
mov %rax,-112(%rbp)
|
|
mov -136(%rbp),%rax
|
|
add -128(%rbp),%rax
|
|
mov %rax,-200(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-192(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-184(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-176(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-168(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-160(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-152(%rbp)
|
|
jmp .L46
|
|
.L105: xor %eax,%eax
|
|
vpcmpeqd %ymm0,%ymm0,%ymm0
|
|
.L45: vpxor 64(%r12,%rax,4),%ymm0,%ymm1
|
|
vpxor (%r12,%rax,4),%ymm0,%ymm2
|
|
vmovdqu %ymm1,64(%r12,%rax,4)
|
|
vmovdqu %ymm2,(%r12,%rax,4)
|
|
add $32,%rax
|
|
cmp %rax,%r15
|
|
jg .L45
|
|
mov -136(%rbp),%r14
|
|
add -128(%rbp),%r14
|
|
mov $8,%ebx
|
|
vpcmpeqd %ymm10,%ymm10,%ymm10
|
|
lea (%r14,%r13),%rax
|
|
mov %rax,-296(%rbp)
|
|
add %r13,%rax
|
|
lea (%rax,%r13),%r11
|
|
mov %rax,-176(%rbp)
|
|
lea (%r11,%r13),%rax
|
|
mov %rax,-288(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-144(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-112(%rbp)
|
|
add -128(%rbp),%rax
|
|
mov %rax,-200(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-192(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-184(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-168(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-160(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-152(%rbp)
|
|
add %r13,%rax
|
|
mov %rax,-280(%rbp)
|
|
.L64: mov %rbx,%rcx
|
|
sarq %rcx
|
|
.L47: cmp $127,%rcx
|
|
jle .L136
|
|
mov %rcx,%rdx
|
|
mov %r15,%rsi
|
|
mov %r12,%rdi
|
|
mov %r11,-272(%rbp)
|
|
sar $2,%rdx
|
|
mov %rcx,-240(%rbp)
|
|
call int32_threestages
|
|
mov -240(%rbp),%rcx
|
|
mov -272(%rbp),%r11
|
|
vpcmpeqd %ymm10,%ymm10,%ymm10
|
|
sar $3,%rcx
|
|
jmp .L47
|
|
.L136: cmp $64,%rcx
|
|
jne .L49
|
|
mov %r15,%rsi
|
|
mov %r12,%rdi
|
|
mov %r11,-240(%rbp)
|
|
call int32_twostages_32
|
|
mov -240(%rbp),%r11
|
|
vpcmpeqd %ymm10,%ymm10,%ymm10
|
|
.L54: xor %eax,%eax
|
|
jmp .L50
|
|
.L49: cmp $32,%rcx
|
|
jne .L51
|
|
mov %r12,%rax
|
|
xor %edx,%edx
|
|
.L52: vmovdqu (%rax),%ymm7
|
|
vmovdqu 32(%rax),%ymm5
|
|
add $64,%rdx
|
|
add $256,%rax
|
|
vpminsd -128(%rax),%ymm7,%ymm8
|
|
vpmaxsd -128(%rax),%ymm7,%ymm4
|
|
vpminsd -96(%rax),%ymm5,%ymm1
|
|
vpmaxsd -96(%rax),%ymm5,%ymm0
|
|
vmovdqu -192(%rax),%ymm6
|
|
vmovdqu -160(%rax),%ymm7
|
|
vpminsd -64(%rax),%ymm6,%ymm5
|
|
vpmaxsd -32(%rax),%ymm7,%ymm2
|
|
vpmaxsd -64(%rax),%ymm6,%ymm3
|
|
vmovdqu -160(%rax),%ymm6
|
|
vpminsd -32(%rax),%ymm6,%ymm6
|
|
vpminsd %ymm5,%ymm8,%ymm7
|
|
vpmaxsd %ymm5,%ymm8,%ymm5
|
|
vpminsd %ymm6,%ymm1,%ymm8
|
|
vpmaxsd %ymm6,%ymm1,%ymm1
|
|
vpminsd %ymm3,%ymm4,%ymm6
|
|
vpmaxsd %ymm3,%ymm4,%ymm3
|
|
vpminsd %ymm2,%ymm0,%ymm4
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vpminsd %ymm8,%ymm7,%ymm9
|
|
vpmaxsd %ymm8,%ymm7,%ymm2
|
|
vpminsd %ymm1,%ymm5,%ymm7
|
|
vpmaxsd %ymm1,%ymm5,%ymm1
|
|
vmovdqu %ymm9,-256(%rax)
|
|
vpminsd %ymm4,%ymm6,%ymm5
|
|
vpmaxsd %ymm4,%ymm6,%ymm6
|
|
vmovdqu %ymm2,-224(%rax)
|
|
vpminsd %ymm0,%ymm3,%ymm4
|
|
vpmaxsd %ymm0,%ymm3,%ymm3
|
|
vmovdqu %ymm5,-128(%rax)
|
|
vmovdqu %ymm7,-192(%rax)
|
|
vmovdqu %ymm1,-160(%rax)
|
|
vmovdqu %ymm6,-96(%rax)
|
|
vmovdqu %ymm4,-64(%rax)
|
|
vmovdqu %ymm3,-32(%rax)
|
|
cmp %rdx,%r15
|
|
jg .L52
|
|
.L56: lea (%rbx,%rbx),%rdx
|
|
xor %ecx,%ecx
|
|
cmp -80(%rbp),%rdx
|
|
setne %al
|
|
sete %cl
|
|
mov %rdx,%r8
|
|
xor %esi,%esi
|
|
movzbl %al,%eax
|
|
mov %eax,-204(%rbp)
|
|
jmp .L53
|
|
.L51: cmp $16,%rcx
|
|
jne .L131
|
|
jmp .L54
|
|
.L50: vmovdqu (%r12,%rax,4),%ymm5
|
|
vmovdqu 32(%r12,%rax,4),%ymm6
|
|
vpminsd 64(%r12,%rax,4),%ymm5,%ymm2
|
|
vpminsd 96(%r12,%rax,4),%ymm6,%ymm3
|
|
vpmaxsd 64(%r12,%rax,4),%ymm5,%ymm0
|
|
vpmaxsd 96(%r12,%rax,4),%ymm6,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%r12,%rax,4)
|
|
vmovdqu %ymm2,32(%r12,%rax,4)
|
|
vmovdqu %ymm3,64(%r12,%rax,4)
|
|
vmovdqu %ymm0,96(%r12,%rax,4)
|
|
add $32,%rax
|
|
cmp %rax,%r15
|
|
jg .L50
|
|
jmp .L56
|
|
.L131: cmp $8,%rcx
|
|
jne .L56
|
|
xor %eax,%eax
|
|
.L57: vmovdqu 32(%r12,%rax,4),%ymm7
|
|
vpmaxsd (%r12,%rax,4),%ymm7,%ymm0
|
|
vpminsd (%r12,%rax,4),%ymm7,%ymm1
|
|
vmovdqu %ymm0,32(%r12,%rax,4)
|
|
vmovdqu %ymm1,(%r12,%rax,4)
|
|
add $16,%rax
|
|
cmp %rax,%r15
|
|
jg .L57
|
|
jmp .L56
|
|
.L59: mov -176(%rbp),%r10
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
vpminsd (%r14,%rax,4),%ymm5,%ymm6
|
|
vpmaxsd (%r14,%rax,4),%ymm5,%ymm15
|
|
vmovdqu (%r10,%rax,4),%ymm5
|
|
mov -296(%rbp),%r10
|
|
vmovdqu (%r10,%rax,4),%ymm7
|
|
mov -288(%rbp),%r10
|
|
vmovdqa %ymm5,-240(%rbp)
|
|
vmovdqa %ymm7,-272(%rbp)
|
|
vmovdqu (%r10,%rax,4),%ymm7
|
|
mov -112(%rbp),%r10
|
|
vmovdqa -272(%rbp),%ymm5
|
|
vpminsd -240(%rbp),%ymm5,%ymm1
|
|
vpmaxsd -240(%rbp),%ymm5,%ymm5
|
|
vmovdqa %ymm7,-240(%rbp)
|
|
vmovdqa -240(%rbp),%ymm4
|
|
vpmaxsd (%r11,%rax,4),%ymm4,%ymm0
|
|
vmovdqu (%r10,%rax,4),%ymm4
|
|
vpminsd %ymm1,%ymm6,%ymm8
|
|
mov -144(%rbp),%r10
|
|
vmovdqa -240(%rbp),%ymm7
|
|
vpmaxsd %ymm1,%ymm6,%ymm6
|
|
vpminsd %ymm5,%ymm15,%ymm1
|
|
vmovdqa %ymm4,-240(%rbp)
|
|
vpminsd (%r11,%rax,4),%ymm7,%ymm7
|
|
vpmaxsd %ymm5,%ymm15,%ymm15
|
|
vmovdqu (%r10,%rax,4),%ymm4
|
|
vmovdqa %ymm4,-272(%rbp)
|
|
vmovdqa -272(%rbp),%ymm4
|
|
vpminsd -240(%rbp),%ymm4,%ymm3
|
|
vpmaxsd -240(%rbp),%ymm4,%ymm4
|
|
vpminsd %ymm3,%ymm7,%ymm2
|
|
vpmaxsd %ymm3,%ymm7,%ymm3
|
|
vpminsd %ymm4,%ymm0,%ymm7
|
|
vpmaxsd %ymm4,%ymm0,%ymm0
|
|
vpminsd %ymm2,%ymm8,%ymm14
|
|
vpminsd %ymm7,%ymm1,%ymm13
|
|
vpminsd %ymm3,%ymm6,%ymm12
|
|
vpminsd %ymm0,%ymm15,%ymm11
|
|
vmovdqa %ymm14,%ymm9
|
|
vpmaxsd %ymm3,%ymm6,%ymm6
|
|
vpmaxsd %ymm2,%ymm8,%ymm2
|
|
vmovdqa %ymm13,%ymm8
|
|
vpmaxsd %ymm7,%ymm1,%ymm1
|
|
vpmaxsd %ymm0,%ymm15,%ymm0
|
|
vmovdqa %ymm6,-240(%rbp)
|
|
vmovdqa %ymm2,%ymm5
|
|
vmovdqa -240(%rbp),%ymm3
|
|
vmovdqa %ymm1,%ymm4
|
|
vmovdqa %ymm12,%ymm7
|
|
vmovdqa %ymm11,%ymm6
|
|
vmovdqa %ymm0,%ymm15
|
|
test %ecx,%ecx
|
|
je .L58
|
|
vpxor %ymm14,%ymm10,%ymm9
|
|
vpxor %ymm13,%ymm10,%ymm8
|
|
vpxor %ymm12,%ymm10,%ymm7
|
|
vpxor %ymm11,%ymm10,%ymm6
|
|
vpxor %ymm2,%ymm10,%ymm5
|
|
vpxor %ymm1,%ymm10,%ymm4
|
|
vpxor %ymm3,%ymm10,%ymm3
|
|
vpxor %ymm0,%ymm10,%ymm15
|
|
.L58: mov -200(%rbp),%r10
|
|
vmovdqu %ymm9,(%r12,%rax,4)
|
|
vmovdqu %ymm8,(%r10,%rax,4)
|
|
mov -192(%rbp),%r10
|
|
vmovdqu %ymm7,(%r10,%rax,4)
|
|
mov -184(%rbp),%r10
|
|
vmovdqu %ymm6,(%r10,%rax,4)
|
|
mov -168(%rbp),%r10
|
|
vmovdqu %ymm5,(%r10,%rax,4)
|
|
mov -160(%rbp),%r10
|
|
vmovdqu %ymm4,(%r10,%rax,4)
|
|
mov -152(%rbp),%r10
|
|
vmovdqu %ymm3,(%r10,%rax,4)
|
|
mov -280(%rbp),%r10
|
|
vmovdqu %ymm15,(%r10,%rax,4)
|
|
add $8,%rax
|
|
.L60: cmp %rax,%rdi
|
|
jg .L59
|
|
xor $1,%ecx
|
|
lea (%rdx,%r9),%rdi
|
|
.L62: mov %rdi,%r9
|
|
sub %rbx,%r9
|
|
mov %r9,%rax
|
|
cmp %r9,%r8
|
|
jg .L60
|
|
xor -204(%rbp),%ecx
|
|
add %rdx,%rsi
|
|
add %rdx,%r8
|
|
.L53: cmp -80(%rbp),%rsi
|
|
jge .L61
|
|
lea (%rsi,%rbx),%rdi
|
|
jmp .L62
|
|
.L61: salq $4,%rbx
|
|
cmp %r15,%rbx
|
|
je .L63
|
|
mov %rdx,%rbx
|
|
jmp .L64
|
|
.L46: cmp $4,%r14
|
|
jne .L132
|
|
mov %r12,%rax
|
|
.L65: cmp -144(%rbp),%rax
|
|
je .L72
|
|
vpxor 32(%rax),%ymm12,%ymm0
|
|
vpxor (%rax),%ymm12,%ymm1
|
|
add $64,%rax
|
|
vmovdqu %ymm1,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
jmp .L65
|
|
.L72: mov -112(%rbp),%rbx
|
|
jmp .L68
|
|
.L132: mov %r12,%rax
|
|
cmp $2,%r14
|
|
jne .L70
|
|
.L69: cmp -144(%rbp),%rax
|
|
je .L72
|
|
vpxor 32(%rax),%ymm10,%ymm2
|
|
vpxor (%rax),%ymm10,%ymm1
|
|
add $64,%rax
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
jmp .L69
|
|
.L70: cmp -144(%rbp),%rax
|
|
je .L72
|
|
vpxor 32(%rax),%ymm11,%ymm2
|
|
vpxor (%rax),%ymm11,%ymm1
|
|
add $64,%rax
|
|
vperm2i128 $32,%ymm2,%ymm1,%ymm0
|
|
vperm2i128 $49,%ymm2,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm1,%ymm0,%ymm2
|
|
vpunpckhqdq %ymm1,%ymm0,%ymm0
|
|
vpminsd %ymm0,%ymm2,%ymm1
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vpunpcklqdq %ymm2,%ymm1,%ymm0
|
|
vpunpckhqdq %ymm2,%ymm1,%ymm1
|
|
vpminsd %ymm1,%ymm0,%ymm2
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vperm2i128 $32,%ymm0,%ymm2,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm2,%ymm0
|
|
vmovdqu %ymm1,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
jmp .L70
|
|
.L137: cmp $32,%rbx
|
|
jne .L75
|
|
.L74: mov %rbx,%rdx
|
|
mov %r15,%rsi
|
|
mov %r12,%rdi
|
|
sar $3,%rbx
|
|
sar $2,%rdx
|
|
call int32_threestages
|
|
vmovdqa .LC2(%rip),%ymm12
|
|
vmovdqa .LC3(%rip),%ymm11
|
|
vmovdqa .LC1(%rip),%ymm10
|
|
.L68: cmp $127,%rbx
|
|
jle .L137
|
|
jmp .L74
|
|
.L139: sar $2,%rbx
|
|
.L75: cmp $15,%rbx
|
|
jle .L138
|
|
mov %rbx,%rcx
|
|
xor %esi,%esi
|
|
sarq %rcx
|
|
imul $-8,%rcx,%rdi
|
|
lea 0(,%rcx,4),%rdx
|
|
lea (%r12,%rdx),%r11
|
|
lea (%r11,%rdx),%r10
|
|
lea (%r10,%rdx),%r8
|
|
lea (%rdi,%r8),%rax
|
|
lea (%rax,%rdx),%r9
|
|
mov %rax,-136(%rbp)
|
|
lea (%r9,%rdx),%rax
|
|
mov %rax,-240(%rbp)
|
|
.L76: cmp %r15,%rsi
|
|
jge .L139
|
|
mov %rsi,%rax
|
|
.L78: cmp %rcx,%rax
|
|
jge .L140
|
|
vmovdqu (%r12,%rax,4),%ymm6
|
|
vmovdqu (%r11,%rax,4),%ymm5
|
|
vpminsd (%r10,%rax,4),%ymm6,%ymm2
|
|
vpminsd (%r8,%rax,4),%ymm5,%ymm3
|
|
mov -136(%rbp),%rdi
|
|
vpmaxsd (%r10,%rax,4),%ymm6,%ymm0
|
|
vpmaxsd (%r8,%rax,4),%ymm5,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vmovdqu %ymm4,(%r12,%rax,4)
|
|
vmovdqu %ymm2,(%rdi,%rax,4)
|
|
mov -240(%rbp),%rdi
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm3,(%r9,%rax,4)
|
|
vmovdqu %ymm0,(%rdi,%rax,4)
|
|
add $8,%rax
|
|
jmp .L78
|
|
.L140: add %rdx,%rsi
|
|
add %rdx,%rcx
|
|
jmp .L76
|
|
.L138: cmp $8,%rbx
|
|
je .L109
|
|
.L83: mov -152(%rbp),%rdx
|
|
mov -160(%rbp),%rcx
|
|
xor %eax,%eax
|
|
mov -168(%rbp),%rsi
|
|
mov -176(%rbp),%rdi
|
|
mov -184(%rbp),%r8
|
|
mov -192(%rbp),%r9
|
|
mov -200(%rbp),%r10
|
|
jmp .L81
|
|
.L109: xor %eax,%eax
|
|
.L80: cmp %r15,%rax
|
|
jge .L83
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
vpminsd 32(%r12,%rax,4),%ymm5,%ymm1
|
|
vpmaxsd 32(%r12,%rax,4),%ymm5,%ymm0
|
|
vmovdqu %ymm1,(%r12,%rax,4)
|
|
vmovdqu %ymm0,32(%r12,%rax,4)
|
|
add $16,%rax
|
|
jmp .L80
|
|
.L81: cmp -80(%rbp),%rax
|
|
jge .L141
|
|
vmovdqu (%rdi),%ymm7
|
|
add $32,%r10
|
|
add $32,%r9
|
|
add $32,%r8
|
|
add $32,%rdi
|
|
add $32,%rsi
|
|
add $32,%rcx
|
|
add $32,%rdx
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
vmovdqu -32(%r9),%ymm6
|
|
vpminsd -32(%r10),%ymm5,%ymm3
|
|
vpmaxsd -32(%r10),%ymm5,%ymm1
|
|
vpminsd -32(%r8),%ymm6,%ymm2
|
|
vpmaxsd -32(%r8),%ymm6,%ymm0
|
|
vpminsd -32(%rsi),%ymm7,%ymm7
|
|
vmovdqu -32(%rcx),%ymm5
|
|
vmovdqu -32(%rdi),%ymm6
|
|
vpmaxsd -32(%rdx),%ymm5,%ymm4
|
|
vpminsd %ymm2,%ymm3,%ymm9
|
|
vpmaxsd -32(%rsi),%ymm6,%ymm8
|
|
vpminsd -32(%rdx),%ymm5,%ymm6
|
|
vpminsd %ymm0,%ymm1,%ymm13
|
|
vpmaxsd %ymm2,%ymm3,%ymm2
|
|
vpminsd %ymm6,%ymm7,%ymm5
|
|
vpminsd %ymm4,%ymm8,%ymm3
|
|
vpmaxsd %ymm6,%ymm7,%ymm6
|
|
vpmaxsd %ymm0,%ymm1,%ymm0
|
|
vpmaxsd %ymm4,%ymm8,%ymm4
|
|
vpminsd %ymm5,%ymm9,%ymm1
|
|
vpminsd %ymm6,%ymm2,%ymm8
|
|
vpminsd %ymm3,%ymm13,%ymm7
|
|
vmovdqu %ymm1,(%r12,%rax,4)
|
|
add $8,%rax
|
|
vpmaxsd %ymm6,%ymm2,%ymm2
|
|
vpmaxsd %ymm5,%ymm9,%ymm5
|
|
vmovdqu %ymm7,-32(%r10)
|
|
vpminsd %ymm4,%ymm0,%ymm6
|
|
vpmaxsd %ymm3,%ymm13,%ymm3
|
|
vmovdqu %ymm8,-32(%r9)
|
|
vpmaxsd %ymm4,%ymm0,%ymm0
|
|
vmovdqu %ymm6,-32(%r8)
|
|
vmovdqu %ymm5,-32(%rdi)
|
|
vmovdqu %ymm3,-32(%rsi)
|
|
vmovdqu %ymm2,-32(%rcx)
|
|
vmovdqu %ymm0,-32(%rdx)
|
|
jmp .L81
|
|
.L141: sarq %r14
|
|
decl -272(%rbp)
|
|
jne .L46
|
|
mov %r12,%rax
|
|
xor %edx,%edx
|
|
vpcmpeqd %ymm5,%ymm5,%ymm5
|
|
.L85: cmp %r15,%rdx
|
|
jge .L89
|
|
vmovdqu (%rax),%ymm7
|
|
vpunpckldq 32(%rax),%ymm7,%ymm12
|
|
vpunpckhdq 32(%rax),%ymm7,%ymm6
|
|
vmovdqu 64(%rax),%ymm7
|
|
vpunpckldq 96(%rax),%ymm7,%ymm2
|
|
vpunpckhdq 96(%rax),%ymm7,%ymm4
|
|
vmovdqu 128(%rax),%ymm7
|
|
vpunpckldq 160(%rax),%ymm7,%ymm1
|
|
vpunpckhdq 160(%rax),%ymm7,%ymm0
|
|
vpunpcklqdq %ymm2,%ymm12,%ymm8
|
|
vpunpcklqdq %ymm4,%ymm6,%ymm9
|
|
cmp $0,-116(%rbp)
|
|
vmovdqu 192(%rax),%ymm7
|
|
vpunpckhqdq %ymm2,%ymm12,%ymm12
|
|
vpunpckhqdq %ymm4,%ymm6,%ymm4
|
|
vpunpckldq 224(%rax),%ymm7,%ymm10
|
|
vpunpckhdq 224(%rax),%ymm7,%ymm3
|
|
vpunpcklqdq %ymm10,%ymm1,%ymm11
|
|
vpunpckhqdq %ymm10,%ymm1,%ymm1
|
|
vpunpcklqdq %ymm3,%ymm0,%ymm7
|
|
vpunpckhqdq %ymm3,%ymm0,%ymm0
|
|
je .L86
|
|
vpxor %ymm5,%ymm12,%ymm12
|
|
vpxor %ymm5,%ymm4,%ymm4
|
|
vpxor %ymm5,%ymm1,%ymm1
|
|
vpxor %ymm5,%ymm0,%ymm0
|
|
jmp .L87
|
|
.L86: vpxor %ymm5,%ymm8,%ymm8
|
|
vpxor %ymm5,%ymm9,%ymm9
|
|
vpxor %ymm5,%ymm11,%ymm11
|
|
vpxor %ymm5,%ymm7,%ymm7
|
|
.L87: vperm2i128 $32,%ymm11,%ymm8,%ymm3
|
|
vperm2i128 $32,%ymm1,%ymm12,%ymm6
|
|
vperm2i128 $32,%ymm7,%ymm9,%ymm10
|
|
add $64,%rdx
|
|
vperm2i128 $32,%ymm0,%ymm4,%ymm13
|
|
vperm2i128 $49,%ymm11,%ymm8,%ymm11
|
|
vperm2i128 $49,%ymm7,%ymm9,%ymm9
|
|
add $256,%rax
|
|
vperm2i128 $49,%ymm1,%ymm12,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm4,%ymm0
|
|
vpmaxsd %ymm6,%ymm3,%ymm2
|
|
vpminsd %ymm6,%ymm3,%ymm4
|
|
vpminsd %ymm1,%ymm11,%ymm7
|
|
vpmaxsd %ymm13,%ymm10,%ymm3
|
|
vpminsd %ymm13,%ymm10,%ymm8
|
|
vpmaxsd %ymm1,%ymm11,%ymm1
|
|
vpminsd %ymm0,%ymm9,%ymm10
|
|
vpmaxsd %ymm0,%ymm9,%ymm0
|
|
vpminsd %ymm8,%ymm4,%ymm11
|
|
vpminsd %ymm3,%ymm2,%ymm9
|
|
vpmaxsd %ymm8,%ymm4,%ymm8
|
|
vpminsd %ymm10,%ymm7,%ymm6
|
|
vpmaxsd %ymm10,%ymm7,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm0,%ymm1,%ymm3
|
|
vpmaxsd %ymm0,%ymm1,%ymm1
|
|
vpminsd %ymm6,%ymm11,%ymm10
|
|
vpmaxsd %ymm6,%ymm11,%ymm0
|
|
vpminsd %ymm3,%ymm9,%ymm7
|
|
vpmaxsd %ymm3,%ymm9,%ymm6
|
|
vpminsd %ymm4,%ymm8,%ymm3
|
|
vpminsd %ymm1,%ymm2,%ymm9
|
|
vpmaxsd %ymm4,%ymm8,%ymm4
|
|
vpunpckldq %ymm7,%ymm10,%ymm8
|
|
vpmaxsd %ymm1,%ymm2,%ymm2
|
|
vpunpckhdq %ymm7,%ymm10,%ymm7
|
|
vpunpckldq %ymm9,%ymm3,%ymm1
|
|
vpunpckhdq %ymm9,%ymm3,%ymm3
|
|
vpunpckldq %ymm6,%ymm0,%ymm9
|
|
vpunpckhdq %ymm6,%ymm0,%ymm6
|
|
vpunpckldq %ymm2,%ymm4,%ymm0
|
|
vpunpckhdq %ymm2,%ymm4,%ymm2
|
|
vpunpcklqdq %ymm3,%ymm7,%ymm10
|
|
vpunpcklqdq %ymm1,%ymm8,%ymm4
|
|
vpunpcklqdq %ymm0,%ymm9,%ymm13
|
|
vpunpckhqdq %ymm1,%ymm8,%ymm8
|
|
vpunpckhqdq %ymm3,%ymm7,%ymm3
|
|
vpunpckhqdq %ymm0,%ymm9,%ymm1
|
|
vpunpcklqdq %ymm2,%ymm6,%ymm7
|
|
vpunpckhqdq %ymm2,%ymm6,%ymm0
|
|
vperm2i128 $32,%ymm13,%ymm4,%ymm12
|
|
vperm2i128 $32,%ymm1,%ymm8,%ymm11
|
|
vperm2i128 $32,%ymm0,%ymm3,%ymm6
|
|
vperm2i128 $32,%ymm7,%ymm10,%ymm9
|
|
vperm2i128 $49,%ymm13,%ymm4,%ymm4
|
|
vmovdqu %ymm12,-256(%rax)
|
|
vperm2i128 $49,%ymm1,%ymm8,%ymm1
|
|
vperm2i128 $49,%ymm7,%ymm10,%ymm2
|
|
vperm2i128 $49,%ymm0,%ymm3,%ymm0
|
|
vmovdqu %ymm11,-224(%rax)
|
|
vmovdqu %ymm9,-192(%rax)
|
|
vmovdqu %ymm6,-160(%rax)
|
|
vmovdqu %ymm4,-128(%rax)
|
|
vmovdqu %ymm1,-96(%rax)
|
|
vmovdqu %ymm2,-64(%rax)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
jmp .L85
|
|
.L142: cmp $32,-112(%rbp)
|
|
jne .L94
|
|
.L93: mov -112(%rbp),%rcx
|
|
sar $2,%rcx
|
|
lea 0(,%rcx,4),%rdx
|
|
lea 0(,%rcx,8),%rax
|
|
mov %rcx,-136(%rbp)
|
|
lea (%r12,%rdx),%r9
|
|
mov %rax,-184(%rbp)
|
|
imul $-24,%rcx,%rax
|
|
lea (%r9,%rdx),%r14
|
|
lea (%r14,%rdx),%rsi
|
|
lea (%rsi,%rdx),%rbx
|
|
lea (%rbx,%rdx),%r10
|
|
lea (%r10,%rdx),%r8
|
|
lea (%r8,%rdx),%rdi
|
|
add %rdi,%rax
|
|
mov %rax,-176(%rbp)
|
|
add %rdx,%rax
|
|
mov %rax,-168(%rbp)
|
|
add %rdx,%rax
|
|
lea (%rax,%rdx),%r11
|
|
mov %rax,-160(%rbp)
|
|
lea (%r11,%rdx),%rax
|
|
mov %rax,-200(%rbp)
|
|
add %rdx,%rax
|
|
add %rax,%rdx
|
|
mov %rax,-144(%rbp)
|
|
mov %rdx,-192(%rbp)
|
|
.L90: mov -136(%rbp),%rax
|
|
sub %rcx,%rax
|
|
cmp %rax,%r15
|
|
jg .L92
|
|
sarq $3,-112(%rbp)
|
|
.L89: cmp $127,-112(%rbp)
|
|
jle .L142
|
|
jmp .L93
|
|
.L92: cmp -136(%rbp),%rax
|
|
jge .L143
|
|
vmovdqu (%r12,%rax,4),%ymm6
|
|
vpminsd (%rbx,%rax,4),%ymm6,%ymm7
|
|
vpmaxsd (%rbx,%rax,4),%ymm6,%ymm4
|
|
vmovdqu (%r9,%rax,4),%ymm6
|
|
vpminsd (%r10,%rax,4),%ymm6,%ymm1
|
|
vpmaxsd (%r10,%rax,4),%ymm6,%ymm0
|
|
vmovdqu (%r14,%rax,4),%ymm6
|
|
vpminsd (%r8,%rax,4),%ymm6,%ymm5
|
|
vpmaxsd (%r8,%rax,4),%ymm6,%ymm3
|
|
vmovdqu (%rsi,%rax,4),%ymm6
|
|
vpminsd (%rdi,%rax,4),%ymm6,%ymm6
|
|
vpminsd %ymm5,%ymm7,%ymm9
|
|
vmovdqu (%rsi,%rax,4),%ymm2
|
|
vpmaxsd %ymm5,%ymm7,%ymm5
|
|
mov -176(%rbp),%rdx
|
|
vpminsd %ymm3,%ymm4,%ymm8
|
|
vpminsd %ymm6,%ymm1,%ymm7
|
|
vpmaxsd %ymm3,%ymm4,%ymm3
|
|
vpminsd %ymm7,%ymm9,%ymm10
|
|
vpmaxsd %ymm7,%ymm9,%ymm4
|
|
vpmaxsd (%rdi,%rax,4),%ymm2,%ymm2
|
|
vpmaxsd %ymm6,%ymm1,%ymm1
|
|
vmovdqu %ymm10,(%r12,%rax,4)
|
|
vmovdqu %ymm4,(%rdx,%rax,4)
|
|
mov -168(%rbp),%rdx
|
|
vpminsd %ymm1,%ymm5,%ymm9
|
|
vpmaxsd %ymm1,%ymm5,%ymm1
|
|
vpminsd %ymm2,%ymm0,%ymm6
|
|
vpmaxsd %ymm2,%ymm0,%ymm0
|
|
vmovdqu %ymm9,(%rdx,%rax,4)
|
|
vpminsd %ymm6,%ymm8,%ymm7
|
|
vpmaxsd %ymm6,%ymm8,%ymm2
|
|
mov -160(%rbp),%rdx
|
|
vpminsd %ymm0,%ymm3,%ymm5
|
|
vpmaxsd %ymm0,%ymm3,%ymm3
|
|
vmovdqu %ymm1,(%rdx,%rax,4)
|
|
mov -200(%rbp),%rdx
|
|
vmovdqu %ymm7,(%r11,%rax,4)
|
|
vmovdqu %ymm2,(%rdx,%rax,4)
|
|
mov -144(%rbp),%rdx
|
|
vmovdqu %ymm5,(%rdx,%rax,4)
|
|
mov -192(%rbp),%rdx
|
|
vmovdqu %ymm3,(%rdx,%rax,4)
|
|
add $8,%rax
|
|
jmp .L92
|
|
.L143: mov -184(%rbp),%rdx
|
|
add %rdx,-136(%rbp)
|
|
jmp .L90
|
|
.L145: sarq $2,-112(%rbp)
|
|
.L94: cmp $15,-112(%rbp)
|
|
jle .L144
|
|
mov -112(%rbp),%rcx
|
|
xor %esi,%esi
|
|
sarq %rcx
|
|
imul $-8,%rcx,%rdi
|
|
lea 0(,%rcx,4),%rdx
|
|
lea (%r12,%rdx),%r11
|
|
lea (%r11,%rdx),%r10
|
|
lea (%r10,%rdx),%r8
|
|
add %r8,%rdi
|
|
lea (%rdi,%rdx),%r9
|
|
lea (%r9,%rdx),%rbx
|
|
.L95: cmp %r15,%rsi
|
|
jge .L145
|
|
mov %rsi,%rax
|
|
.L97: cmp %rcx,%rax
|
|
jge .L146
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
vpminsd (%r10,%rax,4),%ymm5,%ymm2
|
|
vpmaxsd (%r10,%rax,4),%ymm5,%ymm0
|
|
vmovdqu (%r11,%rax,4),%ymm5
|
|
vpminsd (%r8,%rax,4),%ymm5,%ymm3
|
|
vpmaxsd (%r8,%rax,4),%ymm5,%ymm1
|
|
vpminsd %ymm3,%ymm2,%ymm4
|
|
vpmaxsd %ymm3,%ymm2,%ymm2
|
|
vpminsd %ymm1,%ymm0,%ymm3
|
|
vpmaxsd %ymm1,%ymm0,%ymm0
|
|
vmovdqu %ymm4,(%r12,%rax,4)
|
|
vmovdqu %ymm2,(%rdi,%rax,4)
|
|
vmovdqu %ymm3,(%r9,%rax,4)
|
|
vmovdqu %ymm0,(%rbx,%rax,4)
|
|
add $8,%rax
|
|
jmp .L97
|
|
.L146: add %rdx,%rsi
|
|
add %rdx,%rcx
|
|
jmp .L95
|
|
.L144: cmp $8,-112(%rbp)
|
|
je .L111
|
|
.L102: mov -152(%rbp),%rdx
|
|
add -128(%rbp),%rdx
|
|
xor %ecx,%ecx
|
|
vpcmpeqd %ymm6,%ymm6,%ymm6
|
|
lea (%rdx,%r13),%r10
|
|
lea (%r10,%r13),%r9
|
|
lea (%r9,%r13),%r8
|
|
lea (%r8,%r13),%rdi
|
|
lea (%rdi,%r13),%rsi
|
|
lea (%rsi,%r13),%rax
|
|
jmp .L100
|
|
.L111: xor %eax,%eax
|
|
.L99: cmp %r15,%rax
|
|
jge .L102
|
|
vmovdqu (%r12,%rax,4),%ymm5
|
|
vpminsd 32(%r12,%rax,4),%ymm5,%ymm1
|
|
vpmaxsd 32(%r12,%rax,4),%ymm5,%ymm0
|
|
vmovdqu %ymm1,(%r12,%rax,4)
|
|
vmovdqu %ymm0,32(%r12,%rax,4)
|
|
add $16,%rax
|
|
jmp .L99
|
|
.L104: vmovdqu (%r10),%ymm7
|
|
vmovdqu (%r12,%rcx,4),%ymm4
|
|
vpminsd (%r9),%ymm7,%ymm3
|
|
vpminsd (%rdx),%ymm4,%ymm5
|
|
vpmaxsd (%r9),%ymm7,%ymm2
|
|
vpmaxsd (%rdx),%ymm4,%ymm4
|
|
vmovdqu (%r8),%ymm7
|
|
vmovdqu (%rsi),%ymm14
|
|
vpminsd %ymm3,%ymm5,%ymm11
|
|
vpmaxsd %ymm3,%ymm5,%ymm3
|
|
vpminsd (%rdi),%ymm7,%ymm1
|
|
vpminsd %ymm2,%ymm4,%ymm10
|
|
cmp $0,-116(%rbp)
|
|
vpmaxsd (%rdi),%ymm7,%ymm0
|
|
vmovdqu (%rsi),%ymm7
|
|
vpmaxsd %ymm2,%ymm4,%ymm2
|
|
vpminsd (%rax),%ymm7,%ymm7
|
|
vpmaxsd (%rax),%ymm14,%ymm9
|
|
vpminsd %ymm7,%ymm1,%ymm8
|
|
vpmaxsd %ymm7,%ymm1,%ymm1
|
|
vpminsd %ymm9,%ymm0,%ymm7
|
|
vpmaxsd %ymm9,%ymm0,%ymm0
|
|
vpminsd %ymm8,%ymm11,%ymm5
|
|
vpminsd %ymm1,%ymm3,%ymm9
|
|
vpminsd %ymm7,%ymm10,%ymm12
|
|
vpmaxsd %ymm1,%ymm3,%ymm3
|
|
vpminsd %ymm0,%ymm2,%ymm4
|
|
vpmaxsd %ymm8,%ymm11,%ymm8
|
|
vpmaxsd %ymm0,%ymm2,%ymm2
|
|
vpmaxsd %ymm7,%ymm10,%ymm7
|
|
vpunpckldq %ymm8,%ymm5,%ymm11
|
|
vpunpckldq %ymm7,%ymm12,%ymm10
|
|
vpunpckhdq %ymm8,%ymm5,%ymm8
|
|
vpunpckhdq %ymm7,%ymm12,%ymm7
|
|
vpunpckhdq %ymm3,%ymm9,%ymm5
|
|
vpunpckldq %ymm2,%ymm4,%ymm1
|
|
vpunpckldq %ymm3,%ymm9,%ymm0
|
|
vpunpckhdq %ymm2,%ymm4,%ymm4
|
|
vpunpcklqdq %ymm0,%ymm11,%ymm3
|
|
vpunpckhqdq %ymm0,%ymm11,%ymm9
|
|
vpunpcklqdq %ymm5,%ymm8,%ymm2
|
|
vpunpcklqdq %ymm4,%ymm7,%ymm11
|
|
vpunpckhqdq %ymm5,%ymm8,%ymm5
|
|
vpunpcklqdq %ymm1,%ymm10,%ymm12
|
|
vpunpckhqdq %ymm4,%ymm7,%ymm0
|
|
vpunpckhqdq %ymm1,%ymm10,%ymm1
|
|
vperm2i128 $32,%ymm11,%ymm2,%ymm8
|
|
vperm2i128 $32,%ymm12,%ymm3,%ymm10
|
|
vperm2i128 $32,%ymm1,%ymm9,%ymm7
|
|
vperm2i128 $32,%ymm0,%ymm5,%ymm4
|
|
vperm2i128 $49,%ymm12,%ymm3,%ymm3
|
|
vperm2i128 $49,%ymm11,%ymm2,%ymm2
|
|
vperm2i128 $49,%ymm1,%ymm9,%ymm1
|
|
vperm2i128 $49,%ymm0,%ymm5,%ymm0
|
|
je .L103
|
|
vpxor %ymm6,%ymm10,%ymm10
|
|
vpxor %ymm6,%ymm8,%ymm8
|
|
vpxor %ymm6,%ymm7,%ymm7
|
|
vpxor %ymm6,%ymm4,%ymm4
|
|
vpxor %ymm6,%ymm3,%ymm3
|
|
vpxor %ymm6,%ymm2,%ymm2
|
|
vpxor %ymm6,%ymm1,%ymm1
|
|
vpxor %ymm6,%ymm0,%ymm0
|
|
.L103: add $32,%rdx
|
|
add $32,%r10
|
|
add $32,%r9
|
|
add $32,%r8
|
|
vmovdqu %ymm10,(%r12,%rcx,4)
|
|
add $32,%rdi
|
|
add $8,%rcx
|
|
add $32,%rsi
|
|
vmovdqu %ymm3,-32(%rdx)
|
|
add $32,%rax
|
|
vmovdqu %ymm8,-32(%r10)
|
|
vmovdqu %ymm2,-32(%r9)
|
|
vmovdqu %ymm7,-32(%r8)
|
|
vmovdqu %ymm1,-32(%rdi)
|
|
vmovdqu %ymm4,-32(%rsi)
|
|
vmovdqu %ymm0,-32(%rax)
|
|
.L100: cmp -80(%rbp),%rcx
|
|
jl .L104
|
|
.L35: add $264,%rsp
|
|
pop %rbx
|
|
pop %r12
|
|
pop %r13
|
|
pop %r14
|
|
pop %r15
|
|
pop %rbp
|
|
lea -16(%r13),%rsp
|
|
pop %r13
|
|
ret
|
|
.endfn int32_sort_2power
|
|
|
|
.rodata.cst32
|
|
.LC0: .quad -1,0,-1,0
|
|
.LC1: .quad 0,-1,-1,0
|
|
.LC2: .quad -1,-1,0,0
|
|
.LC3: .quad -4294967296,4294967295,-4294967296,4294967295
|
|
.LC4: .quad 0x7fffffff7fffffff
|
|
.quad 0x7fffffff7fffffff
|
|
.quad 0x7fffffff7fffffff
|
|
.quad 0x7fffffff7fffffff
|