#include "third_party/chibicc/chibicc.h" #define GP_MAX 6 #define FP_MAX 8 int depth; static char argreg8[][5] = {"%dil", "%sil", "%dl", "%cl", "%r8b", "%r9b"}; static char argreg16[][5] = {"%di", "%si", "%dx", "%cx", "%r8w", "%r9w"}; static char argreg32[][5] = {"%edi", "%esi", "%edx", "%ecx", "%r8d", "%r9d"}; static char argreg64[][5] = {"%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9"}; static Obj *current_fn; static char *lastline; FILE *output_stream; void flushln(void) { if (!lastline) return; fputs(lastline, output_stream); fputc('\n', output_stream); free(lastline); lastline = NULL; } static void processln(char *nextline) { if (lastline) { // unsophisticated optimization pass to reduce asm noise a little bit if ((!strcmp(lastline, "\txor\t%eax,%eax") && !strcmp(nextline, "\tcltq")) || (!strcmp(lastline, "\tmov\t$0x1,%eax") && !strcmp(nextline, "\tcltq")) || (!strcmp(lastline, "\tmovslq\t(%rax),%rax") && !strcmp(nextline, "\tcltq"))) { free(nextline); } else if (!strcmp(lastline, "\tmov\t(%rax),%rax") && !strcmp(nextline, "\tpush\t%rax")) { free(lastline); free(nextline); lastline = strdup("\tpush\t(%rax)"); } else if (!strcmp(lastline, "\tmov\t$0x1,%eax") && !strcmp(nextline, "\tpush\t%rax")) { free(lastline); free(nextline); lastline = strdup("\tpush\t$1"); } else if (!strcmp(lastline, "\tpush\t(%rax)") && !strcmp(nextline, "\tpop\t%rdi")) { free(lastline); free(nextline); lastline = strdup("\tmov\t(%rax),%rdi"); } else if (!strcmp(lastline, "\tpush\t%rax") && !strcmp(nextline, "\tpop\t%rdi")) { free(lastline); free(nextline); lastline = strdup("\tmov\t%rax,%rdi"); } else { flushln(); lastline = nextline; } } else { lastline = nextline; } } static void emitlin(char *nextline) { processln(strdup(nextline)); } void println(char *fmt, ...) { va_list ap; char *nextline; va_start(ap, fmt); emitlin(xvasprintf(fmt, ap)); va_end(ap); } int count(void) { static int i = 1; return i++; } void push(void) { emitlin("\tpush\t%rax"); depth++; } void pop(char *arg) { println("\tpop\t%s", arg); depth--; fflush(output_stream); DCHECK_GE(depth, 0); } void push2(void) { emitlin("\tpush\t%rdx"); emitlin("\tpush\t%rax"); depth++; depth++; } void pop2(char *a, char *b) { println("\tpop\t%s", a); println("\tpop\t%s", b); depth--; depth--; DCHECK_GE(depth, 0); } void pushreg(char *arg) { println("\tpush\t%%%s", arg); depth++; } void popreg(char *arg) { println("\tpop\t%%%s", arg); depth--; DCHECK_GE(depth, 0); } static const char *nameof(Obj *obj) { if (obj->asmname) { return obj->asmname; } else { return obj->name; } } static void pushx(void) { emitlin("\tsub\t$16,%rsp"); emitlin("\tmovdqu\t%xmm0,(%rsp)"); depth += 2; } static void popx(int reg) { println("\tmovdqu\t(%%rsp),%%xmm%d", reg); emitlin("\tadd\t$16,%rsp"); depth -= 2; DCHECK_GE(depth, 0); } static void pushf(Type *ty) { if (ty->vector_size == 16) { pushx(); } else { emitlin("\tpush\t%rax"); emitlin("\tmovsd\t%xmm0,(%rsp)"); depth++; } } static void popf(Type *ty, int reg) { if (ty->vector_size == 16) { popx(reg); } else { println("\tmovsd\t(%%rsp),%%xmm%d", reg); emitlin("\tadd\t$8,%rsp"); depth--; } } static void print_profiling_nop(void) { emitlin("\t.byte\t0x0f,0x1f,0x44,0x00,0x00"); } static void print_visibility(Obj *obj) { if (obj->visibility) { if (!strcmp(obj->visibility, "hidden")) { println("\t.hidden\t%s", nameof(obj)); } else if (!strcmp(obj->visibility, "protected")) { println("\t.protected %s", nameof(obj)); } else { println("\t.globl\t%s", nameof(obj)); } } else if (obj->is_static) { println("\t.local\t%s", nameof(obj)); } else { println("\t.globl\t%s", nameof(obj)); } if (obj->is_weak) { println("\t.weak\t%s", nameof(obj)); } } static void print_align(int align) { if (align > 1) println("\t.align\t%d", align); } void print_loc(int64_t file, int64_t line) { static int64_t lastfile = -1; static int64_t lastline = -1; char *locbuf, *p; if (file != lastfile || line != lastline) { locbuf = malloc(2 + 4 + 1 + 20 + 1 + 20 + 1); p = stpcpy(locbuf, "\t.loc\t"); p += int64toarray_radix10(file, p); *p++ = ' '; int64toarray_radix10(line, p); emitlin(locbuf); lastfile = file; lastline = line; } } static void print_mov_imm(int64_t val, const char *reg64, const char *reg32) { if (val) { if (val != -1) { if (0 < val && val <= INT32_MAX) { println("\tmov\t$%#lx,%s", val, reg32); } else if (INT32_MIN <= val && val < 0) { println("\tmov\t$%#lx,%s", (int64_t)(int32_t)val, reg64); } else { println("\tmov\t$%#lx,%s", val, reg64); } } else { println("\tor\t$-1,%s", reg64); } } else { println("\txor\t%s,%s", reg32, reg32); } } static bool is_quotable_string(const char *s, size_t n) { size_t i; if (!n) return false; if (s[n - 1]) return false; for (i = 0; i < n - 1; ++i) { if (s[i] == '"') return false; if (s[i] == '\\') return false; if (s[i] < 0x20 || s[i] >= 0x7f) { return false; } } return true; } static char *reg_dx(int sz) { switch (sz) { case 1: return "%dl"; case 2: return "%dx"; case 4: return "%edx"; case 8: return "%rdx"; } UNREACHABLE(); } static char *reg_ax(int sz) { switch (sz) { case 1: return "%al"; case 2: return "%ax"; case 4: return "%eax"; case 8: return "%rax"; } UNREACHABLE(); } // Compute the absolute address of a given node. // It's an error if a given node does not reside in memory. // asm() wants this to not clobber flags or regs other than rax. void gen_addr(Node *node) { switch (node->kind) { case ND_VAR: // Variable-length array, which is always local. if (node->var->ty->kind == TY_VLA) { println("\tmov\t%d(%%rbp),%%rax", node->var->offset); return; } // Local variable if (node->var->is_local) { println("\tlea\t%d(%%rbp),%%rax", node->var->offset); return; } if (opt_pic) { if (node->var->is_tls) { // Dynamic thread-local variable println("\tmov\t%%fs:0,%%rax"); println("\tmov\t%s@dtpoff(%%rax),%%rax", nameof(node->var)); return; } // Function or global variable println("\tmov\t%s@gotpcrel(%%rip),%%rax", nameof(node->var)); return; } // Static thread-local variable if (node->var->is_tls) { println("\tmov\t%%fs:%s@tpoff,%%rax", nameof(node->var)); return; } // Here, we generate an absolute address of a function or a global // variable. Even though they exist at a certain address at runtime, // their addresses are not known at link-time for the following // two reasons. // // - Address randomization: Executables are loaded to memory as a // whole but it is not known what address they are loaded to. // Therefore, at link-time, relative address in the same // exectuable (i.e. the distance between two functions in the // same executable) is known, but the absolute address is not // known. // // - Dynamic linking: Dynamic shared objects (DSOs) or .so files // are loaded to memory alongside an executable at runtime and // linked by the runtime loader in memory. We know nothing // about addresses of global stuff that may be defined by DSOs // until the runtime relocation is complete. // // In order to deal with the former case, we use RIP-relative // addressing, denoted by `(%rip)`. For the latter, we obtain an // address of a stuff that may be in a shared object file from the // Global Offset Table using `@GOTPCREL(%rip)` notation. // Function if (node->ty->kind == TY_FUNC) { if (!opt_pic) { println("\tmov\t$%s,%%eax", nameof(node->var)); } else if (node->var->is_definition) { println("\tlea\t%s(%%rip),%%rax", nameof(node->var)); } else { println("\tmov\t%s@gotpcrel(%%rip),%%rax", nameof(node->var)); } return; } // Global variable if (opt_pic) { println("\tlea\t%s(%%rip),%%rax", nameof(node->var)); } else { println("\tmov\t$%s,%%eax", nameof(node->var)); } return; case ND_CAST: gen_expr(node->lhs); gen_cast(node->lhs->ty, node->ty); return; case ND_DEREF: gen_expr(node->lhs); return; case ND_COMMA: gen_expr(node->lhs); gen_addr(node->rhs); return; case ND_ASSIGN: case ND_COND: if (node->ty->kind == TY_STRUCT || node->ty->kind == TY_UNION) { gen_expr(node); return; } break; case ND_MEMBER: gen_addr(node->lhs); if (node->member->offset) { println("\tlea\t%d(%%rax),%%rax", node->member->offset); } return; case ND_FUNCALL: if (node->ret_buffer) { gen_expr(node); return; } break; case ND_VLA_PTR: println("\tlea\t%d(%%rbp),%%rax", node->var->offset); return; default: error_tok(node->tok, "not an lvalue %d", node->kind); } } // Load a value from where %rax is pointing to. static void load(Type *ty) { switch (ty->kind) { case TY_ARRAY: case TY_STRUCT: case TY_UNION: case TY_FUNC: case TY_VLA: // If it is an array, do not attempt to load a value to the // register because in general we can't load an entire array to a // register. As a result, the result of an evaluation of an array // becomes not the array itself but the address of the array. // This is where "array is automatically converted to a pointer to // the first element of the array in C" occurs. return; case TY_FLOAT: if (ty->vector_size == 16) { if (ty->align >= 16) { emitlin("\tmovaps\t(%rax),%xmm0"); } else { emitlin("\tmovdqu\t(%rax),%xmm0"); } } else { emitlin("\tmovss\t(%rax),%xmm0"); return; } case TY_DOUBLE: if (ty->vector_size == 16) { if (ty->align >= 16) { emitlin("\tmovapd\t(%rax),%xmm0"); } else { emitlin("\tmovdqu\t(%rax),%xmm0"); } } else { emitlin("\tmovsd\t(%rax),%xmm0"); } return; case TY_LDOUBLE: emitlin("\tfldt\t(%rax)"); return; case TY_INT128: emitlin("\tmov\t8(%rax),%rdx"); emitlin("\tmov\t(%rax),%rax"); return; default: break; } char *insn = ty->is_unsigned ? "movz" : "movs"; // When we load a char or a short value to a register, we always // extend them to the size of int, so we can assume the lower half of // a register always contains a valid value. The upper half of a // register for char, short and int may contain garbage. When we load // a long value to a register, it simply occupies the entire register. if (ty->size == 1) { println("\t%sbl\t(%%rax),%%eax", insn); } else if (ty->size == 2) { println("\t%swl\t(%%rax),%%eax", insn); } else if (ty->size == 4) { emitlin("\tmovslq\t(%rax),%rax"); } else { emitlin("\tmov\t(%rax),%rax"); } } static void gen_memcpy(size_t size) { switch (size) { case 0: break; case 1: emitlin("\tmov\t(%rax),%r8b"); emitlin("\tmov\t%r8b,(%rdi)"); break; case 2: emitlin("\tmov\t(%rax),%r8w"); emitlin("\tmov\t%r8w,(%rdi)"); break; case 3: emitlin("\tmov\t(%rax),%r8w"); emitlin("\tmov\t%r8w,(%rdi)"); emitlin("\tmov\t2(%rax),%r8b"); emitlin("\tmov\t%r8b,2(%rdi)"); break; case 4: emitlin("\tmov\t(%rax),%r8d"); emitlin("\tmov\t%r8d,(%rdi)"); break; case 5 ... 7: emitlin("\tmov\t(%rax),%r8d"); println("\tmov\t%d(%%rax),%%r9d", size - 4); emitlin("\tmov\t%r8d,(%rdi)"); println("\tmov\t%%r9d,%d(%%rdi)", size - 4); break; case 8: emitlin("\tmov\t(%rax),%r8"); emitlin("\tmov\t%r8,(%rdi)"); break; case 9 ... 15: emitlin("\tmov\t(%rax),%r8"); println("\tmov\t%d(%%rax),%%r9", size - 8); emitlin("\tmov\t%r8,(%rdi)"); println("\tmov\t%%r9,%d(%%rdi)", size - 8); break; case 16: emitlin("\tmovdqu\t(%rax),%xmm2"); emitlin("\tmovdqu\t%xmm2,(%rdi)"); break; case 17 ... 32: emitlin("\tmovdqu\t(%rax),%xmm2"); println("\tmovdqu\t%d(%%rax),%%xmm3", size - 16); emitlin("\tmovdqu\t%xmm2,(%rdi)"); println("\tmovdqu\t%%xmm3,%d(%%rdi)", size - 16); break; default: emitlin("\tmov\t%rax,%rsi"); print_mov_imm(size, "%rcx", "%ecx"); emitlin("\trep movsb"); break; } } // Store %rax to an address that the stack top is pointing to. static void store(Type *ty) { pop("%rdi"); switch (ty->kind) { case TY_STRUCT: case TY_UNION: gen_memcpy(ty->size); return; case TY_FLOAT: if (ty->vector_size == 16) { if (ty->align >= 16) { emitlin("\tmovaps\t%xmm0,(%rdi)"); } else { emitlin("\tmovdqu\t%xmm0,(%rdi)"); } } else { emitlin("\tmovss\t%xmm0,(%rdi)"); } return; case TY_DOUBLE: if (ty->vector_size == 16) { if (ty->align >= 16) { emitlin("\tmovapd\t%xmm0,(%rdi)"); } else { emitlin("\tmovdqu\t%xmm0,(%rdi)"); } } else { emitlin("\tmovsd\t%xmm0,(%rdi)"); } return; case TY_LDOUBLE: emitlin("\tfstpt\t(%rdi)"); return; case TY_INT128: emitlin("\tmov\t%rax,(%rdi)"); emitlin("\tmov\t%rdx,8(%rdi)"); return; } if (ty->vector_size == 16) { if (ty->align >= 16) { emitlin("\tmovdqa\t%xmm0,(%rdi)"); } else { emitlin("\tmovdqu\t%xmm0,(%rdi)"); } } if (ty->size == 1) { emitlin("\tmov\t%al,(%rdi)"); } else if (ty->size == 2) { emitlin("\tmov\t%ax,(%rdi)"); } else if (ty->size == 4) { emitlin("\tmov\t%eax,(%rdi)"); } else { emitlin("\tmov\t%rax,(%rdi)"); } } void cmp_zero(Type *ty) { switch (ty->kind) { case TY_FLOAT: emitlin("\txorps\t%xmm1,%xmm1"); emitlin("\tucomiss\t%xmm1,%xmm0"); return; case TY_DOUBLE: emitlin("\txorpd\t%xmm1,%xmm1"); emitlin("\tucomisd\t%xmm1,%xmm0"); return; case TY_LDOUBLE: emitlin("\tfldz"); emitlin("\tfucomip"); emitlin("\tfstp\t%st"); return; case TY_INT128: emitlin("\tmov\t%rax,%r11"); emitlin("\tor\t%rdx,%r11"); return; } if (is_integer(ty) && ty->size <= 4) { emitlin("\ttest\t%eax,%eax"); } else { emitlin("\ttest\t%rax,%rax"); } } // Structs or unions equal or smaller than 16 bytes are passed // using up to two registers. // // If the first 8 bytes contains only floating-point type members, // they are passed in an XMM register. Otherwise, they are passed // in a general-purpose register. // // If a struct/union is larger than 8 bytes, the same rule is // applied to the the next 8 byte chunk. // // This function returns true if `ty` has only floating-point // members in its byte range [lo, hi). static bool has_flonum(Type *ty, int lo, int hi, int offset) { if (ty->kind == TY_STRUCT || ty->kind == TY_UNION) { for (Member *mem = ty->members; mem; mem = mem->next) { if (!has_flonum(mem->ty, lo, hi, offset + mem->offset)) { return false; } } return true; } if (ty->kind == TY_ARRAY) { for (int i = 0; i < ty->array_len; i++) { if (!has_flonum(ty->base, lo, hi, offset + ty->base->size * i)) { return false; } } return true; } return offset < lo || hi <= offset || ty->kind == TY_FLOAT || ty->kind == TY_DOUBLE; } static bool has_flonum1(Type *ty) { return has_flonum(ty, 0, 8, 0); } static bool has_flonum2(Type *ty) { return has_flonum(ty, 8, 16, 0); } static void push_struct(Type *ty) { int sz = ROUNDUP(ty->size, 8); println("\tsub\t$%d,%%rsp", sz); depth += sz / 8; for (int i = 0; i < ty->size; i++) { println("\tmov\t%d(%%rax),%%r10b", i); println("\tmov\t%%r10b,%d(%%rsp)", i); } } static void push_args2(Node *args, bool first_pass) { if (!args) return; push_args2(args->next, first_pass); if ((first_pass && !args->pass_by_stack) || (!first_pass && args->pass_by_stack)) { return; } gen_expr(args); switch (args->ty->kind) { case TY_STRUCT: case TY_UNION: push_struct(args->ty); break; case TY_FLOAT: case TY_DOUBLE: pushf(args->ty); break; case TY_LDOUBLE: emitlin("\tsub\t$16,%rsp"); emitlin("\tfstpt\t(%rsp)"); depth += 2; break; case TY_INT128: push2(); break; default: push(); } if (args->realign_stack) { pushreg("rbx"); } } // Load function call arguments. Arguments are already evaluated and // stored to the stack as local variables. What we need to do in this // function is to load them to registers or push them to the stack as // specified by the x86-64 psABI. Here is what the spec says: // // - Up to 6 arguments of integral type are passed using RDI, RSI, // RDX, RCX, R8 and R9. // // - Up to 8 arguments of floating-point type are passed using XMM0 to // XMM7. // // - If all registers of an appropriate type are already used, push an // argument to the stack in the right-to-left order. // // - Each argument passed on the stack takes 8 bytes, and the end of // the argument area must be aligned to a 16 byte boundary. // // - If a function is variadic, set the number of floating-point type // arguments to RAX. static int push_args(Node *node) { int stack = 0, gp = 0, fp = 0; // If the return type is a large struct/union, the caller passes // a pointer to a buffer as if it were the first argument. if (node->ret_buffer && node->ty->size > 16) gp++; // Load as many arguments to the registers as possible. for (Node *arg = node->args; arg; arg = arg->next) { Type *ty = arg->ty; switch (ty->kind) { case TY_STRUCT: case TY_UNION: if (ty->size > 16) { arg->pass_by_stack = true; stack += ROUNDUP(ty->size, 8) / 8; } else { bool fp1 = has_flonum1(ty); bool fp2 = has_flonum2(ty); if (fp + fp1 + fp2 < FP_MAX && gp + !fp1 + !fp2 < GP_MAX) { fp = fp + fp1 + fp2; gp = gp + !fp1 + !fp2; } else { arg->pass_by_stack = true; stack += ROUNDUP(ty->size, 8) / 8; } } break; case TY_FLOAT: case TY_DOUBLE: if (fp++ >= FP_MAX) { if ((stack & 1) && arg->ty->vector_size == 16) { arg->realign_stack = true; ++stack; } arg->pass_by_stack = true; ++stack; } break; case TY_LDOUBLE: arg->pass_by_stack = true; stack += 2; break; case TY_INT128: if (gp + 1 >= GP_MAX) { if (stack & 1) { arg->realign_stack = true; ++stack; } arg->pass_by_stack = true; stack += 2; } else { gp += 2; } break; default: if (gp++ >= GP_MAX) { arg->pass_by_stack = true; stack++; } break; } } if ((depth + stack) % 2 == 1) { emitlin("\tsub\t$8,%rsp"); depth++; stack++; } push_args2(node->args, true); push_args2(node->args, false); // If the return type is a large struct/union, the caller passes // a pointer to a buffer as if it were the first argument. if (node->ret_buffer && node->ty->size > 16) { println("\tlea\t%d(%%rbp),%%rax", node->ret_buffer->offset); push(); } return stack; } static void copy_ret_buffer(Obj *var) { /* todo(jart): wat */ Type *ty = var->ty; int gp = 0, fp = 0; if (has_flonum1(ty)) { assert(ty->size == 4 || 8 <= ty->size); if (ty->size == 4) { println("\tmovss\t%%xmm0,%d(%%rbp)", var->offset); } else { println("\tmovsd\t%%xmm0,%d(%%rbp)", var->offset); } fp++; } else { for (int i = 0; i < MIN(8, ty->size); i++) { println("\tmov\t%%al,%d(%%rbp)", var->offset + i); emitlin("\tshr\t$8,%rax"); } gp++; } if (ty->size > 8) { if (has_flonum2(ty)) { assert(ty->size == 12 || ty->size == 16); if (ty->size == 12) { println("\tmovss\t%%xmm%d,%d(%%rbp)", fp, var->offset + 8); } else { println("\tmovsd\t%%xmm%d,%d(%%rbp)", fp, var->offset + 8); } } else { char *reg1 = (gp == 0) ? "%al" : "%dl"; char *reg2 = (gp == 0) ? "%rax" : "%rdx"; for (int i = 8; i < MIN(16, ty->size); i++) { println("\tmov\t%s,%d(%%rbp)", reg1, var->offset + i); println("\tshr\t$8,%s", reg2); } } } } static void copy_struct_reg(void) { /* todo(jart): wat */ Type *ty = current_fn->ty->return_ty; int gp = 0, fp = 0; emitlin("\tmov\t%rax,%rdi"); if (has_flonum(ty, 0, 8, 0)) { assert(ty->size == 4 || 8 <= ty->size); if (ty->size == 4) { emitlin("\tmovss\t(%rdi),%xmm0"); } else if (ty->size == 8) { emitlin("\tmovsd\t(%rdi),%xmm0"); } else { emitlin("\tmovdqu\t(%rdi),%xmm0"); } fp++; } else { emitlin("\txor\t%eax,%eax"); for (int i = MIN(8, ty->size) - 1; i >= 0; i--) { emitlin("\tshl\t$8,%rax"); println("\tmov\t%d(%%rdi),%%al", i); } gp++; } if (ty->size > 8) { if (has_flonum(ty, 8, 16, 0)) { assert(ty->size == 12 || ty->size == 16); if (ty->size == 4) { println("\tmovss\t8(%%rdi),%%xmm%d", fp); } else { println("\tmovsd\t8(%%rdi),%%xmm%d", fp); } } else { char *reg1 = (gp == 0) ? "%al" : "%dl"; char *reg2 = (gp == 0) ? "%rax" : "%rdx"; char *reg3 = (gp == 0) ? "%eax" : "%edx"; println("\txor\t%s,%s", reg3, reg3); for (int i = MIN(16, ty->size) - 1; i >= 8; i--) { println("\tshl\t$8,%s", reg2); println("\tmov\t%d(%%rdi),%s", i, reg1); } } } } static void copy_struct_mem(void) { Type *ty = current_fn->ty->return_ty; Obj *var = current_fn->params; println("\tmov\t%d(%%rbp),%%rdi", var->offset); for (int i = 0; i < ty->size; i++) { println("\tmov\t%d(%%rax),%%dl", i); println("\tmov\t%%dl,%d(%%rdi)", i); } } static void builtin_alloca(void) { // Align size to 16 bytes. emitlin("\tadd\t$15,%rdi"); emitlin("\tand\t$-16,%rdi"); // Shift the temporary area by %rdi. println("\tmov\t%d(%%rbp),%%rcx", current_fn->alloca_bottom->offset); emitlin("\ \tsub\t%rsp,%rcx\n\ \tmov\t%rsp,%rax\n\ \tsub\t%rdi,%rsp\n\ \tmov\t%rsp,%rdx\n\ 1:\n\ \ttest\t%rcx,%rcx\n\ \tje\t2f\n\ \tmov\t(%rax),%r8b\n\ \tmov\t%r8b,(%rdx)\n\ \tinc\t%rdx\n\ \tinc\t%rax\n\ \tdec\t%rcx\n\ \tjmp\t1b\n\ 2:"); // Move alloca_bottom pointer. println("\tmov\t%d(%%rbp),%%rax", current_fn->alloca_bottom->offset); emitlin("\tsub\t%rdi,%rax"); println("\tmov\t%%rax,%d(%%rbp)", current_fn->alloca_bottom->offset); } static void gen_comis(Node *node, const char *op, int a, int b, const char *pred) { if (a != b) { gen_expr(node->args->next); pushf(node->args->next->ty); gen_expr(node->args); popf(node->args->next->ty, 1); } else { gen_expr(node->args); } emitlin("\txor\t%eax,%eax"); println("\t%s\t%%xmm%d,%%xmm%d", op, a, b); println("\tset%s\t%%al", pred); } static bool gen_builtin_funcall(Node *node, const char *name) { if (!strcmp(name, "alloca")) { gen_expr(node->args); emitlin("\tmov\t%rax,%rdi"); builtin_alloca(); return true; } else if (!strcmp(name, "trap")) { emitlin("\tint3"); return true; } else if (!strcmp(name, "unreachable")) { emitlin("\tud2"); return true; } else if (!strcmp(name, "frame_address")) { if (is_const_expr(node->args) && !eval(node->args)) { emitlin("\tmov\t%rbp,%rax"); } else { error_tok(node->args->tok, "must be 0"); } return true; } else if (!strcmp(name, "ctz")) { gen_expr(node->args); emitlin("\tbsf\t%eax,%eax"); return true; } else if (!strcmp(name, "ctzl") || !strcmp(name, "ctzll")) { gen_expr(node->args); emitlin("\tbsf\t%rax,%rax"); return true; } else if (!strcmp(name, "clz")) { gen_expr(node->args); emitlin("\ \tbsr\t%eax,%eax\n\ \txor\t$31,%eax"); return true; } else if (!strcmp(name, "clzl") || !strcmp(name, "clzll")) { gen_expr(node->args); emitlin("\ \tbsr\t%rax,%rax\n\ \txor\t$63,%eax"); return true; } else if (!strcmp(name, "ffs") || !strcmp(name, "ffsl") || !strcmp(name, "ffsll")) { char regprefix; gen_expr(node->args); emitlin("\tor\t$-1,%edi"); regprefix = endswith(name, "l") ? 'r' : 'e'; println("\tbsf\t%%%cax,%%%cax", regprefix, regprefix); emitlin("\tcmovz\t%edi,%eax"); emitlin("\tinc\t%eax"); return true; } else if (!strcmp(name, "bswap16")) { gen_expr(node->args); emitlin("\txchg\t%al,%ah"); return true; } else if (!strcmp(name, "bswap32")) { gen_expr(node->args); emitlin("\tbswap\t%eax"); return true; } else if (!strcmp(name, "bswap64")) { gen_expr(node->args); emitlin("\tbswap\t%rax"); return true; } else if (!strcmp(name, "popcount")) { gen_expr(node->args); if (opt_popcnt) { emitlin("\tpopcnt\t%eax,%eax"); } else { emitlin("\tmov\t%eax,%edi"); emitlin("\tcall\t__popcountsi2"); } return true; } else if (!strcmp(name, "popcountl") || !strcmp(name, "popcountll")) { gen_expr(node->args); if (opt_popcnt) { emitlin("\tpopcnt\t%rax,%rax"); } else { emitlin("\tmov\t%rax,%rdi"); emitlin("\tcall\t__popcountdi2"); } return true; } else if (!strcmp(name, "memcpy")) { if (is_const_expr(node->args->next->next)) { gen_expr(node->args); push(); gen_expr(node->args->next); emitlin("\tmov\t(%rsp),%rdi"); gen_memcpy(eval(node->args->next->next)); pop("%rax"); return true; } } else if (!strcmp(name, "isgreater")) { gen_comis(node, "comisd", 1, 0, "a"); return true; } else if (!strcmp(name, "isgreaterequal")) { gen_comis(node, "comisd", 1, 0, "ae"); return true; } else if (!strcmp(name, "isless")) { gen_comis(node, "comisd", 0, 1, "a"); return true; } else if (!strcmp(name, "islessequal")) { gen_comis(node, "comisd", 0, 1, "ae"); return true; } else if (!strcmp(name, "islessgreater")) { gen_comis(node, "ucomisd", 1, 0, "ne"); return true; } else if (!strcmp(name, "isunordered")) { gen_comis(node, "ucomisd", 1, 0, "p"); return true; } else if (!strcmp(name, "isnan")) { gen_comis(node, "ucomisd", 0, 0, "p"); return true; } else if (!strcmp(name, "nanf")) { emitlin("\ \tmov\t$0x7fc00000,%eax\n\ \tmovd\t%eax,%xmm0"); return true; } else if (!strcmp(name, "nan")) { emitlin("\ \tmov\t$0x7fffffffffffffff,%rax\n\ \tmovq\t%rax,%xmm0"); return true; } else if (!strcmp(name, "nanl")) { emitlin("\ \tpush\t$0x7fc00000\n\ \tflds\t(%rsp)\n\ \tpop\t%rax"); return true; } else if (!strcmp(name, "inff")) { emitlin("\ \tmov\t$0x7f800000,%eax\n\ \tmovd\t%eax,%xmm0"); return true; } else if (!strcmp(name, "inf")) { emitlin("\ \tmov\t$0x7ff0000000000000,%rax\n\ \tmovq\t%rax,%xmm0"); return true; } else if (!strcmp(name, "infl")) { emitlin("\ \tpush\t$0x7f800000\n\ \tflds\t(%rsp)\n\ \tpop\t%rax"); return true; } else if (!strcmp(name, "isinf")) { gen_expr(node->args); emitlin("\ \tmov\t$0x7fffffffffffffff,%rax\n\ \tmovq\t%rax,%xmm1\n\ \tandps\t%xmm1,%xmm0\n\ \tmov\t$0x7fefffffffffffff,%rax\n\ \tmovq\t%rax,%xmm1\n\ \txor\t%eax,%eax\n\ \tcomisd\t%xmm1,%xmm0\n\ \tseta\t%al"); return true; } else if (!strcmp(name, "isfinite")) { gen_expr(node->args); emitlin("\ \tmov\t$0x7fffffffffffffff,%rax\n\ \tmovq\t%rax,%xmm1\n\ \tandps\t%xmm1,%xmm0\n\ \tmov\t$0x7fefffffffffffff,%rax\n\ \tmovq\t%rax,%xmm1\n\ \txor\t%eax,%eax\n\ \tcomisd\t%xmm0,%xmm1\n\ \tsetnb\t%al"); return true; } else if (!strcmp(name, "signbitf")) { gen_expr(node->args); emitlin("\ \tmovd\t%xmm0,%eax\n\ \tand\t$-2147483648,%eax"); return true; } else if (!strcmp(name, "signbit")) { gen_expr(node->args); emitlin("\ \tmovmskpd\t%xmm0,%eax\n\ \tand\t$1,%eax"); return true; } else if (!strcmp(name, "signbitl")) { gen_expr(node->args); emitlin("\ \tfxam\n\ \tfnstsw\t%ax\n\ \tfstp\t%st\n\ \tand\t$0x200,%eax"); return true; } return false; } static int GetSseIntSuffix(Type *ty) { switch (ty->kind) { case TY_CHAR: return 'b'; case TY_SHORT: return 'w'; case TY_INT: return 'd'; case TY_LONG: return 'q'; default: UNREACHABLE(); } } static bool IsOverflowArithmetic(Node *node) { return (node->kind == ND_ADD || node->kind == ND_SUB || node->kind == ND_MUL || node->kind == ND_NEG) && node->overflow; } static void HandleOverflow(const char *ax) { pop("%rdi"); println("\tmov\t%s,(%%rdi)", ax); emitlin("\tseto\t%al"); emitlin("\tmovzbl\t%al,%eax"); } // Generate code for a given node. void gen_expr(Node *node) { char fbuf[32]; print_loc(node->tok->file->file_no, node->tok->line_no); switch (node->kind) { case ND_NULL_EXPR: return; case ND_NUM: { switch (node->ty->kind) { case TY_FLOAT: { union { float f32; uint32_t u32; } u = {node->fval}; g_ffmt_p(fbuf, &u.f32, 7, sizeof(fbuf), 0); println("\tmov\t$%#x,%%eax\t# float %s", u.u32, fbuf); emitlin("\tmovq\t%rax,%xmm0"); return; } case TY_DOUBLE: { union { double f64; uint64_t u64; } u = {node->fval}; g_dfmt_p(fbuf, &u.f64, 16, sizeof(fbuf), 0); println("\tmov\t$%#lx,%%rax\t# double %s", u.u64, fbuf); emitlin("\tmovq\t%rax,%xmm0"); return; } case TY_LDOUBLE: { union { long double f80; uint64_t u64[2]; } u; memset(&u, 0, sizeof(u)); u.f80 = node->fval; g_xfmt_p(fbuf, &u.f80, 19, sizeof(fbuf), 0); println("\tmov\t$%lu,%%rax\t# long double %s", u.u64[0], fbuf); emitlin("\tmov\t%rax,-16(%rsp)"); println("\tmov\t$%lu,%%rax", u.u64[1]); emitlin("\tmov\t%rax,-8(%rsp)"); emitlin("\tfldt\t-16(%rsp)"); return; } case TY_INT128: print_mov_imm(node->val, "%rax", "%eax"); if (node->ty->is_unsigned) { emitlin("\txor\t%edx,%edx"); } else { emitlin("\tcqto"); } return; default: print_mov_imm(node->val, "%rax", "%eax"); return; } } case ND_NEG: if (IsOverflowArithmetic(node)) { gen_expr(node->overflow); push(); } gen_expr(node->lhs); switch (node->ty->kind) { case TY_FLOAT: emitlin("\tmov\t$-2147483648,%eax"); emitlin("\tmovd\t%eax,%xmm1"); if (node->ty->vector_size == 16) { emitlin("\tpshufd\t$0,%xmm1"); } emitlin("\txorps\t%xmm1,%xmm0"); return; case TY_DOUBLE: emitlin("\tmov\t$1,%eax"); emitlin("\tror\t%rax"); emitlin("\tmovq\t%rax,%xmm1"); if (node->ty->vector_size == 16) { emitlin("\tpshufd\t$0b01000100,%xmm1"); } emitlin("\txorpd\t%xmm1,%xmm0"); return; case TY_LDOUBLE: emitlin("\tfchs"); return; case TY_INT128: emitlin("\tneg\t%rax"); emitlin("\tadc\t$0,%rdx"); emitlin("\tneg\t%rdx"); return; } char *ax; if (node->lhs->ty->kind == TY_LONG || node->lhs->ty->base) { ax = "%rax"; } else { ax = "%eax"; } println("\tneg\t%s", ax); if (IsOverflowArithmetic(node)) { HandleOverflow(ax); } return; case ND_VAR: gen_addr(node); load(node->ty); return; case ND_MEMBER: { gen_addr(node); load(node->ty); Member *mem = node->member; if (mem->is_bitfield) { println("\tshl\t$%d,%%rax", 64 - mem->bit_width - mem->bit_offset); if (mem->ty->is_unsigned) { println("\tshr\t$%d,%%rax", 64 - mem->bit_width); } else { println("\tsar\t$%d,%%rax", 64 - mem->bit_width); } } return; } case ND_DEREF: gen_expr(node->lhs); load(node->ty); return; case ND_ADDR: gen_addr(node->lhs); return; case ND_ASSIGN: gen_addr(node->lhs); push(); gen_expr(node->rhs); if (node->lhs->kind == ND_MEMBER && node->lhs->member->is_bitfield) { // If the lhs is a bitfield, we need to read the current value // from memory and merge it with a new value. emitlin("\tmov\t%rax,%r8"); Member *mem = node->lhs->member; emitlin("\tmov\t%rax,%rdi"); println("\tshl\t$%d,%%rdi", 64 - mem->bit_width); println("\tshr\t$%d,%%rdi", 64 - mem->bit_width - mem->bit_offset); emitlin("\tmov\t(%rsp),%rax"); load(mem->ty); unsigned long mask = ((1ul << mem->bit_width) - 1) << mem->bit_offset; println("\tmov\t$%#lx,%%r9", ~mask); emitlin("\tand\t%r9,%rax"); emitlin("\tor\t%rdi,%rax"); store(node->ty); emitlin("\tmov\t%r8,%rax"); return; } store(node->ty); return; case ND_STMT_EXPR: for (Node *n = node->body; n; n = n->next) gen_stmt(n); return; case ND_COMMA: gen_expr(node->lhs); gen_expr(node->rhs); return; case ND_CAST: gen_expr(node->lhs); gen_cast(node->lhs->ty, node->ty); return; case ND_MEMZERO: switch (node->var->ty->size) { case 1: println("\tmovb\t$0,%d(%%rbp)", node->var->offset); break; case 2: println("\tmovw\t$0,%d(%%rbp)", node->var->offset); break; case 4: println("\tmovl\t$0,%d(%%rbp)", node->var->offset); break; case 8: println("\tmovq\t$0,%d(%%rbp)", node->var->offset); break; case 9 ... 16: emitlin("\txor\t%eax,%eax"); println("\tmov\t%%rax,%d(%%rbp)", node->var->offset); println("\tmov\t%%rax,%d(%%rbp)", node->var->offset + 8 - (16 - node->var->ty->size)); break; default: // `rep stosb` is equivalent to `memset(%rdi, %al, %rcx)`. print_mov_imm(node->var->ty->size, "%rcx", "%ecx"); println("\tlea\t%d(%%rbp),%%rdi", node->var->offset); emitlin("\txor\t%eax,%eax"); emitlin("\trep stosb"); break; } return; case ND_COND: { int c = count(); gen_expr(node->cond); cmp_zero(node->cond->ty); println("\tje\t.L.else.%d", c); gen_expr(node->then); println("\tjmp\t.L.end.%d", c); println(".L.else.%d:", c); gen_expr(node->els); println(".L.end.%d:", c); return; } case ND_NOT: gen_expr(node->lhs); cmp_zero(node->lhs->ty); emitlin("\tsete\t%al"); emitlin("\tmovzbl\t%al,%rax"); return; case ND_BITNOT: gen_expr(node->lhs); if (node->lhs->ty->kind == TY_INT128) { emitlin("\tnot\t%rax"); emitlin("\tnot\t%rdx"); } else if (node->lhs->ty->vector_size == 16) { emitlin("\tpcmpeqd\t%xmm1,%xmm1"); emitlin("\tpxor\t%xmm1,%xmm0"); } else { emitlin("\tnot\t%rax"); } return; case ND_LOGAND: { int c = count(); gen_expr(node->lhs); cmp_zero(node->lhs->ty); println("\tje\t.L.false.%d", c); gen_expr(node->rhs); cmp_zero(node->rhs->ty); println("\tje\t.L.false.%d", c); emitlin("\tmov\t$1,%eax"); println("\tjmp\t.L.end.%d", c); println(".L.false.%d:", c); emitlin("\txor\t%eax,%eax"); println(".L.end.%d:", c); return; } case ND_LOGOR: { int c = count(); gen_expr(node->lhs); cmp_zero(node->lhs->ty); println("\tjne\t.L.true.%d", c); gen_expr(node->rhs); cmp_zero(node->rhs->ty); println("\tjne\t.L.true.%d", c); emitlin("\txor\t%eax,%eax"); println("\tjmp\t.L.end.%d", c); println(".L.true.%d:", c); emitlin("\tmov\t$1,%eax"); println(".L.end.%d:", c); return; } case ND_FUNCALL: { const char *funcname = NULL; if (node->lhs->kind == ND_VAR) { if (startswith(nameof(node->lhs->var), "__builtin_")) { funcname = nameof(node->lhs->var) + 10; if (gen_builtin_funcall(node, funcname)) { return; } } else if (!opt_no_builtin) { if (gen_builtin_funcall(node, nameof(node->lhs->var))) { return; } } } int stack_args = push_args(node); if (!funcname) { if (node->lhs->kind == ND_VAR && !node->lhs->var->is_local && !node->lhs->var->is_tls && node->lhs->ty->kind == TY_FUNC) { if (!opt_pic || node->lhs->var->is_definition) { funcname = nameof(node->lhs->var); } else { funcname = xasprintf("%s@gotpcrel(%%rip)", nameof(node->lhs->var)); } } else { gen_expr(node->lhs); funcname = NULL; } } int gp = 0, fp = 0; // If the return type is a large struct/union, the caller passes // a pointer to a buffer as if it were the first argument. if (node->ret_buffer && node->ty->size > 16) { pop(argreg64[gp++]); } for (Node *arg = node->args; arg; arg = arg->next) { Type *ty = arg->ty; switch (ty->kind) { case TY_STRUCT: case TY_UNION: if (ty->size > 16) continue; bool fp1 = has_flonum1(ty); bool fp2 = has_flonum2(ty); if (fp + fp1 + fp2 < FP_MAX && gp + !fp1 + !fp2 < GP_MAX) { if (fp1) { popf(ty, fp++); } else { pop(argreg64[gp++]); } if (ty->size > 8) { if (fp2) { popf(ty, fp++); } else { pop(argreg64[gp++]); } } } break; case TY_FLOAT: case TY_DOUBLE: if (fp < FP_MAX) { popf(ty, fp++); } break; case TY_LDOUBLE: break; case TY_INT128: if (gp + 1 < GP_MAX) { int a = gp++; int b = gp++; pop2(argreg64[a], argreg64[b]); } break; default: if (gp < GP_MAX) { pop(argreg64[gp++]); } break; } } if (funcname) { if (node->lhs->ty->is_variadic) { print_mov_imm(fp, "%rax", "%eax"); } println("\tcall\t%s", funcname); } else { if (!node->lhs->ty->is_variadic) { emitlin("\tcall\t*%rax"); } else { emitlin("\tmov\t%rax,%r10"); print_mov_imm(fp, "%rax", "%eax"); emitlin("\tcall\t*%r10"); } } if (stack_args) { println("\tadd\t$%d,%%rsp", stack_args * 8); } depth -= stack_args; // It looks like the most significant 48 or 56 bits in RAX may // contain garbage if a function return type is short or bool/char, // respectively. We clear the upper bits here. switch (node->ty->kind) { case TY_BOOL: emitlin("\tmovzbl\t%al,%eax"); return; case TY_CHAR: if (node->ty->is_unsigned) { emitlin("\tmovzbl\t%al,%eax"); } else { emitlin("\tmovsbl\t%al,%eax"); } return; case TY_SHORT: if (node->ty->is_unsigned) { emitlin("\tmovzwl\t%ax,%eax"); } else { emitlin("\tcwtl"); } return; } // If the return type is a small struct, a value is returned // using up to two registers. if (node->ret_buffer && node->ty->size <= 16) { copy_ret_buffer(node->ret_buffer); println("\tlea\t%d(%%rbp),%%rax", node->ret_buffer->offset); } return; } case ND_LABEL_VAL: if (opt_pic) { println("\tlea\t%s(%%rip),%%rax", node->unique_label); } else { println("\tmov\t$%s,%%eax", node->unique_label); } return; case ND_CAS: { gen_expr(node->cas_addr); push(); gen_expr(node->cas_new); push(); gen_expr(node->cas_old); emitlin("\tmov\t%rax,%r8"); load(node->cas_old->ty->base); pop("%rdx"); // new pop("%rdi"); // addr int sz = node->cas_addr->ty->base->size; println("\tlock cmpxchg %s,(%%rdi)", reg_dx(sz)); emitlin("\tsete\t%cl"); emitlin("\tje\t1f"); println("\tmov\t%s,(%%r8)", reg_ax(sz)); emitlin("1:"); emitlin("\tmovzbl\t%cl,%eax"); return; } case ND_EXCH: { gen_expr(node->lhs); push(); gen_expr(node->rhs); pop("%rdi"); int sz = node->lhs->ty->base->size; println("\txchg\t%s,(%%rdi)", reg_ax(sz)); return; } case ND_FPCLASSIFY: gen_fpclassify(node->fpc); return; } switch (node->lhs->ty->kind) { case TY_FLOAT: case TY_DOUBLE: { gen_expr(node->rhs); pushf(node->rhs->ty); gen_expr(node->lhs); popf(node->rhs->ty, 1); char sd = node->lhs->ty->kind == TY_FLOAT ? 's' : 'd'; char ps = node->lhs->ty->vector_size == 16 ? 'p' : 's'; switch (node->kind) { case ND_ADD: println("\tadd%c%c\t%%xmm1,%%xmm0", ps, sd); return; case ND_SUB: println("\tsub%c%c\t%%xmm1,%%xmm0", ps, sd); return; case ND_MUL: println("\tmul%c%c\t%%xmm1,%%xmm0", ps, sd); return; case ND_DIV: println("\tdiv%c%c\t%%xmm1,%%xmm0", ps, sd); return; case ND_EQ: case ND_NE: case ND_LT: case ND_LE: println("\tucomis%c\t%%xmm0,%%xmm1", sd); if (node->kind == ND_EQ) { emitlin("\ \tsete\t%al\n\ \tsetnp\t%dl\n\ \tand\t%dl,%al"); } else if (node->kind == ND_NE) { emitlin("\ \tsetne\t%al\n\ \tsetp\t%dl\n\ \tor\t%dl,%al"); } else if (node->kind == ND_LT) { emitlin("\tseta\t%al"); } else { emitlin("\tsetae\t%al"); } emitlin("\tand\t$1,%al"); emitlin("\tmovzbl\t%al,%eax"); return; } error_tok(node->tok, "invalid expression"); } case TY_LDOUBLE: { gen_expr(node->lhs); gen_expr(node->rhs); switch (node->kind) { case ND_ADD: emitlin("\tfaddp"); return; case ND_SUB: emitlin("\tfsubrp"); return; case ND_MUL: emitlin("\tfmulp"); return; case ND_DIV: emitlin("\tfdivrp"); return; case ND_EQ: case ND_NE: case ND_LT: case ND_LE: emitlin("\tfcomip"); emitlin("\tfstp\t%st"); if (node->kind == ND_EQ) emitlin("\tsete\t%al"); else if (node->kind == ND_NE) emitlin("\tsetne\t%al"); else if (node->kind == ND_LT) emitlin("\tseta\t%al"); else emitlin("\tsetae\t%al"); emitlin("\tmovzbl\t%al,%eax"); return; } error_tok(node->tok, "invalid expression"); } } if (IsOverflowArithmetic(node)) { gen_expr(node->overflow); push(); } if (node->lhs->ty->vector_size == 16) { gen_expr(node->rhs); pushx(); gen_expr(node->lhs); popx(1); } else if (node->rhs->ty->kind == TY_INT128) { gen_expr(node->rhs); push2(); gen_expr(node->lhs); pop2("%rdi", "%rsi"); } else if (!opt_pic && is_const_expr(node->rhs)) { /* shortcut path for immediates */ char **label = NULL; uint64_t val = eval2(node->rhs, &label); gen_expr(node->lhs); if (label) { println("\tmov\t$%s%+ld,%%edi", *label, val); } else { print_mov_imm(val, "%rdi", "%edi"); } } else { gen_expr(node->rhs); push(); gen_expr(node->lhs); pop("%rdi"); } char *ax, *di, *dx; if (node->lhs->ty->kind == TY_LONG || node->lhs->ty->base) { ax = "%rax"; di = "%rdi"; dx = "%rdx"; } else { ax = "%eax"; di = "%edi"; dx = "%edx"; } switch (node->kind) { case ND_ADD: if (node->lhs->ty->kind == TY_INT128) { emitlin("\tadd\t%rdi,%rax"); emitlin("\tadc\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { println("\tpadd%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); } else { println("\tadd\t%s,%s", di, ax); } break; case ND_SUB: if (node->lhs->ty->kind == TY_INT128) { emitlin("\tsub\t%rdi,%rax"); emitlin("\tsbb\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { println("\tpsub%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); } else { println("\tsub\t%s,%s", di, ax); } break; case ND_MUL: if (node->lhs->ty->kind == TY_INT128) { emitlin("\ \timul\t%rdi,%rdx\n\ \timul\t%rax,%rsi\n\ \tadd\t%rdx,%rsi\n\ \tmul\t%rdi\n\ \tadd\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { switch (node->lhs->ty->kind) { case TY_CHAR: emitlin("\ \tmovaps\t%xmm1,%xmm2\n\ \tmovaps\t%xmm0,%xmm3\n\ \tpunpcklbw %xmm0,%xmm3\n\ \tpunpcklbw %xmm1,%xmm2\n\ \tpmullw\t%xmm3,%xmm2\n\ \tpunpckhbw %xmm0,%xmm0\n\ \tpcmpeqd\t%xmm3,%xmm3\n\ \tpunpckhbw %xmm1,%xmm1\n\ \tpmullw\t%xmm0,%xmm1\n\ \tandps\t%xmm3,%xmm2\n\ \tmovaps\t%xmm2,%xmm0\n\ \tandps\t%xmm3,%xmm1\n\ \tpackuswb %xmm1,%xmm0"); break; case TY_SHORT: emitlin("\tpmullw\t%xmm1,%xmm0"); break; case TY_INT: if (opt_sse4) { emitlin("\tpmulld\t%xmm1,%xmm0"); } else { emitlin("\ \tmovaps\t%xmm0,%xmm2\n\ \tpsrlq\t$32,%xmm0\n\ \tpmuludq\t%xmm1,%xmm2\n\ \tpsrlq\t$32,%xmm1\n\ \tpmuludq\t%xmm1,%xmm0\n\ \tpshufd\t$8,%xmm2,%xmm2\n\ \tpshufd\t$8,%xmm0,%xmm1\n\ \tpunpckldq %xmm1,%xmm2\n\ \tmovaps\t%xmm2,%xmm0"); } case TY_LONG: emitlin("\ \tmovaps\t%xmm1,%xmm2\n\ \tmovaps\t%xmm0,%xmm3\n\ \tpmuludq\t%xmm1,%xmm3\n\ \tmovaps\t%xmm2,%xmm4\n\ \tmovaps\t%xmm0,%xmm1\n\ \tpsrlq\t$32,%xmm4\n\ \tpsrlq\t$32,%xmm1\n\ \tpmuludq\t%xmm4,%xmm0\n\ \tpmuludq\t%xmm2,%xmm1\n\ \tpaddq\t%xmm0,%xmm1\n\ \tpsllq\t$32,%xmm1\n\ \tpaddq\t%xmm1,%xmm3\n\ \tmovaps\t%xmm3,%xmm0"); break; default: UNREACHABLE(); } } else { println("\timul\t%s,%s", di, ax); } break; case ND_DIV: case ND_REM: if (node->lhs->ty->kind == TY_INT128) { bool skew; if ((skew = (depth & 1))) { emitlin("\tsub\t$8,%rsp"); depth++; } emitlin("\ \tmov\t%rsi,%rcx\n\ \tmov\t%rdx,%rsi\n\ \tmov\t%rdi,%rdx\n\ \tmov\t%rax,%rdi"); if (node->kind == ND_DIV) { if (node->ty->is_unsigned) { emitlin("\tcall\t__udivti3"); } else { emitlin("\tcall\t__divti3"); } } else { if (node->ty->is_unsigned) { emitlin("\tcall\t__umodti3"); } else { emitlin("\tcall\t__modti3"); } } if (skew) { emitlin("\tadd\t$8,%rsp"); depth--; } } else if (node->lhs->ty->vector_size == 16) { error_tok(node->tok, "no div/rem sse instruction"); } else { if (node->ty->is_unsigned) { emitlin("\txor\t%edx,%edx"); println("\tdiv\t%s", di); } else { if (node->lhs->ty->size == 8) { emitlin("\tcqto"); } else { emitlin("\tcltd"); } println("\tidiv\t%s", di); } if (node->kind == ND_REM) { emitlin("\tmov\t%rdx,%rax"); } } break; case ND_BINAND: if (node->lhs->ty->kind == TY_INT128) { emitlin("\tand\t%rdi,%rax"); emitlin("\tand\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { emitlin("\tpand\t%xmm1,%xmm0"); } else { println("\tand\t%s,%s", di, ax); } break; case ND_BINOR: if (node->lhs->ty->kind == TY_INT128) { emitlin("\tor\t%rdi,%rax"); emitlin("\tor\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { emitlin("\tpor\t%xmm1,%xmm0"); } else { println("\tor\t%s,%s", di, ax); } break; case ND_BINXOR: if (node->lhs->ty->kind == TY_INT128) { emitlin("\txor\t%rdi,%rax"); emitlin("\txor\t%rsi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { emitlin("\tpxor\t%xmm1,%xmm0"); } else { println("\txor\t%s,%s", di, ax); } break; case ND_EQ: case ND_NE: case ND_LT: case ND_LE: if (node->lhs->ty->kind == TY_INT128) { switch (node->kind) { case ND_EQ: emitlin("\ \txor\t%rax,%rdi\n\ \txor\t%rdx,%rsi\n\ \tor\t%rsi,%rdi\n\ \tsete\t%al"); break; case ND_NE: emitlin("\ \txor\t%rax,%rdi\n\ \txor\t%rdx,%rsi\n\ \tor\t%rsi,%rdi\n\ \tsetne\t%al"); break; case ND_LT: if (node->lhs->ty->is_unsigned) { emitlin("\ \tcmp\t%rdi,%rax\n\ \tmov\t%rdx,%rax\n\ \tsbb\t%rsi,%rax\n\ \tsetc\t%al"); } else { emitlin("\ \tcmp\t%rdi,%rax\n\ \tmov\t%rdx,%rax\n\ \tsbb\t%rsi,%rax\n\ \tsetl\t%al"); } break; case ND_LE: if (node->lhs->ty->is_unsigned) { emitlin("\ \tcmp\t%rax,%rdi\n\ \tsbb\t%rdx,%rsi\n\ \tsetnc\t%al"); } else { emitlin("\ \tcmp\t%rax,%rdi\n\ \tsbb\t%rdx,%rsi\n\ \tsetge\t%al"); } break; } } else if (node->lhs->ty->vector_size == 16) { switch (node->kind) { case ND_EQ: switch (node->lhs->ty->kind) { case TY_CHAR: case TY_SHORT: case TY_INT: println("\tpcmpeq%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); break; default: error_tok(node->tok, "todo sse eq"); } break; case ND_NE: switch (node->lhs->ty->kind) { case TY_CHAR: case TY_SHORT: case TY_INT: println("\tpcmpeq%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); emitlin("\tpcmpeqd\t%xmm1,%xmm1\n" "\tandnps\t%xmm1,%xmm0"); break; default: error_tok(node->tok, "todo sse ne"); } break; case ND_LT: if (node->lhs->ty->is_unsigned) { switch (node->lhs->ty->kind) { case TY_CHAR: case TY_SHORT: println("\tpsubus%c\t%%xmm0,%%xmm1", GetSseIntSuffix(node->lhs->ty)); emitlin("\tmovaps\t%xmm1,%xmm0\n" "\txorps\t%xmm1,%xmm1"); println("\tpcmpeq%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); emitlin("\tpcmpeqd\t%xmm1,%xmm1\n" "\tandnps\t%xmm1,%xmm0"); break; default: error_tok(node->tok, "todo sse ltu"); } } else { switch (node->lhs->ty->kind) { case TY_CHAR: case TY_SHORT: case TY_INT: println("\tpcmpgt%c\t%%xmm0,%%xmm1", GetSseIntSuffix(node->lhs->ty)); emitlin("\tmovaps\t%xmm1,%xmm0"); break; default: error_tok(node->tok, "todo sse lt"); } } break; case ND_LE: if (node->lhs->ty->is_unsigned) { switch (node->lhs->ty->kind) { case TY_CHAR: emitlin("\tpminub\t%xmm0,%xmm1\n" "\tpcmpeqb\t%xmm1,%xmm0"); break; case TY_SHORT: emitlin("\tpsubusw\t%xmm1,%xmm0\n" "\txorps\t%xmm1,%xmm1\n" "\tpcmpeqw\t%xmm1,%xmm0"); break; case TY_INT: emitlin("\tmov\t$-2147483648,%eax\n" "\tmovd\t%eax,%xmm2\n" "\tpshufd\t$0,%xmm2"); emitlin("\tpcmpgtd\t%xmm1,%xmm0"); emitlin("\tpcmpeqd\t%xmm1,%xmm1\n" "\tandnps\t%xmm1,%xmm0"); break; default: error_tok(node->tok, "todo sse leu"); } } else { switch (node->lhs->ty->kind) { case TY_SHORT: emitlin("\tpminsw\t%xmm0,%xmm1\n" "\tpcmpeqw\t%xmm1,%xmm0"); break; case TY_CHAR: case TY_INT: println("\tpcmpgt%c\t%%xmm1,%%xmm0", GetSseIntSuffix(node->lhs->ty)); emitlin("\tpcmpeqd\t%xmm1,%xmm1\n" "\tandnps\t%xmm1,%xmm0"); break; default: error_tok(node->tok, "todo sse le"); } } break; } } else { println("\tcmp\t%s,%s", di, ax); if (node->kind == ND_EQ) { emitlin("\tsete\t%al"); } else if (node->kind == ND_NE) { emitlin("\tsetne\t%al"); } else if (node->kind == ND_LT) { if (node->lhs->ty->is_unsigned) { emitlin("\tsetb\t%al"); } else { emitlin("\tsetl\t%al"); } } else if (node->kind == ND_LE) { if (node->lhs->ty->is_unsigned) { emitlin("\tsetbe\t%al"); } else { emitlin("\tsetle\t%al"); } } } emitlin("\tmovzbl\t%al,%eax"); break; case ND_SHL: emitlin("\tmov\t%edi,%ecx"); if (node->lhs->ty->kind == TY_INT128) { emitlin("\ \tshld\t%cl,%rax,%rdx\n\ \tshl\t%cl,%rax\n\ \txor\t%edi,%edi\n\ \tand\t$64,%cl\n\ \tcmovne\t%rax,%rdx\n\ \tcmovne\t%rdi,%rax"); } else if (node->lhs->ty->vector_size == 16) { error_tok(node->tok, "todo sse shl"); } else { println("\tshl\t%%cl,%s", ax); } break; case ND_SHR: emitlin("\tmov\t%edi,%ecx"); if (node->lhs->ty->is_unsigned) { if (node->lhs->ty->kind == TY_INT128) { emitlin("\ \tshrd\t%cl,%rdx,%rax\n\ \tshr\t%cl,%rdx\n\ \txor\t%edi,%edi\n\ \tand\t$64,%cl\n\ \tcmovne\t%rdx,%rax\n\ \tcmovne\t%rdi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { error_tok(node->tok, "todo sse shr"); } else { println("\tshr\t%%cl,%s", ax); } } else { if (node->lhs->ty->kind == TY_INT128) { emitlin("\ \tshrd\t%cl,%rdx,%rax\n\ \tsar\t%cl,%rdx\n\ \tmov\t%rdx,%rdi\n\ \tsar\t$63,%rdi\n\ \tand\t$64,%cl\n\ \tcmovne\t%rdx,%rax\n\ \tcmovne\t%rdi,%rdx"); } else if (node->lhs->ty->vector_size == 16) { error_tok(node->tok, "todo sse sar"); } else { println("\tsar\t%%cl,%s", ax); } } break; default: error_tok(node->tok, "invalid expression"); } if (IsOverflowArithmetic(node)) { HandleOverflow(ax); } } void gen_stmt(Node *node) { print_loc(node->tok->file->file_no, node->tok->line_no); switch (node->kind) { case ND_IF: { int c = count(); gen_expr(node->cond); cmp_zero(node->cond->ty); println("\tje\t.L.else.%d", c); gen_stmt(node->then); println("\tjmp\t.L.end.%d", c); println(".L.else.%d:", c); if (node->els) gen_stmt(node->els); println(".L.end.%d:", c); return; } case ND_FOR: { int c = count(); if (node->init) gen_stmt(node->init); println(".L.begin.%d:", c); if (node->cond) { gen_expr(node->cond); cmp_zero(node->cond->ty); println("\tje\t%s", node->brk_label); } gen_stmt(node->then); println("%s:", node->cont_label); if (node->inc) gen_expr(node->inc); println("\tjmp\t.L.begin.%d", c); println("%s:", node->brk_label); return; } case ND_DO: { int c = count(); println(".L.begin.%d:", c); gen_stmt(node->then); println("%s:", node->cont_label); gen_expr(node->cond); cmp_zero(node->cond->ty); println("\tjne\t.L.begin.%d", c); println("%s:", node->brk_label); return; } case ND_SWITCH: // TODO(jart): 64-bit? gen_expr(node->cond); for (Node *n = node->case_next; n; n = n->case_next) { char *ax = (node->cond->ty->size == 8) ? "%rax" : "%eax"; char *di = (node->cond->ty->size == 8) ? "%rdi" : "%edi"; if (n->begin == n->end) { println("\tcmp\t$%ld,%s", n->begin, ax); println("\tje\t%s", n->label); continue; } // [GNU] Case ranges println("\tmov\t%s,%s", ax, di); println("\tsub\t$%ld,%s", n->begin, di); println("\tcmp\t$%ld,%s", n->end - n->begin, di); println("\tjbe\t%s", n->label); } if (node->default_case) { println("\tjmp\t%s", node->default_case->label); } println("\tjmp\t%s", node->brk_label); gen_stmt(node->then); println("%s:", node->brk_label); return; case ND_CASE: println("%s:", node->label); gen_stmt(node->lhs); return; case ND_BLOCK: for (Node *n = node->body; n; n = n->next) gen_stmt(n); return; case ND_GOTO: println("\tjmp\t%s", node->unique_label); return; case ND_GOTO_EXPR: gen_expr(node->lhs); emitlin("\tjmp\t*%rax"); return; case ND_LABEL: println("%s:", node->unique_label); gen_stmt(node->lhs); return; case ND_RETURN: if (node->lhs) { gen_expr(node->lhs); Type *ty = node->lhs->ty; switch (ty->kind) { case TY_STRUCT: case TY_UNION: if (ty->size <= 16) { copy_struct_reg(); } else { copy_struct_mem(); } break; } } println("\tjmp\t.L.return.%s", nameof(current_fn)); return; case ND_EXPR_STMT: gen_expr(node->lhs); return; case ND_ASM: gen_asm(node->azm); return; } error_tok(node->tok, "invalid statement"); } // Assign offsets to local variables. static void assign_lvar_offsets(Obj *prog) { for (Obj *fn = prog; fn; fn = fn->next) { if (!fn->is_function) continue; // If a function has many parameters, some parameters are // inevitably passed by stack rather than by register. // The first passed-by-stack parameter resides at RBP+16. int top = 16; int bottom = 0; int gp = 0, fp = 0; // Assign offsets to pass-by-stack parameters. for (Obj *var = fn->params; var; var = var->next) { Type *ty = var->ty; switch (ty->kind) { case TY_STRUCT: case TY_UNION: if (ty->size <= 16) { bool fp1 = has_flonum(ty, 0, 8, 0); bool fp2 = has_flonum(ty, 8, 16, 8); if (fp + fp1 + fp2 < FP_MAX && gp + !fp1 + !fp2 < GP_MAX) { fp = fp + fp1 + fp2; gp = gp + !fp1 + !fp2; continue; } } break; case TY_FLOAT: case TY_DOUBLE: if (fp++ < FP_MAX) continue; break; case TY_LDOUBLE: break; case TY_INT128: gp++; if (gp++ < GP_MAX) continue; default: if (gp++ < GP_MAX) continue; } top = ROUNDUP(top, 8); var->offset = top; top += var->ty->size; } // Assign offsets to pass-by-register parameters and local variables. for (Obj *var = fn->locals; var; var = var->next) { if (var->offset) continue; // AMD64 System V ABI has a special alignment rule for an array of // length at least 16 bytes. We need to align such array to at least // 16-byte boundaries. See p.14 of // https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-draft.pdf. int align = (var->ty->kind == TY_ARRAY && var->ty->size >= 16) ? MAX(16, var->align) : var->align; bottom += var->ty->size; bottom = ROUNDUP(bottom, align); var->offset = -bottom; } fn->stack_size = ROUNDUP(bottom, 16); } } static void emit_data(Obj *prog) { for (Obj *var = prog; var; var = var->next) { if (var->is_function || !var->is_definition) continue; flushln(); fputc('\n', output_stream); print_visibility(var); int align = (var->ty->kind == TY_ARRAY && var->ty->size >= 16) ? MAX(16, var->align) : var->align; if (opt_common && var->is_tentative) { println("\t.comm\t%s,%d,%d", nameof(var), var->ty->size, align); } else { if (var->section) { println("\t.section %s,\"aw\",@%s", var->section, var->init_data ? "progbits" : "nobits"); } else if (var->is_tls) { println("\t.section .t%s,\"awT\",@%s", var->init_data ? "progbits" : "nobits", var->init_data ? "data" : "bss"); #if 0 /* TODO: unflag if assigned to array */ } else if (align <= 1 && var->is_string_literal) { println("\t.section .rodata.str1.1,\"aSM\",@progbits,1"); #endif } else if (opt_data_sections) { println("\tsection .%s.%s", var->init_data ? "data" : "bss", nameof(var)); } else { println("\t.%s", var->init_data ? "data" : "bss"); } print_align(align); println("\t.type\t%s,@object", nameof(var)); println("\t.size\t%s,%d", nameof(var), var->ty->size); println("%s:", nameof(var)); if (var->init_data) { int pos = 0; Relocation *rel = var->rel; if (!rel && is_quotable_string(var->init_data, var->ty->size)) { println("\t.asciz\t\"%s\"", var->init_data); } else { while (pos < var->ty->size) { if (rel && rel->offset == pos) { assert(pos + 8 <= var->ty->size); println("\t.quad\t%s%+ld", *rel->label, rel->addend); rel = rel->next; pos += 8; } else { println("\t.byte\t%d", var->init_data[pos++]); } } } } else { println("\t.zero\t%d", var->ty->size); } } } } static void store_fp(int r, int offset, int sz) { switch (sz) { case 4: println("\tmovss\t%%xmm%d,%d(%%rbp)", r, offset); return; case 8: println("\tmovsd\t%%xmm%d,%d(%%rbp)", r, offset); return; case 16: println("\tmovaps\t%%xmm%d,%d(%%rbp)", r, offset); return; } UNREACHABLE(); } static void store_gp(int r, int offset, int sz) { switch (sz) { case 1: println("\tmov\t%s,%d(%%rbp)", argreg8[r], offset); return; case 2: println("\tmov\t%s,%d(%%rbp)", argreg16[r], offset); return; case 4: println("\tmov\t%s,%d(%%rbp)", argreg32[r], offset); return; case 8: println("\tmov\t%s,%d(%%rbp)", argreg64[r], offset); return; default: for (int i = 0; i < sz; i++) { println("\tmov\t%s,%d(%%rbp)", argreg8[r], offset + i); println("\tshr\t$8,%s", argreg64[r]); } return; } } static void emit_text(Obj *prog) { for (Obj *fn = prog; fn; fn = fn->next) { if (!fn->is_function || !fn->is_definition) continue; // No code is emitted for "static inline" functions // if no one is referencing them. if (!fn->is_live) continue; flushln(); fputc('\n', output_stream); if (fn->section) { println("\t.section %s,\"a\",@progbits", fn->section); } else if (opt_function_sections) { println("\tsection .text.%s", nameof(fn)); } else { emitlin("\t.text"); } print_visibility(fn); print_align(fn->align); println("\t.type\t%s,@function", nameof(fn)); println("%s:", nameof(fn)); current_fn = fn; // Prologue emitlin("\tpush\t%rbp"); emitlin("\tmov\t%rsp,%rbp"); if (opt_nop_mcount) { print_profiling_nop(); } else if (opt_fentry) { emitlin("\tcall\t__fentry__@gotpcrel(%rip)"); } else if (opt_pg) { emitlin("\tcall\tmcount@gotpcrel(%rip)"); } else { print_profiling_nop(); } println("\tsub\t$%d,%%rsp", fn->stack_size); println("\tmov\t%%rsp,%d(%%rbp)", fn->alloca_bottom->offset); // Save arg registers if function is variadic if (fn->va_area) { int gp = 0, fp = 0; for (Obj *var = fn->params; var; var = var->next) { if (is_flonum(var->ty)) { fp++; } else { gp++; } } int off = fn->va_area->offset; // va_elem println("\tmovl\t$%d,%d(%%rbp)", gp * 8, off); // gp_offset println("\tmovl\t$%d,%d(%%rbp)", fp * 8 + 48, off + 4); // fp_offset println("\tmov\t%%rbp,%d(%%rbp)", off + 8); // overflow_arg_area println("\taddq\t$16,%d(%%rbp)", off + 8); println("\tmov\t%%rbp,%d(%%rbp)", off + 16); // reg_save_area println("\taddq\t$%d,%d(%%rbp)", off + 24, off + 16); // __reg_save_area__ println("\tmov\t%%rdi,%d(%%rbp)", off + 24); println("\tmov\t%%rsi,%d(%%rbp)", off + 32); println("\tmov\t%%rdx,%d(%%rbp)", off + 40); println("\tmov\t%%rcx,%d(%%rbp)", off + 48); println("\tmov\t%%r8,%d(%%rbp)", off + 56); println("\tmov\t%%r9,%d(%%rbp)", off + 64); println("\tmovsd\t%%xmm0,%d(%%rbp)", off + 72); println("\tmovsd\t%%xmm1,%d(%%rbp)", off + 80); println("\tmovsd\t%%xmm2,%d(%%rbp)", off + 88); println("\tmovsd\t%%xmm3,%d(%%rbp)", off + 96); println("\tmovsd\t%%xmm4,%d(%%rbp)", off + 104); println("\tmovsd\t%%xmm5,%d(%%rbp)", off + 112); println("\tmovsd\t%%xmm6,%d(%%rbp)", off + 120); println("\tmovsd\t%%xmm7,%d(%%rbp)", off + 128); } // Save passed-by-register arguments to the stack int gp = 0, fp = 0; for (Obj *var = fn->params; var; var = var->next) { if (var->offset > 0) continue; Type *ty = var->ty; switch (ty->kind) { case TY_STRUCT: case TY_UNION: assert(ty->size <= 16); if (has_flonum(ty, 0, 8, 0)) { store_fp(fp++, var->offset, MIN(8, ty->size)); } else { store_gp(gp++, var->offset, MIN(8, ty->size)); } if (ty->size > 8) { if (has_flonum(ty, 8, 16, 0)) { store_fp(fp++, var->offset + 8, ty->size - 8); } else { store_gp(gp++, var->offset + 8, ty->size - 8); } } break; case TY_FLOAT: case TY_DOUBLE: store_fp(fp++, var->offset, ty->size); break; case TY_INT128: store_gp(gp++, var->offset + 0, 8); store_gp(gp++, var->offset + 8, 8); break; default: store_gp(gp++, var->offset, ty->size); } } // Emit code gen_stmt(fn->body); assert(!depth); // [https://www.sigbus.info/n1570#5.1.2.2.3p1] The C spec defines // a special rule for the main function. Reaching the end of the // main function is equivalent to returning 0, even though the // behavior is undefined for the other functions. if (strcmp(nameof(fn), "main") == 0) { emitlin("\txor\t%eax,%eax"); } // Epilogue println(".L.return.%s:", nameof(fn)); if (fn->is_noreturn) { emitlin("\tud2"); } else { emitlin("\tleave"); emitlin("\tret"); } println("\t.size\t%s,.-%s", nameof(fn), nameof(fn)); if (fn->is_constructor) { emitlin("\t.section .ctors,\"aw\",@progbits"); emitlin("\t.align\t8"); println("\t.quad\t%s", nameof(fn)); } if (fn->is_destructor) { emitlin("\t.section .dtors,\"aw\",@progbits"); emitlin("\t.align\t8"); println("\t.quad\t%s", nameof(fn)); } } } static void emit_staticasms(StaticAsm *a) { if (!a) return; emit_staticasms(a->next); gen_asm(a->body); } void codegen(Obj *prog, FILE *out) { output_stream = out; File **files = get_input_files(); for (int i = 0; files[i]; i++) { println("\t.file\t%d %`'s", files[i]->file_no, files[i]->name); } assign_lvar_offsets(prog); emit_staticasms(staticasms); emit_data(prog); emit_text(prog); flushln(); }