diff --git a/third_party/chibicc/alloc.c b/third_party/chibicc/alloc.c new file mode 100644 index 00000000..9f5cf9a6 --- /dev/null +++ b/third_party/chibicc/alloc.c @@ -0,0 +1,44 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2020 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "third_party/chibicc/chibicc.h" + +long alloc_node_count; +long alloc_token_count; +long alloc_obj_count; +long alloc_type_count; + +Node *alloc_node(void) { + ++alloc_node_count; + return calloc(1, sizeof(Node)); +} + +Token *alloc_token(void) { + ++alloc_token_count; + return calloc(1, sizeof(Token)); +} + +Obj *alloc_obj(void) { + ++alloc_obj_count; + return calloc(1, sizeof(Obj)); +} + +Type *alloc_type(void) { + ++alloc_type_count; + return calloc(1, sizeof(Type)); +} diff --git a/third_party/chibicc/chibicc.c b/third_party/chibicc/chibicc.c index 3428af16..4f0ccab0 100644 --- a/third_party/chibicc/chibicc.c +++ b/third_party/chibicc/chibicc.c @@ -141,7 +141,16 @@ static char *quote_makefile(char *s) { static void PrintMemoryUsage(void) { struct mallinfo mi; mi = mallinfo(); + fprintf(stderr, "\n"); fprintf(stderr, "allocated %,ld bytes of memory\n", mi.arena); + fprintf(stderr, "allocated %,ld nodes (%,ld bytes)\n", alloc_node_count, + sizeof(Node) * alloc_node_count); + fprintf(stderr, "allocated %,ld tokens (%,ld bytes)\n", alloc_token_count, + sizeof(Token) * alloc_token_count); + fprintf(stderr, "allocated %,ld objs (%,ld bytes)\n", alloc_obj_count, + sizeof(Obj) * alloc_obj_count); + fprintf(stderr, "allocated %,ld types (%,ld bytes)\n", alloc_type_count, + sizeof(Type) * alloc_type_count); } static void strarray_push_comma(StringArray *a, char *s) { diff --git a/third_party/chibicc/chibicc.h b/third_party/chibicc/chibicc.h index 916af0c4..ba1ddb9d 100644 --- a/third_party/chibicc/chibicc.h +++ b/third_party/chibicc/chibicc.h @@ -62,14 +62,14 @@ void strarray_push(StringArray *, char *); // tokenize.c // -// Token typedef enum { - TK_RESERVED, // Keywords or punctuators - TK_IDENT, // Identifiers - TK_STR, // String literals - TK_NUM, // Numeric literals - TK_PP_NUM, // Preprocessing numbers - TK_EOF, // End-of-file markers + TK_IDENT, // Identifiers + TK_PUNCT, // Punctuators + TK_KEYWORD, // Keywords + TK_STR, // String literals + TK_NUM, // Numeric literals + TK_PP_NUM, // Preprocessing numbers + TK_EOF, // End-of-file markers } TokenKind; struct File { @@ -81,15 +81,14 @@ struct File { int line_delta; }; -// Token type -struct Token { - TokenKind kind; // Token kind +struct thatispacked Token { + Token *next; // Next token int len; // Token length int line_no; // Line number int line_delta; // Line number + TokenKind kind; // Token kind bool at_bol; // True if this token is at beginning of line bool has_space; // True if this token follows a space character - Token *next; // Next token char *loc; // Token location Type *ty; // Used if TK_NUM or TK_STR File *file; // Source location @@ -518,7 +517,7 @@ int encode_utf8(char *, uint32_t); uint32_t decode_utf8(char **, char *); bool is_ident1(uint32_t); bool is_ident2(uint32_t); -int str_width(char *, int); +int display_width(char *, int); // // hashmap.c @@ -564,6 +563,20 @@ extern bool opt_sse4; extern bool opt_verbose; extern char *base_file; +// +// alloc.c +// + +extern long alloc_node_count; +extern long alloc_token_count; +extern long alloc_obj_count; +extern long alloc_type_count; + +Node *alloc_node(void); +Token *alloc_token(void); +Obj *alloc_obj(void); +Type *alloc_type(void); + COSMOPOLITAN_C_END_ #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ #endif /* COSMOPOLITAN_THIRD_PARTY_CHIBICC_CHIBICC_H_ */ diff --git a/third_party/chibicc/codegen.c b/third_party/chibicc/codegen.c index c6c74a05..1c5fef7c 100644 --- a/third_party/chibicc/codegen.c +++ b/third_party/chibicc/codegen.c @@ -2219,7 +2219,7 @@ static void emit_data(Obj *prog) { int align = (var->ty->kind == TY_ARRAY && var->ty->size >= 16) ? MAX(16, var->align) : var->align; - if (opt_common && var->is_tentative && !var->is_tls) { + if (opt_common && var->is_tentative) { println("\t.comm\t%s,%d,%d", nameof(var), var->ty->size, align); } else { if (var->section) { @@ -2410,10 +2410,10 @@ static void emit_text(Obj *prog) { // Emit code gen_stmt(fn->body); assert(!depth); - // The C spec defines a special rule for the main function. - // Reaching the end of the main function is equivalent to - // returning 0, even though the behavior is undefined for the - // other functions. See C11 5.1.2.2.3. + // [https://www.sigbus.info/n1570#5.1.2.2.3p1] The C spec defines + // a special rule for the main function. Reaching the end of the + // main function is equivalent to returning 0, even though the + // behavior is undefined for the other functions. if (strcmp(nameof(fn), "main") == 0) { emitlin("\txor\t%eax,%eax"); } diff --git a/third_party/chibicc/hashmap.c b/third_party/chibicc/hashmap.c index 8406732e..5a0d5a83 100644 --- a/third_party/chibicc/hashmap.c +++ b/third_party/chibicc/hashmap.c @@ -2,7 +2,10 @@ #include "third_party/chibicc/chibicc.h" -#define TOMBSTONE ((void *)-1) // Represents a deleted hash entry +#define INIT_SIZE 16 // initial hash bucket size +#define LOW_WATERMARK 50 // keep usage below 50% after rehashing +#define HIGH_WATERMARK 70 // perform rehash when usage exceeds 70% +#define TOMBSTONE ((void *)-1) // represents deleted hash table entry static uint64_t fnv_hash(char *s, int len) { uint64_t hash = 0xcbf29ce484222325; @@ -24,7 +27,8 @@ static void rehash(HashMap *map) { } } size_t cap = map->capacity; - while ((nkeys * 100) / cap >= 50) cap = cap * 2; + while ((nkeys * 100) / cap >= LOW_WATERMARK) cap = cap * 2; + assert(cap > 0); // Create a new hashmap and copy all key-values. HashMap map2 = {}; map2.buckets = calloc(cap, sizeof(HashEntry)); @@ -56,9 +60,11 @@ static HashEntry *get_entry(HashMap *map, char *key, int keylen) { static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) { if (!map->buckets) { - map->buckets = calloc((map->capacity = 16), sizeof(HashEntry)); + map->buckets = calloc(INIT_SIZE, sizeof(HashEntry)); + map->capacity = INIT_SIZE; + } else if ((map->used * 100) / map->capacity >= HIGH_WATERMARK) { + rehash(map); } - if ((map->used * 100) / map->capacity >= 70) rehash(map); uint64_t hash = fnv_hash(key, keylen); for (int i = 0; i < map->capacity; i++) { HashEntry *ent = &map->buckets[(hash + i) & (map->capacity - 1)]; diff --git a/third_party/chibicc/parse.c b/third_party/chibicc/parse.c index 11a8327b..05bbf301 100644 --- a/third_party/chibicc/parse.c +++ b/third_party/chibicc/parse.c @@ -25,21 +25,13 @@ typedef struct Scope Scope; // Scope for local variables, global variables, typedefs // or enum constants typedef struct { - char *name; - int depth; Obj *var; Type *type_def; Type *enum_ty; int enum_val; } VarScope; -// Scope for struct, union or enum tags -typedef struct { - char *name; - int depth; - Type *ty; -} TagScope; - +// Represents a block scope. struct Scope { Scope *next; // C has two block scopes; one is for variables/typedefs and @@ -103,10 +95,6 @@ static Obj *globals; static Scope *scope = &(Scope){}; -// scope_depth is incremented by one at the beginning of a block -// scope and decremented by one at the end of a block scope. -static int scope_depth; - // Points to the function object the parser is currently parsing. static Obj *current_fn; @@ -173,12 +161,10 @@ static void enter_scope(void) { Scope *sc = calloc(1, sizeof(Scope)); sc->next = scope; scope = sc; - scope_depth++; } static void leave_scope(void) { scope = scope->next; - scope_depth--; } // Find a variable by name. @@ -190,16 +176,16 @@ static VarScope *find_var(Token *tok) { return NULL; } -static TagScope *find_tag(Token *tok) { +static Type *find_tag(Token *tok) { for (Scope *sc = scope; sc; sc = sc->next) { - TagScope *sc2 = hashmap_get2(&sc->tags, tok->loc, tok->len); - if (sc2) return sc2; + Type *ty = hashmap_get2(&sc->tags, tok->loc, tok->len); + if (ty) return ty; } return NULL; } Node *new_node(NodeKind kind, Token *tok) { - Node *node = calloc(1, sizeof(Node)); + Node *node = alloc_node(); node->kind = kind; node->tok = tok; return node; @@ -252,7 +238,7 @@ static Node *new_vla_ptr(Obj *var, Token *tok) { Node *new_cast(Node *expr, Type *ty) { add_type(expr); - Node *node = calloc(1, sizeof(Node)); + Node *node = alloc_node(); node->kind = ND_CAST; node->tok = expr->tok; node->lhs = expr; @@ -262,8 +248,6 @@ Node *new_cast(Node *expr, Type *ty) { static VarScope *push_scope(char *name) { VarScope *sc = calloc(1, sizeof(VarScope)); - sc->name = name; - sc->depth = scope_depth; hashmap_put(&scope->vars, name, sc); return sc; } @@ -303,7 +287,7 @@ static Initializer *new_initializer(Type *ty, bool is_flexible) { } static Obj *new_var(char *name, Type *ty) { - Obj *var = calloc(1, sizeof(Obj)); + Obj *var = alloc_obj(); var->name = name; var->ty = ty; var->align = ty->align; @@ -330,9 +314,7 @@ static Obj *new_gvar(char *name, Type *ty) { static char *new_unique_name(void) { static int id = 0; - char *buf = calloc(1, 20); - sprintf(buf, ".L..%d", id++); - return buf; + return xasprintf(".L..%d", id++); } static Obj *new_anon_gvar(Type *ty) { @@ -360,11 +342,7 @@ static Type *find_typedef(Token *tok) { } static void push_tag_scope(Token *tok, Type *ty) { - TagScope *sc = calloc(1, sizeof(TagScope)); - sc->name = strndup(tok->loc, tok->len); - sc->depth = scope_depth; - sc->ty = ty; - hashmap_put2(&scope->tags, tok->loc, tok->len, sc); + hashmap_put2(&scope->tags, tok->loc, tok->len, ty); } // Consumes token if equal to STR or __STR__. @@ -599,9 +577,14 @@ static Token *thing_attributes(Token *tok, void *arg) { error_tok(tok, "unknown function attribute"); } -// typespec = typename typename* -// typename = "void" | "_Bool" | "char" | "short" | "int" | "long" -// | struct-decl | union-decl | typedef-name +// declspec = ("void" | "_Bool" | "char" | "short" | "int" | "long" +// | "typedef" | "static" | "extern" | "inline" +// | "_Thread_local" | "__thread" +// | "signed" | "unsigned" +// | struct-decl | union-decl | typedef-name +// | enum-specifier | typeof-specifier +// | "const" | "volatile" | "auto" | "register" | "restrict" +// | "__restrict" | "__restrict__" | "_Noreturn")+ // // The order of typenames in a type-specifier doesn't matter. For // example, `int long static` means the same as `static long int`. @@ -614,7 +597,7 @@ static Token *thing_attributes(Token *tok, void *arg) { // while keeping the "current" type object that the typenames up // until that point represent. When we reach a non-typename token, // we returns the current type object. -static Type *typespec(Token **rest, Token *tok, VarAttr *attr) { +static Type *declspec(Token **rest, Token *tok, VarAttr *attr) { // We use a single integer as counters for all typenames. // For example, bits 0 and 1 represents how many times we saw the // keyword "void" so far. With this, we can use a switch statement @@ -851,7 +834,7 @@ static Token *static_assertion(Token *tok) { } // func-params = ("void" | param ("," param)* ("," "...")?)? ")" -// param = typespec declarator +// param = declspec declarator static Type *func_params(Token **rest, Token *tok, Type *ty) { if (EQUAL(tok, "void") && EQUAL(tok->next, ")")) { *rest = tok->next->next; @@ -868,7 +851,7 @@ static Type *func_params(Token **rest, Token *tok, Type *ty) { skip(tok, ')'); break; } - Type *ty2 = typespec(&tok, tok, NULL); + Type *ty2 = declspec(&tok, tok, NULL); ty2 = declarator(&tok, tok, ty2); Token *name = ty2->name; if (ty2->kind == TY_ARRAY) { @@ -935,8 +918,8 @@ static Type *declarator(Token **rest, Token *tok, Type *ty) { ty = pointers(&tok, tok, ty); if (EQUAL(tok, "(")) { Token *start = tok; - Type ignore = {}; - declarator(&tok, tok->next, &ignore); + Type dummy = {}; + declarator(&tok, start->next, &dummy); tok = skip(tok, ')'); ty = type_suffix(rest, tok, ty); ty = declarator(&tok, start->next, ty); @@ -959,8 +942,8 @@ static Type *abstract_declarator(Token **rest, Token *tok, Type *ty) { ty = pointers(&tok, tok, ty); if (EQUAL(tok, "(")) { Token *start = tok; - Type ignore = {}; - abstract_declarator(&tok, tok->next, &ignore); + Type dummy = {}; + abstract_declarator(&tok, start->next, &dummy); tok = skip(tok, ')'); ty = type_suffix(rest, tok, ty); return abstract_declarator(&tok, start->next, ty); @@ -968,9 +951,9 @@ static Type *abstract_declarator(Token **rest, Token *tok, Type *ty) { return type_suffix(rest, tok, ty); } -// type-name = typespec abstract-declarator +// type-name = declspec abstract-declarator static Type *typename(Token **rest, Token *tok) { - Type *ty = typespec(&tok, tok, NULL); + Type *ty = declspec(&tok, tok, NULL); return abstract_declarator(rest, tok, ty); } @@ -1003,11 +986,11 @@ static Type *enum_specifier(Token **rest, Token *tok) { tok = tok->next; } if (tag && !EQUAL(tok, "{")) { - TagScope *sc = find_tag(tag); - if (!sc) error_tok(tag, "unknown enum type"); - if (sc->ty->kind != TY_ENUM) error_tok(tag, "not an enum tag"); + Type *ty = find_tag(tag); + if (!ty) error_tok(tag, "unknown enum type"); + if (ty->kind != TY_ENUM) error_tok(tag, "not an enum tag"); *rest = tok; - return sc->ty; + return ty; } tok = skip(tok, '{'); // Read an enum-list. @@ -1070,8 +1053,8 @@ static Node *new_alloca(Node *sz) { return node; } -// declaration = typespec (declarator ("=" expr)? ("," declarator ("=" -// expr)?)*)? ";" +// declaration = declspec (declarator ("=" expr)? +// ("," declarator ("=" expr)?)*)? ";" static Node *declaration(Token **rest, Token *tok, Type *basety, VarAttr *attr) { Node head = {}; @@ -1363,9 +1346,11 @@ static void struct_initializer1(Token **rest, Token *tok, Initializer *init) { // struct-initializer2 = initializer ("," initializer)* static void struct_initializer2(Token **rest, Token *tok, Initializer *init, Member *mem) { + bool first = true; for (; mem && !is_end(tok); mem = mem->next) { Token *start = tok; - if (mem != init->ty->members) tok = skip(tok, ','); + if (!first) tok = skip(tok, ','); + first = false; if (EQUAL(tok, "[") || EQUAL(tok, ".")) { *rest = start; return; @@ -1389,6 +1374,7 @@ static void union_initializer(Token **rest, Token *tok, Initializer *init) { init->mem = init->ty->members; if (EQUAL(tok, "{")) { initializer2(&tok, tok->next, init->children[0]); + CONSUME(&tok, tok, ","); *rest = skip(tok, '}'); } else { initializer2(rest, tok, init->children[0]); @@ -1769,7 +1755,7 @@ static Node *stmt(Token **rest, Token *tok) { brk_label = node->brk_label = new_unique_name(); cont_label = node->cont_label = new_unique_name(); if (is_typename(tok)) { - Type *basety = typespec(&tok, tok, NULL); + Type *basety = declspec(&tok, tok, NULL); node->init = declaration(&tok, tok, basety, NULL); } else { node->init = expr_stmt(&tok, tok); @@ -1872,7 +1858,7 @@ static Node *compound_stmt(Token **rest, Token *tok) { while (!EQUAL(tok, "}")) { if (is_typename(tok) && !EQUAL(tok->next, ":")) { VarAttr attr = {}; - Type *basety = typespec(&tok, tok, &attr); + Type *basety = declspec(&tok, tok, &attr); if (attr.is_typedef) { tok = parse_typedef(tok, basety); continue; @@ -2565,30 +2551,14 @@ static Node *mul(Token **rest, Token *tok) { } } -// compound-literal = initializer "}" -static Node *compound_literal(Token **rest, Token *tok, Type *ty, - Token *start) { - if (scope_depth == 0) { - Obj *var = new_anon_gvar(ty); - gvar_initializer(rest, tok, var); - return new_var_node(var, start); - } - Obj *var = new_lvar(new_unique_name(), ty); - Node *lhs = lvar_initializer(rest, tok, var); - Node *rhs = new_var_node(var, tok); - return new_binary(ND_COMMA, lhs, rhs, tok); -} - -// cast = "(" type-name ")" "{" compound-literal -// | "(" type-name ")" cast -// | unary +// cast = "(" type-name ")" cast | unary static Node *cast(Token **rest, Token *tok) { if (EQUAL(tok, "(") && is_typename(tok->next)) { Token *start = tok; Type *ty = typename(&tok, tok->next); tok = skip(tok, ')'); // compound literal - if (EQUAL(tok, "{")) return compound_literal(rest, tok, ty, start); + if (EQUAL(tok, "{")) return unary(rest, start); // type cast Node *node = new_cast(cast(rest, tok), ty); node->tok = start; @@ -2612,9 +2582,10 @@ static Node *unary(Token **rest, Token *tok) { return new_unary(ND_ADDR, lhs, tok); } if (EQUAL(tok, "*")) { - // [C18 6.5.3.2p4] This is an oddity in the C spec, but dereferencing - // a function shouldn't do anything. If foo is a function, `*foo`, - // `**foo` or `*****foo` are all equivalent to just `foo`. + // [https://www.sigbus.info/n1570#6.5.3.2p4] This is an oddity + // in the C spec, but dereferencing a function shouldn't do + // anything. If foo is a function, `*foo`, `**foo` or `*****foo` + // are all equivalent to just `foo`. Node *node = cast(rest, tok->next); add_type(node); if (node->ty->kind == TY_FUNC) return node; @@ -2640,14 +2611,14 @@ static Node *unary(Token **rest, Token *tok) { return postfix(rest, tok); } -// struct-members = (typespec declarator ("," declarator)* ";")* +// struct-members = (declspec declarator ("," declarator)* ";")* static void struct_members(Token **rest, Token *tok, Type *ty) { Member head = {}; Member *cur = &head; int idx = 0; while (!EQUAL(tok, "}")) { VarAttr attr = {}; - Type *basety = typespec(&tok, tok, &attr); + Type *basety = declspec(&tok, tok, &attr); bool first = true; // Anonymous struct member if ((basety->kind == TY_STRUCT || basety->kind == TY_UNION) && @@ -2708,8 +2679,8 @@ static Type *struct_union_decl(Token **rest, Token *tok) { } if (tag && !EQUAL(tok, "{")) { *rest = tok; - TagScope *sc = find_tag(tag); - if (sc) return sc->ty; + Type *ty2 = find_tag(tag); + if (ty2) return ty2; ty->size = -1; push_tag_scope(tag, ty); return ty; @@ -2721,10 +2692,10 @@ static Type *struct_union_decl(Token **rest, Token *tok) { if (tag) { // If this is a redefinition, overwrite a previous type. // Otherwise, register the struct type. - TagScope *sc = find_tag(tag); - if (sc && sc->depth == scope_depth) { - *sc->ty = *ty; - return sc->ty; + Type *ty2 = hashmap_get2(&scope->tags, tag->loc, tag->len); + if (ty2) { + *ty2 = *ty; + return ty2; } push_tag_scope(tag, ty); } @@ -2837,7 +2808,8 @@ static Node *new_inc_dec(Node *node, Token *tok, int addend) { node->ty); } -// postfix = ident "(" func-args ")" postfix-tail* +// postfix = "(" type-name ")" "{" initializer-list "}" +// | ident "(" func-args ")" postfix-tail* // | primary postfix-tail* // // postfix-tail = "[" expr "]" @@ -2847,6 +2819,21 @@ static Node *new_inc_dec(Node *node, Token *tok, int addend) { // | "++" // | "--" static Node *postfix(Token **rest, Token *tok) { + if (EQUAL(tok, "(") && is_typename(tok->next)) { + // Compound literal + Token *start = tok; + Type *ty = typename(&tok, tok->next); + tok = skip(tok, ')'); + if (scope->next == NULL) { + Obj *var = new_anon_gvar(ty); + gvar_initializer(rest, tok, var); + return new_var_node(var, start); + } + Obj *var = new_lvar("", ty); + Node *lhs = lvar_initializer(rest, tok, var); + Node *rhs = new_var_node(var, tok); + return new_binary(ND_COMMA, lhs, rhs, start); + } Node *node = primary(&tok, tok); for (;;) { if (EQUAL(tok, "(")) { @@ -2961,7 +2948,7 @@ static Node *generic_selection(Token **rest, Token *tok) { return ret; } -// primary = "(" "{" stmt stmt* "}" ")" +// primary = "(" "{" stmt+ "}" ")" // | "(" expr ")" // | "sizeof" "(" type-name ")" // | "sizeof" unary @@ -3367,8 +3354,9 @@ static Token *function(Token *tok, Type *basety, VarAttr *attr) { fn->va_area = new_lvar("__va_area__", array_of(ty_char, 136)); fn->alloca_bottom = new_lvar("__alloca_size__", pointer_to(ty_char)); tok = skip(tok, '{'); - // [C18 6.4.2.2] "__func__" is automatically defined as a - // local variable containing the current function name. + // [https://www.sigbus.info/n1570#6.4.2.2p1] "__func__" is + // automatically defined as a local variable containing the + // current function name. push_scope("__func__")->var = new_string_literal(fn->name, array_of(ty_char, strlen(fn->name) + 1)); // [GNU] __FUNCTION__ is yet another name of __func__. @@ -3401,7 +3389,7 @@ static Token *global_variable(Token *tok, Type *basety, VarAttr *attr) { if (attr->align) var->align = attr->align; if (EQUAL(tok, "=")) { gvar_initializer(&tok, tok->next, var); - } else if (!attr->is_extern) { + } else if (!attr->is_extern && !attr->is_tls) { var->is_tentative = true; } } @@ -3537,7 +3525,7 @@ Obj *parse(Token *tok) { } VarAttr attr = {}; tok = attribute_list(tok, &attr, thing_attributes); - Type *basety = typespec(&tok, tok, &attr); + Type *basety = declspec(&tok, tok, &attr); if (attr.is_typedef) { tok = parse_typedef(tok, basety); continue; diff --git a/third_party/chibicc/preprocess.c b/third_party/chibicc/preprocess.c index a0cabf9d..89fc4f5a 100644 --- a/third_party/chibicc/preprocess.c +++ b/third_party/chibicc/preprocess.c @@ -96,7 +96,7 @@ static Token *skip_line(Token *tok) { } static Token *copy_token(Token *tok) { - Token *t = calloc(1, sizeof(Token)); + Token *t = alloc_token(); *t = *tok; t->next = NULL; return t; @@ -234,9 +234,8 @@ static Token *copy_line(Token **rest, Token *tok) { } static Token *new_num_token(int val, Token *tmpl) { - char buf[30]; - sprintf(buf, "%d\n", val); - return tokenize(new_file(tmpl->file->name, tmpl->file->file_no, strdup(buf))); + char *buf = xasprintf("%d\n", val); + return tokenize(new_file(tmpl->file->name, tmpl->file->file_no, buf)); } static Token *read_const_expr(Token **rest, Token *tok) { @@ -270,10 +269,10 @@ static long eval_const_expr(Token **rest, Token *tok) { Token *expr = read_const_expr(rest, tok->next); expr = preprocess2(expr); if (expr->kind == TK_EOF) error_tok(start, "no expression"); - // [C18 6.10.1.4] The standard requires we replace remaining - // non-macro identifiers with "0" before evaluating a constant - // expression. For example, `#if foo` is equivalent to `#if 0` - // if foo is not defined. + // [https://www.sigbus.info/n1570#6.10.1p4] The standard requires + // we replace remaining non-macro identifiers with "0" before + // evaluating a constant expression. For example, `#if foo` is + // equivalent to `#if 0` if foo is not defined. for (Token *t = expr; t->kind != TK_EOF; t = t->next) { if (t->kind == TK_IDENT) { Token *next = t->next; @@ -453,8 +452,7 @@ static Token *stringize(Token *hash, Token *arg) { // Concatenate two tokens to create a new token. static Token *paste(Token *lhs, Token *rhs) { // Paste the two tokens. - char *buf = calloc(1, lhs->len + rhs->len + 1); - sprintf(buf, "%.*s%.*s", lhs->len, lhs->loc, rhs->len, rhs->loc); + char *buf = xasprintf("%.*s%.*s", lhs->len, lhs->loc, rhs->len, rhs->loc); // Tokenize the resulting string. Token *tok = tokenize(new_file(lhs->file->name, lhs->file->file_no, buf)); if (tok->next->kind != TK_EOF) @@ -706,7 +704,7 @@ static char *detect_include_guard(Token *tok) { return NULL; } -static Token *include_file(Token *tok, char *path) { +static Token *include_file(Token *tok, char *path, Token *filename_tok) { // Check for "#pragma once" if (hashmap_get(&pragma_once, path)) return tok; // If we read the same file before, and if the file was guarded @@ -716,7 +714,8 @@ static Token *include_file(Token *tok, char *path) { char *guard_name = hashmap_get(&include_guards, path); if (guard_name && hashmap_get(¯os, guard_name)) return tok; Token *tok2 = tokenize_file(path); - if (!tok2) error_tok(tok, "%s: cannot open file: %s", path, strerror(errno)); + if (!tok2) + error_tok(filename_tok, "%s: cannot open file: %s", path, strerror(errno)); guard_name = detect_include_guard(tok2); if (guard_name) hashmap_put(&include_guards, path, guard_name); return append(tok2, tok); @@ -760,19 +759,19 @@ static Token *preprocess2(Token *tok) { char *path = xasprintf("%s/%s", dirname(strdup(start->file->name)), filename); if (fileexists(path)) { - tok = include_file(tok, path); + tok = include_file(tok, path, start->next->next); continue; } } char *path = search_include_paths(filename); - tok = include_file(tok, path ? path : filename); + tok = include_file(tok, path ? path : filename, start->next->next); continue; } if (EQUAL(tok, "include_next")) { bool ignore; char *filename = read_include_filename(&tok, tok->next, &ignore); char *path = search_include_next(filename); - tok = include_file(tok, path ? path : filename); + tok = include_file(tok, path ? path : filename, start->next->next); continue; } if (EQUAL(tok, "define")) { @@ -914,17 +913,13 @@ static char *format_date(struct tm *tm) { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", }; - char buf[30]; - sprintf(buf, "\"%s %2d %d\"", mon[tm->tm_mon], tm->tm_mday, - tm->tm_year + 1900); - return strdup(buf); + return xasprintf("\"%s %2d %d\"", mon[tm->tm_mon], tm->tm_mday, + tm->tm_year + 1900); } // __TIME__ is expanded to the current time, e.g. "13:34:03". static char *format_time(struct tm *tm) { - char buf[30]; - sprintf(buf, "\"%02d:%02d:%02d\"", tm->tm_hour, tm->tm_min, tm->tm_sec); - return strdup(buf); + return xasprintf("\"%02d:%02d:%02d\"", tm->tm_hour, tm->tm_min, tm->tm_sec); } void init_macros(void) { @@ -1302,11 +1297,11 @@ static void join_adjacent_string_literals(Token *tok) { } // Second pass: concatenate adjacent string literals. for (Token *tok1 = tok; tok1->kind != TK_EOF;) { - Token *tok2 = tok1->next; - if (tok1->kind != TK_STR || tok2->kind != TK_STR) { + if (tok1->kind != TK_STR || tok1->next->kind != TK_STR) { tok1 = tok1->next; continue; } +#if 0 assert(tok1->ty->base->size == tok2->ty->base->size); Token *t = copy_token(tok1); t->ty = @@ -1317,6 +1312,25 @@ static void join_adjacent_string_literals(Token *tok) { tok2->str, tok2->ty->size); t->len = strlen(t->loc); *tok1 = *t; +#else + Token *tok2 = tok1->next; + while (tok2->kind == TK_STR) tok2 = tok2->next; + int len = tok1->ty->array_len; + for (Token *t = tok1->next; t != tok2; t = t->next) { + len = len + t->ty->array_len - 1; + } + char *buf = calloc(tok1->ty->base->size, len); + int i = 0; + for (Token *t = tok1; t != tok2; t = t->next) { + memcpy(buf + i, t->str, t->ty->size); + i = i + t->ty->size - t->ty->base->size; + } + *tok1 = *copy_token(tok1); + tok1->ty = array_of(tok1->ty->base, len); + tok1->str = buf; + tok1->next = tok2; + tok1 = tok2; +#endif } } diff --git a/third_party/chibicc/test/hog_test.c b/third_party/chibicc/test/hog_test.c deleted file mode 100644 index 961a27f5..00000000 --- a/third_party/chibicc/test/hog_test.c +++ /dev/null @@ -1,4 +0,0 @@ -main(void) { - void *p; - p = "hello"; -} diff --git a/third_party/chibicc/test/initializer_test.c b/third_party/chibicc/test/initializer_test.c index b119851d..c8914f2c 100644 --- a/third_party/chibicc/test/initializer_test.c +++ b/third_party/chibicc/test/initializer_test.c @@ -445,6 +445,15 @@ int main() { }; x.a; })); + ASSERT(1, ({ + union { + int a; + char b; + } x = { + 1, + }; + x.a; + })); ASSERT(2, ({ enum { x, diff --git a/third_party/chibicc/test/struct_test.c b/third_party/chibicc/test/struct_test.c index 810996d8..7d82571f 100644 --- a/third_party/chibicc/test/struct_test.c +++ b/third_party/chibicc/test/struct_test.c @@ -392,6 +392,24 @@ int main() { } x = {1}, y = {2}; (0 ? x : y).a; })); + ASSERT(2, ({ + struct { + int a; + } x = {1}, y = {2}; + (x = y).a; + })); + ASSERT(1, ({ + struct { + int a; + } x = {1}, y = {2}; + (1 ? x : y).a; + })); + ASSERT(2, ({ + struct { + int a; + } x = {1}, y = {2}; + (0 ? x : y).a; + })); return 0; } diff --git a/third_party/chibicc/tokenize.c b/third_party/chibicc/tokenize.c index 3e89c881..1e2eb576 100644 --- a/third_party/chibicc/tokenize.c +++ b/third_party/chibicc/tokenize.c @@ -38,7 +38,7 @@ static void verror_at(char *filename, char *input, int line_no, char *loc, int indent = fprintf(stderr, "%s:%d: ", filename, line_no); fprintf(stderr, "%.*s\n", (int)(end - line), line); // Show the error message. - int pos = str_width(line, loc - line) + indent; + int pos = display_width(line, loc - line) + indent; fprintf(stderr, "%*s", pos, ""); // print pos spaces. fprintf(stderr, "^ "); vfprintf(stderr, fmt, ap); @@ -53,6 +53,7 @@ void error_at(char *loc, char *fmt, ...) { va_list ap; va_start(ap, fmt); verror_at(current_file->name, current_file->contents, line_no, loc, fmt, ap); + va_end(ap); exit(1); } @@ -64,7 +65,7 @@ void error_tok(Token *tok, char *fmt, ...) { verror_at(t->file->name, t->file->contents, t->line_no, t->loc, fmt, ap); va_end(ap); } - va_end(va); + va_end(ap); exit(1); } @@ -73,6 +74,7 @@ void warn_tok(Token *tok, char *fmt, ...) { va_start(ap, fmt); verror_at(tok->file->name, tok->file->contents, tok->line_no, tok->loc, fmt, ap); + va_end(ap); } static int is_space(int c) { @@ -103,9 +105,9 @@ Token *skip(Token *tok, char op) { } } -// Create a new token and add it as the next token of `cur`. +// Create a new token. static Token *new_token(TokenKind kind, char *start, char *end) { - Token *tok = calloc(1, sizeof(Token)); + Token *tok = alloc_token(); tok->kind = kind; tok->loc = start; tok->len = end - start; @@ -117,18 +119,17 @@ static Token *new_token(TokenKind kind, char *start, char *end) { return tok; } -// Read an identifier and returns a pointer pointing to the end -// of an identifier. -// -// Returns null if p does not point to a valid identifier. -static char *read_ident(char *p) { +// Read an identifier and returns the length of it. +// If p does not point to a valid identifier, 0 is returned. +static int read_ident(char *start) { + char *p = start; uint32_t c = decode_utf8(&p, p); - if (!is_ident1(c)) return NULL; + if (!is_ident1(c)) return 0; for (;;) { char *q; c = decode_utf8(&q, p); if (!('a' <= c && c <= 'f') && !is_ident2(c)) { - return p; + return p - start; } p = q; } @@ -140,6 +141,19 @@ static int from_hex(char c) { return c - 'A' + 10; } +// Read a punctuator token from p and returns its length. +static int read_punct(char *p) { + static char *kw[] = {"<<=", ">>=", "...", "==", "!=", "<=", ">=", "->", + "+=", "-=", "*=", "/=", "++", "--", "%=", "&=", + "|=", "^=", "&&", "||", "<<", ">>", "##"}; + for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++) { + if (startswith(p, kw[i])) { + return strlen(kw[i]); + } + } + return ispunct(*p) ? 1 : 0; +} + static bool is_keyword(Token *tok) { static HashMap map; if (map.capacity == 0) { @@ -190,6 +204,17 @@ static int read_escaped_char(char **new_pos, char *p) { return c; } *new_pos = p + 1; + // Escape sequences are defined using themselves here. E.g. + // '\n' is implemented using '\n'. This tautological definition + // works because the compiler that compiles our compiler knows + // what '\n' actually is. In other words, we "inherit" the ASCII + // code of '\n' from the compiler that compiles our compiler, + // so we don't have to teach the actual code here. + // + // This fact has huge implications not only for the correctness + // of the compiler but also for the security of the generated code. + // For more info, read "Reflections on Trusting Trust" by Ken Thompson. + // https://github.com/rui314/chibicc/wiki/thompson1984.pdf switch (*p) { case 'a': return '\a'; @@ -217,7 +242,7 @@ static int read_escaped_char(char **new_pos, char *p) { static char *string_literal_end(char *p) { char *start = p; for (; *p != '"'; p++) { - if (*p == '\0') error_at(start, "unclosed string literal"); + if (*p == '\n' || *p == '\0') error_at(start, "unclosed string literal"); if (*p == '\\') p++; } return p; @@ -225,7 +250,7 @@ static char *string_literal_end(char *p) { static Token *read_string_literal(char *start, char *quote) { char *end = string_literal_end(quote + 1); - char *buf = calloc(1, end - quote); + char *buf = calloc(2, end - quote); int len = 0; for (char *p = quote + 1; p < end;) { if (*p == '\\') @@ -409,7 +434,7 @@ static void convert_pp_number(Token *tok) { void convert_pp_tokens(Token *tok) { for (Token *t = tok; t->kind != TK_EOF; t = t->next) { if (is_keyword(t)) - t->kind = TK_RESERVED; + t->kind = TK_KEYWORD; else if (t->kind == TK_PP_NUM) convert_pp_number(t); } @@ -546,34 +571,17 @@ Token *tokenize(File *file) { continue; } // Identifier or keyword - char *q; - if ((q = read_ident(p)) != NULL) { - cur = cur->next = new_token(TK_IDENT, p, q); - p = q; + int ident_len = read_ident(p); + if (ident_len) { + cur = cur->next = new_token(TK_IDENT, p, p + ident_len); + p += cur->len; continue; } - // Three-letter punctuators - if (LOOKINGAT(p, "<<=") || LOOKINGAT(p, ">>=") || LOOKINGAT(p, "...")) { - cur = cur->next = new_token(TK_RESERVED, p, p + 3); - p += 3; - continue; - } - // Two-letter punctuators - if (LOOKINGAT(p, "==") || LOOKINGAT(p, "!=") || LOOKINGAT(p, "<=") || - LOOKINGAT(p, ">=") || LOOKINGAT(p, "->") || LOOKINGAT(p, "+=") || - LOOKINGAT(p, "-=") || LOOKINGAT(p, "*=") || LOOKINGAT(p, "/=") || - LOOKINGAT(p, "++") || LOOKINGAT(p, "--") || LOOKINGAT(p, "%=") || - LOOKINGAT(p, "&=") || LOOKINGAT(p, "|=") || LOOKINGAT(p, "^=") || - LOOKINGAT(p, "&&") || LOOKINGAT(p, "||") || LOOKINGAT(p, "<<") || - LOOKINGAT(p, ">>") || LOOKINGAT(p, "##")) { - cur = cur->next = new_token(TK_RESERVED, p, p + 2); - p += 2; - continue; - } - // Single-letter punctuators - if (ispunct(*p)) { - cur = cur->next = new_token(TK_RESERVED, p, p + 1); - p++; + // Punctuators + int punct_len = read_punct(p); + if (punct_len) { + cur = cur->next = new_token(TK_PUNCT, p, p + punct_len); + p += cur->len; continue; } error_at(p, "invalid token"); @@ -665,6 +673,7 @@ static void remove_backslash_newline(char *p) { p[j++] = p[i++]; } } + for (; n > 0; n--) p[j++] = '\n'; p[j] = '\0'; } @@ -710,6 +719,11 @@ static void convert_universal_chars(char *p) { Token *tokenize_file(char *path) { char *p = read_file(path); if (!p) return NULL; + // UTF-8 texts may start with a 3-byte "BOM" marker sequence. + // If exists, just skip them because they are useless bytes. + // (It is actually not recommended to add BOM markers to UTF-8 + // texts, but it's not uncommon particularly on Windows.) + if (!memcmp(p, "\xef\xbb\xbf", 3)) p += 3; canonicalize_newline(p); remove_backslash_newline(p); convert_universal_chars(p); diff --git a/third_party/chibicc/type.c b/third_party/chibicc/type.c index 743082aa..fb8f8538 100644 --- a/third_party/chibicc/type.c +++ b/third_party/chibicc/type.c @@ -18,7 +18,7 @@ Type ty_double[1] = {{TY_DOUBLE, 8, 8}}; Type ty_ldouble[1] = {{TY_LDOUBLE, 16, 16}}; static Type *new_type(TypeKind kind, int size, int align) { - Type *ty = calloc(1, sizeof(Type)); + Type *ty = alloc_type(); ty->kind = kind; ty->size = size; ty->align = align; @@ -77,7 +77,7 @@ bool is_compatible(Type *t1, Type *t2) { } Type *copy_type(Type *ty) { - Type *ret = calloc(1, sizeof(Type)); + Type *ret = alloc_type(); *ret = *ty; ret->origin = ty; return ret; diff --git a/third_party/chibicc/unicode.c b/third_party/chibicc/unicode.c index 64d1dcab..0ff0b840 100644 --- a/third_party/chibicc/unicode.c +++ b/third_party/chibicc/unicode.c @@ -66,9 +66,9 @@ static bool in_range(uint32_t *range, uint32_t c) { return false; } -// C11 allows not only ASCII but some multibyte characters in certan -// Unicode ranges to be used in an identifier. See C11 Annex D for the -// details. +// [https://www.sigbus.info/n1570#D] C11 allows not only ASCII but +// some multibyte characters in certan Unicode ranges to be used in an +// identifier. // // This function returns true if a given character is acceptable as // the first character of an identifier. @@ -108,7 +108,7 @@ bool is_ident2(uint32_t c) { // Returns the number of columns needed to display a given // string in a fixed-width font. -int str_width(char *p, int len) { +int display_width(char *p, int len) { char *start = p; int w = 0; while (p - start < len) {