cosmopolitan/third_party/ctags/ruby.c

/*
 *   $Id: ruby.c 571 2007-06-24 23:32:14Z elliotth $
 *
 *   Copyright (c) 2000-2001, Thaddeus Covert <sahuagin@mediaone.net>
 *   Copyright (c) 2002 Matthias Veit <matthias_veit@yahoo.de>
 *   Copyright (c) 2004 Elliott Hughes <enh@acm.org>
 *
 *   This source code is released for free distribution under the terms of the
 *   GNU General Public License.
 *
 *   This module contains functions for generating tags for Ruby language
 *   files.
 */
#include "third_party/ctags/general.h"
/* must always come first */
#include "third_party/ctags/entry.h"
#include "third_party/ctags/parse.h"
#include "third_party/ctags/read.h"
#include "third_party/ctags/vstring.h"

/*
 *   DATA DECLARATIONS
 */
typedef enum {
  K_UNDEFINED = -1,
  K_CLASS,
  K_METHOD,
  K_MODULE,
  K_SINGLETON
} rubyKind;

/*
 *   DATA DEFINITIONS
 */
static kindOption RubyKinds[] = {
    {TRUE, 'c', "class", "classes"},
    {TRUE, 'f', "method", "methods"},
    {TRUE, 'm', "module", "modules"},
    {TRUE, 'F', "singleton method", "singleton methods"}};

static stringList* nesting = 0;

/*
 *   FUNCTION DEFINITIONS
 */

/*
 * Returns a string describing the scope in 'list'.
 * We record the current scope as a list of entered scopes.
 * Scopes corresponding to 'if' statements and the like are
 * represented by empty strings. Scopes corresponding to
 * modules and classes are represented by the name of the
 * module or class.
 */
static vString* stringListToScope(const stringList* list) {
  unsigned int i;
  unsigned int chunks_output = 0;
  vString* result = vStringNew();
  const unsigned int max = stringListCount(list);
  for (i = 0; i < max; ++i) {
    vString* chunk = stringListItem(list, i);
    if (vStringLength(chunk) > 0) {
      vStringCatS(result, (chunks_output++ > 0) ? "." : "");
      vStringCatS(result, vStringValue(chunk));
    }
  }
  return result;
}

/*
 * Attempts to advance 's' past 'literal'.
 * Returns TRUE if it did, FALSE (and leaves 's' where
 * it was) otherwise.
 */
static boolean canMatch(const unsigned char** s, const char* literal) {
  const int literal_length = strlen(literal);
  const unsigned char next_char = *(*s + literal_length);
  if (strncmp((const char*)*s, literal, literal_length) != 0) {
    return FALSE;
  }
  /* Additionally check that we're at the end of a token. */
  if (!(next_char == 0 || isspace(next_char) || next_char == '(')) {
    return FALSE;
  }
  *s += literal_length;
  return TRUE;
}

/*
 * Attempts to advance 'cp' past a Ruby operator method name. Returns
 * TRUE if successful (and copies the name into 'name'), FALSE otherwise.
 */
static boolean parseRubyOperator(vString* name, const unsigned char** cp) {
  static const char* RUBY_OPERATORS[] = {
      "[]", "[]=", "**", "!",   "~",  "+@", "-@", "*",  "/", "%",
      "+",  "-",   ">>", "<<",  "&",  "^",  "|",  "<=", "<", ">",
      ">=", "<=>", "==", "===", "!=", "=~", "!~", "`",  0};
  int i;
  for (i = 0; RUBY_OPERATORS[i] != 0; ++i) {
    if (canMatch(cp, RUBY_OPERATORS[i])) {
      vStringCatS(name, RUBY_OPERATORS[i]);
      return TRUE;
    }
  }
  return FALSE;
}

/*
 * Emits a tag for the given 'name' of kind 'kind' at the current nesting.
 */
static void emitRubyTag(vString* name, rubyKind kind) {
  tagEntryInfo tag;
  vString* scope;

  vStringTerminate(name);
  scope = stringListToScope(nesting);

  initTagEntry(&tag, vStringValue(name));
  if (vStringLength(scope) > 0) {
    tag.extensionFields.scope[0] = "class";
    tag.extensionFields.scope[1] = vStringValue(scope);
  }
  tag.kindName = RubyKinds[kind].name;
  tag.kind = RubyKinds[kind].letter;
  makeTagEntry(&tag);

  stringListAdd(nesting, vStringNewCopy(name));

  vStringClear(name);
  vStringDelete(scope);
}

/* Tests whether 'ch' is a character in 'list'. */
static boolean charIsIn(char ch, const char* list) {
  return (strchr(list, ch) != 0);
}

/* Advances 'cp' over leading whitespace. */
static void skipWhitespace(const unsigned char** cp) {
  while (isspace(**cp)) {
    ++*cp;
  }
}

/*
 * Copies the characters forming an identifier from *cp into
 * name, leaving *cp pointing to the character after the identifier.
 */
static rubyKind parseIdentifier(const unsigned char** cp, vString* name,
                                rubyKind kind) {
  /* Method names are slightly different to class and variable names.
   * A method name may optionally end with a question mark, exclamation
   * point or equals sign. These are all part of the name.
   * A method name may also contain a period if it's a singleton method.
   */
  const char* also_ok = (kind == K_METHOD) ? "_.?!=" : "_";

  skipWhitespace(cp);

  /* Check for an anonymous (singleton) class such as "class << HTTP". */
  if (kind == K_CLASS && **cp == '<' && *(*cp + 1) == '<') {
    return K_UNDEFINED;
  }

  /* Check for operators such as "def []=(key, val)". */
  if (kind == K_METHOD || kind == K_SINGLETON) {
    if (parseRubyOperator(name, cp)) {
      return kind;
    }
  }

  /* Copy the identifier into 'name'. */
  while (**cp != 0 && (isalnum(**cp) || charIsIn(**cp, also_ok))) {
    char last_char = **cp;

    vStringPut(name, last_char);
    ++*cp;

    if (kind == K_METHOD) {
      /* Recognize singleton methods. */
      if (last_char == '.') {
        vStringTerminate(name);
        vStringClear(name);
        return parseIdentifier(cp, name, K_SINGLETON);
      }

      /* Recognize characters which mark the end of a method name. */
      if (charIsIn(last_char, "?!=")) {
        break;
      }
    }
  }
  return kind;
}

static void readAndEmitTag(const unsigned char** cp, rubyKind expected_kind) {
  if (isspace(**cp)) {
    vString* name = vStringNew();
    rubyKind actual_kind = parseIdentifier(cp, name, expected_kind);

    if (actual_kind == K_UNDEFINED || vStringLength(name) == 0) {
      /*
       * What kind of tags should we create for code like this?
       *
       *    %w(self.clfloor clfloor).each do |name|
       *        module_eval <<-"end;"
       *            def #{name}(x, y=1)
       *                q, r = x.divmod(y)
       *                q = q.to_i
       *                return q, r
       *            end
       *        end;
       *    end
       *
       * Or this?
       *
       *    class << HTTP
       *
       * For now, we don't create any.
       */
    } else {
      emitRubyTag(name, actual_kind);
    }
    vStringDelete(name);
  }
}

static void enterUnnamedScope(void) {
  stringListAdd(nesting, vStringNewInit(""));
}

static void findRubyTags(void) {
  const unsigned char* line;
  boolean inMultiLineComment = FALSE;

  nesting = stringListNew();

  /* FIXME: this whole scheme is wrong, because Ruby isn't line-based.
   * You could perfectly well write:
   *
   *  def
   *  method
   *   puts("hello")
   *  end
   *
   * if you wished, and this function would fail to recognize anything.
   */
  while ((line = fileReadLine()) != NULL) {
    const unsigned char* cp = line;

    if (canMatch(&cp, "=begin")) {
      inMultiLineComment = TRUE;
      continue;
    }
    if (canMatch(&cp, "=end")) {
      inMultiLineComment = FALSE;
      continue;
    }

    skipWhitespace(&cp);

    /* Avoid mistakenly starting a scope for modifiers such as
     *
     *   return if <exp>
     *
     * FIXME: this is fooled by code such as
     *
     *   result = if <exp>
     *               <a>
     *            else
     *               <b>
     *            end
     *
     * FIXME: we're also fooled if someone does something heinous such as
     *
     *   puts("hello") \
     *       unless <exp>
     */
    if (canMatch(&cp, "case") || canMatch(&cp, "for") || canMatch(&cp, "if") ||
        canMatch(&cp, "unless") || canMatch(&cp, "while")) {
      enterUnnamedScope();
    }

    /*
     * "module M", "class C" and "def m" should only be at the beginning
     * of a line.
     */
    if (canMatch(&cp, "module")) {
      readAndEmitTag(&cp, K_MODULE);
    } else if (canMatch(&cp, "class")) {
      readAndEmitTag(&cp, K_CLASS);
    } else if (canMatch(&cp, "def")) {
      readAndEmitTag(&cp, K_METHOD);
    }

    while (*cp != '\0') {
      /* FIXME: we don't cope with here documents,
       * or regular expression literals, or ... you get the idea.
       * Hopefully, the restriction above that insists on seeing
       * definitions at the starts of lines should keep us out of
       * mischief.
       */
      if (inMultiLineComment || isspace(*cp)) {
        ++cp;
      } else if (*cp == '#') {
        /* FIXME: this is wrong, but there *probably* won't be a
         * definition after an interpolated string (where # doesn't
         * mean 'comment').
         */
        break;
      } else if (canMatch(&cp, "begin") || canMatch(&cp, "do")) {
        enterUnnamedScope();
      } else if (canMatch(&cp, "end") && stringListCount(nesting) > 0) {
        /* Leave the most recent scope. */
        vStringDelete(stringListLast(nesting));
        stringListRemoveLast(nesting);
      } else if (*cp == '"') {
        /* Skip string literals.
         * FIXME: should cope with escapes and interpolation.
         */
        do {
          ++cp;
        } while (*cp != 0 && *cp != '"');
      } else if (*cp != '\0') {
        do
          ++cp;
        while (isalnum(*cp) || *cp == '_');
      }
    }
  }
  stringListDelete(nesting);
}

extern parserDefinition* RubyParser(void) {
  static const char* const extensions[] = {"rb", "ruby", NULL};
  parserDefinition* def = parserNew("Ruby");
  def->kinds = RubyKinds;
  def->kindCount = KIND_COUNT(RubyKinds);
  def->extensions = extensions;
  def->parser = findRubyTags;
  return def;
}

/* vi:set tabstop=4 shiftwidth=4: */