cosmopolitan/third_party/ctags/tex.c

/*
 *	 $Id: tex.c 666 2008-05-15 17:47:31Z dfishburn $
 *
 *	 Copyright (c) 2008, David Fishburn
 *
 *	 This source code is released for free distribution under the
 *       terms of the GNU General Public License.
 *
 *	 This module contains functions for generating tags for TeX
 *       language files.
 *
 *	 Tex language reference:
 *		 http://en.wikibooks.org/wiki/TeX#The_Structure_of_TeX
 */
#include "third_party/ctags/general.h"
/* must always come first */
#include "third_party/ctags/debug.h"
#include "third_party/ctags/entry.h"
#include "third_party/ctags/keyword.h"
#include "third_party/ctags/parse.h"
#include "third_party/ctags/read.h"
#include "third_party/ctags/routines.h"
#include "third_party/ctags/vstring.h"

/*
 *	 MACROS
 */
#define isType(token, t)    (boolean)((token)->type == (t))
#define isKeyword(token, k) (boolean)((token)->keyword == (k))

/*
 *	 DATA DECLARATIONS
 */

typedef enum eException { ExceptionNone, ExceptionEOF } exception_t;

/*
 * Used to specify type of keyword.
 */
typedef enum eKeywordId {
  KEYWORD_NONE = -1,
  KEYWORD_chapter,
  KEYWORD_section,
  KEYWORD_subsection,
  KEYWORD_subsubsection,
  KEYWORD_part,
  KEYWORD_paragraph,
  KEYWORD_subparagraph,
  KEYWORD_include
} keywordId;

/*	Used to determine whether keyword is valid for the token language and
 *	what its ID is.
 */
typedef struct sKeywordDesc {
  const char *name;
  keywordId id;
} keywordDesc;

typedef enum eTokenType {
  TOKEN_UNDEFINED,
  TOKEN_CHARACTER,
  TOKEN_CLOSE_PAREN,
  TOKEN_COMMA,
  TOKEN_KEYWORD,
  TOKEN_OPEN_PAREN,
  TOKEN_IDENTIFIER,
  TOKEN_STRING,
  TOKEN_OPEN_CURLY,
  TOKEN_CLOSE_CURLY,
  TOKEN_OPEN_SQUARE,
  TOKEN_CLOSE_SQUARE,
  TOKEN_QUESTION_MARK,
  TOKEN_STAR
} tokenType;

typedef struct sTokenInfo {
  tokenType type;
  keywordId keyword;
  vString *string;
  vString *scope;
  unsigned long lineNumber;
  fpos_t filePosition;
} tokenInfo;

/*
 *	DATA DEFINITIONS
 */

static langType Lang_js;

static jmp_buf Exception;

typedef enum {
  TEXTAG_CHAPTER,
  TEXTAG_SECTION,
  TEXTAG_SUBSECTION,
  TEXTAG_SUBSUBSECTION,
  TEXTAG_PART,
  TEXTAG_PARAGRAPH,
  TEXTAG_SUBPARAGRAPH,
  TEXTAG_INCLUDE,
  TEXTAG_COUNT
} texKind;

static kindOption TexKinds[] = {{TRUE, 'c', "chapter", "chapters"},
                                {TRUE, 's', "section", "sections"},
                                {TRUE, 'u', "subsection", "subsections"},
                                {TRUE, 'b', "subsubsection", "subsubsections"},
                                {TRUE, 'p', "part", "parts"},
                                {TRUE, 'P', "paragraph", "paragraphs"},
                                {TRUE, 'G', "subparagraph", "subparagraphs"},
                                {TRUE, 'i', "include", "includes"}};

static const keywordDesc TexKeywordTable[] = {
    /* keyword			keyword ID */
    {"chapter", KEYWORD_chapter},
    {"section", KEYWORD_section},
    {"subsection", KEYWORD_subsection},
    {"subsubsection", KEYWORD_subsubsection},
    {"part", KEYWORD_part},
    {"paragraph", KEYWORD_paragraph},
    {"subparagraph", KEYWORD_subparagraph},
    {"include", KEYWORD_include}};

/*
 *	 FUNCTION DEFINITIONS
 */

static boolean isIdentChar(const int c) {
  return (boolean)(isalpha(c) || isdigit(c) || c == '$' || c == '_' ||
                   c == '#' || c == '-' || c == '.');
}

static void buildTexKeywordHash(void) {
  const size_t count = sizeof(TexKeywordTable) / sizeof(TexKeywordTable[0]);
  size_t i;
  for (i = 0; i < count; ++i) {
    const keywordDesc *const p = &TexKeywordTable[i];
    addKeyword(p->name, Lang_js, (int)p->id);
  }
}

static tokenInfo *newToken(void) {
  tokenInfo *const token = xMalloc(1, tokenInfo);

  token->type = TOKEN_UNDEFINED;
  token->keyword = KEYWORD_NONE;
  token->string = vStringNew();
  token->scope = vStringNew();
  token->lineNumber = getSourceLineNumber();
  token->filePosition = getInputFilePosition();

  return token;
}

static void deleteToken(tokenInfo *const token) {
  vStringDelete(token->string);
  vStringDelete(token->scope);
  eFree(token);
}

/*
 *	 Tag generation functions
 */

static void makeConstTag(tokenInfo *const token, const texKind kind) {
  if (TexKinds[kind].enabled) {
    const char *const name = vStringValue(token->string);
    tagEntryInfo e;
    initTagEntry(&e, name);

    e.lineNumber = token->lineNumber;
    e.filePosition = token->filePosition;
    e.kindName = TexKinds[kind].name;
    e.kind = TexKinds[kind].letter;

    makeTagEntry(&e);
  }
}

static void makeTexTag(tokenInfo *const token, texKind kind) {
  vString *fulltag;

  if (TexKinds[kind].enabled) {
    /*
     * If a scope has been added to the token, change the token
     * string to include the scope when making the tag.
     */
    if (vStringLength(token->scope) > 0) {
      fulltag = vStringNew();
      vStringCopy(fulltag, token->scope);
      vStringCatS(fulltag, ".");
      vStringCatS(fulltag, vStringValue(token->string));
      vStringTerminate(fulltag);
      vStringCopy(token->string, fulltag);
      vStringDelete(fulltag);
    }
    makeConstTag(token, kind);
  }
}

/*
 *	 Parsing functions
 */

static void parseString(vString *const string, const int delimiter) {
  boolean end = FALSE;
  while (!end) {
    int c = fileGetc();
    if (c == EOF)
      end = TRUE;
    else if (c == '\\') {
      c = fileGetc(); /* This maybe a ' or ". */
      vStringPut(string, c);
    } else if (c == delimiter)
      end = TRUE;
    else
      vStringPut(string, c);
  }
  vStringTerminate(string);
}

/*
 *	Read a C identifier beginning with "firstChar" and places it into
 *	"name".
 */
static void parseIdentifier(vString *const string, const int firstChar) {
  int c = firstChar;
  Assert(isIdentChar(c));
  do {
    vStringPut(string, c);
    c = fileGetc();
  } while (isIdentChar(c));

  vStringTerminate(string);
  if (!isspace(c)) fileUngetc(c); /* unget non-identifier character */
}

static void readToken(tokenInfo *const token) {
  int c;

  token->type = TOKEN_UNDEFINED;
  token->keyword = KEYWORD_NONE;
  vStringClear(token->string);

getNextChar:
  do {
    c = fileGetc();
    token->lineNumber = getSourceLineNumber();
    token->filePosition = getInputFilePosition();
  } while (c == '\t' || c == ' ' || c == '\n');

  switch (c) {
    case EOF:
      longjmp(Exception, (int)ExceptionEOF);
      break;
    case '(':
      token->type = TOKEN_OPEN_PAREN;
      break;
    case ')':
      token->type = TOKEN_CLOSE_PAREN;
      break;
    case ',':
      token->type = TOKEN_COMMA;
      break;
    case '{':
      token->type = TOKEN_OPEN_CURLY;
      break;
    case '}':
      token->type = TOKEN_CLOSE_CURLY;
      break;
    case '[':
      token->type = TOKEN_OPEN_SQUARE;
      break;
    case ']':
      token->type = TOKEN_CLOSE_SQUARE;
      break;
    case '*':
      token->type = TOKEN_STAR;
      break;

    case '\'':
    case '"':
      token->type = TOKEN_STRING;
      parseString(token->string, c);
      token->lineNumber = getSourceLineNumber();
      token->filePosition = getInputFilePosition();
      break;

    case '\\':
      /*
       * All Tex tags start with a backslash.
       * Check if the next character is an alpha character
       * else it is not a potential tex tag.
       */
      c = fileGetc();
      if (!isalpha(c))
        fileUngetc(c);
      else {
        parseIdentifier(token->string, c);
        token->lineNumber = getSourceLineNumber();
        token->filePosition = getInputFilePosition();
        token->keyword = analyzeToken(token->string, Lang_js);
        if (isKeyword(token, KEYWORD_NONE))
          token->type = TOKEN_IDENTIFIER;
        else
          token->type = TOKEN_KEYWORD;
      }
      break;

    case '%':
      fileSkipToCharacter('\n'); /* % are single line comments */
      goto getNextChar;
      break;

    default:
      if (!isIdentChar(c))
        token->type = TOKEN_UNDEFINED;
      else {
        parseIdentifier(token->string, c);
        token->lineNumber = getSourceLineNumber();
        token->filePosition = getInputFilePosition();
        token->type = TOKEN_IDENTIFIER;
      }
      break;
  }
}

static void copyToken(tokenInfo *const dest, tokenInfo *const src) {
  dest->lineNumber = src->lineNumber;
  dest->filePosition = src->filePosition;
  dest->type = src->type;
  dest->keyword = src->keyword;
  vStringCopy(dest->string, src->string);
  vStringCopy(dest->scope, src->scope);
}

/*
 *	 Scanning functions
 */

static boolean parseTag(tokenInfo *const token, texKind kind) {
  tokenInfo *const name = newToken();
  vString *fullname;
  boolean useLongName = TRUE;

  fullname = vStringNew();
  vStringClear(fullname);

  /*
   * Tex tags are of these formats:
   *   \keyword{any number of words}
   *   \keyword[short desc]{any number of words}
   *   \keyword*[short desc]{any number of words}
   *
   * When a keyword is found, loop through all words within
   * the curly braces for the tag name.
   */

  if (isType(token, TOKEN_KEYWORD)) {
    copyToken(name, token);
    readToken(token);
  }

  if (isType(token, TOKEN_OPEN_SQUARE)) {
    useLongName = FALSE;

    readToken(token);
    while (!isType(token, TOKEN_CLOSE_SQUARE)) {
      if (isType(token, TOKEN_IDENTIFIER)) {
        if (fullname->length > 0) vStringCatS(fullname, " ");
        vStringCatS(fullname, vStringValue(token->string));
      }
      readToken(token);
    }
    vStringTerminate(fullname);
    vStringCopy(name->string, fullname);
    makeTexTag(name, kind);
  }

  if (isType(token, TOKEN_STAR)) {
    readToken(token);
  }

  if (isType(token, TOKEN_OPEN_CURLY)) {
    readToken(token);
    while (!isType(token, TOKEN_CLOSE_CURLY)) {
      /* if (isType (token, TOKEN_IDENTIFIER) && useLongName) */
      if (useLongName) {
        if (fullname->length > 0) vStringCatS(fullname, " ");
        vStringCatS(fullname, vStringValue(token->string));
      }
      readToken(token);
    }
    if (useLongName) {
      vStringTerminate(fullname);
      vStringCopy(name->string, fullname);
      makeTexTag(name, kind);
    }
  }

  deleteToken(name);
  vStringDelete(fullname);
  return TRUE;
}

static void parseTexFile(tokenInfo *const token) {
  do {
    readToken(token);

    if (isType(token, TOKEN_KEYWORD)) {
      switch (token->keyword) {
        case KEYWORD_chapter:
          parseTag(token, TEXTAG_CHAPTER);
          break;
        case KEYWORD_section:
          parseTag(token, TEXTAG_SECTION);
          break;
        case KEYWORD_subsection:
          parseTag(token, TEXTAG_SUBSECTION);
          break;
        case KEYWORD_subsubsection:
          parseTag(token, TEXTAG_SUBSUBSECTION);
          break;
        case KEYWORD_part:
          parseTag(token, TEXTAG_PART);
          break;
        case KEYWORD_paragraph:
          parseTag(token, TEXTAG_PARAGRAPH);
          break;
        case KEYWORD_subparagraph:
          parseTag(token, TEXTAG_SUBPARAGRAPH);
          break;
        case KEYWORD_include:
          parseTag(token, TEXTAG_INCLUDE);
          break;
        default:
          break;
      }
    }
  } while (TRUE);
}

static void initialize(const langType language) {
  Assert(sizeof(TexKinds) / sizeof(TexKinds[0]) == TEXTAG_COUNT);
  Lang_js = language;
  buildTexKeywordHash();
}

static void findTexTags(void) {
  tokenInfo *const token = newToken();
  exception_t exception;

  exception = (exception_t)(setjmp(Exception));
  while (exception == ExceptionNone) parseTexFile(token);

  deleteToken(token);
}

/* Create parser definition stucture */
extern parserDefinition *TexParser(void) {
  static const char *const extensions[] = {"tex", NULL};
  parserDefinition *const def = parserNew("Tex");
  def->extensions = extensions;
  /*
   * New definitions for parsing instead of regex
   */
  def->kinds = TexKinds;
  def->kindCount = KIND_COUNT(TexKinds);
  def->parser = findTexTags;
  def->initialize = initialize;

  return def;
}
/* vi:set tabstop=4 shiftwidth=4 noexpandtab: */