326 lines
7.5 KiB
C
326 lines
7.5 KiB
C
|
/*-*- mode:c;indent-tabs-mode:t;c-basic-offset:8;tab-width:8;coding:utf-8 -*-│
|
||
|
│vi: set et ft=c ts=8 sw=8 fenc=utf-8 :vi│
|
||
|
└─────────────────────────────────────────────────────────────────────────────*/
|
||
|
/* clang-format off */
|
||
|
/* $OpenBSD: ccl.c,v 1.8 2015/11/19 22:55:13 tedu Exp $ */
|
||
|
|
||
|
/* ccl - routines for character classes */
|
||
|
|
||
|
/* Copyright (c) 1990 The Regents of the University of California. */
|
||
|
/* All rights reserved. */
|
||
|
|
||
|
/* This code is derived from software contributed to Berkeley by */
|
||
|
/* Vern Paxson. */
|
||
|
|
||
|
/* The United States Government has rights in this work pursuant */
|
||
|
/* to contract no. DE-AC03-76SF00098 between the United States */
|
||
|
/* Department of Energy and the University of California. */
|
||
|
|
||
|
/* This file is part of flex. */
|
||
|
|
||
|
/* Redistribution and use in source and binary forms, with or without */
|
||
|
/* modification, are permitted provided that the following conditions */
|
||
|
/* are met: */
|
||
|
|
||
|
/* 1. Redistributions of source code must retain the above copyright */
|
||
|
/* notice, this list of conditions and the following disclaimer. */
|
||
|
/* 2. Redistributions in binary form must reproduce the above copyright */
|
||
|
/* notice, this list of conditions and the following disclaimer in the */
|
||
|
/* documentation and/or other materials provided with the distribution. */
|
||
|
|
||
|
/* Neither the name of the University nor the names of its contributors */
|
||
|
/* may be used to endorse or promote products derived from this software */
|
||
|
/* without specific prior written permission. */
|
||
|
|
||
|
/* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
|
||
|
/* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
|
||
|
/* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
|
||
|
/* PURPOSE. */
|
||
|
|
||
|
#include "libc/str/str.h"
|
||
|
#include "flexdef.h"
|
||
|
|
||
|
/* return true if the chr is in the ccl. Takes negation into account. */
|
||
|
static bool
|
||
|
ccl_contains(const int cclp, const int ch)
|
||
|
{
|
||
|
int ind, len, i;
|
||
|
|
||
|
len = ccllen[cclp];
|
||
|
ind = cclmap[cclp];
|
||
|
|
||
|
for (i = 0; i < len; ++i)
|
||
|
if (ccltbl[ind + i] == ch)
|
||
|
return !cclng[cclp];
|
||
|
|
||
|
return cclng[cclp];
|
||
|
}
|
||
|
|
||
|
|
||
|
/* ccladd - add a single character to a ccl */
|
||
|
|
||
|
void
|
||
|
ccladd(cclp, ch)
|
||
|
int cclp;
|
||
|
int ch;
|
||
|
{
|
||
|
int ind, len, newpos, i;
|
||
|
|
||
|
check_char(ch);
|
||
|
|
||
|
len = ccllen[cclp];
|
||
|
ind = cclmap[cclp];
|
||
|
|
||
|
/* check to see if the character is already in the ccl */
|
||
|
|
||
|
for (i = 0; i < len; ++i)
|
||
|
if (ccltbl[ind + i] == ch)
|
||
|
return;
|
||
|
|
||
|
/* mark newlines */
|
||
|
if (ch == nlch)
|
||
|
ccl_has_nl[cclp] = true;
|
||
|
|
||
|
newpos = ind + len;
|
||
|
|
||
|
if (newpos >= current_max_ccl_tbl_size) {
|
||
|
current_max_ccl_tbl_size += MAX_CCL_TBL_SIZE_INCREMENT;
|
||
|
|
||
|
++num_reallocs;
|
||
|
|
||
|
ccltbl = reallocate_Character_array(ccltbl,
|
||
|
current_max_ccl_tbl_size);
|
||
|
}
|
||
|
ccllen[cclp] = len + 1;
|
||
|
ccltbl[newpos] = ch;
|
||
|
}
|
||
|
|
||
|
/* dump_cclp - same thing as list_character_set, but for cclps. */
|
||
|
|
||
|
static void
|
||
|
dump_cclp(FILE * file, int cclp)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
putc('[', file);
|
||
|
|
||
|
for (i = 0; i < csize; ++i) {
|
||
|
if (ccl_contains(cclp, i)) {
|
||
|
int start_char = i;
|
||
|
|
||
|
putc(' ', file);
|
||
|
|
||
|
fputs(readable_form(i), file);
|
||
|
|
||
|
while (++i < csize && ccl_contains(cclp, i));
|
||
|
|
||
|
if (i - 1 > start_char)
|
||
|
/* this was a run */
|
||
|
fprintf(file, "-%s",
|
||
|
readable_form(i - 1));
|
||
|
|
||
|
putc(' ', file);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
putc(']', file);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/* ccl_set_diff - create a new ccl as the set difference of the two given ccls. */
|
||
|
int
|
||
|
ccl_set_diff(int a, int b)
|
||
|
{
|
||
|
int d, ch;
|
||
|
|
||
|
/* create new class */
|
||
|
d = cclinit();
|
||
|
|
||
|
/*
|
||
|
* In order to handle negation, we spin through all possible chars,
|
||
|
* addding each char in a that is not in b. (This could be O(n^2),
|
||
|
* but n is small and bounded.)
|
||
|
*/
|
||
|
for (ch = 0; ch < csize; ++ch)
|
||
|
if (ccl_contains(a, ch) && !ccl_contains(b, ch))
|
||
|
ccladd(d, ch);
|
||
|
|
||
|
/* debug */
|
||
|
if (0) {
|
||
|
fprintf(stderr, "ccl_set_diff (");
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, a);
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, b);
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, d);
|
||
|
fprintf(stderr, "\n)\n");
|
||
|
}
|
||
|
return d;
|
||
|
}
|
||
|
|
||
|
/* ccl_set_union - create a new ccl as the set union of the two given ccls. */
|
||
|
int
|
||
|
ccl_set_union(int a, int b)
|
||
|
{
|
||
|
int d, i;
|
||
|
|
||
|
/* create new class */
|
||
|
d = cclinit();
|
||
|
|
||
|
/* Add all of a */
|
||
|
for (i = 0; i < ccllen[a]; ++i)
|
||
|
ccladd(d, ccltbl[cclmap[a] + i]);
|
||
|
|
||
|
/* Add all of b */
|
||
|
for (i = 0; i < ccllen[b]; ++i)
|
||
|
ccladd(d, ccltbl[cclmap[b] + i]);
|
||
|
|
||
|
/* debug */
|
||
|
if (0) {
|
||
|
fprintf(stderr, "ccl_set_union (%d + %d = %d", a, b, d);
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, a);
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, b);
|
||
|
fprintf(stderr, "\n ");
|
||
|
dump_cclp(stderr, d);
|
||
|
fprintf(stderr, "\n)\n");
|
||
|
}
|
||
|
return d;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* cclinit - return an empty ccl */
|
||
|
|
||
|
int
|
||
|
cclinit()
|
||
|
{
|
||
|
if (++lastccl >= current_maxccls) {
|
||
|
current_maxccls += MAX_CCLS_INCREMENT;
|
||
|
|
||
|
++num_reallocs;
|
||
|
|
||
|
cclmap =
|
||
|
reallocate_integer_array(cclmap, current_maxccls);
|
||
|
ccllen =
|
||
|
reallocate_integer_array(ccllen, current_maxccls);
|
||
|
cclng = reallocate_integer_array(cclng, current_maxccls);
|
||
|
ccl_has_nl =
|
||
|
reallocate_bool_array(ccl_has_nl,
|
||
|
current_maxccls);
|
||
|
}
|
||
|
if (lastccl == 1)
|
||
|
/* we're making the first ccl */
|
||
|
cclmap[lastccl] = 0;
|
||
|
|
||
|
else
|
||
|
/*
|
||
|
* The new pointer is just past the end of the last ccl.
|
||
|
* Since the cclmap points to the \first/ character of a ccl,
|
||
|
* adding the length of the ccl to the cclmap pointer will
|
||
|
* produce a cursor to the first free space.
|
||
|
*/
|
||
|
cclmap[lastccl] =
|
||
|
cclmap[lastccl - 1] + ccllen[lastccl - 1];
|
||
|
|
||
|
ccllen[lastccl] = 0;
|
||
|
cclng[lastccl] = 0; /* ccl's start out life un-negated */
|
||
|
ccl_has_nl[lastccl] = false;
|
||
|
|
||
|
return lastccl;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* cclnegate - negate the given ccl */
|
||
|
|
||
|
void
|
||
|
cclnegate(cclp)
|
||
|
int cclp;
|
||
|
{
|
||
|
cclng[cclp] = 1;
|
||
|
ccl_has_nl[cclp] = !ccl_has_nl[cclp];
|
||
|
}
|
||
|
|
||
|
|
||
|
/* list_character_set - list the members of a set of characters in CCL form
|
||
|
*
|
||
|
* Writes to the given file a character-class representation of those
|
||
|
* characters present in the given CCL. A character is present if it
|
||
|
* has a non-zero value in the cset array.
|
||
|
*/
|
||
|
|
||
|
void
|
||
|
list_character_set(file, cset)
|
||
|
FILE *file;
|
||
|
int cset[];
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
putc('[', file);
|
||
|
|
||
|
for (i = 0; i < csize; ++i) {
|
||
|
if (cset[i]) {
|
||
|
int start_char = i;
|
||
|
|
||
|
putc(' ', file);
|
||
|
|
||
|
fputs(readable_form(i), file);
|
||
|
|
||
|
while (++i < csize && cset[i]);
|
||
|
|
||
|
if (i - 1 > start_char)
|
||
|
/* this was a run */
|
||
|
fprintf(file, "-%s",
|
||
|
readable_form(i - 1));
|
||
|
|
||
|
putc(' ', file);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
putc(']', file);
|
||
|
}
|
||
|
|
||
|
/** Determines if the range [c1-c2] is unambiguous in a case-insensitive
|
||
|
* scanner. Specifically, if a lowercase or uppercase character, x, is in the
|
||
|
* range [c1-c2], then we require that UPPERCASE(x) and LOWERCASE(x) must also
|
||
|
* be in the range. If not, then this range is ambiguous, and the function
|
||
|
* returns false. For example, [@-_] spans [a-z] but not [A-Z]. Beware that
|
||
|
* [a-z] will be labeled ambiguous because it does not include [A-Z].
|
||
|
*
|
||
|
* @param c1 the lower end of the range
|
||
|
* @param c2 the upper end of the range
|
||
|
* @return true if [c1-c2] is not ambiguous for a caseless scanner.
|
||
|
*/
|
||
|
bool
|
||
|
range_covers_case(int c1, int c2)
|
||
|
{
|
||
|
int i, o;
|
||
|
|
||
|
for (i = c1; i <= c2; i++) {
|
||
|
if (has_case(i)) {
|
||
|
o = reverse_case(i);
|
||
|
if (o < c1 || c2 < o)
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
/** Reverse the case of a character, if possible.
|
||
|
* @return c if case-reversal does not apply.
|
||
|
*/
|
||
|
int
|
||
|
reverse_case(int c)
|
||
|
{
|
||
|
return isupper(c) ? tolower(c) : (islower(c) ? toupper(c) : c);
|
||
|
}
|
||
|
|
||
|
/** Return true if c is uppercase or lowercase. */
|
||
|
bool
|
||
|
has_case(int c)
|
||
|
{
|
||
|
return (isupper(c) || islower(c)) ? true : false;
|
||
|
}
|