cosmopolitan/examples/ispell.c

#if 0
/*─────────────────────────────────────────────────────────────────╗
│ To the extent possible under law, Justine Tunney has waived      │
│ all copyright and related or neighboring rights to this file,    │
│ as it is written in the following disclaimers:                   │
│   • http://unlicense.org/                                        │
│   • http://creativecommons.org/publicdomain/zero/1.0/            │
╚─────────────────────────────────────────────────────────────────*/
#endif
#include "libc/alg/alg.h"
#include "libc/alg/arraylist.h"
#include "libc/bits/bits.h"
#include "libc/calls/calls.h"
#include "libc/conv/conv.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/macros.h"
#include "libc/mem/mem.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/str/tpdecode.h"
#include "libc/sysv/consts/fileno.h"
#include "libc/x/x.h"

STATIC_YOINK("zip_uri_support");

/**
 * @fileoverview Simple Interactive Spell Checker.
 *
 * This is an attempt to get spell checking to work in Emacs across
 * platforms. While the computer science behind spell checking is very
 * simple, unfortunately Emacs probes all these System Five spell progs
 * similar to how websites have been known to probe User-Agent strings.
 *
 * Here's how we believe the repl works:
 *
 *     $ make -j8 o//examples/ispell.com
 *     $ o//examples/ispell.com
 *     @(#) Cosmopolitan Ispell Version 3.4.00 8 Feb 2015
 *     word: hello
 *     ok
 *     word: héllo
 *     how about: hills, hello, hilly, jello
 *     word: lova
 *     how about: diva, dona, dora, dove, elva, fora, hove, iota
 *     word: hecruhecrue
 *     not found
 *
 * The dictionary for this program is embedded as a text file within the
 * zip structure of the binary. It can be edited after distribution.
 *
 * It's possible to go even fancier than what this code is doing, by
 * using cmudict phonemes, bloom filters, unicode tables e.g. e vs. é,
 * and even doing ML similar to Google's online spell checker.
 *
 * TODO: Figure out why Emacs rejects this interface.
 */

#define MISSING_LETTER_DISTANCE  5
#define MAX_NEARBY_WORD_DISTANCE 6
#define MAX_NEARBY_RESULTS       8

FILE *f;
char *line;
size_t linesize;
const char *query;
struct critbit0 words; /* does O(log 𝑛) fast path lookup */

struct NearbyWords {
  size_t i, n;
  struct WordMatch {
    long dist;
    char *word;
  } * p;
} nearby;

long WordDistance(const char *a, const char *b) {
  long dist;
  int gota, gotb;
  unsigned long h, p;
  wint_t chara, charb;
  dist = p = 0;
  for (;;) {
    gota = abs(tpdecode(a, &chara)); /* parses utf-8 multibyte characters */
    gotb = abs(tpdecode(b, &charb)); /* abs() handles -1, always yields <20> */
    if (!chara && !charb) break;
    if (!chara || !charb) {
      dist += MISSING_LETTER_DISTANCE;
    } else if ((h = hamming(chara, charb))) {
      dist += h + p++; /* penalize multiple mismatched letters */
    }
    if (chara) a += gota;
    if (charb) b += gotb;
  }
  return dist;
}

intptr_t ConsiderWord(const char *word, void *arg) {
  long dist;
  if ((dist = WordDistance(word, query)) < MAX_NEARBY_WORD_DISTANCE) {
    append(&nearby, &((struct WordMatch){dist, word}));
  }
  return 0;
}

int CompareWords(const struct WordMatch *a, const struct WordMatch *b) {
  return a->dist < b->dist;
}

void FindNearbyWords(void) {
  nearby.i = 0;
  critbit0_allprefixed(&words, "", ConsiderWord, NULL);
  qsort(nearby.p, nearby.i, sizeof(struct WordMatch), (void *)CompareWords);
}

void SpellChecker(void) {
  int i;
  printf("@(#) Cosmopolitan Ispell Version 3.4.00 8 Feb 2015\r\n");
  while (!feof(stdin)) {
    printf("word: ");
    fflush(stdout);
    if (getline(&line, &linesize, stdin) > 0) {
      query = strtolower(chomp(line));
      if (critbit0_contains(&words, query)) {
        printf("ok\r\n");
      } else {
        FindNearbyWords();
        if (nearby.i) {
          printf("how about: ");
          for (i = 0; i < MIN(MAX_NEARBY_RESULTS, nearby.i); ++i) {
            if (i) printf(", ");
            fputs(nearby.p[i].word, stdout);
          }
          printf("\r\n");
        } else {
          printf("not found\r\n");
        }
      }
    }
    printf("\r\n");
  }
  CHECK_NE(-1, fclose(stdin));
  CHECK_NE(-1, fclose(stdout));
}

void LoadWords(void) {
  CHECK_NOTNULL((f = fopen("zip:usr/share/dict/words", "r")));
  while (getline(&line, &linesize, f) > 0) {
    critbit0_insert(&words, strtolower(chomp(line)));
  }
  CHECK_NE(-1, fclose(f));
}

int main(int argc, char *argv[]) {
  showcrashreports();
  LoadWords();
  SpellChecker();
  return 0;
}