diff --git a/examples/examples.mk b/examples/examples.mk index 2cd6e636..e510698d 100644 --- a/examples/examples.mk +++ b/examples/examples.mk @@ -114,6 +114,18 @@ o/$(MODE)/examples/hellojs.com.dbg: \ $(APE) @$(APELINK) +o/$(MODE)/examples/ispell.com.dbg: \ + $(EXAMPLES_DEPS) \ + o/$(MODE)/examples/ispell.o \ + o/$(MODE)/usr/share/dict/words.zip.o \ + o/$(MODE)/examples/examples.pkg \ + $(CRT) \ + $(APE) + @$(APELINK) + +o/$(MODE)/usr/share/dict/words: usr/share/dict/words.gz + $(GZ) $(ZFLAGS) <$< >$@ + .PHONY: o/$(MODE)/examples o/$(MODE)/examples: \ o/$(MODE)/examples/package \ diff --git a/examples/ispell.c b/examples/ispell.c new file mode 100644 index 00000000..816b83cf --- /dev/null +++ b/examples/ispell.c @@ -0,0 +1,157 @@ +#if 0 +/*─────────────────────────────────────────────────────────────────╗ +│ To the extent possible under law, Justine Tunney has waived │ +│ all copyright and related or neighboring rights to this file, │ +│ as it is written in the following disclaimers: │ +│ • http://unlicense.org/ │ +│ • http://creativecommons.org/publicdomain/zero/1.0/ │ +╚─────────────────────────────────────────────────────────────────*/ +#endif +#include "libc/alg/alg.h" +#include "libc/alg/arraylist.h" +#include "libc/bits/bits.h" +#include "libc/calls/calls.h" +#include "libc/conv/conv.h" +#include "libc/log/check.h" +#include "libc/log/log.h" +#include "libc/macros.h" +#include "libc/mem/mem.h" +#include "libc/stdio/stdio.h" +#include "libc/str/str.h" +#include "libc/str/tpdecode.h" +#include "libc/sysv/consts/fileno.h" +#include "libc/x/x.h" + +/** + * @fileoverview Simple Interactive Spell Checker. + * + * This is an attempt to get spell checking to work in Emacs across + * platforms. While the computer science behind spell checking is very + * simple, unfortunately Emacs probes all these System Five spell progs + * similar to how websites have been known to probe User-Agent strings. + * + * Here's how we believe the repl works: + * + * $ make -j8 o//examples/ispell.com + * $ o//examples/ispell.com + * @(#) Cosmopolitan Ispell Version 3.4.00 8 Feb 2015 + * word: hello + * ok + * word: héllo + * how about: hills, hello, hilly, jello + * word: lova + * how about: diva, dona, dora, dove, elva, fora, hove, iota + * word: hecruhecrue + * not found + * + * The dictionary for this program is embedded as a text file within the + * zip structure of the binary. It can be edited after distribution. + * + * It's possible to go even fancier than what this code is doing, by + * using cmudict phonemes, bloom filters, unicode tables e.g. e vs. é, + * and even doing ML similar to Google's online spell checker. + * + * TODO: Figure out why Emacs rejects this interface. + */ + +#define MISSING_LETTER_DISTANCE 5 +#define MAX_NEARBY_WORD_DISTANCE 6 +#define MAX_NEARBY_RESULTS 8 + +FILE *f; +char *line; +size_t linesize; +const char *query; +struct critbit0 words; /* does O(log 𝑛) fast path lookup */ + +struct NearbyWords { + size_t i, n; + struct WordMatch { + long dist; + char *word; + } * p; +} nearby; + +long WordDistance(const char *a, const char *b) { + long dist; + int gota, gotb; + unsigned long h, p; + wint_t chara, charb; + dist = p = 0; + for (;;) { + gota = abs(tpdecode(a, &chara)); /* parses utf-8 multibyte characters */ + gotb = abs(tpdecode(b, &charb)); /* abs() handles -1, always yields � */ + if (!chara && !charb) break; + if (!chara || !charb) { + dist += MISSING_LETTER_DISTANCE; + } else if ((h = hamming(chara, charb))) { + dist += h + p++; /* penalize multiple mismatched letters */ + } + if (chara) a += gota; + if (charb) b += gotb; + } + return dist; +} + +intptr_t ConsiderWord(const char *word, void *arg) { + long dist; + if ((dist = WordDistance(word, query)) < MAX_NEARBY_WORD_DISTANCE) { + append(&nearby, &((struct WordMatch){dist, word})); + } + return 0; +} + +int CompareWords(const struct WordMatch *a, const struct WordMatch *b) { + return a->dist < b->dist; +} + +void FindNearbyWords(void) { + nearby.i = 0; + critbit0_allprefixed(&words, "", ConsiderWord, NULL); + qsort(nearby.p, nearby.i, sizeof(struct WordMatch), (void *)CompareWords); +} + +void SpellChecker(void) { + int i; + printf("@(#) Cosmopolitan Ispell Version 3.4.00 8 Feb 2015\r\n"); + while (!feof(stdin)) { + printf("word: "); + fflush(stdout); + if (getline(&line, &linesize, stdin) > 0) { + query = strtolower(chomp(line)); + if (critbit0_contains(&words, query)) { + printf("ok\r\n"); + } else { + FindNearbyWords(); + if (nearby.i) { + printf("how about: "); + for (i = 0; i < MIN(MAX_NEARBY_RESULTS, nearby.i); ++i) { + if (i) printf(", "); + fputs(nearby.p[i].word, stdout); + } + printf("\r\n"); + } else { + printf("not found\r\n"); + } + } + } + printf("\r\n"); + } + CHECK_NE(-1, fclose(stdin)); + CHECK_NE(-1, fclose(stdout)); +} + +void LoadWords(void) { + CHECK_NOTNULL((f = fopen("zip:usr/share/dict/words", "r"))); + while (getline(&line, &linesize, f) > 0) { + critbit0_insert(&words, strtolower(chomp(line))); + } + CHECK_NE(-1, fclose(f)); +} + +int main(int argc, char *argv[]) { + showcrashreports(); + LoadWords(); + SpellChecker(); + return 0; +} diff --git a/libc/bits/bits.h b/libc/bits/bits.h index 68e01224..3a550e5c 100644 --- a/libc/bits/bits.h +++ b/libc/bits/bits.h @@ -36,6 +36,7 @@ uint64_t bitreverse64(uint64_t) libcesque pureconst; unsigned long roundup2pow(unsigned long) libcesque pureconst; unsigned long roundup2log(unsigned long) libcesque pureconst; unsigned long rounddown2pow(unsigned long) libcesque pureconst; +unsigned long hamming(unsigned long, unsigned long) pureconst; /*───────────────────────────────────────────────────────────────────────────│─╗ │ cosmopolitan § bits » no assembly required ─╬─│┼ diff --git a/libc/bits/hamming.c b/libc/bits/hamming.c index 5f74b472..38beb33a 100644 --- a/libc/bits/hamming.c +++ b/libc/bits/hamming.c @@ -19,6 +19,9 @@ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/bits/bits.h" -unsigned long(hamming)(unsigned long x, unsigned long y) { +/** + * Counts number of different bits. + */ +unsigned long hamming(unsigned long x, unsigned long y) { return popcount(x ^ y); } diff --git a/test/libc/str/tpdecode_test.c b/test/libc/str/tpdecode_test.c index 11c360eb..deb82cb6 100644 --- a/test/libc/str/tpdecode_test.c +++ b/test/libc/str/tpdecode_test.c @@ -30,6 +30,12 @@ wint_t wc; +TEST(tpdecode, testEmptyString_consumesNulTerminator) { + wc = 123; + EXPECT_EQ(1, tpdecode("", &wc)); + EXPECT_EQ(0, wc); +} + TEST(tpdecode, testGlyph) { EXPECT_EQ(u'→', PROGN(ASSERT_EQ(3, tpdecode("→", &wc)), wc)); EXPECT_EQ(L'𐌰', PROGN(ASSERT_EQ(4, tpdecode("𐌰𐌱𐌲𐌳", &wc)), wc)); diff --git a/usr/share/dict/words.gz b/usr/share/dict/words.gz new file mode 100644 index 00000000..df125841 Binary files /dev/null and b/usr/share/dict/words.gz differ