Exercise 6.1 - getword

Question

Our version of getword does not properly handle underscores, string constants, comments, or preprocessor control lines. Write a better version.

#include <ctype.h>
#include <stdio.h>
#include <string.h>

struct key {
    char *word;
    int count;
} keytab[] = {"auto", 0, "break", 0, "case", 0, "char", 0,
              "const", 0, "continue", 0, "default", 0, "do", 0,
              "double", 0, "else", 0, "enum", 0, "extern", 0,
              "float", 0, "for", 0, "goto", 0, "if", 0,
              "int", 0, "long", 0, "register", 0, "return", 0,
              "short", 0, "signed", 0, "sizeof", 0, "static", 0,
              "struct", 0, "switch", 0, "typedef", 0, "union", 0,
              "unsigned", 0, "void", 0, "volatite", 0, "while", 0};

int mygetword(char *, int);

int binsearch(char *, struct key *, int);

#define NKEYS (sizeof(keytab) / sizeof(keytab[0]))
#define MAXWORD 100

int main(int argc, char *argv[]) {
    int n;
    char word[MAXWORD];

    while (mygetword(word, MAXWORD) != EOF)
        if (isalpha(word[0]))
            if ((n = binsearch(word, keytab, NKEYS)) >= 0)
                keytab[n].count++;
    for (n = 0; n < NKEYS; n++)
        if (keytab[n].count > 0)
            printf("%4d %s\n", keytab[n].count, keytab[n].word);
}

int binsearch(char *word, struct key lkeytab[], int n) {
    int cond;
    int low, high, mid;

    low = 0;
    high = n - 1;
    while (low <= high) {
        mid = (low + high) / 2;
        if ((cond = strcmp(word, lkeytab[mid].word)) < 0)
            high = mid - 1;
        else if (cond > 0)
            low = mid + 1;
        else
            return mid;
    }
    return -1;
}

int mygetword(char *word, int lim) {
    int c, getch(void);
    void ungetch(int);
    char *w = word;
    int t;

    while (isspace(c = getch()));
    if (c != EOF)
        *w++ = c;
    if (!isalpha(c)) {
        if (c == '\"') { /*string constant*/
            for (c = getch(); c != '\"'; c = getch());
        } else if (c == '#') { /*preprocessor*/
            for (c = getch(); c != '\n'; c = getch());
        } else if (c == '/')            /*comment*/
            if ((c = getch()) == '/') { /*single comment*/
                for (c = getch(); c != '\n'; c = getch());
            } else if (c == '*') { /*mutiline comment*/
                for (c = getch(), t = getch(); c != '*' && t != '/';
                     c = getch(), t = getch())
                    ungetch(t);
            } else
                ungetch(c);
        else /*underscore*/
            for (; !isspace(c) && c != EOF; c = getch());
        if (c != '\"' && c != '\n' && c != '/')
            ungetch(c);
        *w = '\0';
        return c;
    }

    for (; --lim > 0; w++)
        if (!isalnum(*w = getch())) {
            if (!isspace(*w)) {
                ungetch(*w);
                return (*w);
            } else {
                ungetch(*w);
                break;
            }
        }
    *w = '\0';
    return word[0];
}

#define BUFSIZE 100
char buf[BUFSIZE];
int bufp = 0;

int getch(void) { return (bufp > 0) ? buf[--bufp] : getchar(); }

void ungetch(int c) {
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
}

This is program from Section 6.3 implementing getword.

#include <ctype.h>
#include <stdio.h>
#include <string.h>

#define BUFSIZE 100
#define MAXWORD 100
#define NKEYS (sizeof keytab / sizeof(struct key))

char buf[BUFSIZE];          /* buffer for ungetch */
int bufp = 0;               /* next free position in buf */

/* Figure out why printf is a special case */
struct key {
    char *word;
    int count;
} keytab[] = {
        "auto", 0,
        "break", 0,
        "case", 0,
        "char", 0,
        "const", 0,
        "continue", 0,
        "default", 0,
        "unsigned", 0,
        "void", 0,
        "volatile", 0,
        "while", 0,
        "printf", 0,
};

int binsearch(char *, struct key *, int);
int getch(void);
int mgetword(char *, int);
void ungetch(int c);

int getch(void) /* get a (possibly pushed back) character */
{
    return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c) /* push a character back on input */
{
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters \n");
    else
        buf[bufp++] = c;
}

/* binsearch: find word in tab[0]...tab[n-1] */
int binsearch(char *word, struct key tab[], int n) {
    int cond;
    int low, high, mid;
    low = 0;
    high = n - 1;
    while (low <= high) {
        mid = (low + high) / 2;
        if ((cond = strcmp(word, tab[mid].word)) < 0)
            high = mid - 1;
        else if (cond > 0)
            low = mid + 1;
        else
            return mid;
    }
    return -1;
}

/* getword: get next word or character from input */
int mgetword(char *word, int lim) {
    int c;
    char *w = word;
    while (isspace(c = getch()));
    if (c != EOF)
        *w++ = c;
    if (!isalpha(c)) {
        *w = '\0';
        return c;
    }
    for (; --lim > 0; w++)
        if (!isalnum(*w = getch())) {
            ungetch(*w);
            break;
        }
    *w = '\0';
    return word[0];
}

/* count C keywords */
int main(int argc, char *argv[]) {
    int n;
    char word[MAXWORD];

    while (mgetword(word, MAXWORD) != EOF)
        if (isalpha(word[0]))
            if ((n = binsearch(word, keytab, NKEYS)) >= 0)
                keytab[n].count++;
    for (n = 0; n < NKEYS; n++)
        if (keytab[n].count > 0)
            printf("%4d %s\n", keytab[n].count, keytab[n].word);
    return 0;
}

Explanation

This program identifies the keywords in the given input.

$ ./ex_6.1_getword
this is a short sentence.
   1 short