Exercise 6.2 - Identical Variables

Question

Write a program that reads a C program and prints in alphabetical order each group of variable names that are identical in the first 6 characters, but different somewhere thereafter. Don’t count words within strings and comments. Make 6 a parameter that can be set from the command line.

/*
 * Exercise 6-2 in K&R2 on page 143.  Original By: Barrett Drawdy
 * Write a program that reads a C program and prints in alphabetical
 * order each group of variable names that are identical in the first
 * 6 characters, but different somewhere thereafter.  Don't count words
 * within strings and comments.  Make 6 a parameter that can be set
 * from the command line.
 */

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXWORD 1000       /* longest word that can be read by getword */
#define DEFAULT_COMP_LEN 6 /* default length to compare */

/*
 * tnode: Tree node from K&R2 page 140.  Words are initially read into
 * the tree by getword.
 */
struct tnode {
    char *word;
    int count;
    struct tnode *left;
    struct tnode *right;
};

/*
 * simroot: Part of a linked list of pointers to simword lists with
 * a common root.
 */
struct simroot {
    struct simword *firstword; /* points to the list of words */
    struct simroot *nextroot;  /* points to the next node in the list */
};

/*
 * simword: Part of a linked list of words with a common root.  Points
 * to the word in the tnodes.
 */
struct simword {
    char *word;               /* points to the word in the tree */
    int count;                /* copied from the tree */
    struct simword *nextword; /* next node */
};

struct tnode *addtree(struct tnode *, char *);

void treeprint(const struct tnode *);

int mgetword(char *, int);

struct simroot *addroot(struct simroot *, struct tnode *, int);

struct simroot *parse(struct tnode *, int);

void printlist(struct simroot *, int);

void printwords(struct simword *);

int main(int argc, char *argv[]) {
    struct tnode *root;
    char word[MAXWORD];
    struct simroot *listroot;
    int len;

    /* get all the words */
    root = NULL;
    while (mgetword(word, MAXWORD) != 'x')
        if (isalpha(word[0]))
            root = addtree(root, word);

    if (argc == 1)
        len = DEFAULT_COMP_LEN;
    else if (argc == 2)
        len = atoi(argv[1]);
    else {
        printf("Incorrect number of arguments.\n");
        return 1;
    }

    /* parse the tree to find similar words and populate list */
    listroot = NULL;
    listroot = parse(root, len);
    treeprint(root); /* prints all the words */
    printf("\nDuplicate list:\n\n");
    printlist(listroot, len); /* prints the similar words */

    return 0;
} /* end of main() */

/*
 * parse: Reads the tree created by getword.  It finds words that are
 * similar in the first len letters and places them in the list structure
 * that it creates.
 */
struct simroot *parse(struct tnode *node, int len) {
    struct tnode *temp; /* pointer to the children of the node */
    int this_len;       /* length of the word in the current node */
    static struct simroot *root = NULL; /* points to the root of the tree */

    if (node == NULL)
        return NULL;

    this_len = strlen(node->word);

    parse(node->left, len); /* start in the left subtree */

    temp = node->left; /* find the closest left child and compare */
    if (temp != NULL) {
        while (temp->right != NULL)
            temp = temp->right;
        /* if the word matches, put both words in the list */
        if (this_len >= len && strncmp(temp->word, node->word, len) == 0) {
            root = addroot(root, temp, len);
            addroot(root, node, len);
        }
    }

    temp = node->right; /* find the closest right child and compare */
    if (temp != NULL) {
        while (temp->left != NULL)
            temp = temp->left;
        /* if the word matches, put both words in the list */
        if (this_len >= len && strncmp(temp->word, node->word, len) == 0) {
            root = addroot(root, node, len);
            addroot(root, temp, len);
        }
    }

    parse(node->right, len); /* continue on to right subtree */

    return root;
}

/*
 * printlist: Prints the roots that were similar, followed by the list
 * of words.
 */
void printlist(struct simroot *p, int len) {
    int i;
    if (p == NULL)
        return;
    for (i = 0; i < len; ++i) /* print the root */
        putchar(p->firstword->word[i]);
    putchar('\n');
    printwords(p->firstword);    /* print the list of words */
    printlist(p->nextroot, len); /* print the next root/list */
}

/*
 * printword: Prints the list of words with the same roots.
 */
void printwords(struct simword *p) {
    printf("  %4d %s\n", p->count, p->word);
    if (p->nextword != NULL)
        printwords(p->nextword);
}

struct tnode *talloc(void);

char *mstrdup(char *);

struct simword *walloc(struct simword *, struct tnode *);

void addword(struct simword *, struct tnode *);

/*
 * addroot: When a node n is passed to addroot, n->word is compared to the
 * first word in ps list.  If they have are the same for the first 'len'
 * characters, then the word is added to that list.  Otherwise, it is passed
 * along the simroots until it reaches the end, where a new simroot is
 * created if the word has a new root.
 */
struct simroot *addroot(struct simroot *p, struct tnode *n, int len) {
    /* end of list, create a new root */
    if (p == NULL) {
        p = (struct simroot *) malloc(sizeof(struct simroot));
        p->nextroot = NULL;
        p->firstword = walloc(p->firstword, n);
    }
        /* word belongs to this list, add it */
    else if (strncmp(p->firstword->word, n->word, len) == 0)
        addword(p->firstword, n);
        /* haven't found the right root or end yet */
    else
        p->nextroot = addroot(p->nextroot, n, len);
    return p;
}

/*
 * addword: Compares the word from n to the word in p.  If they are the same,
 * the word was already added and it returns.  This continues down the list
 * until the end, where a new node is created if the word is new.
 */
void addword(struct simword *p, struct tnode *n) {
    /* word was already added */
    if (strcmp(p->word, n->word) == 0)
        return;
    /* end of list. create a new node */
    if (p->nextword == NULL)
        p->nextword = walloc(p->nextword, n);
        /* haven't reached the end yet */
    else
        addword(p->nextword, n);
}

/* walloc: Creates a new simword node. */
struct simword *walloc(struct simword *p, struct tnode *n) {
    p = (struct simword *) malloc(sizeof(struct simword));
    if (p != NULL) {
        p->word = n->word;
        p->count = n->count;
        p->nextword = NULL;
    }
    return p;
}

/* mgetword from Ex6.1 */

#define IN 1
#define OUT 0

int mgetword(char *word, int lim) {
    int c, d, getch(void), comment, string, directive;
    void ungetch(int);
    char *w = word;

    comment = string = directive = OUT;

    while (isspace(c = getch()));

    /* Check if inside a comment */

    if (c == '/') {
        if ((d = getch()) == '*') {
            comment = IN;
        } else {
            comment = OUT;
            ungetch(d);
        }
    }

    /* Check if inside a quote */

    if (c == '\"') {
        string = IN;
    }

    /* Check if inside a directive */

    if (c == '#') {
        directive = IN;
    }

    if (c == '\\') {
        c = getch(); /* ignore the \\ character */
    }

    if (comment == OUT && string == OUT && directive == OUT) {

        if (c != EOF)
            *w++ = c;

        if (!isalnum(c) && c != '_') {
            *w = '\0';
            return c;
        }

        for (; --lim > 0; w++) {
            *w = getch();
            if (!isalnum(*w) && *w != '_') {
                ungetch(*w);
                break;
            }
        }
        *w = '\0';
        return word[0];
    } else if (comment == IN) {
        *w++ = c;
        *w++ = d;

        while ((*w++ = c = getch())) {
            if (c == '*') {
                if ((c = getch()) == '/') {
                    *w++ = c;
                    comment = OUT;
                    break;
                } else {
                    ungetch(c);
                }
            }
        }
        *w = '\0';

    } else if (string == IN) {
        *w++ = c;
        while ((*w++ = getch()) != '\"') {
            if (*w == '\\') /* Take care of escaped quotes */
                *w++ = getch();
        }
        string = OUT;
        *w = '\0';
    } else if (directive == IN) {
        *w++ = c;
        while ((*w++ = getch()) != '\n') {
            if (c == '\\') { /* Take care of continuation line escape */
                *w++ = getch();
            }
        }
        directive = OUT;
        *w = '\0';
    }

    return c;
}

/***************************************************************************
 *                    All code below here is from K&R2.                    *
 ***************************************************************************/

/*
 * addtree: From K&R2 page 141.
 * Adds a node containing w, at or below node p.
 */
struct tnode *addtree(struct tnode *p, char *w) {
    int cond;

    if (p == NULL) { /* new word */
        p = talloc();
        p->word = mstrdup(w);
        p->count = 1;
        p->left = p->right = NULL;
    } else if ((cond = strcmp(w, p->word)) == 0)
        p->count++;
    else if (cond < 0)
        p->left = addtree(p->left, w);
    else
        p->right = addtree(p->right, w);
    return p;
}

/* treeprint: From K&R2 page 142. Prints tree p in-order. */
void treeprint(const struct tnode *p) {
    if (p != NULL) {
        treeprint(p->left);
        printf("%4d %s\n", p->count, p->word);
        treeprint(p->right);
    }
}

/* talloc: From K&R2 page 142. Makes a tnode. */
struct tnode *talloc(void) {
    return (struct tnode *) malloc(sizeof(struct tnode));
}

/* strdup: From K&R2 page 143. Makes a duplicate of s. */
char *mstrdup(char *s) {
    char *p;
    p = (char *) malloc(strlen(s) + 1);
    if (p != NULL)
        strcpy(p, s);
    return p;
}

/*
 * getch and ungetch are from K&R2, page 79
 */
#define BUFSIZE 100

char buf[BUFSIZE]; /* buffer for ungetch() */
int bufp = 0;      /* next free position in buf */

int getch(void) { /* get a (possibly pushed back) character */
    return (bufp > 0) ? buf[--bufp] : getchar();
}

void ungetch(int c) { /* push character back on input */
    if (bufp >= BUFSIZE)
        printf("ungetch: too many characters\n");
    else
        buf[bufp++] = c;
    return;
}

Explanation

This program reads a C program and groups similar list of variable names as similar words list. It parses the C program and stores the variables names in a binary tree, then constructs a similar word list which have a common prefix length.