/*

 * word_cnt.c

 * This program reads a file and counts the words in it.

 */

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

#include "tree.h"

 

#define MAX_WORD  200

 

/* A structure for storing a word along with its number of occurrences: */

struct _WordCount

{

  char  *word;

  int   count;

};

 

typedef struct _WordCount WordCount;

 

/* A comparison function for two WordCount pointers: */

int comp_word (const void *p1, const void *p2)

{

  const WordCount   *p_wc1;

  const WordCount   *p_wc2;

 

  /* Convert pointers and compare the words they contain. */

  p_wc1 = ((const WordCount *)p1);

  p_wc2 = ((const WordCount *)p2);

 

  return (strcmp (p_wc2->word, p_wc1->word));

}

 

/* Enumeration for a state: */

enum _State

{

  OUTSIDE_WORD,

  INSIDE_WORD

};

 

typedef enum _State State;

 

/* Prototypes: */

static void update_word (char *word, Tree *p_tree);

 

/* ------------------------------------------------------------------------

 * The main:

 */

int main (int argc, char **argv)

{

  const char *delims = " \t\n\r`~!@#$%^&*()-=+|\\[]{};:\",.<>/?";

                                   /* All characters that may be delimiters. */

  FILE       *p_file;              /* The input file. */

  char       word [MAX_WORD + 1];  /* The current word. */

  int        l_word;               /* Its length. */

  State      state = OUTSIDE_WORD; /* Are we inside or outside a word. */

  int        c;                    /* The current character read. */

  int        is_delim;             /* Is this character a delimiter. */

  Tree       *p_tree;              /* The tree containing the word count. */

  const Node *p_node;              /* The current tree node we scan. */

  int        n_total;              /* Total occurrences of all words. */

 

  /* Open the input file. */

  if (argc < 2)

  {

    printf ("Usage: %s <input file>\n", argv[0]);

    return (1);

  }

 

  p_file = fopen (argv[1], "r");

 

  if (p_file == NULL)

  {

    printf ("Failed to open '%s'.\n", argv[1]);

    return (1);

  }

 

  /* Initialize the tree. */

  p_tree = tree_init (comp_word, 1);

 

  /* Read the input file, character by character. */

  while ((c = fgetc (p_file)) != EOF)

  {

    /* Check whether the current character is a delimiter. */

    is_delim = (strchr (delims, c) != NULL);

 

    /* Act according to the current state. */

    if (state == OUTSIDE_WORD)

    {

      /* We are outside a word: Ignore any delimiters, but pay attention

         to the case that a new word has just started. */

      if (! is_delim)

      {

        word[0] = c;

        l_word = 1;

        state = INSIDE_WORD;

      }

    }

    else

    {

      /* We are inside a word. */

      if (! is_delim)

      {

        /* Current character is not a delimiter - append it to the word. */

        word[l_word] = c;

        l_word++;

      }

      else

      {

        /* We reached the end of the word. */

        state = OUTSIDE_WORD;

 

        /* Create a new WordCount object and update the tree. */

        if (l_word > 0)

        {

          word[l_word] = '\0';

          update_word (word, p_tree);

        }

      }

    }

  }

 

  /* Close the input file. */

  fclose (p_file);

 

  /* Go over all words in the tree and print their number of occurrences. */

  n_total = 0;

 

  for (p_node = tree_min (p_tree);

       p_node != NULL;

       p_node = tree_next (p_node))

  {

    WordCount  *p_wc = (WordCount *)(p_node->p_object);

 

    printf ("%s  :  %d\n", p_wc->word, p_wc->count);

    n_total += p_wc->count;

  }

 

  /* Print the summary. */

  printf ("\n**************** Summary: ****************\n");

  printf ("Found %d unique words,\nwith %d total occurences.\n",

        p_tree->n_objects, n_total);

  printf ("******************************************\n");

 

  /* Clear the tree and free memory. */

  tree_reset (p_tree);

  free (p_tree);

 

  return (0);

}

 

/* ------------------------------------------------------------------------

 * Function: update_word

 * Purpose:  Update the number of occurrences of the word.

 * Input:    word   - The word to update.

 * In/Out:   p_tree - The binary tree of WordCount objects.

 * Returns:  Nothing.

 */

static void update_word (char *word, Tree *p_tree)

{

  int         l_word;          /* The length of the word. */

  WordCount   wc_query;        /* An object used to query the tree. */

  const Node  *p_node;         /* The node that contains the desired word. */

  int         i;

 

  /* Change the word to lower case, in order to be case insensitive. */

  l_word = strlen (word);

 

  for (i = 0; i < l_word; i++)

    word[i] = tolower (word[i]);

 

  /* Create a temporary WordCount object and use it to query the tree

     and find the word in it. */

  wc_query.word = word;

  p_node = tree_find (p_tree, &wc_query);

 

  if (p_node != NULL)

  {

    /* The word exists in the tree - increment its counter. */

    WordCount   *p_existing_wc;

 

    p_existing_wc = (WordCount *)p_node->p_object;

    p_existing_wc->count++;

  }

  else

  {

    /* Create a new WordCount object and insert it to the tree. */

    WordCount   *p_new_wc;

 

    p_new_wc = (WordCount *) malloc (sizeof(WordCount));

    p_new_wc->word = (char *) malloc (sizeof(char) * (strlen(word) + 1));

    strcpy (p_new_wc->word, word);

    p_new_wc->count = 1;

 

    tree_insert (p_tree, p_new_wc);

  }

 

  return;

}