#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Open Source (c) 2000 Dr. Richard S. Wallace
 * Terms: GPL published by the Free Software Foundation
 * classify.c:
 * Usage: classify 
 * Version: 0.03-1
 * Last modified: July 06, 2003 by David Calinski
 * Compile with:
 * gcc classify.c -Wall -lm -o classify
 * Test with:
 * cat dialog.txt | classify
 */

// The classify program:
// 1. reads the AIML files (default root file = B.aiml)
// 2. classifies lines from the stdin

// The basic idea of this program is to convert the pattern
// set into a compact tree representation.  Then the classifer
// can match an input sentence in essentially constant time.
//  
// The easiest way to understand the tree is visually,
// using a simple example.  
// Suppose the pattern set consists of just these patterns:

// *
// WHAT * 
// WHAT * IS
// WHAT ARE *
// WHAT IS *

// Then the tree could be depicted as:

//
// ROOT->'*'
//     |
//     \->'W'->'H'->'A'-'T'->' '->'*'
//                            |     |           
//                            |     \->' '->'I'->'S'
//                            |
//                            |->'I'->'S'->'*'
//                            |
//                            \->'A'->'R'->'E'->'*'
//
//
// Notice that the tree compressed all the common
// prefix letters into single nodes.  The leaf nodes
// (not shown) contain the pattern strings (for now).

// The heart of this program is a tree that
// stores the patterns in a very compact form.
// The tree is made up of a collection of objects
// called nodes.  Each node represents either a
// pattern (at the terminal nodes) or an
// an array of pointers to possible successor 
// nodes from that node.
// Each node has at most 29 branches, for A-Z, *, _ and ' '.
// From the root node, an input may follow one of the 29
// branches to either eventually reach a terminal node
// or fail to match.
// The array has constant size 32 with indeces
// arranged as follows:
// 0 SPACE 
// 1 * STAR
// 2-27 A-Z
// 28 _ UNDER
// 29-31 unused.

#define A_INDEX 2
#define Z_INDEX 27
#define STAR_INDEX 1
#define SPACE_INDEX 0
#define UNDER_INDEX 28

// The struct Node implemennts the tree node abstraction.
// Each node represents one letter,
// space or wild-card * or _.  From that node, the
// next node is determined by the successor character
// in the pattern.  Each node has at most 29 branches
// (A-Z, _, *, and SPACE).

typedef struct Node {
	char *pattern;	// null if INTERNAL node
	char *fname;	// null if INTERNAL node
	int actcnt;	// activation count
	struct Node *next[32];
} Node;

// (Should be able to use a union struct to reduce memory
// requirements at leaf nodes.)

// forward function declarations:
void classify(); // the main program routine
void read_aiml_file (char *filename); //
int match (char *pattern, Node *node); // match input
void insert (char *pattern, Node *node, char *filename); // insert pattern
int starmatch (char *pattern, Node *node); // match input with * node
void traverse (Node *node, int depth, int command); // visit nodes
void analyze_tree(); // statistics of node tree
void substitute (char *pattern, char *target, char *line);

// an array for storing input lines:
#define MAXLENGTH 256
char star[MAXLENGTH];  // the value of "*"

char sentence[MAXLENGTH];  // buffers input lines and patterns
char aiml_file[MAXLENGTH]; // the matching pattern
char matching_pattern[MAXLENGTH]; // the matching pattern
char matching_filename[MAXLENGTH]; // the matching pattern's filename

// Globals:
Node *root; // root node of tree
int patcnt=0; // number of patterns
double logpatcnt; // log of above
int maxcnt=0; // maximum activation count
int dupcnt=0; // number of duplicates found
int nodecnt=0; // number of nodes allocated
int branchsum=0; // total number of branches
int nodecntl[MAXLENGTH]; // number of nodes at level l
int branchsuml[MAXLENGTH]; // # branches at each level l

// We do the same NEWNODE steps over & over.
// Using NEWNODE we make sure to count every node
// allocated:

#define NEWNODE newnode = (Node *)malloc(sizeof(Node)); bzero(newnode, sizeof(Node)); nodecnt++

// macro to check for alphabeticalness:
#define ALPHA(c) (c >= 'A' && c <= 'Z')   

// MAIN
int main (int argc, char *argv[]) {
	Node *newnode;  // allocated
	printf ("Node size = %d\n", sizeof(Node));
	NEWNODE; // create the root node
	root = newnode;
	read_aiml_file ("B.aiml");
	printf ("%d nodes created\n", nodecnt);
	printf ("%d patterns read\n", patcnt);
	classify();
	//traverse(root, 0, 2);
	return 0;
} // MAIN

// classify:
// Read the patterns and grow pattern tree.
// Classify the data.

void classify() {
	char buffer[MAXLENGTH];
	char c; // input char
	char *p; // pointer to sentence 
	int len; // length of input line
	int nosep=1; // no separator found
	int matched=0; // boolean

	// PHASE II:
	// read lines fromt he standard input and classify.
	// write patterns and lines to the stdout.
	p = sentence;
	while ((c = getchar()) != EOF) {
		if (c == '\n') {
			*p = 0; 
			bzero (matching_pattern,  MAXLENGTH);
			bzero (matching_filename, MAXLENGTH);
			bzero (star, MAXLENGTH);
			if (strstr (sentence, "Robot:") == 0 && strstr (sentence, "t:") && *(strstr(sentence, "t:") +2)) {
				p = strstr (sentence, "t:") + 2;
				while (p[0] == ' ')
					p++; // skip any leading ' '
				bzero (buffer, MAXLENGTH);
				strcpy (buffer, p);
				substitute ("+", " PLUS ", buffer);
				substitute ("20", " TWENTY ", buffer);
				substitute ("21", " TWENTY ONE ", buffer);
				substitute ("22", " TWENTY TWO ", buffer);
				substitute ("23", " TWENTY THREE ", buffer);
				substitute ("24", " TWENTY FOUR ", buffer);
				substitute ("25", " TWENTY FIVE ", buffer);
				substitute ("26", " TWENTY SIX ", buffer);
				substitute ("27", " TWENTY SEVEN ", buffer);
				substitute ("28", " TWENTY EIGHT ", buffer);
				substitute ("29", " TWENTY NINE ", buffer);
				substitute ("10", " TEN ", buffer);
				substitute ("11", " ELEVEN ", buffer);
				substitute ("12", " TWELVE ", buffer);
				substitute ("13", " THIRTEEN ", buffer);
				substitute ("14", " FOURTEEN ", buffer);
				substitute ("15", " FIFTEEN ", buffer);
				substitute ("16", " SIXTEEN ", buffer);
				substitute ("17", " SEVENTEEN ", buffer);
				substitute ("18", " EIGHTEEN ", buffer);
				substitute ("19", " NINETEEN ", buffer);
				substitute ("0", " ZERO ", buffer);
				substitute ("1", " ONE ", buffer);
				substitute ("2", " TWO ", buffer);
				substitute ("3", " THREE ", buffer);
				substitute ("4", " FOUR ", buffer);
				substitute ("5", " FIVE ", buffer);
				substitute ("6", " SIX ", buffer);
				substitute ("7", " SEVEN ", buffer);
				substitute ("8", " EIGHT ", buffer);
				substitute ("9", " NINE ", buffer);
				substitute ("  ", " ", buffer);
				while (buffer[strlen (buffer)-1] == ' ') {
					printf("c='%c'\n",buffer[strlen(buffer)-1]);
					buffer[strlen(buffer)-1] = 0;
				}
				matched=match(buffer, root);
				if (*star != 0)
					printf ("%s : %s (star=%s) [%s]\n", matching_pattern, buffer, star, matching_filename);
				else
					printf ("%s : %s [%s]\n", matching_pattern, buffer, matching_filename);
			}
			len = 0; nosep=1;
			bzero (sentence, MAXLENGTH);
			p = sentence; // reset pointer
		}
		else {
			if (len++ < MAXLENGTH)
				*p++ = c; // advance pointer
		} // if-then-else    
	}
} // CLASSIFY

// traverse the tree depth-first
// calculate statistics & print patterns
// command = 1 : print all patterns
// command = 2 : print only activated patterns
// command = 3 : print only NON-activated patterns

void traverse (Node *node, int depth, int command) {
	int i; // index
	double entropy;
	nodecntl[depth]++; // this node exists at this depth
	if (node->pattern) { // TERMINAL node
		patcnt++; // count all patterns
		if (command == 1)
			printf ("%s\n", node->pattern);
		else
			if (command==2 && node->actcnt > 0) {
				entropy = logpatcnt - log((double)(node->actcnt));
				printf ("%2.2f %s %d \n",entropy, node->pattern,node->actcnt);
			}
			else
				if (command==3 && node->actcnt == 0) 
		printf ("%s zero activation\n", node->pattern, node->actcnt); /* FIXME: BUG */
	}
	for (i = 0; i < 32; i++) { // for all possible branches:
		if (node->next[i]) { // if the branch exists
			branchsum++; 
			branchsuml[depth]++;
			traverse (node->next[i], depth+1, command); // RECURSIVE case
		}
	} // for
} // traverse


// insert a new pattern into the tree:
// 

void insert (char *pattern, Node *node, char *filename) {
	Node *newnode = 0;
	char c; // first pattern character
	int index; // branch index of letter
	c = *pattern;
	switch(c) {
		case 0: // BASE case:
			if (node->pattern) {
				dupcnt++;
				//  printf("Duplicate pattern '%s' '%s'\n",node->pattern,sentence);
			}
			else {
				node->pattern = (char *)malloc(strlen(sentence)+1);
				strcpy (node->pattern, sentence);
				node->fname = (char*)malloc(strlen(filename)+1);
				strcpy(node->fname, filename);
				return; // finished with this pattern
			}
		case ' ': // SPACE case:
			if (node->next[SPACE_INDEX] == 0) {
				NEWNODE;
				node->next[SPACE_INDEX] = newnode;
			}
			else
				newnode = node->next[SPACE_INDEX];
			break;
		case '*':  // STAR case:
			if (node->next[STAR_INDEX] == 0) {
				NEWNODE;
				node->next[STAR_INDEX] = newnode;
			}
			else
				newnode = node->next[STAR_INDEX];
			break;
		case '_': // UNDER case:
 			if (node->next[UNDER_INDEX] == 0) {
				NEWNODE;
				node->next[UNDER_INDEX] = newnode;
			}
			else
				newnode = node->next[UNDER_INDEX];
			break;
		default:
			if (ALPHA(c)) {
				index = A_INDEX+(c-'A');
				if (node->next[index] == 0) {
					NEWNODE;
					node->next[index] = newnode;
				}
				else newnode = node->next[index];
			}
			else {
			// likely to be a nonalpha character
			}
			break;
		}
		if (newnode != 0) {
			pattern++;
		insert (pattern, newnode, filename); // RECURSIVE case
	}
	else {
		// likely to be a nonalphabetical character
	} 
} // insert()


// The routines match() and starmatch() are mutually
// recursive functions to locate the matching pattern
// for a given input. 

// match() copies the matched pattern
// to the array matching_pattern:
// The basic loop structure is similar to insert().
// The routine examines the input character by
// character.

int match (char *input, Node *node) {
	char c; // the input character
	Node *nextnode=0; // successor node
	c = *input; // c = the first char of input
	if (c == 0) {
		if (node->pattern) {
			strcpy (matching_pattern, node->pattern);
			strcpy (matching_filename, node->fname);
			node->actcnt++;
			if (node->actcnt > maxcnt)
				maxcnt = node->actcnt;
			return 1; // finished with this input
		} //  pattern found 
	} // c is NULL
	nextnode = node->next[SPACE_INDEX]; // SPACE ' ' case
	if (nextnode && c == ' ' && match (input + 1, nextnode))
		return 1;
	nextnode = node->next[UNDER_INDEX]; // UNDER '_' case
	if (nextnode && starmatch (input, nextnode))
		return 1;
	nextnode = node->next[A_INDEX+(c-'A')]; // A-Z case
	if (nextnode && ALPHA(c) && match (input+1, nextnode))
		return 1;
	nextnode = node->next[STAR_INDEX]; // STAR '*' case
	if (nextnode && starmatch(input, nextnode))
		return 1;
	return 0;
}

// starmatch
// match the sub-pattern starting with '_' or '*'
// Remember, '*' matches one or more words of the input,
// so starmatch "consumes" at least one word in the match.
//

int starmatch (char *input, Node *node) {
	Node *spacenode;	// node for any ' ' following '*' 
	Node *nextnode;		// next node after ' '
	char *nextword;		// successive words of the input
	char c;			// first char of next word
	spacenode = node->next[SPACE_INDEX];
	if (spacenode) {	// this * may be followed by a SPACE ' '
		nextword = input;	// begin with input but skip 1 word for '*'
		while (*nextword) {
			while (*nextword != ' ' && *nextword != 0)
				nextword++; // skip 1st word
			if (*nextword == ' ')
				nextword++;
			c = *nextword;		// first char of nextword
			if (ALPHA(c)) {
				nextnode = spacenode->next[A_INDEX + (c - 'A')];
				if (nextnode && match (nextword+1, nextnode)) {
					strncpy (star, input, (size_t) strlen(nextword));
					return 1;
				}
			}
		} // if ALPHA(c)
	} // if SPACE follows * or _
	if (node->pattern && strlen (node->pattern) > 0) {
		node->actcnt++;
		if (node->actcnt > maxcnt)
			maxcnt = node->actcnt;
		strcpy (matching_pattern, node->pattern);
		strcpy (matching_filename, node->fname);
		strcpy (star, input);
		return 1; 
	} // TERMINAL * or _
	return 0;  // NO star match 
} // starmatch()

// tree analysis:
void analyze_tree() {
	int i; // array index
	double ratio = 0.0;  // tree stats
	double product = 1.0;
	double maxprod = 0.0;
	int maxproddepth = 0;
	patcnt = 0;
	traverse (root, 0, 0); // tour the tree
	printf ("%d patterns in tree\n", patcnt);
	
	// analyze tree:
	for (i = 0; i < MAXLENGTH-1; i++) {
			if (nodecntl[i] > 0) {
			ratio = (double)(branchsuml[i]) / ((double)nodecntl[i]);
			product *= ratio;
			//  printf("%d (%d/%d)=%f %f\n",i,branchsuml[i],nodecntl[i],ratio,product);
			if (product > maxprod) {
				maxprod = product;
				maxproddepth = i;
			}
		}
	}
	product = 1.0;
	for (i = 0; i < 32; i++) {
		ratio = (double)(branchsuml[i])/((double)nodecntl[i]);
		product *= ratio;
		printf("%d (%d/%d)=%f %f\n", i, branchsuml[i], nodecntl[i], ratio, product);
	}
 // end tree analysis
}

void read_aiml_file (char *filename) {
	char buffer[MAXLENGTH];  //
	char subfile[MAXLENGTH];
	char c; // input char
	char *p; // pointer to sentence 
	int len; // length of input line
	FILE *fp; // pattern file 

	//printf ("Reading %s\n",filename);
	p = sentence; // set p to beginning of sentence
	fp = fopen (filename,"r");
	if (!fp) {
		printf ("no file %s\n", filename);
		return;
	}

	while ((c = getc(fp)) != EOF) {
		if (c == '\n') { // read lines from pattern file
			if (strstr (sentence, "<load filename=")) {
			strcpy (buffer, strstr (sentence, "<load filename=") + strlen ("<load filename=")+1);
			if (strstr (buffer, "\"/>"))
				*strstr (buffer, "\"/>") = 0;
			strcpy (subfile, buffer);
			read_aiml_file (subfile);
		}
		else
			if (strstr (sentence,"<pattern>") && strstr (sentence,"<that>") == 0) {
				strcpy (buffer, strstr(sentence,"<pattern>") + strlen("<pattern>"));
				if (strstr (buffer, "</pattern>"))
					*strstr (buffer, "</pattern>") = '\0';
				// printf("%s\n",buffer);
				patcnt++;
				substitute("<name/>","ALICE", buffer);
				insert(buffer, root, filename); // grow the tree
			}
			len = 0; // set line length to zero
			p = sentence; // reset pointer
			bzero(sentence, MAXLENGTH);
			bzero(buffer, MAXLENGTH);
		} // if newline
		else {
			if (len++ < MAXLENGTH)
				*p++ = c; // advance pointer
		} // if-then-else    
	} // while reading charcacters c
	fclose(fp); // done with AIML file
}

char sbuffer[MAXLENGTH];  

void substitute (char *pattern, char *target, char *line) {
	char *p;
	while ((p = strstr(line, pattern)) && strlen(line)+strlen(target) < MAXLENGTH ) {
		bzero(sbuffer, MAXLENGTH);
		strcpy(sbuffer, line);
		strcpy(p, target);
		strcpy(p+strlen(target), 
		strstr(sbuffer, pattern)+strlen(pattern));
	}
} // substituter


// changelog:

// May 28, 2000: improved memory usage by allocating less space
// per pattern.
// May 29, 2000: fixed multi-word bug in starmatch with loop

/* by David Calinski: 
 * July 06, 2003: cleaning (I mean recoding it to my coding style :P)
 		- proper idents (tabs instead of spaces), added some comment,
		deleted some obvious ones,
		fixed info "howto compile" to use gcc with -lm and -Wall,
		removed some unused variables, fixes to gcc warnings (one still not fixed).
		(BTW: classify, by now, is by no means usable to Aine)
 */

