/* fuzzymatch.c - used by spellchecker */

/* http://www.snippets.org/snippets/portable/APPROX+C.php3 */

/* 
NOTE:
	The code it this file (up to *** BLAH ***)
	was initially written by John Rex,
	and as far as I know it was putted into public domain OR just
	it's free is sense: with no restrictions.
	(this code is on snippets.org where only such free code exist,
	and on many other web sites on the web as "free".)
	I have optimized it for Aine needs.
	
	- David Calinski
*/


/***************************************************************
 *
 * Fuzzy string searching subroutines
 *
 * Author:    John Rex
 * Date:      August, 1988
 * References: (1) Computer Algorithms, by Sara Baase
 *                 Addison-Wesley, 1988, pp 242-4.
 *             (2) Hall PAV, Dowling GR: "Approximate string matching",
 *                 ACM Computing Surveys, 12:381-402, 1980.
 *
 * Verified on:
 *    Datalite, DeSmet, Ecosoft, Lattice, MetaWare, MSC, Turbo, Watcom
 *
 * Compile time preprocessor switches:
 *    DEBUG - if defined, include test driver
 *
 * Usage:
 *
 *    char *pattern, *text;  - search for pattern in text
 *    int degree;      - degree of allowed mismatch
 *    char *start, *end;
 *    int howclose;
 *
 *    void App_init(pattern, text, degree);   - setup routine
 *    void App_next(&start, &end, &howclose); - find next match
 *
 *    - searching is done when App_next() returns start==NULL
 *
 **************************************************************/

#include "fuzzymatch.h"

/* local, static data */

static char *Text, *Pattern; /* pointers to search strings       */
static int Textloc;          /* current search position in Text  */
static int Plen;             /* length of Pattern                */
#define DEGREE	 1           /* max degree of allowed mismatch   */
static int mem[256];	/* 256 is safe enough, max token len = 63, so max possible usage of mem[] is about 190 and few */
static int *Ldiff, *Rdiff;   /* difference arrays  */
static int *Loff,  *Roff;    /* used to calculate start of match */


static void App_init (char *pattern, char *text)
{
	register int i;

	/* save parameters */
	Text = text;
	Pattern = pattern;

	/* initialize */
	Plen = strlen(pattern);
	assert (Plen > 0);
	
	Ldiff = mem;
	Rdiff  = Ldiff + Plen + 1;
	Loff   = Rdiff + Plen + 1;
	Roff   = Loff +  Plen + 1;
	
	i = Plen;
	do {
		Rdiff[i] = i;
		Roff[i] = 1;
	} while (i--);
	
	Textloc = -1; /* current offset into Text */
}

static INLINE void App_next (char **start, int *howclose) {
	int *temp, b, c;
	register int i;
	register int a;

	*start = NULL;

	do {	/* start computing columns */
		if (Text[++Textloc] == '\0') /* out of text to search! */
			return;
		temp = Rdiff;   /* move right-hand column to left ... */
		Rdiff = Ldiff;  /* ... so that we can compute new ... */
		Ldiff = temp;   /* ... right-hand column */
		Rdiff[0] = 0;   /* top (boundary) row */
		
		temp = Roff;    /* and swap offset arrays, too */
		Roff = Loff;
		Loff = temp;
		Roff[1] = 0;

		i = 0;
		do {   /* run through pattern
			compute a, b, & c as the three adjacent cells ... */

			a = Ldiff[i];
			if (Pattern[i] != Text[Textloc])
				a++;
			b = Ldiff[i+1] + 1;
			c = Rdiff[i] + 1;

			/* ... now pick minimum ... */
			if (b < a)
				a = b;
			if (c < a)
				a = c;

			/* ... and store */
			Rdiff[++i] = a;
		} while (i < Plen);

		/* now update offset array
		   the values in the offset arrays are added to the
		   current location to determine the beginning of the
		   mismatched substring. (see text for details) */

		for (i=2; i<=Plen; i++) {
			if (Ldiff[i-1] < Rdiff[i])
				Roff[i] = Loff[i-1] - 1;
			else if (Rdiff[i-1] < Rdiff[i])
				Roff[i] = Roff[i-1];
			else if (Ldiff[i] < Rdiff[i])
				Roff[i] = Loff[i] - 1;
			else /* Ldiff[i-1] == Rdiff[i] */
				Roff[i] = Loff[i-1] - 1;
		}

		/* now, do we have an approximate match? */
		if (Rdiff[Plen] <= DEGREE) {   /* indeed so! */
			*start = Text + Textloc + Roff[Plen];
			*howclose = Rdiff[Plen];
			return;
		}
	} while (1);
}


/*** BLAH ***/


/* code below is from WinAlice by Jacco Bikker
   (with my modifications though, mainly to work with plain C compiler),
    Jacco gave me once permission to use his code from WinAlice in my project
- David
*/

int INLINE fuzzymatch (char *pattern, char *text)
{
#ifdef DEBUG
	printf ("fuzzymatch got, pattern=[%s], text=[%s]\n", pattern, text);
#endif
	int howclose = 100, best = 100;
	char *begin;
	App_init (pattern, text);
	do {
		if (howclose < best)
			best = howclose;
		App_next (&begin, &howclose);
	} while (begin != NULL);
#ifdef DEBUG
	if (best < 5)
		printf ("fuzzymatch with pattern=[%s], and text=[%s], returns = best(%d) + abs (strlen (text) (%d) - Plen(%d))\n",
				pattern, text, best, strlen(text), Plen);
#endif
	return best + abs (strlen (text) - Plen);
} 

void spellchecker (char *text) /* call here *after* calling tokenizer() */
{
	FILE *dict;
	char *p, buffer[102], bestword[102];
	int best, score, i;
	char *fbuf = (char*) malloc (sizeof(char) * BIG_FILE_BUFFER);
	
	if (fbuf == NULL) {
		aine_error ("Not enough memory for file buffer in spellchecker()\n");
		return;
	}
	
	dict = fopen ("data/brain/dictionary.txt", "r");
#ifdef DEBUG
	printf ("spellchecker gots=[%s] (tokens=%d)\n", text, tokens);
#endif
	if (!dict) {
		aine_error ("Cannot open data/brain/dictionary.txt\n");
		free (fbuf);
		return;
	}

	setvbuf (dict, fbuf, _IOFBF, BIG_FILE_BUFFER);
	
	for (i = 0; i < tokens; i++) {
		if (token[i] == '\0') continue;
		best = 100;
		rewind(dict);
		while (fgets (buffer, 100, dict )) {
			if ((p=strchr(buffer,'\n')))
				*p = '\0';
			score = fuzzymatch (token[i], buffer);
			if (score < best) {
				best = score;
				strcpy (bestword, buffer);
				if (score == 0) break; // Found a perfect match
			}
		}
		if ((best < 3) && (best > 0)) {
			replace (text, token[i], bestword);
#ifdef DEBUG
			printf ("spellchecker corrects [%s] to [%s]\n", token[i], bestword);
#endif
		}
	}
	fclose (dict);
	free (fbuf);
	return;
}

void tokenizer (char *text) /* call here *after* uppercase(text) ! */
{
        register char *pos = text;
        int word = 0, wlen = 0;
	Bool got_a_word = FALSE;
	if (*pos == '\0') {
		tokens = 0;
		return;
	}
        do {
                if (isalpha(*pos)
#ifdef POLISH_LANG
		   ) {
#else
			||  *pos == '\'') {
#endif
                        token[word][wlen++] = *pos;
			got_a_word = TRUE;
		}
             
		if (!isalnum(*pos) && got_a_word) {
			/* DEBUG printf ("tokenizer twierdzi, ze [%c] to nie alpha ani num\n", *pos); */
			token[word++][wlen] = '\0';
                        wlen = 0;
			got_a_word = FALSE;
                }
                if ((wlen == 63) || (word == 63)) break;
                pos++;
        } while (*pos != '\0');
	if (got_a_word)
		token[word++][wlen] = '\0';
	tokens = word;
}


/* Jacco's code for extracting he/she variables:
   
void analyzer()
{
        // For the moment: Try to fill 'he' and 'she' variables
        char* buffer = new char[1024];
        FILE* names = fopen( "literature/malename.txt", "r" );
        BOOL found = FALSE;
	int i;
        while ((!feof( names )) && (!found))
        {
                fgets( buffer, 1024, names );
                buffer[strlen( buffer ) - 1] = 0;
                for ( int i = 0; i < tokens; i++ ) if (!stricmp( buffer, token[i]))
                {
                        strcpy( var[4], buffer );
                        found = true;
                        break;
                }
        }
        fclose( names);
        names = fopen( "literature/femalename.txt", "r" );
        found = FALSE;
        while ((!feof( names )) && (!found)) {
                fgets( buffer, 1024, names );
                buffer[strlen( buffer ) - 1] = 0;
                for (i = 0; i < tokens; i++)  if (!str_case_cmp (buffer, token[i])) {
                        strcpy (var[5], buffer);
                        found = TRUE;
                        break;
                }
        }
        fclose ( names);
        delete [] buffer;
}


*/

