/* Aine knowledge files compiler

 License: GPL
 This is part of the Aine Project.
 For copyrights and license (GPL) - see main.c in main source dir
 
*/

#include <stdarg.h>
#include "aine_c.h"

#undef FALSE
#undef TRUE
typedef enum { FALSE, TRUE } bool;

FILE *indexpos_f, *inputs_a_f, *inputs_a_t, *inputs_d_f, *inputs_d_t, *aine;

fpos_t chunk[53]; /* 26 * 2 + 1 */

char *topic;   /* Topic holds our topic which we convert to conditional tags */
short topic_count;
FILE *index_f, *topicfile, *unsorted_a_f, *unsorted_a_t, *unsorted_d_f,
*unsorted_d_t, *outputs_f;
bool comment = FALSE;
bool output_check;
unsigned long size; /* number of categories bot have */


/* prints out panic reason,
 * deletes all created files
 * (as they are most probably broken, if compiler encountered a problem)
 * and quits
 */
void panic (char *s, ...) {
	va_list argp;
	va_start (argp, s);
	vfprintf (stderr, s, argp);
	va_end (argp);
	fprintf (stderr, ".\n");

	remove ("data/brain/inputs_a_f");
	remove ("data/brain/inputs_a_t");
	remove ("data/brain/inputs_d_f");
	remove ("data/brain/inputs_d_t");
	remove ("data/brain/outputs");
	remove ("data/brain/topics");
	remove ("data/brain/indexpos");
	remove ("data/brain/unsorted_a_t");
	remove ("data/brain/unsorted_a_f");	
	remove ("data/brain/unsorted_d_t");
	remove ("data/brain/unsorted_d_f");
	exit (1);
}


short str_len_aine (char *s)
{
	int count=0;
	while (*s != '}' && *s != '\0')
		s++;
	if (*s == '\0')
		return 0;
	while (*s != '<' && *s != '\0') {
		count++; s++; 
	}
	return count;
}


short str_len_up_to_that (char *s)
{		/* and without topic */
	if (!strchr (s, '<')) return 0; 
	return ( strchr (s, '<') - s);
}

void binary_writer (FILE *out, char *s)
{		/* quick and nasty fix for binary output */
	char *p;
	int i, j;
	char *buf = (char*) malloc (strlen(s) * sizeof(char));
	if (!buf) exit (12);
	
	if ((p=strchr (s, '\n'))) *p = '\0';
	p = s;
	while (*(++p) != '}') ;
	*p = '\0';
	i =  atoi (s);
	fwrite (&i, sizeof (int), 1, out);
	i = (int) strlen (++p);
	strcpy (buf, p);
	*strchr (buf, '&') = '\0';
	i++;
	fwrite (&i, sizeof (int), 1, out);
	j = (int) strlen (buf);
	fwrite (&j, sizeof (int), 1, out);
	fwrite (p, sizeof(char), (size_t) i, out);
	free (buf);
}


void wardens_writer (FILE *out) {
	int i, j;
	char bla[] = "blabla<$=0";
	for (j = 0; j < 4; j++) {
		i = -1;
		fwrite (&i, sizeof(int), 1, out);
		i = strlen (bla) +1;
		fwrite (&i, sizeof(int), 1, out);
		i = 6;
		fwrite (&i, sizeof(int), 1, out);
		fwrite (bla, sizeof(char), i, out);
	}
}


void replacer (char *line)
{
	char *pos;
	short i;
	
	if (line[0] == '\0' || line[1] == '\0')
		return;
  	if (comment && strstr (line, "-->")) { 
		line[0] = '\0';
		comment = FALSE;
	}
	if (strstr (line, "<!--") && strstr (line, "-->"))
		line[0] = '\0';
  	if (strstr (line, "<!--")) comment = TRUE;
	if (comment)
		line[0] = '\0';
	replace (line, "<R>", "<r><l>");
	replace (line, "</R>", "</l></r>");
	replace_all (line, "<*=0>", "<*>");
	replace_all (line, "<star/>", "<*>");
	replace_all (line, "{*}", "{<*>}");

	/* next 3 are rather only for my (Polish) bot...
	   I don't think you will need them */
	replace_all (line, "<plec>", "<c=gender><k=on>");
	replace_all (line, "<ona>", "</k><k>");
	replace_all (line, "</plec>", "</k></c>"); 

	if ((pos = strstr (line, "<topic="))) {
		topic_count++;
		if (topic_count > 999) {
			fputs ("ERROR - more than 1000 topics are not allowed", stderr);
			exit (1);
		}
		if (pos[7] == '\"') pos++; 
		strcpy (topic, pos + 7);
		*strchr (topic,'>') = '\0';
		replace (topic, "\"", "");
		uppercase (topic);
		i = strlen (topic) + 1;
		fwrite (&i, sizeof(short), 1, topicfile);
		fwrite (topic, sizeof(char), i, topicfile);
		fwrite (&topic_count, sizeof(short), 1, topicfile);
	}
	if (strstr (line, "</topic>"))
		topic[0] = '\0';
	strremove (line, "<topic=", ">");
	replace (line, "</topic>", "");
	replace_all (line, "\t","");
	replace_all (line, "\r","");
	replace_all (line, "\n","");
	replace_all (line, "<input/>", "<input=1>");
	replace_all (line, "<input index=", "<input=");
	replace_all (line, "<person/>", "<person><*></person>");
	replace_all (line, "<person2/>", "<person2><*></person2>");
	replace_all (line, "<personf/>", "<personf><*></personf>");
	replace_all (line, "<sr/>", "{<*>}");
	replace_all (line, "</*>", "{<*>}");
}


int splitaiml (char *src)
{		/* prepares the temp file */
	unsigned long size_this = 0;
	int patterns = 0, templates = 0, bytecount = 0;
	char *pos, *pos1;
	fpos_t tpos;
	char *line, *next;
	register int linenr = -1;
	bool atomic = FALSE, that_used = FALSE;
	unsigned short int i, j;
	
	aine = fopen (src, "r+");  /* into the template and pattern files. */
	if (!aine) {
		printf ("%s not found, skipping!\n", src);
		return (1);
	}

	/* here check last two lines if they are blank lines */
	fseek (aine, -3, SEEK_END);
	for (j=0, i=0; i < 3; i++) {
		if (getc(aine) != '\n')
			j++;
	}
	/* we put the lines if the user forget */
	if (j != 0) {
		for (i = 0; i < j; i++)
			fputc ('\n', aine);
	}
	/* check syntax */
  	printf ("Checking %s ...", src);
	fflush (stdout);
	check_ainel (aine);
  	
	/* fill dictionary */
	printf ("Added %d words to dict.\n", dictionary (aine));
	
	printf ("Adding %s ...", src);

	line = (char*) malloc (sizeof(char)*MAX_LINE_SIZE);
	next = (char*) malloc (sizeof(char)*MAX_LINE_SIZE);
	fgets (line, MAX_LINE_SIZE, aine);

	while (fgets (next, MAX_LINE_SIZE, aine)) {
		/* we now make the temp.txt into unsorted.txt
  		 * and templates.txt also we exchange some tags for shorthand tags. */
		linenr++;
		replacer (line);
    
          	/*  PART WITH   T H A T  */
		if (strstr (line, "+[") && ( strstr (line, "<t>") || strstr (next, "<t>"))) {
			size++;
      			size_this++;
  			pos = strstr (line, "<t>");
			if (!pos)
				pos = strstr (next, "<t>");
  			if (output_check) { 
				panic ("Error before line %d\n", linenr);
			}
			else output_check = TRUE;
      
			that_used = TRUE;
			while (!strstr (line, "</t>")) {
				strcpy (line + strlen (line), next);
				replacer (line);
				linenr++;
				fgets (next, MAX_LINE_SIZE, aine);
			}
			
			patterns++;
			/* what we need to print out.. is line till that. */
			*strstr (line,"</t>") = '\0';
			replace (line, "+[","");
			
			replace (line, "?]", "&?]");
			replace (line, "!]", "&!]");
			replace (line, ".]", "&.]");
			
			if (strchr (line, '&'))
				replace (line, "]", "");
			else
				replace (line, "]", "&");
    
			pos1 = NULL;
			if ((pos = strchr (line, '*')) || (pos1 = strchr (line, '_'))) {
				pos = (pos > pos1) ? pos : pos1;
				if (pos > strstr (line, "<t>")) atomic = FALSE;
				else atomic = TRUE;
			}
			else atomic = FALSE;
		
			replace (line, "<t>", "<"); /* added 13/03/2003 */
				
			if (topic[0] == 0) {
				if (atomic) {	fprintf (unsorted_d_t, "%s", line);
					atomic = FALSE;
				}
				else { 	fprintf (unsorted_a_t, "%s", line);
					atomic = TRUE; 
				}
			}
    			else {
      				if (atomic) { fprintf (unsorted_d_t, "%s", line);  
					atomic = FALSE;
				}
				else {	fprintf (unsorted_a_t, "%s", line);
					atomic = TRUE;
				}
			}
			bytecount += strlen (line) - 3;
		}
		/* PART *without* THAT */
		else
		if (strstr (line, "+[")) {
			size++;
			size_this++;
			if (output_check) 
				panic ("Error before line %d\n", linenr);
			else output_check = TRUE;
		
			that_used = FALSE;
			while (!strchr (line, ']')) {
				strcpy (line + strlen (line), next);
				replacer (line);
				linenr++;
				fgets (next, MAX_LINE_SIZE, aine);
			}
			patterns++;
			*(strchr (line, ']') + 1) = '\0';
			replace (line, "+[", "");
			
			replace (line, ".", "&.");
			replace (line, "?", "&?");
			replace (line, "!", "&!");
			
			if (replace (line, "]", "")  &&  !strchr (line, '&') )
				strcat (line, "&");

			
			if (topic[0] == 0) {
				if (strchr (line, '*') || strchr (line, '_')) { 
					fprintf (unsorted_d_f, "0}%s", line);
					atomic = FALSE;
				}
				else { 
					fprintf (unsorted_a_f, "%s", line);
					atomic = TRUE;
				}
			}
			else {
				if (strchr (line, '*') || strchr (line, '_')) {
					fprintf (unsorted_d_f, "%d}%s", topic_count, line);
					atomic = FALSE;
				}
				else {
					fprintf (unsorted_a_f, "%s", line);
					atomic = TRUE;
				}
			}
			bytecount += strlen (line) - 3;
		}
		else
		if (strstr (line, "-[")) {
			if (!output_check)
				panic ("Error before line %d\n", linenr);
			else output_check = FALSE;
			while (!strchr (line, ']')) {
				strcpy (line + strlen (line), next);
				replacer (line);
				linenr++;
				fgets (next, MAX_LINE_SIZE, aine);
			}
	
			fgetpos (outputs_f, &tpos);
			templates++;
			if (templates != patterns) { 
				printf ("ERROR - Pattern/Template count mismatch\n");
				exit (0);
			}
			*strchr (line, ']') = '\0';
			replace (line, "-[", "");
			fprintf (outputs_f, "%s\n", line);

			
			if (atomic) {
#if defined (__linux__) || (__sun)
				if (that_used) fprintf (unsorted_a_t, "<$=%ld\n",  tpos.__pos); 
				else fprintf (unsorted_a_f, "<$=%ld\n", tpos.__pos);
#else
				if (that_used) fprintf (unsorted_a_t, "<$=%lld\n",  tpos); 
				else fprintf (unsorted_a_f, "<$=%lld\n", tpos);
#endif
			}
			else {
#if defined (__linux__) || (__sun)
				if (that_used) 	fprintf (unsorted_d_t, "<$=%ld\n", tpos.__pos);  
				else fprintf (unsorted_d_f, "<$=%ld\n", tpos.__pos);
#else
				if (that_used) 	fprintf (unsorted_d_t, "<$=%lld\n", tpos);
				else fprintf (unsorted_d_f, "<$=%lld\n", tpos);
#endif
			}

			bytecount += strlen (line) - 3;
		}
		strcpy (line, next);
	}
	printf ("OK, added %lu patterns\n", size_this);
	free (line);
	free (next);
	fclose (aine);
	return (1);
}


int prepare (void)
{
	register short i;
	short ind;
	char *aline = (char*) malloc (sizeof (char) * 256);
	char *buffer = (char*) malloc (sizeof (char) * MAX_LINE_SIZE);
	
	topic = (char*) malloc (sizeof(char)*MAX_VARVAL_SIZE);
  
	output_check = FALSE;
  
	puts ("\nLoading list files to load: brain_index.txt\nLoading AINE brain files:");
  
	while (fgets (aline, 256, index_f)) {
		aline[strcspn (aline, "\n")] = 0;
		if (!splitaiml (aline))
			return (0);
		memset (aline, 0, 256);
	}
	free (aline);

  	/*   S O R T I N G    */
  
		/* SORTING "ATOMIC *without* THAT" */
	puts ("\nSorting patterns. Please wait.");
	puts ("Sorting unsorted_a_f");
	fclose (unsorted_a_f);
	unsorted_a_f = fopen ("data/brain/unsorted_a_f", "r");
	for (i = 'A'; i <= 'Z'; i++) {
		putchar (i);
		fgetpos (inputs_a_f, &chunk[i- 'A']);
		for (ind = 0; ind < 2; ind++) { /* if (ind == 0) we only printf patterns with
a ? or ! or . So these are always first. */
			rewind (unsorted_a_f);
			while (fgets (buffer, MAX_LINE_SIZE, unsorted_a_f)) {
				if (!ind && buffer[0] == i && (strchr (buffer, '.') || strchr (buffer, '!')
|| strchr (buffer, '?'))) {
					fputs (buffer, inputs_a_f);
					continue;
				}
				if (ind && buffer[0] == i && (!strchr (buffer, '.') && !strchr (buffer,
'!') && !strchr (buffer, '?'))) {
					fputs (buffer, inputs_a_f);
					continue;
				}
				if (ind && i == 'Z' && (buffer[0] < 'A'  ||  buffer[0] > 'Z'))
					fprintf (unsorted_d_f, "0}%s", buffer);
			}			
		}
	}
	
	fputs ("eof&<$=0\n", inputs_a_f);
	/* we need second 'eof' as well (loop unroll in matchers.c) */
	fputs ("eof2&<$=0", inputs_a_f);
	
	putchar ('\n');  
	
	/* SORTING "ATOMIC WITH THAT" */
	fclose (unsorted_a_t);
	unsorted_a_t = fopen ("data/brain/unsorted_a_t", "r");
	puts ("Sorting unsorted_a_t.");
	for (i = 'A'; i <= 'Z'; i++) {
		putchar (i);
		fgetpos (inputs_a_t, &chunk[i - 'A' + 26]);
		for (ind=0; ind < 2; ind++) {
			rewind (unsorted_a_t);
			while (fgets (buffer, MAX_LINE_SIZE, unsorted_a_t)) {
				if (!ind && buffer[0] == i && (strchr (buffer, '.')
				  || strchr (buffer, '!') || strchr (buffer, '?'))) {
					fputs (buffer, inputs_a_t);
					continue;
				}
				if (ind && buffer[0] == i && (!strchr (buffer, '.')
				  && !strchr (buffer, '!') && !strchr (buffer, '?'))) {
					fputs (buffer, inputs_a_t);
					continue;
				}
				if (ind && i == 'Z' && (buffer[0] < 'A'  || buffer[0] > 'Z'))
					fputs (buffer, unsorted_d_t);
			}
	
		}
	}

#ifdef __linux__
#define CHUNK &chunk[ind].__pos
#else
#define CHUNK &chunk[ind]
#endif
	
	fputs ("eof&<Nothing<$=0\n", inputs_a_t);
	fputs ("eof2&<Nothing<$=0", inputs_a_t); /* we need two eof's (loop unroll) */
	putchar ('\n');
	ind = 0;
	while (ind < 53) { /* 26*2 letters of alphabet */
		fwrite (CHUNK, sizeof(fpos_t), 1, indexpos_f);
		ind++;
	}
	fclose (unsorted_d_f);
	puts ("Sorting unsorted_d_f. This may take a while...");
  	
	/* SORTING "DEFAULT *without* THAT" (but with topic!) */
	unsorted_d_f = fopen ("data/brain/unsorted_d_f", "r");
	for (i = 80; i > 0; i--) {
		/* first put topic0: */
		rewind (unsorted_d_f);
		while (fgets (buffer, MAX_LINE_SIZE, unsorted_d_f)) {
			if (buffer[0] == '0') {
				if (i == 80  && (str_len_aine(buffer) >= 80 || strchr(buffer, '_')
				 || (!strchr(buffer, '*') && !strchr(buffer, '_')))) { 
					replace_all(buffer, "_", "*");
					binary_writer(inputs_d_f, buffer);
				}
				else {
					if (str_len_aine(buffer) == i && strchr(buffer, '*') && !strchr(buffer, '_')) {
						binary_writer(inputs_d_f, buffer);
					}
				}
			}
			
		}
		/* then rest of topics: */
		rewind (unsorted_d_f);
		while (fgets (buffer, MAX_LINE_SIZE, unsorted_d_f)) {
			if (buffer[0] != '0') {
				if (i == 80 && (str_len_aine(buffer) >= 80 || strchr(buffer, '_')) ) {
					replace_all (buffer, "_", "*");
					binary_writer (inputs_d_f, buffer);
				}
				else {
					if (str_len_aine(buffer) == i && !strchr(buffer, '_')) {
						binary_writer (inputs_d_f, buffer);
					}
				}
			}
		}
	}
  	putchar ('\n');
	fclose (unsorted_d_t);

    	/* SORTING "DEFAULT WITH THAT" */
	puts ("Sorting unsorted_d_t");
	unsorted_d_t = fopen ("data/brain/unsorted_d_t", "r");
	for (i = 80; i > 0; i--) {
		rewind (unsorted_d_t);
		while (fgets (buffer, MAX_LINE_SIZE, unsorted_d_t)) {
			if (i == 80 && (str_len_up_to_that (buffer) >= 80 || !strchr (buffer, '*')))   
		      		fputs (buffer, inputs_d_t);
			else {
				if (str_len_up_to_that (buffer) == i && strchr (buffer, '*'))   
					fputs (buffer, inputs_d_t);
      			}
    		}
  	}
	free (buffer);
	free (topic);
	return (1);
}


int main (void)
{
#ifdef HAVE_LOCALE_H
	setlocale (LC_ALL, "");
#endif
	index_f    = fopen ("brain_index.txt", "r");
	if (!index_f) {
		fputs ("brain_index.txt not found!\n", stderr);
		exit (1);
	}
	puts ("I must have permission to write to /data, where I will drop compiled files for Aine");
	
	inputs_a_f = fopen ("data/brain/inputs_a_f", "w");
	if (!inputs_a_f) {
		fputs ("Cannot create data/brain/inputs_a_f", stderr);
		exit (2);
	}
	inputs_a_t = fopen ("data/brain/inputs_a_t", "w");
	inputs_d_f = fopen ("data/brain/inputs_d_f", "wb");
	inputs_d_t = fopen ("data/brain/inputs_d_t", "w");

	setvbuf (inputs_a_t , NULL , _IOFBF , 16384 );
	setvbuf (inputs_a_f , NULL , _IOFBF , 16384);  
	setvbuf (inputs_d_t , NULL , _IOFBF , 16384 );
	setvbuf (inputs_d_f , NULL , _IOFBF , 16384 );
	
	outputs_f  = fopen ("data/brain/outputs",    "w"); 
	indexpos_f = fopen ("data/brain/indexpos",   "wb");
	topicfile  = fopen ("data/brain/topics",     "wb");
  
	unsorted_a_t = fopen ("data/brain/unsorted_a_t",  "w");
	unsorted_a_f = fopen ("data/brain/unsorted_a_f",  "w");	
	unsorted_d_t = fopen ("data/brain/unsorted_d_t",  "w");
	unsorted_d_f = fopen ("data/brain/unsorted_d_f",  "w");

	size = 0;
	if (prepare()) puts ("OK");
	else puts ("Error");

	printf ("Total patterns = %lu\n", size);
	
	fclose (inputs_a_f);
	fclose (inputs_a_t);
	wardens_writer (inputs_d_f);
	fclose (inputs_d_f);
	fclose (inputs_d_t);	
	
	inputs_a_f = fopen ("data/brain/inputs_a_f", "r");
	inputs_a_t = fopen ("data/brain/inputs_a_t", "r");
	inputs_d_f = fopen ("data/brain/inputs_d_f", "rb");
	inputs_d_t = fopen ("data/brain/inputs_d_t", "r");
	
	puts ("Checking for duplicate patterns...");
	puts ("NOTE: you may have some \"false\" alerts about");
	puts ("\tduplicate patterns that, in fact, has different topics.");
	puts ("\tWhy? Aine cares for topic only in categories without \"that\"");
	puts ("\tbut with a wildcard. (It's most sensible approach, making Aine pretty fast,");
	puts ("\tand making \"topic\" feature works, at the same time,");
	puts ("\ton categories where \"topic\" feature is most useful).");
	
	duplicate_checker (inputs_a_f, 0);
	duplicate_checker (inputs_a_t, 0);
	duplicate_checker (inputs_d_f, 1);
	duplicate_checker (inputs_d_t, 0);

	puts ("OK");
	fclose (index_f);

	fclose (inputs_a_f);
	fclose (inputs_a_t);
	fclose (inputs_d_f);
	fclose (inputs_d_t);
  
	fclose (outputs_f);
	fclose (indexpos_f);
	fclose (topicfile);
  
	fclose (unsorted_a_t);
	fclose (unsorted_a_f);
	fclose (unsorted_d_f);
	fclose (unsorted_d_t);
	
	remove ("data/brain/unsorted_a_t");
	remove ("data/brain/unsorted_a_f");	
	remove ("data/brain/unsorted_d_t");
	remove ("data/brain/unsorted_d_f");
	save_dictionary();

	return 0;
}

