/*
 * SearchSieve.java. Copyright (c) 2000, David Faden. All rights reserved (for now).
 *
 * This code is distributed without warranty of any kind.
 */

//SearchToHTML c1999 The Gilbert Post by David Faden
//The applet and code are distributed as linkware...
//If you use this applet or a variant on its code,
//include a link to The Gilbert Post, 
//http://www.geocities.com/Athens/Parthenon/1911
//The Gilbert Post and David Faden take no responsibility
//for anything bad that happens as a result of using this applet.
//Please send reports of problems to gilbertnews@hotmail.com, anyway, though.


//Partial modification record
// August 16, 2000: Corrected some inaccurate comments.
//
// August 17, 2000: Removed method isSpace(char). Added the 
// isWordChar(int) method to be used as negative replacement.
// isWordChar uses table look-up rather than a loop so it should be
// substantially faster. Added method setWordChar(char, boolean) 
// to match isWordChar(int).
// I consider this to be an interim version of isWordChar - the final
// one will use some sort of hash and correctly handle the full range of
// Unicode characters.
//
// August 17, 2000: changed the names of the methods numOfMatches() and keyLength()
// to getNumOfMatches() and getKeyLength() respectively. Documented the undocumented.

/**
 * The SearchSieve class sifts for a key in a sequence
 * of Java (Unicode) characters fed to the sieve with addChar.
 *
 * @author David Faden
 * @version 0.9a August 17, 2000
 */
public class SearchSieve {
  private int numOfMatches;//how many matches have been found?
  private int kposition;//current position in key
  private char[] key;//search word/phrase
  private boolean exact;//must the word be an exact match or
  //can it be part of a larger word
  
  private char prevc;//the previous char examined
  
  private char[] buffer;//store chars as one sequence is examined in case
  //the key starts at another spot within the chain
  private int bufferLength;
  private boolean buffering;
  private char[] temp;
  
  /**
   * Word character table used for fast look-up.
   */
  private static boolean[] wordCharTable;
  //XXX! This table excludes multi-byte characters. A search for a better
  //solution is under way. DF.
  static {
    wordCharTable = new boolean[256];
    for (int i = 0; i < 256; i++) {
        if (i >= 'A' && i <= 'Z' ||
            i >= 'a' && i <= 'z' ||
            i >= '0' && i <= '9' ||
            i >= 0xC0) //LATIN CAPITAL LETTER A WITH GRAVE and up
            wordCharTable[i] = true;
        else
            wordCharTable[i] = false;
            //I believe that the Java Lang Spec calls for
            //booleans to have an uninitialized value of false anyway but...
    }
  }
  
  
  public SearchSieve(char[] c, boolean bexact) {
    setKey(c, bexact);
  }
  
  public void setKey(char[] c, boolean bexact) {
    exact=bexact;
    key=new char[c.length];
    System.arraycopy(c,0,key,0,c.length);
    buffer=new char[key.length];
    temp=new char[buffer.length];//temp is created here to 
    //avoid the overhead of recreating it each time it is needed
    
    reset();
  }
    
    
  public void reset() {
    kposition=0;
    numOfMatches=0;
    prevc='\0'; //null according to the Unicode table, nul according to others
    bufferLength=0;//clear the buffer
    buffering=false;
  }
  
  public int getKeyLength() {
    return key.length;
  }
  
  /**
   * Returns true if the sequence of chars added to the SearchSieve,
   * including <code>c</code>, constitute a match. For exact searches,
   * the last char of a match must be a non-word char.
   */
  public final boolean addChar(char c) {
    //If there's a match, return true.
    boolean matchfound=false;//left unchanged if nothing found
    boolean eatthebuffer=false;
    if (exact) {
      if (kposition == key.length) {
        //found a match
        if (!isWordChar(c)) {
          numOfMatches++;
          matchfound=true;
        }
        kposition=0;
        eatthebuffer=true;
      }
      else if(kposition==0) {
        if (c==key[0] && !isWordChar(prevc)) 
            kposition++;
      }
      else if (c == key[kposition]) 
        kposition++;
      else {
        //found no match
        kposition=0;
        eatthebuffer=true;
      }
    }
    //Inexact match check
    else {
      if (kposition == 0) { 
        if (c == key[0]) 
            kposition++;
      }
      else if (c == key[kposition]) 
        kposition++;
      //Special case space characters - consider any two to match.
      //XXX! Character.isSpace(char) is deprecated in JDK 1.1 but our target
      //JDK is 1.0.
      else if (Character.isSpace(c) && Character.isSpace(key[kposition]))
        kposition++;
      else {
        //no match found
        kposition=0;
        eatthebuffer=true;
      }
      if (kposition == key.length) {
        numOfMatches++;
        kposition=0;
        eatthebuffer=true;
        matchfound=true;
      }
    }
    //a kposition of 1 indicates that one char match has been found
    if(kposition>1 && !buffering && c==key[0]) {
      if(!exact) {
        bufferLength=0;
        buffering=true;
      }//if exact, matches have to be surrounded by spaces
      else if(!isWordChar(prevc)) {
        bufferLength=0;
        buffering=true;
      }
    }
    if(buffering) {
      if(c==key[bufferLength]){
        buffer[bufferLength]=c;
        bufferLength++;
      }
      else {
        bufferLength=0;
        buffering=false;
      }
    }
    if(eatthebuffer && bufferLength>0) eatTheBuffer();//feed the buffer back into the
    //search system on failures and finds
    prevc=c;
    return matchfound;
  }
  
  //Feed the buffer through addChar(char) as if it were being externally entered.
  //No match can ever be found in the buffer alone, so there is
  //no need to check the return of addChar(char).
  //Huge problem with this method which will cause fatal error...
  //Buffer itself may contain a second ambiguous char...
  //However, if the buffer doesn't match up, it is overwritten
  //When the next call to eatTheBuffer is made
  //Idea, what if temp is made internal to eatTheBuffer?
  //This could lead to a major memory leak, though...
  //Fixed...breath easy now!
  private final void eatTheBuffer() {
    //reset search
    kposition=0;
    System.arraycopy(buffer,0,temp,0,bufferLength);
    int tempLength=bufferLength;//save the value of bufferLength
    bufferLength=0; //reset the buffer
    buffering=false;
    //Don't bother checking addChar(char) return value - It can never be true
    //in this case.
    for (int i = 0; i < tempLength; i++) 
        addChar(temp[i]);
  }
  
  public final int getNumOfMatches() {
    return numOfMatches;
  }
  
    /**
     * Returns true if SearchSieves consider <code>c</code> to
     * be part of a word. 
     * <p>
     * Currently, this method is limited to 
     * characters &lt; 256 and will return false for all
     * values out of range. This failing will only affect
     * "exact" searches, potentially returning false positives.
     *
     * @param c - The char to test.
     * @return True if <code>c</code> is a word char.
     */
    public static final boolean isWordChar(int c) {
        if (c < 0 || c > 255)
            return false;
        return wordCharTable[c];
    }
    
    /**
     * Set whether SearchSieves should consider <code>c</code>
     * to be part of a word.
     * <p>
     * Currently, this method is limited to char &lt; 256 and will
     * will throw an IllegalArgumentException for out of range char.
     *
     * @param c - The char to set.
     * @param isWord - A true value will cause SearchSieves to consider c part of a word.
     * @exception IllegalArgumentException - Thrown if c > 255.
     */
    public static final void setWordChar(char c, boolean isWord) {
        int i = c;
        //XXX! Remember to chop this section once the code is fully
        //internationalized.
        if (i == 0 || i > 255)
            throw new IllegalArgumentException("Out of range.");
        wordCharTable[i] = isWord;
    }
  
}
