<!-- #include file="CURLExtractor.asp" -->

<%

class CHTMLTextExtractor

  Private m_count_invalid_text, m_count, m_database, m_heading_required, m_max_urls, m_proceed
  Public m_too_busy_text, m_wait
  
  'true if you want existing URL's to be updated
  Public m_update
  

  'remove all data
  public sub Clear
  
    if DatabaseTableExists then
      m_database.Execute "begin tran truncate table [Yider] commit tran"
    end if
    
  end sub

  
  public sub Constructor(sql_server_connection)
  
    m_update = true
    m_proceed = true
    m_count_invalid_text = 0
    m_heading_required = true
    m_count = 0
    m_max_urls = 0
    m_wait = 30
        
    set m_database = Server.CreateObject("ADODB.Connection")
    m_database.Open sql_server_connection
    
  end sub


  public sub Destructor    
    m_database.Close
    set m_database = Nothing
  end sub
  
  
  private function DatabaseTableExists
    Dim exists, recordset
    
    exists = false
    set recordset = m_database.Execute("select [id] from [sysobjects] where [name]='Yider'")
      
    if not recordset.eof then
      exists = true
    end if
      
    recordset.Close
    set recordset = Nothing
    
    DatabaseTableExists = exists
    
  end function
  
  
  private sub ExtractSearchableText(url, text, invalid_text)

    Dim count, query, ret, title
    
    ret = GetTitleAndText(text)
    title = ret(0)
    text = ret(1)
    
    if IsArray(m_too_busy_text) then
    
      if InArrayStr(text, m_too_busy_text) then
        m_proceed = false
      end if
      
    end if
          
    
    if m_proceed then
    
      if IsArray(invalid_text) then
      
        if not InArrayStr(text, invalid_text) then
        
          m_database.Execute "update [Yider] set [title]='" & replace(title, "'", "''") & "', [text]='" & replace(text, "'", "''") & "', [parsed]=1 where [url]='" & url & "'"
          PrintResults "<br>" & url & " was parsed"
          
        else
          m_database.Execute "update [Yider] set [parsed]=1 where [url]='" & url & "'"

          PrintResults "<br>*Invalid text: " & url & " was not added to the database because it contains invalid text"
          m_count_invalid_text = m_count_invalid_text + 1
        end if
        
      else
        m_database.Execute "update [Yider] set [title]='" & replace(title, "'", "''") & "', [text]='" & replace(text, "'", "''") & "', [parsed]=1 where [url]='" & url & "'"
      
          PrintResults "<br>" & url & " was parsed"
      end if      
      
    else
      
      PrintResults "<br><br>Spidering was abandoned because " & url & " contained one of the following words/phrases:"
        
      for count = 0 to UBound(m_too_busy_text)
        response.write " '" & m_too_busy_text(count) & "'"
          
        if count <  UBound(m_too_busy_text) then
          response.write ","
        end if
      next
        
    end if
      
  end sub


  'a not very way to determine whether a dtaabse has full text enabled but I'm not sure where this
  'information is stored
  private function FullTextEabled
    Dim enabled, recordset
    
    enabled = false
    
    CreateRecordset recordset
    
    recordset.Open "select ftcatid from sysfulltextcatalogs", m_database
    
    if not recordset.eof then
      enabled = true
    end if
    
    recordset.Close
    set recordset = Nothing
    
    FullTextEabled = enabled
  
  end function
  
  
  private function GetTagValue(html, tag)
    Dim html_copy, pos_start, pos_end, tag_value
    
    tag_value = ""
    html_copy = lcase(html)
    
    pos_start = InStr(html_copy, "<" & lcase(tag) & ">")
    
    if pos_start <> 0 then
      pos_end = InStr(html_copy, "</" & lcase(tag) & ">")
      
      if pos_end <> 0 then
        tag_value = Mid(html, pos_start + Len(tag) + 2, pos_end - pos_start - (Len(tag) + 2))
      end if
    end if
    
    GetTagValue = tag_value
    
  end function

  
  'html is the raw html from an url
  'returns an array
  '(0) is the value of the title tag
  '(1) is the searchable text within html
  public function GetTitleAndText(html)
  
    Dim title, position
    
    title = GetTagValue(html, "title")
    
    html = replace(html, vbCrLf, " ") 'get rid of all carriage returns
    html = replace(html, "<br>", vbCrLf) 'change <br> to carriage returns so they don't get stripped
    html = ReplaceTag(html, "script", "") 'remove all content between <script></script> tags
    html = ReplaceTag(html, "SCRIPT", "") 'remove all content between <SCRIPT></SCRIPT> tags
    
    position = 1
    while position <> 0 
      position = StripBetween(position, "<!--", "-->", false, html)
    wend

    position = 1
    while position <> 0
      position = StripBetween(position, "<select", "</select>", true, html)
    wend

    position = 1
    while position <> 0
      position = StripBetween(position, "<", ">", true, html)
    wend

    html = replace(html, "&nbsp;", " ") 'must convert &nbsp; to spaces before removing duplicate spaces
    html = replace(html, Chr(9), " ") 'convert tabs to spaces before removing duplicate spaces
    html = RemoveDuplicateStrings(html, " ") 'remove duplicate spaces
    html = replace(html, " " & vbCrLf, vbCrLf) 'remove spaces immediately before line breaks
    html = RemoveDuplicateStrings(html, vbCrLf) 'remove duplicate <br>'s (remember that these were converted to vbCrLf's)
    html = replace(html, vbCrLf, vbCrLf & "<br>")
    
    GetTitleAndText = Array(title, html)

  end function
  
  
  public sub GetTextThroughoutDatabase(valid_file_extensions, valid_domains, invalid_text, invalid_url_strings, urlextractor)
  
    Dim finished, html, recordset

    CreateRecordset recordset
    finished = false

    while not finished and m_proceed and m_count < m_max_urls

      recordset.Open "select [url] from [Yider] where [parsed]=0", m_database
      
      if recordset.eof then
        finished = true
      end if
      
      while not recordset.eof and m_proceed and m_count < m_max_urls

        html = urlextractor.ExtractHREFsFromURL(recordset(0), valid_file_extensions, valid_domains)
        ExtractSearchableText recordset(0), html, invalid_text
        recordset.MoveNext
        m_count = m_count + 1
        
      wend
    
      recordset.Close
    wend
    
    set recordset = Nothing
    
  end sub

  
  public sub GetTextThroughoutDomain(start_url, valid_file_extensions, valid_domains, max_urls, invalid_text, invalid_url_strings)
    Dim html, recordset, urlextractor
    
    m_max_urls = max_urls
    
    CreateRecordset recordset

    set urlextractor = new CURLExtractor
    urlextractor.Constructor m_update, m_database
    
    urlextractor.m_invalid_url_strings = invalid_url_strings
    
    html = urlextractor.ExtractHREFsFromURL(start_url, valid_file_extensions, valid_domains)
    
    if ParsingRequired(start_url, recordset) then
      ExtractSearchableText start_url, html, invalid_text
      m_count = 1
    else
      m_count = 0
    end if
   
    GetTextThroughoutDatabase valid_file_extensions, valid_domains, invalid_text, invalid_url_strings, urlextractor
    
    if FULL_TEXT then
      'on error resume next
      'm_database.Execute("exec sp_fulltext_table @tabname='Yider', @action='start_incremental'")
      m_database.Execute("exec sp_fulltext_catalog 'YiderFullText', 'start_incremental'")
    end if

    
    if m_count <> 0 then
      PrintResults "<br><br>" & m_count & " URL's were spidered in this pass"
    else
      m_heading_required = false
      PrintResults "<br><br>No URL's have been spidered."
    end if
      
    if IsArray(invalid_text) and m_count_invalid_text <> 0 then
      response.write "<br>" & m_count_invalid_text & " URLs contained some of the invalid text in <b>"
      ResponseWriteCommaDelimitedArray invalid_text
      response.write "</b>"
    end if
    
    set recordset = Nothing
        
    urlextractor.Destructor
    set urlextractor = Nothing

  end sub

  
  sub MakeYiderTable
  
    if FULL_TEXT then
      m_database.Execute "if not exists (select * from dbo.sysfulltextcatalogs) exec sp_fulltext_database @action = 'enable'"
      m_database.Execute "if not exists (select * from dbo.sysfulltextcatalogs where name = N'YiderFullText') exec sp_fulltext_catalog @ftcat = 'YiderFullText', @action = 'create'"
    end if
    
    m_database.Execute "begin tran CREATE TABLE [Yider] ([key] [int] IDENTITY (1, 1) NOT NULL, [url] [varchar] (900) NOT NULL, [title] [varchar] (900) NOT NULL, [text] [text] NOT NULL, [parsed] [bit] NOT NULL, [fulltext_timestamp] [timestamp] NOT NULL) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] commit tran"
    m_database.Execute "begin tran ALTER TABLE [Yider] WITH NOCHECK ADD CONSTRAINT [PK_Yider] PRIMARY KEY  CLUSTERED ([key]) ON [PRIMARY] commit tran"

    if FULL_TEXT then
      m_database.Execute "exec sp_fulltext_table @tabname = 'Yider', @action = 'create', @ftcat = 'YiderFullText', @keyname = 'PK_Yider'"
      m_database.Execute "exec sp_fulltext_column @tabname = 'Yider', @colname = 'text', @action = 'add'"
    end if

  end sub
  
  
  function ParsingRequired(start_url, recordset)
    Dim required
    
    recordset.Open "select [parsed] from [Yider] where [url]='" & start_url & "'", m_database
    
    if recordset.eof then
      required = true
    else
      if recordset(0) = false then
        required = true
      else
        required = false
      end if
    end if
    
    recordset.Close
    
    ParsingRequired = required
    
  end function
  
  
  sub PrintResults(str)
  
    if m_heading_required then      
      response.write "<br><br>The Yider crawled through the following URL's:<br>"
      m_heading_required = false
    end if
      
    response.write str
        
  end sub
  

  'remove duplicates of the string 'duplicate_string' in 'str'
  'eg RemoveDuplicateStrings("aaa11bbb11ccc", "1") returns "aaa1bbb1ccc"
  'tested for
  'str = RemoveDuplicateStrings("", "b")
  'str = RemoveDuplicateStrings("b", "b")
  'str = RemoveDuplicateStrings("bb", "b")
  'str = RemoveDuplicateStrings("bbb", "b")
  'str = RemoveDuplicateStrings("1bbb", "b")
  'str = RemoveDuplicateStrings("bbb3", "b")
  'str = RemoveDuplicateStrings("1bbb3", "b")
  'str = RemoveDuplicateStrings("1bb3bb5", "b")
  'str = RemoveDuplicateStrings("1bb3bb5b", "b")
  'str = RemoveDuplicateStrings("1bb3bb5bb", "b")
  'str = RemoveDuplicateStrings("b1bb3bb5bb", "b")
  'str = RemoveDuplicateStrings("bb1bb3bb5bb", "b")
  'str = RemoveDuplicateStrings("bb1bb3bb5bb", "bb")
  'str = RemoveDuplicateStrings("bbb1bb3bb5bb", "bb")
  'str = RemoveDuplicateStrings("bbbb1bb3bb5bb", "bb")
  private function RemoveDuplicateStrings(str, duplicate_string)
    Dim position, search
    
    position = InStr(str, duplicate_string & duplicate_string)
    
    while position <> 0

      str = replace(str, duplicate_string & duplicate_string, duplicate_string)
      position = InStr(str, duplicate_string & duplicate_string)
      
    wend
    
    RemoveDuplicateStrings = str
    
  end function  

  
  'replaces the string between the start and end of a tag with 'replacement'
  'eg ReplaceTag("here comes a <a href="""">tag</a>", "a", "poo")
  'returns "here comes a poo"
  private function ReplaceTag(str, tag_name, replacement)
    Dim tag_start, tag_end, str_left, str_right

    tag_start = InStr(str, "<" & tag_name)
    
    while tag_start <> 0
    
      if tag_start <> 0 then
        tag_end = InStr(tag_start + Len("<" & tag_name), str, "</" & tag_name)
        tag_end = InStr(tag_end, str, ">")
      end if
      
      if tag_start <> 0 and tag_end <> 0 then
        str_left = Left(str, tag_start - 1)
        str_right = Right(str, Len(str) - tag_end)
        str = str_left + replacement + str_right
      end if
    
      tag_start = InStr(str, "<" & tag_name)
      
    wend
     
    ReplaceTag = str
    
  end function
  
  
  private function SpideringComplete
    Dim complete, recordset
    
    CreateRecordset recordset
    
    recordset.Open "select top 1 [key] from [Yider] where [parsed]=0", m_database
    
    if recordset.eof then
      complete = true
    else
      complete = false
    end if
    
    recordset.Close
    set recordset = Nothing
    
    SpideringComplete = complete
    
  end function

  
  public sub StoreTextThroughoutDomain(url, valid_file_extensions, valid_domains, invalid_text, invalid_url_strings, max_urls)
  
    Dim count, no, recordset, text_array
            
    if not DatabaseTableExists then
      MakeYiderTable
    end if
    
    GetTextThroughoutDomain url, valid_file_extensions, valid_domains, max_urls, invalid_text, invalid_url_strings
    CreateRecordset recordset
    recordset.Open "select count(*) from [Yider] where [parsed]=1", m_database
    
    if not SpideringComplete then
    
      response.Write NewLine(0) & "<script language=""JavaScript1.2"">"
      response.Write NewLine(0) & "window.setTimeout('document.population.submit()', " & m_wait & "000);"
      response.Write NewLine(0) & "</script>"

      response.Write NewLine(0) & "<input type=""hidden"" name=""auto_populate"" value=""1"">"
      response.Write NewLine(0) & "<br><br>PLEASE WAIT - Spidering is incomplete for " & url & " and will recommence in " & m_wait & " seconds"
      response.Write NewLine(0) & "<br>" & recordset(0) & " URL's parsed so far..."
    else
    
      response.Write NewLine(0) & "<br><br>FINISHED - Spidering is complete for " & url
      response.Write NewLine(0) & "<br>" & recordset(0) & " URL's parsed"

    end if
    
    recordset.Close
    set recordset = Nothing
    
  end sub
  

  'removes the text between the first occurrence of 'first_str' and the next occurrence of 'last_str'
  'returns the position of the character after 'last_str' in the new string or 0 if there isn't one
  public function StripBetween(position, first_str, last_str, add_space, byref str)
    Dim length, first_str_pos, last_str_pos, str_left, str_right
    
    length = Len(str)
    first_str_pos = 0
    last_str_pos = 0

    first_str_pos = InStr(position, str, first_str)
    
    if first_str_pos <> 0 then
    'we found the first character
      last_str_pos = InStr(first_str_pos + 1, str, last_str)
      
    'response.write "<br><br>first_str is " & first_str & " last_str is " & last_str

      if last_str_pos = 0 then
        position = 0
      
      elseif last_str_pos <> 0 then
        str_left = Left(str, first_str_pos - 1)
        str_right = Right(str, Len(str) - (last_str_pos + Len(last_str) - 1))
        
        if add_space then
          str = str_left + " " + str_right
        else
          str = str_left + str_right
        end if
                
        if last_str_pos = length then
        'the last character is at the end of the string
          position = 0
        else
          position = Len(str_left)
                    
          if position = 0 then
            position = 1
          end if
        end if
        
        'response.write "<br><br>first_str is " & first_str & "<br>first_str_next is " & first_str_next & "<br>position_ascii_62 is " & position_ascii_62 & "<br>start is " & start & " strlen is " & len(str)
      end if
      
    else
    
      position = 0
      
    end if
    
    'response.write "<br>str is '" & str & "'"

    StripBetween = position
      
  end function
  
  
  private function UpdateRequired(url)
    Dim query, recordset, required
    
    set recordset = Server.CreateObject("ADODB.Recordset")
    recordset.CursorType = adOpenForwardOnly
    recordset.CacheSize = 1

    required = true
            
    if not m_update then
        
      query =  "select [key] from [Yider] where [url] = '" & replace(url, "'", "''") & "'"
      recordset.Open query, m_database

      if not recordset.eof then
       required = false
      end if
      
      recordset.Close
    end if
    
    set recordset = Nothing
    
    UpdateRequired = required
    
  end function

end class

%>



<%
  'text = extractor.ExtractHREFsFromURL("http://127.0.0.1/spider/page1.htm", Array("htm", "html", "asp", "php", "php3"), Array("spider"))
  'text = extractor.ExtractHREFsFromURL("http://127.0.0.1/esolutions/index.asp", Array("htm", "html", "asp", "php", "php3"), Array("127.0.0.1/esolutions"))
  'text = extractor.ExtractHREFsFromURL("http://www.ninemsn.com.au", Array("htm", "html", "asp", "php", "php3"), Array("kineticdesign"))
  'text = extractor.ExtractHREFsFromURL("http://www.d-i-g.com.au", Array("htm", "html", "asp", "php", "php3"), Array("d-i-g"))
  'text = extractor.ExtractHREFsFromURL("http://www.martindawessystems.com", Array("htm", "html", "asp", "php", "php3"), Array("martindawessystems", "dise3g"))
  'text = extractor.GetTextAtURL("http://127.0.0.1/esolutions/index.asp")
  
  'extractor.GetTextThroughoutDomain "http://127.0.0.1/esolutions/index.asp", Array("htm", "html", "asp", "php", "php3"), Array("127.0.0.1/esolutions")
%>  

