<!-- #include file="adovbs.inc" -->
<!-- #include file="functions.asp" -->

<%

class CURLExtractor

  Private m_database, m_update, m_winHttp
  Public m_invalid_url_strings


  private sub AddToDatabase(url, examined)
  
    Dim recordset, parsed
    
    if examined then
      parsed = 1
    else
      parsed = 0
    end if
    
    CreateRecordset recordset
          
    'response.write "<br>url is " & url

    recordset.Open "select [parsed] from [Yider] where [url]='" & url & "'", m_database
    
    if recordset.eof then
      m_database.Execute "begin tran insert into [Yider] values ( '" & url & "', '', '', " & parsed & ", DEFAULT) commit tran"
    else
    
      if recordset(0) = false and examined then
        m_database.Execute "begin tran update [Yider] set [parsed]=1 where [url]='" & url & "' commit tran"
      end if
    
    end if

    recordset.Close
    set recordset = Nothing
    
  end sub

  
  public sub Constructor(update, byref database)
  
    set m_winHttp = Server.CreateObject("WinHttp.WinHttpRequest.5")
    
    m_update = update
    set m_database = database
        
  end sub


  public sub Destructor
    set m_winHttp = Nothing
    set m_database = Nothing
  end sub
  
  
  private function DomainValid(href, valid_domains)
    Dim valid
    
    valid = false
        
    if IsArray(valid_domains) then
      
      if InStr(href, "http://") <> 0 then        
        if InArrayStr(href, valid_domains) then
          valid = true
        end if
      end if
      
    else
        
      valid =true
      
    end if
    
    'response.write "<br>valid is " & valid & " href is " & href & " valid_domains(0) is " & valid_domains(0)
    
    DomainValid = valid
    
  end function

  

  'eg GetBaseURL("http://www.esolutions.com.au") returns http://www.esolutions.com.au
  'eg GetBaseURL("http://www.esolutions.com.au/") returns http://www.esolutions.com.au
  'eg GetBaseURL("http://www.esolutions.com.au/page10.htm") returns http://www.esolutions.com.au
  'eg GetBaseURL("http://www.esolutions.com.au/page10") returns http://www.esolutions.com.au/page10
  'eg GetBaseURL("http://www.esolutions.com.au/page10/") returns http://www.esolutions.com.au/page10
  'url must contain "http://" at least
  private function GetBaseURL(url)
  
    Dim base_url
        
    url = replace(url, "\", "/")
   
    if InStr(8, url, "/") <> 0 then
      base_url = GetBaseURLWithSlashes(url)
    else
      base_url = url
    end if

    GetBaseURL = base_url
    
  end function

  
  'eg GetBaseURLWithSlashes("http://www.esolutions.com.au/") returns http://www.esolutions.com.au
  'eg GetBaseURLWithSlashes("http://www.esolutions.com.au/page10.htm") returns http://www.esolutions.com.au
  'eg GetBaseURLWithSlashes("http://www.esolutions.com.au/page10") returns http://www.esolutions.com.au/page10
  'eg GetBaseURLWithSlashes("http://www.esolutions.com.au/page10/") returns http://www.esolutions.com.au/page10
  'url must contain a '/' after 'http://'
  function GetBaseURLWithSlashes(url)
  
    Dim base_url, position_dot, position_last_slash
            
    if Mid(url, Len(url)) = "/" then
      base_url = Mid(url, 1, Len(url) - 1)
      
    else
    'if there is a dot after the last slash, this is a specific file otherwise it's a base url
      position_last_slash = InStrRev(url, "/")
      position_dot = InStr(position_last_slash, url, ".")
      
      if position_dot > position_last_slash then
        base_url = Mid(url, 1, position_last_slash - 1)
      else
        base_url = url
      end if
    
    end if
    
    GetBaseURLWithSlashes = base_url
    
  end function
  
  
  'url - http://www.esolutions.com.au/
  'url - http://www.esolutions.com.au
  'url - http://www.esolutions.com.au/index
  'url - http://www.esolutions.com.au/index.asp
  'the above should return http://www.esolutions.com.au/
  private function GetDirectory(url)
    Dim pos_slash, pos_dot
        
    url = replace(url, "\", "/")
    
    pos_slash = InStrRev(url, "/")
    pos_dot = InStrRev(url, ".")
    
    if pos_dot > pos_slash then
    'this could be a filename
    'check the slashes aren't the // after http://
      if InStrRev(url, "//") <> pos_slash - 1 then
        url = Left(url, pos_slash)
      else
        if Right(url, 1) <> "/" then
          url = url & "/"
        end if
      end if
    else
    'this is a directory
      if Right(url, 1) <> "/" then
        url = url & "/"
      end if
    end if
    
    GetDirectory = url
    
  end function
  
  
  private function GetFileExtension(href)
    Dim extension, pos_dot, pos_question
    
    extension = ""
    pos_question = InStr(href, "?")

    if pos_question <> 0 then
    'the url contains a ?
      pos_dot=  InStrRev(href, ".", pos_question)
      extension = Mid(href, pos_dot + 1, pos_question - pos_dot - 1)
      
    else

      pos_dot=  InStrRev(href, ".")
      
      if pos_dot <> 0 then
        extension = Mid(href, pos_dot + 1)
      end if
        
    end if
    
    GetFileExtension = extension
  end function
  
  
  'returns the fully qualified URL of a href in url
  'tested for
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles", "index.asp") - http://www.esolutions.com.au/articles/index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles", "/index.asp") - http://www.esolutions.com.au/articles/index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles", "../index.asp") - http://www.esolutions.com.au/index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au", "../index.asp") - http://index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles/", "index.asp") - http://www.esolutions.com.au/articles/index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles/", "/index.asp") - http://www.esolutions.com.au/articles/index.asp
  'GetFullyQualifiedURL("http://www.esolutions.com.au/articles/", "../index.asp") - http://www.esolutions.com.au/index.asp
  private function GetFullyQualifiedURL(url, href)
    Dim char, fq_url, new_url, pos
 
    if InStr(href, "http://") = 0 and InStr(href, "https://") = 0 then
    
      new_url = GetDirectory(url)
      
      char = Left(href, 1)
      
      if char = "/" then
        href = Right(href, Len(href) - 1)
      end if

      fq_url = new_url & href
      fq_url = RemoveDotDotReferencing(fq_url)
    
    else
    
      fq_url = RemoveDotDotReferencing(href)
      
    end if
    
    'response.write "<br>1 url is " & url & " fq_url is " & fq_url & " href is " & href
        
    GetFullyQualifiedURL = fq_url
      
  end function

  
  'html' is a text string of a plain old html page
  ''valid_file_extensions' is an array of acceptable file extensions in hrefs in this page that we will
  'consider extracting eg Array("htm", "html", "asp", "php", "php3")
  ''url' is the fully qualified name of the page where 'html' has beed extracted
  'from eg http://www.somewhere.com/page.htm
  'this function builds pairs of values in 'urls_examined'
  
  'the first value in the pair will be 'url'
  'the second value in the pair is a fully qualified url that was extracted from within a href tag
  'within the 'html' provided it contains at least one of the strings in the string array 'valid_domains'
  private sub GetHREFs(url, html, valid_file_extensions, valid_domains, tag_attribute)
    Dim file_extension, href, in_str, original_url, pos_start, pos_end, fq_url, valid
    
    'required for bug in vbscript
    'see below
    original_url = url
    pos_start = InStr(html, tag_attribute & "=""")
                 
    while pos_start <> 0
    
      pos_end = InStr(pos_start + Len(tag_attribute) + 2, html, """")
      href = Mid(html, pos_start + Len(tag_attribute) + 2, (pos_end - pos_start - (Len(tag_attribute) + 2)))
            
      file_extension = GetFileExtension(href)
      
      'response.write "<br><br>1 fq_url is " & fq_url & " url is '" & url & "' href is '" & href & "' file_extension is " & file_extension
      
      if IsHREF(html, pos_start) and InArrayStrExact(file_extension, valid_file_extensions) then
      
        'bug in vbscript?
        'url get's reassigned in this statement for no reason I can tell!   
        fq_url = GetFullyQualifiedURL(url, href)
        url = original_url

        valid = DomainValid(fq_url, valid_domains)
        in_str = InStrArray(fq_url, m_invalid_url_strings)

        'response.write "<br><br>fq_url is " & fq_url & " valid is " & valid & " in_str is " & in_str

        if valid and not in_str then
          AddToDatabase fq_url, false
        end if
        
      end if
      
      pos_start = InStr(pos_start + 1, html, tag_attribute & "=""")
    wend
            
  end sub


  public function ExtractHREFsFromURL(start_url, valid_file_extensions, valid_domains)
    Dim url_array, html
        
    if InStr(start_url, "http://") = 0 then
      Response.Write "The url you supplied must contain a ""http://"" e.g. http://www.esolutions.com.au"
      Response.End
    else
      AddToDatabase start_url, false
      html = GetURLsDirect(start_url, valid_file_extensions, valid_domains)
    end if
    
    ExtractHREFsFromURL = html
 
  end function
  
  
  ''url' is the fully qualified name of the page where hrefs are to be extracted
  'from eg http://www.somewhere.com/page.htm
  ''valid_file_extensions' is an array of acceptable file extensions in hrefs in this page that we will
  'consider extracting eg Array("htm", "html", "asp", "php", "php3"), Array("esolutions")
  
  'urls_examined is an Array of fully qualified url pairs
  'the first value in the pair is the value of an url that has to be openend and examined for hrefs
  'the second value is true if it has been examined and false if it hasn't  
  public function GetURLsDirect(url, valid_file_extensions, valid_domains)
   
    Dim count, html
                                            
    m_winHttp.open "GET", url, false
    m_winHttp.send()
        
    if Err.Number <> 0 then
      Response.Write "<br>The url " & url & " cannot be found"
    end if

    html = m_winHttp.ResponseText
        
    GetHREFs url, html, valid_file_extensions, valid_domains, "href"
    GetHREFs url, html, valid_file_extensions, valid_domains, "src"
        
    GetURLsDirect = html
          
  end function


  'returns true if the href found at pos_start is in fact a href
  'for instance &lt; href="index.asp" &gt; is not a href but
  '<a href="index.asp">ref</a> is
  private function IsHREF(html, pos_start)
    Dim href, pos_gt, pos_angle
    
    href = true
    
    pos_gt = InStr(pos_start, html, "&gt;")
    pos_angle = InStr(pos_start, html, ">")
        
    if pos_angle = 0 then
      href = false
    
    elseif pos_gt = 0 then
      href = true

    elseif pos_angle > pos_gt then
      href = false
    end if
    
    IsHREF = href
    
  end function
  
  
  'some urls are of the form http://www.esolutions.com.au/articles/../index.asp
  'or http://www.esolutions.com.au/stuff/../articles/../index.asp
  'the first url is exactly equivalent to http://www.esolutions.com.au/index.asp
  'the second url is exactly equivalent to http://www.esolutions.com.au/index.asp
  'url must contain 'http' or 'https'
  'this function returns the converted url
  public function RemoveDotDotReferencing(url)
    Dim pos_dotdot, pos_replace, pos_slash, start, str_left, str_right, url_fixed
    
    if InStr(url, "https:") <> 0 then
      pos_replace = 9
    else
      pos_replace = 8
    end if
    
    start = 1
    url = replace(url, "\", "/")
    
    url = replace(url, "//", "/", pos_replace)
    
    if pos_replace = 8 then
      url = "http://" & url
    else
      url = "https://" & url
    end if
    
    pos_dotdot = InStr(start, url, "/..")
        
    while pos_dotdot <> 0
    
      pos_slash = InStrRev(url, "/", pos_dotdot - 1)

      str_left = Left(url, pos_slash - 1)
      str_right = Right(url, Len(url) - pos_dotdot - 2)
      url = str_left + str_right
      
      start = pos_slash
      pos_dotdot = InStr(start, url, "/..")
      
    wend
      
    RemoveDotDotReferencing = url
    
  end function
  
end class


%>
