Ir para conteúdo
Fórum Script Brasil
  • 0

(Resolvido) Web Scraping - ler página html


mariolopes

Pergunta

4 respostass a esta questão

Posts Recomendados

  • 0

pesquise nesses links

https://www.google.com.br/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=web+scraping+delphi

https://www.google.com.br/webhp?sourceid=chrome-instant&ion=1&espv=2&ie=UTF-8#q=ler%20arquivo%20html%20delphi

 

exemplo

program WebScrape;
 
{$APPTYPE CONSOLE}
 
{.$DEFINE DEBUG}
 
uses
  Classes,
  Winsock;
 
 
{ Function to connect to host, send HTTP request and retrieve response }
function DoHTTPGET(const hostName: PAnsiChar; const resource: PAnsiChar; HTTPResponse: TStrings): Boolean;
const
  Port: integer = 80;
  CRLF = #13#10; // carriage return/line feed
var
  WSAData: TWSAData;
  Sock: TSocket;
  SockAddrIn: TSockAddrIn;
  IPAddress: PHostEnt;
  bytesIn: integer;
  inBuffer: array [0..1023] of char;
  Req: string;
begin
  Result := False;
  HTTPResponse.Clear;
 
  { Initialise use of the Windows Sockets DLL.
    Older Windows versions support Winsock 1.1 whilst newer Windows
    include Winsock 2 but support 1.1.  Therefore, we'll specify
    version 1.1 ($101) as being the highest version of Windows Sockets
    that we can use to provide greatest flexibility.
    WSAData receives details of the Windows Sockets implementation }
  Winsock.WSAStartUp($101, WSAData);
  try
 
    { Create a socket for TCP/IP usage passing in
      Address family spec: AF_INET (TCP, UDP, etc.)
      Type specification: SOCK_STREAM
      Protocol: IPPROTO_TCP (TCP) }
    Sock := WinSock.Socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    try
 
      // Check we have a valid socket
      if (Sock <> INVALID_SOCKET) then
        begin
          // Populate socket address structure
          with SockAddrIn do
            begin
              // Address family specification
              sin_family := AF_INET;
              // Port
              sin_port := htons(Port);
              // Address
              sin_addr.s_addr := inet_addr(hostName);
            end;
 
          if (SockAddrIn.sin_addr.s_addr = INADDR_NONE) then
            begin
              { As we're using a domain name instead of an
                IP Address, we need to resolve the domain name }
              IPAddress := Winsock.gethostbyname(hostName);
 
              // Quit if we didn't get an IP Address
              if (IPAddress = nil) then
                Exit;
 
              // Update the structure with the IP Address
              SockAddrIn.sin_addr.s_addr := PLongint(IPAddress^.h_addr_list^)^;
            end;
 
          // Try to connect to host
          if (Winsock.connect(Sock, SockAddrIn, SizeOf(SockAddrIn)) <> SOCKET_ERROR) then
            begin
              // OK - Connected
 
              // Compose our request
              // Each line of the request must be terminated with a carriage return/line feed
 
              {  The First line specifies method (e.g. GET, POST), path to required resource,
                 and the HTTP version being used.  These three fields are space separated. }
               Req := 'GET '+resource+' HTTP/1.1' + CRLF +
 
                     // Host: is the only Required header in HTTP 1.1
                     'Host: '+hostName + CRLF +
 
                     { Persistent connections are the default in HTTP 1.1 but, as we don't want
                       or need one for this exercise, we must include the "Connection: close"
                       header in our request }
                     'Connection: close' + CRLF +
 
                     CRLF; // Request must end with an empty line!
 
              // Try to send the request to the host
              if (Winsock.send(Sock,Req[1],Length(Req),0) <> SOCKET_ERROR) then
                begin
                  // Initialise incoming data buffer (i.e. fill array with nulls)
                  FillChar(inBuffer,SizeOf(inBuffer),#0);
                  // Loop until nothing left to read
                  repeat
                    // Read incoming data from socket
                    bytesIn := Winsock.recv(Sock, inBuffer, SizeOf(inBuffer), 0);
                    // Assign buffer to Stringlist
                    HTTPResponse.Text := HTTPResponse.Text + Copy(string(inBuffer),1,bytesIn);
                  until
                    (bytesIn <= 0) or (bytesIn = SOCKET_ERROR);
 
                  { Our list of response strings should
                    contain at least 1 line }
                  Result := HTTPResponse.Count > 0;
                end;
 
             end;
 
        end;
 
    finally
      // Close our socket
      Winsock.closesocket(Sock);
    end;
 
  finally
    { This causes our application to deregister itself from this
      Windows Sockets implementation and allows the implementation
      to free any resources allocated on our behalf. }
    Winsock.WSACleanup;
  end;
 
end;
 
{ Simple function to locate and return the UTC time from the
  request sent to http://tycho.usno.navy.mil/cgi-bin/timer.pl
  The HTTPResponse parameter contains both the HTTP Headers and
  the HTML served up by the requested resource. }
function ParseResponse(HTTPResponse: TStrings): string;
var
  i: Integer;
begin
  Result := '';
 
  { Check first line for server response code
    We want something like this: HTTP/1.1 200 OK }
  if Pos('200',HTTPResponse[0]) > 0 then
    begin
      for i := 0 to Pred(HTTPResponse.Count) do
        begin
          { The line we're looking for is something like this:
            <BR>May. 04. 21:55:19 UTC	Universal Time }
 
          // Check each line
          if Pos('UTC',HTTPResponse[i]) > 0 then
            begin
              Result := Copy(HTTPResponse[i],5,Pos('UTC',HTTPResponse[i])-1);
              Break;
            end;
 
        end;
    end
    else
    Result := 'HTTP Error: '+HTTPResponse[0];
end;
 
 
const
  host: PAnsiChar = 'tycho.usno.navy.mil';
  res : PAnsiChar = '/cgi-bin/timer.pl';
 
 
var
  Response: TStrings;
 
begin
  { A TStringList is a TStrings descendant class
    that is used to store and manipulate a list
    of strings.
 
    Instantiate a stringlist class to
    hold the results of our HTTP GET }
  Response := TStringList.Create;
  try
    // Try an HTTP GET request
    if DoHTTPGET(host,res,Response) then
      begin
        {$IFDEF DEBUG}
        { Write the entire response to
          the console window }
        Writeln(Response.text);
        {$ELSE}
        { Parse the response and write the
          result to the console window }
        Writeln(ParseResponse(Response));
        {$ENDIF DEBUG}
      end
      else
      Writeln('Error retrieving data');
 
  finally
    Response.Free;
  end;
 
  // Keep console window open
  Readln;
 
 
end.
 
 


Example using Indy's IdHTTP component.

program ReadUTCTime;
 
{$APPTYPE CONSOLE}
 
uses SysUtils, Classes, IdHTTP;
 
var
  s: string;
  lHTTP: TIdHTTP;
  lReader: TStringReader;
begin
  lHTTP := TIdHTTP.Create(nil);
  try
    lReader := TStringReader.Create(lHTTP.Get('http://tycho.usno.navy.mil/cgi-bin/timer.pl'));
    while lReader.Peek > 0 do
    begin
      s := lReader.ReadLine;
      if Pos('UTC', s) > 0 then
      begin
        Writeln(s);
        Break;
      end;
    end;
  finally
    lHTTP.Free;
    lReader.Free;
  end;
end.

abraço

Link para o comentário
Compartilhar em outros sites

Participe da discussão

Você pode postar agora e se registrar depois. Se você já tem uma conta, acesse agora para postar com sua conta.

Visitante
Responder esta pergunta...

×   Você colou conteúdo com formatação.   Remover formatação

  Apenas 75 emoticons são permitidos.

×   Seu link foi incorporado automaticamente.   Exibir como um link em vez disso

×   Seu conteúdo anterior foi restaurado.   Limpar Editor

×   Você não pode colar imagens diretamente. Carregar ou inserir imagens do URL.



  • Estatísticas dos Fóruns

    • Tópicos
      152k
    • Posts
      651,7k
×
×
  • Criar Novo...