develooper Front page | perl.libwww | Postings from April 2001

Usage: HTML::Parser::parse(self, chunk)

Thread Next
From:
Matej Kovacic
Date:
April 10, 2001 06:08
Subject:
Usage: HTML::Parser::parse(self, chunk)
Message ID:
OIEOJJFOOGNHPHBNOADBAEIACCAA.matej.kovacic@uni-lj.si
Hi!

I have written a program in Perl which reads URL's from file and then make
some simple analysis of them and extract all the links. Links are written in
file results.net, info about URL in results.out.
I run the program, and after analysing several thousent of pages, I got an
error:
===
Uncaught exception from user code:
        Usage: HTML::Parser::parse(self, chunk) at jure.pl line ---.

main::analyse('http://alibaba.ijs.si/ME/CD/docs/CES1/dtd2html/cesAna/distrib
uto...') called at jure.pl line 103

===
Here is the program:

#!/usr/bin/perl -w
use CGI;

use HTML::LinkExtor;
use LWP::Simple;
use HTTP::Response;
use LWP;
use LWP::UserAgent;
use URI::URL;
use strict;
use diagnostics;

# HTTP Response Codes on Has(c)h (RFC2068)
###
my %statuscode = (
	100   => 'Continue',
	101   => 'Switching Protocols',
	200   => 'OK',
	201   => 'Created',
	202   => 'Accepted',
	203   => 'Non-Authoritative Information',
	204   => 'No Content',
	205   => 'Reset Content',
	206   => 'Partial Content',
	300   => 'Multiple Choices',
	301   => 'Moved Permanently',
	302   => 'Moved Temporarily',
	303   => 'See Other',
	304   => 'Not Modified',
	305   => 'Use Proxy',
	400   => 'Bad Request',
	401   => 'Unauthorized',
	402   => 'Payment Required',
	403   => 'Forbidden',
	404   => 'Not Found',
	405   => 'Method Not Allowed',
	406   => 'Not Acceptable',
	407   => 'Proxy Authentication Required',
	408   => 'Request Time-out',
	409   => 'Conflict',
	410   => 'Gone',
	411   => 'Length Required',
	412   => 'Precondition Failed',
	413   => 'Request Entity Too Large',
	414   => 'Request-URI Too Large',
	415   => 'Unsupported Media Type',
	500   => 'Internal Server Error',
	501   => 'Not Implemented',
	502   => 'Bad Gateway',
	503   => 'Service Unavailable',
	504   => 'Gateway Time-out',
	505   => 'HTTP Version not supported'
	);

my $filesdir = "files";
$| = 1;

# print "Content-type: text/html\n\n";

BEGIN {
      open(OUT,">results/result.out") || print "Error!";
      open(NET,">results/result.net") || print "Error!";
}
END {
      close(NET);
      close(OUT);
}

  my $file = "si-url.txt";
  if ($file ne '') {

      print "Analysing file: $file";
      open(FILE,"$filesdir/$file") || print "Error - no file $file.";
      local $\ = undef;
      my @file = <FILE>;
      close(FILE);

      print "START:\n";

      my $count = 1;
      foreach my $main_line (@file) {
        $/="\r\n";
        chomp($main_line);
        $/="\n";
        my $base_url = "$main_line";
        print "Line number: $count\n";
        if ($count > 1665) {
          &analyse($base_url);
        }
        $count++;
      }



###################
# Analyse page!
###################
sub analyse {

  my $url=shift;

  chomp($url);

  my $browser = LWP::UserAgent->new();
  $browser->agent("MatejKovacicGregaPetric/InternetResearchProject");

  my $webdoc = $browser->request(HTTP::Request->new(GET => $url));

  my $responsecode = $webdoc->code;

  if ($webdoc->is_success) {

    # COUNT images
    @main::images=();
    @main::images = $webdoc->content =~
	m{
	 <  \s*   img
	 \s*   ["|']?  (  [^\s>'"]+  )   ['|"]?
	 }xgm;
    $main::num_img=@main::images;

    my $string = $webdoc->content;
    $string =~ s/\n//g; # remove all newlines!
    $string =~ s/\r//g; # remove all carridge returns!
    $string =~ s/\t//g; # remove all tabs!

    # LENGTH of page
    my $size1 = length($string); # get the size of a string!

    $string =~ s/<([^>]|\n)*>//g; # remove all HTML tags
    $string =~ s/  //g; # remove all double spaces!

    # LENGTH of text on the page
    my $size2 = length($string); # get the size of a shortened string!

    print OUT "$url <> $responsecode <> $size1 <> $size2 <> ",$webdoc->base,
	" <> ",$webdoc->content_type," <> ",$webdoc->title,
	    " <> $main::num_img\n";

    # EXTRACT links
    $main::base_url = $webdoc->base;
    my $parser = HTML::LinkExtor->new(undef, $main::base_url);
    $parser->parse(get($main::base_url))->eof;

    @main::links = $parser->links;
    my %seen;

    foreach my $linkarray (@main::links) {
      @main::element=();
      @main::element  = @$linkarray;
      my $elt_type = shift @main::element;
      while (@main::element) {
        my ($attr_name , $attr_value) = splice(@main::element, 0, 2);
        $seen{$attr_value}++;
      }
    }

    @main::arr = sort keys %seen;

    my $i = 0;
    for (sort keys %seen) {
      print NET "$main::base_url -> $main::arr[$i]\n";
      $i++;
    }

  }
  else {
    print OUT "$url <> $responsecode <> . <> . <> . <> . <> . <> .\n";
  }
}

====
Do you have any idea how to get the solution of this problem?

I have to tell you I am quite new in Perl programming... but I am trying
hard.

Thanks in advance.

bye, Matej


Thread Next


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About