develooper Front page | perl.libwww | Postings from October 1999

Re: Performance improvements for HTML::TokeParser

Thread Previous
From:
Gisle Aas
Date:
October 29, 1999 08:46
Subject:
Re: Performance improvements for HTML::TokeParser
Message ID:
m3d7tys6cv.fsf@eik.g.aas.no
"Jeff Sparkes" <jsparkes@internetivity.com> writes:

> I'm using TokeParser to parse large (100k+) HTML files, and it was very slow.
> The EFFICIENCY section of HTML::Parser suggests passing in test in small chunks, 
> but TokeParser::new passed in the whole file at once.

Good point.

> I modified it to pass in a line at a time, and now TokeParser is
> about 5 times for my test cases.

I got similar numbers.

> It might be better to use substr to break it into chunks, but I couldn't get it to
> work as well.

I made it work with the enclosed patch.  It will break the string
into chunks as we go.  This avoids creating a huge @{$self->{tokens}}
array up front.  A user visible change with my patch is that you are
not to allowed to change the string until all tokens have been
extracted.  I hope to get away with that.

Regards,
Gisle



> My diff is attached.
> *** /usr/local/lib/site_perl/HTML/TokeParser.pm	Wed Jun  9 10:20:02 1999
> --- TP.pm	Fri Oct 22 15:34:59 1999
> ***************
> *** 1,4 ****
> ! package HTML::TokeParser;
>   
>   # $Id: TokeParser.pm,v 2.5 1999/06/09 10:20:02 gisle Exp $
>   
> --- 1,4 ----
> ! package TP;
>   
>   # $Id: TokeParser.pm,v 2.5 1999/06/09 10:20:02 gisle Exp $
>   

I did not apply this part :-)

> ***************
> *** 24,30 ****
>       $self->{tokens} = [];
>       $self->{textify} = {img => "alt", applet => "alt"};
>       if (ref($file) eq "SCALAR") {
> ! 	$self->parse($$file);
>   	$self->eof;
>       } else {
>   	$self->{file} = $file;
> --- 24,33 ----
>       $self->{tokens} = [];
>       $self->{textify} = {img => "alt", applet => "alt"};
>       if (ref($file) eq "SCALAR") {
> ! 	my $line;
> ! 	foreach $line (split(/\n/, $$file)) {
> ! 	    $self->parse($line);
> ! 	}
>   	$self->eof;
>       } else {
>   	$self->{file} = $file;

My patch:

Index: lib/HTML/TokeParser.pm
===================================================================
RCS file: /home/cvs/aas/perl/mods/html-parser/lib/HTML/TokeParser.pm,v
retrieving revision 2.5
retrieving revision 2.6
diff -u -p -r2.5 -r2.6
--- lib/HTML/TokeParser.pm	1999/06/09 10:20:02	2.5
+++ lib/HTML/TokeParser.pm	1999/10/29 12:00:41	2.6
@@ -1,10 +1,10 @@
 package HTML::TokeParser;
 
-# $Id: TokeParser.pm,v 2.5 1999/06/09 10:20:02 gisle Exp $
+# $Id: TokeParser.pm,v 2.6 1999/10/29 12:00:41 gisle Exp $
 
 require HTML::Parser;
 @ISA=qw(HTML::Parser);
-$VERSION = sprintf("%d.%02d", q$Revision: 2.5 $ =~ /(\d+)\.(\d+)/);
+$VERSION = sprintf("%d.%02d", q$Revision: 2.6 $ =~ /(\d+)\.(\d+)/);
 
 use strict;
 use Carp qw(croak);
@@ -24,10 +24,10 @@ sub new
     $self->{tokens} = [];
     $self->{textify} = {img => "alt", applet => "alt"};
     if (ref($file) eq "SCALAR") {
-	$self->parse($$file);
-	$self->eof;
+	$self->{toke_scalar} = $file;
+	$self->{toke_scalarpos}  = 0;
     } else {
-	$self->{file} = $file;
+	$self->{toke_file} = $file;
     }
     $self;
 }
@@ -43,14 +43,36 @@ for (qw(declaration start end text comme
 sub get_token
 {
     my $self = shift;
-    while (!@{$self->{tokens}} && $self->{file}) {
-	# must try to parse more of the file
-	my $buf;
-	if (read($self->{file}, $buf, 512)) {
-	    $self->parse($buf);
-	} else {
-	    $self->eof;
-	    delete $self->{file};
+    while (!@{$self->{tokens}} && !$self->{toke_eof}) {
+	if (my $f = $self->{toke_file}) {
+	    # must try to parse more from the file
+	    my $buf;
+	    if (read($f, $buf, 512)) {
+		$self->parse($buf);
+	    } else {
+		$self->eof;
+		$self->{toke_eof}++;
+		delete $self->{toke_file};
+	    }
+	}
+	elsif (my $sref = $self->{toke_scalar}) {
+	    # must try to parse more from the scalar
+	    my $pos = $self->{toke_scalarpos};
+	    my $chunk = substr($$sref, $pos, 512);
+	    $self->parse($chunk);
+	    $pos += length($chunk);
+	    if ($pos < length($$sref)) {
+		$self->{toke_scalarpos} = $pos;
+	    }
+	    else {
+		$self->eof;
+		$self->{toke_eof}++;
+		delete $self->{toke_scalar};
+		delete $self->{toke_scalarpos};
+	    }
+	}
+	else {
+	    die;
 	}
     }
     shift @{$self->{tokens}};

Thread Previous


nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About