develooper Front page | perl.recdescent | Postings from October 2009

C Code Parser Using Recursive Descent

From:
Rahul Jain
Date:
October 9, 2009 05:12
Subject:
C Code Parser Using Recursive Descent
Message ID:
754544.10862.qm@web50211.mail.re2.yahoo.com
#! /usr/bin/perl -w
# stat-comments.pl by Teodor Zlatanov, tzz@iglou.com
# March 26, 2000

# A script to evaluate the readability of comments
# embedded in C++.  Utilizes code from demo-decomment.pl,
# which is included with the Parse::RecDescent module.
# Uses the Lingua::EN::Fathom module to evaluate text
# readability.

# ORIGINAL BY Helmut Jarausch 
# EXTENDED BY Damian Conway AND Helmut Jarausch
# POLISHED BY Teodor Zlatanov


use strict;
use Parse::RecDescent;
use Lingua::EN::Fathom;

use vars qw/ $Grammar /;

my $parser = new Parse::RecDescent $Grammar  or  die "invalid grammar";

undef $/;
my $text = @ARGV ? <> : <DATA>;

my $parts = $parser->program($text) or die "malformed C program";

# only work with comments of length > 0
die "No comments found in input" unless length $parts->{comments};

# convert every comment mark to a period, so separate comments are
# separate sentences, if well-formed.  Lingua::EN::Fathom is quite
# good at figuring out what sentences are valid, so an extra period
# in the text won't affect the overall counts.

$parts->{comments} =~ s#//#. #g;
$parts->{comments} =~ s#/\*#. #g;
$parts->{comments} =~ s#\*/#. #g;

# we can now evaluate the comments (stored in $parts->{comments})
my $fathom = new Lingua::EN::Fathom; 
$fathom->analyse_block($parts->{comments});

# voila, the readability report!
print($fathom->report);
  
BEGIN
{ $Grammar=<<'EOF';

program : <rulevar: local $WithinComment=0>
program : <rulevar: local $Comments = ""> /this shouldn't be here :-/
program : <reject>
program : <reject> /with prejudice/
program : <rulevar: local $Code = "">
program : <rulevar: local @Strings>

program	: <skip:''> part(s)
		{ { code=>$Code, comments=>$Comments, strings=>[@Strings]} }

part	: comment
        | C_code
        | string

C_code  : m{(			
	      [^"/]+		# one or more non-delimiters
	      (			# then (optionally)...
	       /		# a potential comment delimiter
	       [^*/]		# which is not an actual delimiter
	      )?		# 
	    )+			# all repeated once or more
	   }x
		{ $Code .= $item[1] }

string	: m{"			# a leading delimiter
	    ((			# zero or more...
	      \\.		# escaped anything
	      |			# or
	      [^"]		# anything but a delimiter
	     )*
	    )
	    "}x
		{ $Code .= $item[1]; push @Strings, $1 }


comment	: m{\s*			# optional whitespace
	    //			# comment delimiter
	    [^\n]*		# anything except a newline
	    \n			# then a newline
	   }x
		{ $Code .= "\n"; $Comments .= $item[1] }

	| m{\s*			# optional whitespace
	    /\*			# comment opener
	    (?:[^*]+|\*(?!/))*	# anything except */
	    \*/		        # comment closer
            ([ \t]*)?           # trailing blanks or tabs
	   }x	
		{ $Code .= " "; $Comments .= $item[1] }

EOF
}
__DATA__
program test; // for decomment

// using Parse::RecDescent

/*
 We should raise the indices quite a bit with this text section,
 because it will actually include sentences and structure.  See,
 the problem with most C/C++ programs is that they use comments
 that are very short and convey little information.
*/
 
int main()
{
/* this should
   be removed
*/
  char *cp1 = "";
  char *cp2 = "cp2";
  int i;  // a counter
          // remove this line altogehter
  int k;  
      int more_indented;  // keep indentation
      int l;  /* a loop
             variable */
      // should be completely removed

  char *str = "/* ceci n'est pas un commentaire */";
  return 0;
}



nntp.perl.org: Perl Programming lists via nntp and http.
Comments to Ask Bjørn Hansen at ask@perl.org | Group listing | About