#!/usr/bin/perl

##########################################################################
#  
#  xlint.pl
#
#  This program acts as an error-tolerant XML parser. It reports all the
#  detected errors as it parses through the XML document and never
#  terminates before it reaches the end of the document.
#
#  Author : Yuhui Jin
#           Juan Fernando Arguello
#           Stanford University
#  Last Modified : December 24, 2003
#
#  Acknowledgement: This work is supported by US Air Force and the DARPA 
#  DAML project "OntoAgents" (01IN901C0).
#
#
##########################################################################


############## Main program ##############

# Define global variables

## Error types to label each error, used in error reporting
@errType = ("Syntax error for the xml declaration", #0
	    "Expect the attribute for version info", #1
	    "Invalid version number assignment",     #2	
	    "Invalid encoding name assignment", #3
	    "Invalid assignment for standalone attribute", #4
	    "Attribute not expected at this location", #5
	    "Syntax error for the tag", #6
	    "Duplicate DocType declaration", #7
	    "Missing the start tag", #8
	    "Missing the end tag", #9
	    "Syntax error for the comment", #10
	    "Syntax error for the processing instruction", #11
	    "Expect white space", #12
	    "Syntax error for attribute assignment" #13
	    );

## Line number of the current line being parsed	
$linenum = 0;		

## Store the line being parsed
$line = "";

## The position in the line being parsed
$pos = 0;

## The flag for verbose mode of error reporting.
## If it is set, error context consisting of N characters 
## will be shown as well as the position of the error.
## The default is off.
$verbose = 0;

## The length of the error context
$context = 30;

## The counter for total number of errors
$errorCount = 0;

## Cache the line being procesed for reporting error in context
$cachedCurLine = "";

# Check correct command line usage of the parser
if ($#ARGV < 0) {
   print "USAGE: ";
   print "perl xlint.pl <file_name> [-v |-v <number_of_chars>]\n";
   print "<file_name>: the absolute or relative file name of the XML document to be parsed.\n";
   print "-v: the verbose mode where a context of N characters around the error ".
       "position is displayed. The default value for N is 30.\n";
   print "<number_of_chars>: the value for N, the length of error context.\n";
   exit(1);
} 

## get input file name 
$infile = $ARGV[0];

## open source file
open(INPUT, "$infile") || die "Can't open file \"$infile\".\n";

## check whether we have verbose mode set.
if (($#ARGV >= 1) && ($ARGV[1] eq "-v")) {
    $verbose = 1;

    ## check whether we have the context length set
    if ($#ARGV >= 2) {
	$context = int($ARGV[2]); 
    }

}

## start parsing the document
&parseDoc();

## report error count
&printErrorSummary();

exit(0);


############## Subroutines ##############

# Root of the parser

sub parseDoc() {
  
  &parseProlog();

  &parseBody();
  
  close(INPUT); 

  &cleanupStack();
}


# Parse the prolog part of the document
#
# prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
# XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
# Misc ::= Comment | PI | S
# Comment ::= '<!--' ((char - '-') | ('-' (char - '-')))* '-->'
# PI ::= '<?' PITarget (char* - '?>')? '?>'
# S ::= (#x20 | #x9 | #xD | #xA)+     (i.e., white space characters)
#
# More in the XML specification 1.0 at www.w3.org
sub parseProlog() {
  my ($err, $doctype, $depth, $stop, $more);
  
  $line = &nonBlankLine();
  $pos = 0;
  
  if (defined($line)) {

	## parse the xmlDecl part	  
	if ($line =~ /(.*)<\?xml(.*)/) {
	    $pos = length($1) + 5;
	    $more = $2;

	    if ($1 =~ /[\s]*(.*)[\s]*/) {		  	  
		if (length($1) != 0) {
		    ## wrong syntax
		    &addError($errType[0], $linenum, 0, $cachedCurLine);
		}
	    }

	    $line = &parseXMLDecl($more, $pos);
	}
  }
    
  if ($line =~ /[\s]*(.*)[\s]*/) {
	$line = $1;
  }
  
  if (length($line) == 0) {
  	$line = &nonBlankLine();
  	$pos = 0;			
  }
	
  $stop = 0;
  $doctype = 0;
  ## parse Misc, which could be either comment, PI or doctypedecl;
  ## and doctypedecl can exist at most once.
  while ((defined($line)) && (!$stop)) {
	
	## parse tags  
	if ($line =~ /([^<>]*)(<.*)/) {
	    $line = $2;
	    
	    ## Parse doctypedecl start tags <!DOCTYPE...>
	    if ($line =~ /(<!DOCTYPE[^<>]*>)(.*)/) {
		$line = $2;
		$pos = $pos + length($1);
		
		if ($doctype == 0) {
		    $doctype = 1;
		    ## Todo: handle nested DTD in multiple lines
		} else {
		    ##  duplicate doctype 
		    &addError($errType[7], $linenum, $pos, $cachedCurLine);  
		}		
	    } 
	    
	    ## Parse comment <!-- ... -->
	    elsif (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);
	    } 

	    ## catch incomplete comment starting with <!
	    elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete comment
		&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## catch incomplete comment end with ->
	    elsif ($line =~ /(<.*->)(.*)/) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete comment
		&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## parse PI <? ... ?>
	    elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) {
		$line = $2;		
		$pos = $pos + length($1);		
	    } 

	    ## catch incomplete PI starting with <?
	    elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete PI
		&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## catch incomplete PI end with ?>
	    elsif ($line =~ /(<.*\?>)(.*)/) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete PI
		&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## assume the beginning of element body, need to exit the subroutine   
	    else {
		$stop = 1;
	    }  
	}
	
	# missing <, skip to the next <
	elsif ($line =~ /([^<>]*)>(.*)/) {
	    $line = $2;
	    $pos = $pos + length($1) + 1;

	    ## wrong syntax for a tag
	    &addError($errType[6]." for \"".$1."\"", $linenum, $pos-1, $cachedCurLine);	    	   
	}

	## skip to new line since no < or > (encountering test data)	
	else {
  	  $line = &nonBlankLine();
  	  $pos = 0;		
	} 	
	  	
  }
  
}

# Parse the XMLDecl part 
sub parseXMLDecl() {
  my ($decl, $pos) = @_;
  my ($err, $more);
  
  $err = 0;
  
  ## parse the versionInfo
  if ($decl =~ /(([\s]*)version[\s]*=[\s]*)(.*)/) {

	  if (length($2) == 0) {
	      ## need whitespace
	      &addError($errType[12], $linenum, $pos, $cachedCurLine); 
	  }

	  ## parse the version number
	  $pos = $pos + length($1);
	  $more = $3;
	  
	  if ($more =~ /(("|')?([a-zA-Z0-9_\.:]|-)+("|')?)(.*)/) {
		$pos = $pos + length($1);
		$more = $5; 		  
	  } else {
		  ## wrong version number format
		  &addError($errType[2], $linenum, $pos, $cachedCurLine);
		  $err = 1;
	  }
	    	
	  ## parse the optional EncodingDecl	  
	  if (!$err) { 
	  	if ($more =~ /(([\s]*)encoding[\s]*=[\s]*)(.*)/) {
		  	
	          if (length($2) == 0) {
	              ## need whitespace
	              &addError($errType[12], $linenum, $pos, $cachedCurLine);
	          }

		  $pos = $pos + length($1);
		  $more = $3;
		  if ($more =~ /(("|')([A-Za-z]([A-Za-z0-9\._]|-)*)("|'))(.*)/) {			  
			$pos = $pos + length($1);			
			$more = $6;
		  } else {
		  	## wrong encoding name
		  	&addError($errType[3], $linenum, $pos, $cachedCurLine); 
		  	$err = 1;
		  }
	    }
      }
	
	  ## parse the optional SDDecl
	  if (!$err) {
		if ($more =~ /(([\s]*)standalone[\s]*=[\s]*)(.*)/) {
	  	
	          if (length($2) == 0) {
	              ## need whitespace

	              &addError($errType[12], $linenum, $pos, $cachedCurLine);
	          }

		  $pos = $pos + length($1);
		  $more = $3;
		  if ($more =~ /(("|')(yes|no)("|'))(.*)/) {
			$pos = $pos + $1;
			$more = $5;
		  } else {
		  	## wrong Standalone Document Declaration
		  	&addError($errType[4], $linenum, $pos, $cachedCurLine); 
		  	$err = 1; 			  
		  }		    
  	    }
	  }

  	  if (!$err) {
  	  	if ($more =~ /(.*)\?>(.*)/) {
	  	  
	  	  $tmp = $pos;
	  	  $pos = $pos + length($1) + 2;	 
	  	  $more = $2; 	  
	  	  	  	  
	  	  if ($1 =~ /[\s]*(.*)[\s]*/) {		  	  
		  	if (length($1) != 0) {
			  	## wrong attributes
		  		&addError($errType[5], $linenum, $tmp, $cachedCurLine);
		  		$err = 1;
	  		}
	  	  }
  	  	} else {
	  		## missing end tag
	  		&addError($errType[6], $linenum, $pos, $cachedCurLine);
	  		$err = 1;
	  	}	
  	  }
  	  
  } else {
	## wrong version attribute
	&addError($errType[1], $linenum, $pos, $cachedCurLine);
	$err = 1;
  }
  
  return $more;
}


# Parse the document body containing nested tagged content,
# comments and processing instructions.
sub parseBody() {

    while (defined($line)) {

	## parse tag starting with <.
	if ($line =~ /([^<>]*)(<.*)/) {

	    $line = $2;
	    $pos = $pos + length($1);
	    
	    ## "<!-- ... -->": parse the comment
	    if (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);		
	    }

	    ## catch incomplete comment starting with <!
	    elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete comment
		&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## catch incomplete comment end with ->
	    elsif ($line =~ /(<.*->)(.*)/) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete comment
		&addError($errType[10], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## parse PI <? ... ?>
	    elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) {
		$line = $2;		
		$pos = $pos + length($1);		
	    } 

	    ## catch incomplete PI starting with <?
	    elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete PI
		&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## catch incomplete PI end with ?>
	    elsif ($line =~ /(<.*\?>)(.*)/) {
		$line = $2;
		$pos = $pos + length($1);
		## incomplete PI
		&addError($errType[11], $linenum, $pos-1, $cachedCurLine);
	    }	    

	    ## "< ... >": parse the tag content
	    elsif ($line =~ /<([^<>]*)>(.*)/) { 	    
		$content = $1;
		$line = $2;
		$prevpos = $pos; # cache the pos for error reporting on attribute-value pairs
		$pos = $pos + length($1) + 2;

		## "< ... />": parse an empty tag (with no matching tags)
		if ($content =~ /\/$/) {	    
		    
		}

		## "</ ... >": parse and match the tag with existing ones.
		elsif ($content =~ /^\//) {

		    if ($content =~ /\/(.*)/) {
 
			## check whether there is a match with a start tag in the stack
			$depth = &locateTagname($1);
			if ($depth >= 0) {
			    &popFrom($depth+1);
			    &pop();
			} else {
			    ## miss the start tag
			    &addError($errType[8]." for \"".$content."\"", $linenum, $pos-1, $cachedCurLine);
			}
		
		    }     
		}

		## "< ... >": parse the start tag
		else { 
		
		    ## check whether we have attribute-value pairs
		    if ($content =~ /([^\s]+)(\s+)(.*)/) {
			## set the tag name 
			$content = $1;

			## parse the attribute-value pair list
			&parseAttributes($3, $prevpos + length($1) + length($2) + 1);
		    }

		    &push($content, $linenum, $pos);
		
		}	    
	    }
	    
	    ## incomplete tag
	    else {
		## wrong syntax for a tag
		&addError($errType[6], $linenum, $pos, $cachedCurLine);

		if ($line =~ /(<[^<>]*)(<.*)/) { 
		    $line = $2;
		    $pos = $pos + length($1);
		} 

		## skip to new line if no < detected (encountering test data)  
		else {
		    $line = &nonBlankLine();
		    $pos = 0;				    
		} 			
	    }
	}

	## parse tag starting with > (missing <)
	## We assume a tag will not be seperated on different lines.
	elsif ($line =~ /([^<>]*)>(.*)/) {
	    $line = $2;
	    $pos = $pos + length($1) + 1;

	    ## wrong syntax for a tag
	    &addError($errType[6], $linenum, $pos-1, $cachedCurLine);
	}

	## skip to new line if no < or > is detected (encountering test data)			
	else {
  	  $line = &nonBlankLine();
  	  $pos = 0;		

	} 	

    } # while loop

}


# Parse the attribute-value pairs

sub parseAttributes() {
    my ($list, $p) = @_;

    ## we scan the list backwards from the end to the front, 
    ## verifying the format of each attribute-value pair. This is 
    ## because the pattern matching is greedy.

    ## set position to the end of the list as we start backwards
    $p = $p + length($list);

    while (length($list) != 0) {
	
	## extract the attribute-value pair at the end of the list
	if ($list =~ /(.*)([^\s\"=]+\s*=\s*\"[^\"]*\")(.*)/) {

	    $list = $1;
	    $tmp = $p - length($3);
	    $p = $p - length($2) - length($3);

	    ## check whether there is content after the matched pair
	    if ($3 =~ /[\s]*(.*)/) {
		if (length($1) != 0) {
		    ## report error on attribute assignment syntax
		    &addError($errType[13], $linenum, $tmp, $cachedCurLine);
		}
	    }	    	    

	    ## remove any white space before the list
	    ## making the list empty if it only consists of white spaces
	    if ($list =~ /[\s]*(.*)/) {
		$list = $1;
	    }	    

	} else {
	    ## report error if the list is not empty but does not contain
	    ## any attribute-value pair
	    $p = $p - length($list);
	    &addError($errType[13], $linenum, $p, $cachedCurLine);
	    $list = ""; ## to exit the loop
	}

    } # while loop
    
}



# Return the next non-blank line in the document

sub nonBlankLine() {
  my $line = <INPUT>;
  my $isBlank = 0;
  
  ## while loop for getting a non-blank line
  while (defined($line) && ($isBlank == 0)) {
    $linenum++;    
    
    if ($line =~ /\n$/) {
	chop($line);
    }
    
    ## remove any while space characters (space, tab, newline, etc.) 
    ## before the string
    if ($line =~ /[\s]*(.*)[\s]*/) {
      $line = $1;
    }
    
    if (length($line) != 0) {
      $isBlank = 1;
    }
    else {
      $line = <INPUT>;      
    }
    
  }  
  
  ## cache the line to be processed for reporting error context.
  $cachedCurLine = $line;

  return $line;
}


# Add a new error to the error log
sub addError() {
  my ($et, $en, $p, $cl) = @_;	
  my ($offset, $len, $substr);

  $errorCount++;
  print "Line $en, Col $p:  \t$et.";

  ## check whether we need to report in verbose mode
  if ($verbose) {
      $offset = $p - $context / 2;
      if ($offset < 0) { $offset = 0; }

      $len = $context;
      if ($offset + $len > length($cl)) {
	  $len = length($cl) - $offset - 1;
      }

      $substr = substr($cl, $offset, $len);
      print "\t(...$substr...)";
  }
  
  print "\n";

}

## Print a summary of errors
sub printErrorSummary() {
    if ($errorCount > 1) {
	print "$errorCount errors found.\n";
    } else {
	print "$errorCount error found.\n";
    }
}


##### Stack Implementation ######

# Push the record into the Stack
sub push() {
  my ($tn, $ln, $p) = @_;

  ## chop off spaces around tag name  
  if ($tn =~ /[\s]*(.*)[\s]*/) {
      $tn = $1;
  }
  
  @tagname[$top] = $tn;
  @linenum[$top] = $ln;
  @pos[$top] = $p;

  ## save the current line implicitly for reporting error context.
  @cachedLine[$top] = $cachedCurLine;

  $top++;
}


# Pop the top record from the Stack
sub pop() {
  $top--;	
}


# Return 1 if Stack is empty, 0 otherwise
sub empty() {
  if (top == 0) {
      return 1;
  }	else {
      return 0;
  }
}


# Return the Stack depth of the given tagname
sub locateTagname() {
  my($tn) = @_;
  my($d);
  $d = $top-1;
  $found = 0;
  
  ## chop off spaces around tag name  
  if ($tn =~ /[\s]*(.*)[\s]/) {
      $tn = $1;
  }

  while (($d >= 0) && (!$found)) {	  
      if ($tagname[$d] eq $tn) {
	  $found = 1;
      } else {
	  $d--;
      }
  }
  
  return $d;	
}


# Pop the records from the given depth to the top of the Stack
sub popFrom() {
  my($d) = @_;
  my($i);
  
  if (($d >= 0) && ($d < $top)) {
	  
      ## pop each of the records from given depth up to the top
      ## for each records, report an error of missing end tag
      for ($i=$d; $i<$top; $i++) {
	  ## missing end tag
	  &addError($errType[9]." for \"".$tagname[$i]."\"",
		    $linenum[$i], $pos[$i], $cachedLine[$i]);
      }
      
      $top = $d;	  
  }		
}


# Clean up the Stack before we exit the program - check whether we have
# any unmatched records in the stack. Report error for each of them.
sub cleanupStack() {

    for ($i=0; $i<$top; $i++) {
	## missing end tag
	&addError($errType[9]." for \"".$tagname[$i]."\"", $linenum[$i],
		  $pos[$i], $cachedLine[$i]);
    }

    $top = 0;
}


# Print the stack content for debugging
sub printStack() {
    print "Stack: ";
    for ($i=0; $i<$top; $i++) {
	print "($tagname[$i], $linenum[$i], $pos[$i])\t";
    } 
    print "\n";
}