#!/usr/bin/perl ########################################################################## # # xlint.pl # # This program acts as an error-tolerant XML parser. It reports all the # detected errors as it parses through the XML document and never # terminates before it reaches the end of the document. # # Author : Yuhui Jin # Juan Fernando Arguello # Stanford University # Last Modified : December 24, 2003 # # Acknowledgement: This work is supported by US Air Force and the DARPA # DAML project "OntoAgents" (01IN901C0). # # ########################################################################## ############## Main program ############## # Define global variables ## Error types to label each error, used in error reporting @errType = ("Syntax error for the xml declaration", #0 "Expect the attribute for version info", #1 "Invalid version number assignment", #2 "Invalid encoding name assignment", #3 "Invalid assignment for standalone attribute", #4 "Attribute not expected at this location", #5 "Syntax error for the tag", #6 "Duplicate DocType declaration", #7 "Missing the start tag", #8 "Missing the end tag", #9 "Syntax error for the comment", #10 "Syntax error for the processing instruction", #11 "Expect white space", #12 "Syntax error for attribute assignment" #13 ); ## Line number of the current line being parsed $linenum = 0; ## Store the line being parsed $line = ""; ## The position in the line being parsed $pos = 0; ## The flag for verbose mode of error reporting. ## If it is set, error context consisting of N characters ## will be shown as well as the position of the error. ## The default is off. $verbose = 0; ## The length of the error context $context = 30; ## The counter for total number of errors $errorCount = 0; ## Cache the line being procesed for reporting error in context $cachedCurLine = ""; # Check correct command line usage of the parser if ($#ARGV < 0) { print "USAGE: "; print "perl xlint.pl <file_name> [-v |-v <number_of_chars>]\n"; print "<file_name>: the absolute or relative file name of the XML document to be parsed.\n"; print "-v: the verbose mode where a context of N characters around the error ". "position is displayed. The default value for N is 30.\n"; print "<number_of_chars>: the value for N, the length of error context.\n"; exit(1); } ## get input file name $infile = $ARGV[0]; ## open source file open(INPUT, "$infile") || die "Can't open file \"$infile\".\n"; ## check whether we have verbose mode set. if (($#ARGV >= 1) && ($ARGV[1] eq "-v")) { $verbose = 1; ## check whether we have the context length set if ($#ARGV >= 2) { $context = int($ARGV[2]); } } ## start parsing the document &parseDoc(); ## report error count &printErrorSummary(); exit(0); ############## Subroutines ############## # Root of the parser sub parseDoc() { &parseProlog(); &parseBody(); close(INPUT); &cleanupStack(); } # Parse the prolog part of the document # # prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? # XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' # Misc ::= Comment | PI | S # Comment ::= '<!--' ((char - '-') | ('-' (char - '-')))* '-->' # PI ::= '<?' PITarget (char* - '?>')? '?>' # S ::= (#x20 | #x9 | #xD | #xA)+ (i.e., white space characters) # # More in the XML specification 1.0 at www.w3.org sub parseProlog() { my ($err, $doctype, $depth, $stop, $more); $line = &nonBlankLine(); $pos = 0; if (defined($line)) { ## parse the xmlDecl part if ($line =~ /(.*)<\?xml(.*)/) { $pos = length($1) + 5; $more = $2; if ($1 =~ /[\s]*(.*)[\s]*/) { if (length($1) != 0) { ## wrong syntax &addError($errType[0], $linenum, 0, $cachedCurLine); } } $line = &parseXMLDecl($more, $pos); } } if ($line =~ /[\s]*(.*)[\s]*/) { $line = $1; } if (length($line) == 0) { $line = &nonBlankLine(); $pos = 0; } $stop = 0; $doctype = 0; ## parse Misc, which could be either comment, PI or doctypedecl; ## and doctypedecl can exist at most once. while ((defined($line)) && (!$stop)) { ## parse tags if ($line =~ /([^<>]*)(<.*)/) { $line = $2; ## Parse doctypedecl start tags <!DOCTYPE...> if ($line =~ /(<!DOCTYPE[^<>]*>)(.*)/) { $line = $2; $pos = $pos + length($1); if ($doctype == 0) { $doctype = 1; ## Todo: handle nested DTD in multiple lines } else { ## duplicate doctype &addError($errType[7], $linenum, $pos, $cachedCurLine); } } ## Parse comment <!-- ... --> elsif (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) { $line = $2; $pos = $pos + length($1); } ## catch incomplete comment starting with <! elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) { $line = $2; $pos = $pos + length($1); ## incomplete comment &addError($errType[10], $linenum, $pos-1, $cachedCurLine); } ## catch incomplete comment end with -> elsif ($line =~ /(<.*->)(.*)/) { $line = $2; $pos = $pos + length($1); ## incomplete comment &addError($errType[10], $linenum, $pos-1, $cachedCurLine); } ## parse PI <? ... ?> elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) { $line = $2; $pos = $pos + length($1); } ## catch incomplete PI starting with <? elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) { $line = $2; $pos = $pos + length($1); ## incomplete PI &addError($errType[11], $linenum, $pos-1, $cachedCurLine); } ## catch incomplete PI end with ?> elsif ($line =~ /(<.*\?>)(.*)/) { $line = $2; $pos = $pos + length($1); ## incomplete PI &addError($errType[11], $linenum, $pos-1, $cachedCurLine); } ## assume the beginning of element body, need to exit the subroutine else { $stop = 1; } } # missing <, skip to the next < elsif ($line =~ /([^<>]*)>(.*)/) { $line = $2; $pos = $pos + length($1) + 1; ## wrong syntax for a tag &addError($errType[6]." for \"".$1."\"", $linenum, $pos-1, $cachedCurLine); } ## skip to new line since no < or > (encountering test data) else { $line = &nonBlankLine(); $pos = 0; } } } # Parse the XMLDecl part sub parseXMLDecl() { my ($decl, $pos) = @_; my ($err, $more); $err = 0; ## parse the versionInfo if ($decl =~ /(([\s]*)version[\s]*=[\s]*)(.*)/) { if (length($2) == 0) { ## need whitespace &addError($errType[12], $linenum, $pos, $cachedCurLine); } ## parse the version number $pos = $pos + length($1); $more = $3; if ($more =~ /(("|')?([a-zA-Z0-9_\.:]|-)+("|')?)(.*)/) { $pos = $pos + length($1); $more = $5; } else { ## wrong version number format &addError($errType[2], $linenum, $pos, $cachedCurLine); $err = 1; } ## parse the optional EncodingDecl if (!$err) { if ($more =~ /(([\s]*)encoding[\s]*=[\s]*)(.*)/) { if (length($2) == 0) { ## need whitespace &addError($errType[12], $linenum, $pos, $cachedCurLine); } $pos = $pos + length($1); $more = $3; if ($more =~ /(("|')([A-Za-z]([A-Za-z0-9\._]|-)*)("|'))(.*)/) { $pos = $pos + length($1); $more = $6; } else { ## wrong encoding name &addError($errType[3], $linenum, $pos, $cachedCurLine); $err = 1; } } } ## parse the optional SDDecl if (!$err) { if ($more =~ /(([\s]*)standalone[\s]*=[\s]*)(.*)/) { if (length($2) == 0) { ## need whitespace &addError($errType[12], $linenum, $pos, $cachedCurLine); } $pos = $pos + length($1); $more = $3; if ($more =~ /(("|')(yes|no)("|'))(.*)/) { $pos = $pos + $1; $more = $5; } else { ## wrong Standalone Document Declaration &addError($errType[4], $linenum, $pos, $cachedCurLine); $err = 1; } } } if (!$err) { if ($more =~ /(.*)\?>(.*)/) { $tmp = $pos; $pos = $pos + length($1) + 2; $more = $2; if ($1 =~ /[\s]*(.*)[\s]*/) { if (length($1) != 0) { ## wrong attributes &addError($errType[5], $linenum, $tmp, $cachedCurLine); $err = 1; } } } else { ## missing end tag &addError($errType[6], $linenum, $pos, $cachedCurLine); $err = 1; } } } else { ## wrong version attribute &addError($errType[1], $linenum, $pos, $cachedCurLine); $err = 1; } return $more; } # Parse the document body containing nested tagged content, # comments and processing instructions. sub parseBody() { while (defined($line)) { ## parse tag starting with <. if ($line =~ /([^<>]*)(<.*)/) { $line = $2; $pos = $pos + length($1); ## "<!-- ... -->": parse the comment if (($line =~ /^<!--/) && ($line =~ /(<!--.*-->)(.*)/)) { $line = $2; $pos = $pos + length($1); } ## catch incomplete comment starting with <! elsif (($line =~ /^<!/) && ($line =~ /(<!.*>)(.*)/)) { $line = $2; $pos = $pos + length($1); ## incomplete comment &addError($errType[10], $linenum, $pos-1, $cachedCurLine); } ## catch incomplete comment end with -> elsif ($line =~ /(<.*->)(.*)/) { $line = $2; $pos = $pos + length($1); ## incomplete comment &addError($errType[10], $linenum, $pos-1, $cachedCurLine); } ## parse PI <? ... ?> elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*\?>)(.*)/)) { $line = $2; $pos = $pos + length($1); } ## catch incomplete PI starting with <? elsif (($line =~ /^<\?/) && ($line =~ /(<\?.*>)(.*)/)) { $line = $2; $pos = $pos + length($1); ## incomplete PI &addError($errType[11], $linenum, $pos-1, $cachedCurLine); } ## catch incomplete PI end with ?> elsif ($line =~ /(<.*\?>)(.*)/) { $line = $2; $pos = $pos + length($1); ## incomplete PI &addError($errType[11], $linenum, $pos-1, $cachedCurLine); } ## "< ... >": parse the tag content elsif ($line =~ /<([^<>]*)>(.*)/) { $content = $1; $line = $2; $prevpos = $pos; # cache the pos for error reporting on attribute-value pairs $pos = $pos + length($1) + 2; ## "< ... />": parse an empty tag (with no matching tags) if ($content =~ /\/$/) { } ## "</ ... >": parse and match the tag with existing ones. elsif ($content =~ /^\//) { if ($content =~ /\/(.*)/) { ## check whether there is a match with a start tag in the stack $depth = &locateTagname($1); if ($depth >= 0) { &popFrom($depth+1); &pop(); } else { ## miss the start tag &addError($errType[8]." for \"".$content."\"", $linenum, $pos-1, $cachedCurLine); } } } ## "< ... >": parse the start tag else { ## check whether we have attribute-value pairs if ($content =~ /([^\s]+)(\s+)(.*)/) { ## set the tag name $content = $1; ## parse the attribute-value pair list &parseAttributes($3, $prevpos + length($1) + length($2) + 1); } &push($content, $linenum, $pos); } } ## incomplete tag else { ## wrong syntax for a tag &addError($errType[6], $linenum, $pos, $cachedCurLine); if ($line =~ /(<[^<>]*)(<.*)/) { $line = $2; $pos = $pos + length($1); } ## skip to new line if no < detected (encountering test data) else { $line = &nonBlankLine(); $pos = 0; } } } ## parse tag starting with > (missing <) ## We assume a tag will not be seperated on different lines. elsif ($line =~ /([^<>]*)>(.*)/) { $line = $2; $pos = $pos + length($1) + 1; ## wrong syntax for a tag &addError($errType[6], $linenum, $pos-1, $cachedCurLine); } ## skip to new line if no < or > is detected (encountering test data) else { $line = &nonBlankLine(); $pos = 0; } } # while loop } # Parse the attribute-value pairs sub parseAttributes() { my ($list, $p) = @_; ## we scan the list backwards from the end to the front, ## verifying the format of each attribute-value pair. This is ## because the pattern matching is greedy. ## set position to the end of the list as we start backwards $p = $p + length($list); while (length($list) != 0) { ## extract the attribute-value pair at the end of the list if ($list =~ /(.*)([^\s\"=]+\s*=\s*\"[^\"]*\")(.*)/) { $list = $1; $tmp = $p - length($3); $p = $p - length($2) - length($3); ## check whether there is content after the matched pair if ($3 =~ /[\s]*(.*)/) { if (length($1) != 0) { ## report error on attribute assignment syntax &addError($errType[13], $linenum, $tmp, $cachedCurLine); } } ## remove any white space before the list ## making the list empty if it only consists of white spaces if ($list =~ /[\s]*(.*)/) { $list = $1; } } else { ## report error if the list is not empty but does not contain ## any attribute-value pair $p = $p - length($list); &addError($errType[13], $linenum, $p, $cachedCurLine); $list = ""; ## to exit the loop } } # while loop } # Return the next non-blank line in the document sub nonBlankLine() { my $line = <INPUT>; my $isBlank = 0; ## while loop for getting a non-blank line while (defined($line) && ($isBlank == 0)) { $linenum++; if ($line =~ /\n$/) { chop($line); } ## remove any while space characters (space, tab, newline, etc.) ## before the string if ($line =~ /[\s]*(.*)[\s]*/) { $line = $1; } if (length($line) != 0) { $isBlank = 1; } else { $line = <INPUT>; } } ## cache the line to be processed for reporting error context. $cachedCurLine = $line; return $line; } # Add a new error to the error log sub addError() { my ($et, $en, $p, $cl) = @_; my ($offset, $len, $substr); $errorCount++; print "Line $en, Col $p: \t$et."; ## check whether we need to report in verbose mode if ($verbose) { $offset = $p - $context / 2; if ($offset < 0) { $offset = 0; } $len = $context; if ($offset + $len > length($cl)) { $len = length($cl) - $offset - 1; } $substr = substr($cl, $offset, $len); print "\t(...$substr...)"; } print "\n"; } ## Print a summary of errors sub printErrorSummary() { if ($errorCount > 1) { print "$errorCount errors found.\n"; } else { print "$errorCount error found.\n"; } } ##### Stack Implementation ###### # Push the record into the Stack sub push() { my ($tn, $ln, $p) = @_; ## chop off spaces around tag name if ($tn =~ /[\s]*(.*)[\s]*/) { $tn = $1; } @tagname[$top] = $tn; @linenum[$top] = $ln; @pos[$top] = $p; ## save the current line implicitly for reporting error context. @cachedLine[$top] = $cachedCurLine; $top++; } # Pop the top record from the Stack sub pop() { $top--; } # Return 1 if Stack is empty, 0 otherwise sub empty() { if (top == 0) { return 1; } else { return 0; } } # Return the Stack depth of the given tagname sub locateTagname() { my($tn) = @_; my($d); $d = $top-1; $found = 0; ## chop off spaces around tag name if ($tn =~ /[\s]*(.*)[\s]/) { $tn = $1; } while (($d >= 0) && (!$found)) { if ($tagname[$d] eq $tn) { $found = 1; } else { $d--; } } return $d; } # Pop the records from the given depth to the top of the Stack sub popFrom() { my($d) = @_; my($i); if (($d >= 0) && ($d < $top)) { ## pop each of the records from given depth up to the top ## for each records, report an error of missing end tag for ($i=$d; $i<$top; $i++) { ## missing end tag &addError($errType[9]." for \"".$tagname[$i]."\"", $linenum[$i], $pos[$i], $cachedLine[$i]); } $top = $d; } } # Clean up the Stack before we exit the program - check whether we have # any unmatched records in the stack. Report error for each of them. sub cleanupStack() { for ($i=0; $i<$top; $i++) { ## missing end tag &addError($errType[9]." for \"".$tagname[$i]."\"", $linenum[$i], $pos[$i], $cachedLine[$i]); } $top = 0; } # Print the stack content for debugging sub printStack() { print "Stack: "; for ($i=0; $i<$top; $i++) { print "($tagname[$i], $linenum[$i], $pos[$i])\t"; } print "\n"; }