From 81a8ec22194b44ff37e066b04b22c0967d0bba06 Mon Sep 17 00:00:00 2001 From: mikesamuel Date: Wed, 23 May 2007 04:07:43 +0000 Subject: [PATCH] fixed issue 12: implemented lexing of regular expression literals using an approach based on javascripts lexical grammar to decide when a / begins a regexp literal. This is more conservative than javascript since I don't attempt to handle lexically valid but syntactically invalid javascript. There is one case where a regexp literal in a syntactically valid javascript will not be recognized for (var fieldName in /foo/) { ... } I have never seen this in practice. Someone might iterate over a regexp to iterate out parenthetical matches, but they would have to assign the regexp to a variable first, since javascript does not allow pooling of regexp literals. --- src/prettify.js | 81 ++++++++++++++++++++++++++++-- tests/prettify_test.html | 104 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 178 insertions(+), 7 deletions(-) diff --git a/src/prettify.js b/src/prettify.js index 576611f1..dba5493e 100644 --- a/src/prettify.js +++ b/src/prettify.js @@ -239,6 +239,51 @@ function PR_endsWith(s, suffix) { suffix == s.substring(s.length - suffix.length, s.length); } +/** a set of tokens that can precede a regular expression literal in javascript. + * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full + * list, but I've removed ones that might be problematic when seen in languages + * that don't support regular expression literals. + * + *

Specifically, I've removed any keywords that can't precede a regexp + * literal in a syntactically legal javascript program, and I've removed the + * "in" keyword since it's not a keyword in many languages, and might be used + * as a count of inches. + * @private + */ +var REGEXP_PRECEDER_PATTERN = (function () { + var preceders = [ + "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=", + "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=", + "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";", + "<", "<<", "<<=", "<=", "=", "==", "===", ">", + ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", + "^", "^=", "^^", "^^=", "{", "|", "|=", "||", + "||=", "~", "break", "case", "continue", "delete", + "do", "else", "finally", "instanceof", + "return", "throw", "try", "typeof" + ]; + var pattern = '(?:' + + '(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number + '(?:(?:^|[^\\+])\\+)|' + // allow + but not ++ + '(?:(?:^|[^\\-])-)' // allow - but not -- + ; + for (var i = 0; i < preceders.length; ++i) { + var preceder = preceders[i]; + if (PR_isWordChar(preceder.charAt(0))) { + pattern += '|\\b' + preceder; + } else { + pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1'); + } + } + pattern += ')\\s*$'; // matches at end + return new RegExp(pattern); + // CAVEAT: this does not properly handle the case where a regular expression + // immediately follows another since a regular expression may have flags + // for case-sensitivity and the like. Having regexp tokens adjacent is not + // valid in any language I'm aware of, so I'm punting. + // TODO: maybe style special characters inside a regexp as punctuation. + })(); + /** true iff prefix matches the first prefix characters in chars[0:len]. * @private */ @@ -679,6 +724,8 @@ function PR_splitStringAndCommentTokens(chunks) { var state = 0; // FSM state variable var delim = -1; // string delimiter var k = 0; // absolute position of beginning of current chunk + var lookBehind = []; // the last 16 characters processed collapsing space + var lastCh = ''; for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { var chunk = chunks[ci]; @@ -699,8 +746,8 @@ function PR_splitStringAndCommentTokens(chunks) { } else if (ch == '/') { state = 3; } else if (ch == '#') { - tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); state = 4; + tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); } } else if (1 == state) { if (ch == delim) { @@ -719,10 +766,21 @@ function PR_splitStringAndCommentTokens(chunks) { state = 5; tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); } else { - state = 0; - // next loop will reenter state 0 without same value of i, so - // ch will be reconsidered as start of new token. - next = i; + // check the last token and see if we should treat this as the start + // of a regular expression literal. + if ((!lookBehind.length || + REGEXP_PRECEDER_PATTERN.test(lookBehind.join('')))) { + // treat regular expression as a string with delimiter / + state = 1; + delim = '/'; + tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); + } else { + state = 0; + // next loop will reenter state 0 without same value of i, so + // ch will be reconsidered as start of new token. + next = i; + continue; + } } } else if (4 == state) { if (ch == '\r' || ch == '\n') { @@ -737,10 +795,23 @@ function PR_splitStringAndCommentTokens(chunks) { if (ch == '/') { state = 0; tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT)); + continue; // skip lookbehind } else if (ch != '*') { state = 5; } } + + // push char on lookbehind if it's not a comment token. Don't + // waste space with lots of space ; just leave enough to indicate + // boundaries. + if (3 > state || state > 6) { + var isSpace = PR_isSpaceChar(ch); + if (!(lastCh === ' ' && isSpace)) { + if (lookBehind.length > 16) { lookBehind.shift(); } + lastCh = isSpace ? ' ' : ch; + lookBehind.push(lastCh); + } + } } } k += s.length; diff --git a/tests/prettify_test.html b/tests/prettify_test.html index 93d9fa0c..99eaba29 100644 --- a/tests/prettify_test.html +++ b/tests/prettify_test.html @@ -150,6 +150,47 @@

Javascript

document.write(fib(10)); +

Issue 12 - Javascript Regular Expressions

+
+/foo/;  // a slash starting a line treated as a regexp beginning
+"foo".match(/fo+$/);
+// this line comment not treated as a regular expressions
+"foo /bar/".test(/"baz"/);  // test string and regexp boundaries
+var division = /\b\d+\/\d+/g;  // test char sets and escaping of specials
+var allSpecials = /([^\(\)\[\]\{\}\-\?\+\*\.\^\$\/]+)\\/;
+
+// test that slash used in numeric context treated as an operator
+1 / 2;
+1. / x;
+x / y;
+(x) / y;
+1 /* foo */ / 2;
+1 /* foo *// 2;
+1/2;
+1./x;
+x/y;
+(x)/y;
+
+// test split over two lines.  line comment should not fool it
+1//
+/2;
+
+x++/y;
+x--/y;
+x[y] / z;
+f() / n;
+
+// test that slash after non postfix operator is start of regexp
+log('matches = ' + /foo/.test(foo));
+
+// test keyword preceders
+return /a regexp/;
+division = notreturn / not_a_regexp / 2;  // keyword suffix does not match
+
+// & not used as prefix operator in javascript but this should still work
+&/foo/;
+
+

Perl

 #!/usr/bin/perl
@@ -747,7 +788,7 @@ 

Bug 8 - tabs mangled

'      `END`COM// PHP has a plethora of comment types' + '`END`PLN
' + '      `END`COM\/* What is a
' + - '         "plethora"? */`END`PLN
' + + '         "plethora"? *\/`END`PLN
' + '      `END`KWDfunction`END`PLN fib`END`PUN(`END' + '`PLN$n`END`PUN)`END`PLN `END`PUN{`END`PLN
' + '        `END`COM# I don\'t know.`END`PLN
' + @@ -827,7 +868,66 @@

Bug 8 - tabs mangled

'`END`PLNeleven`END`PLN  `END`TYPTwelve`END`PLN  `END' + '`PLNthirteen`END`PLN        `END' + '`TYPFourteen`END`PLN        fifteen `END`' + - 'PUN|`END') + 'PUN|`END'), + issue12: ( + '`STR/foo/`END`PUN;`END`PLN  `END`COM// a slash starting a line ' + + 'treated as a regexp beginning`END`PLN
' + + '`END`STR"foo"`END`PUN.`END`PLNmatch`END`PUN(`END`STR/fo+$/`END' + + '`PUN);`END`PLN
' + + '`END`COM// this line comment not treated as a regular expressions`END' + + '`PLN
' + + '`END`STR"foo /bar/"`END`PUN.`END`PLNtest`END`PUN(`END`STR/"baz"/`END' + + '`PUN);`END`PLN  `END`COM// test string and regexp boundaries' + + '`END`PLN
' + + '`END`KWDvar`END`PLN division `END`PUN=`END`PLN `END' + + '`STR/\\b\\d+\\/\\d+/`END`PLNg`END`PUN;`END`PLN  `END' + + '`COM// test char sets and escaping of specials`END`PLN
' + + '`END`KWDvar`END`PLN allSpecials `END`PUN=`END`PLN `END' + + '`STR/([^\\(\\)\\[\\]\\{\\}\\-\\?\\+\\*\\.\\^\\$\\/]+)\\\\/`END' + + '`PUN;`END`PLN
' + + '
' + + '`END`COM// test that slash used in numeric context treated as an ' + + 'operator`END`PLN
' + + '`END`LIT1`END`PLN `END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PUN.`END`PLN `END`PUN/`END`PLN x`END`PUN;`END`PLN
' + + 'x `END`PUN/`END`PLN y`END`PUN;`END`PLN
' + + '`END`PUN(`END`PLNx`END`PUN)`END`PLN `END`PUN/`END`PLN y`END`PUN;`END' + + '`PLN
' + + '`END`LIT1`END`PLN `END`COM/* foo */`END`PLN `END`PUN/`END`PLN `END' + + '`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PLN `END`COM/* foo */`END`PUN/`END`PLN `END`LIT2`END' + + '`PUN;`END`PLN
' + + '`END`LIT1`END`PUN/`END`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PUN./`END`PLNx`END`PUN;`END`PLN
' + + 'x`END`PUN/`END`PLNy`END`PUN;`END`PLN
' + + '`END`PUN(`END`PLNx`END`PUN)/`END`PLNy`END`PUN;`END`PLN
' + + '
' + + '`END`COM// test split over two lines.  line comment should not ' + + 'fool it`END`PLN
' + + '`END`LIT1`END`COM//`END`PLN
' + + '`END`PUN/`END`LIT2`END`PUN;`END`PLN
' + + '
' + + 'x`END`PUN++/`END`PLNy`END`PUN;`END`PLN
' + + 'x`END`PUN--/`END`PLNy`END`PUN;`END`PLN
' + + 'x`END`PUN[`END`PLNy`END`PUN]`END`PLN `END`PUN/`END`PLN z`END`PUN;`END' + + '`PLN
' + + 'f`END`PUN()`END`PLN `END`PUN/`END`PLN n`END`PUN;`END`PLN
' + + '
' + + '`END`COM// test that slash after non postfix operator is start of ' + + 'regexp`END`PLN
' + + 'log`END`PUN(`END`STR\'matches = \'`END`PLN `END`PUN+`END`PLN `END' + + '`STR/foo/`END`PUN.`END`PLNtest`END`PUN(`END`PLNfoo`END`PUN));`END' + + '`PLN
' + + '
' + + '`END`COM// test keyword preceders`END`PLN
' + + '`END`KWDreturn`END`PLN `END`STR/a regexp/`END`PUN;`END`PLN
' + + 'division `END`PUN=`END`PLN notreturn `END`PUN/`END`PLN not_a_regexp ' + + '`END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN  `END`COM// ' + + 'keyword suffix does not match`END`PLN
' + + '
' + + '`END`COM// & not used as prefix operator in javascript but this ' + + 'should still work`END`PLN
' + + '`END`PUN&`END`STR/foo/`END`PUN;`END') };