diff --git a/src/prettify.js b/src/prettify.js index 576611f1..dba5493e 100644 --- a/src/prettify.js +++ b/src/prettify.js @@ -239,6 +239,51 @@ function PR_endsWith(s, suffix) { suffix == s.substring(s.length - suffix.length, s.length); } +/** a set of tokens that can precede a regular expression literal in javascript. + * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full + * list, but I've removed ones that might be problematic when seen in languages + * that don't support regular expression literals. + * + *
Specifically, I've removed any keywords that can't precede a regexp + * literal in a syntactically legal javascript program, and I've removed the + * "in" keyword since it's not a keyword in many languages, and might be used + * as a count of inches. + * @private + */ +var REGEXP_PRECEDER_PATTERN = (function () { + var preceders = [ + "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=", + "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=", + "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";", + "<", "<<", "<<=", "<=", "=", "==", "===", ">", + ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", + "^", "^=", "^^", "^^=", "{", "|", "|=", "||", + "||=", "~", "break", "case", "continue", "delete", + "do", "else", "finally", "instanceof", + "return", "throw", "try", "typeof" + ]; + var pattern = '(?:' + + '(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number + '(?:(?:^|[^\\+])\\+)|' + // allow + but not ++ + '(?:(?:^|[^\\-])-)' // allow - but not -- + ; + for (var i = 0; i < preceders.length; ++i) { + var preceder = preceders[i]; + if (PR_isWordChar(preceder.charAt(0))) { + pattern += '|\\b' + preceder; + } else { + pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1'); + } + } + pattern += ')\\s*$'; // matches at end + return new RegExp(pattern); + // CAVEAT: this does not properly handle the case where a regular expression + // immediately follows another since a regular expression may have flags + // for case-sensitivity and the like. Having regexp tokens adjacent is not + // valid in any language I'm aware of, so I'm punting. + // TODO: maybe style special characters inside a regexp as punctuation. + })(); + /** true iff prefix matches the first prefix characters in chars[0:len]. * @private */ @@ -679,6 +724,8 @@ function PR_splitStringAndCommentTokens(chunks) { var state = 0; // FSM state variable var delim = -1; // string delimiter var k = 0; // absolute position of beginning of current chunk + var lookBehind = []; // the last 16 characters processed collapsing space + var lastCh = ''; for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { var chunk = chunks[ci]; @@ -699,8 +746,8 @@ function PR_splitStringAndCommentTokens(chunks) { } else if (ch == '/') { state = 3; } else if (ch == '#') { - tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); state = 4; + tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); } } else if (1 == state) { if (ch == delim) { @@ -719,10 +766,21 @@ function PR_splitStringAndCommentTokens(chunks) { state = 5; tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); } else { - state = 0; - // next loop will reenter state 0 without same value of i, so - // ch will be reconsidered as start of new token. - next = i; + // check the last token and see if we should treat this as the start + // of a regular expression literal. + if ((!lookBehind.length || + REGEXP_PRECEDER_PATTERN.test(lookBehind.join('')))) { + // treat regular expression as a string with delimiter / + state = 1; + delim = '/'; + tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); + } else { + state = 0; + // next loop will reenter state 0 without same value of i, so + // ch will be reconsidered as start of new token. + next = i; + continue; + } } } else if (4 == state) { if (ch == '\r' || ch == '\n') { @@ -737,10 +795,23 @@ function PR_splitStringAndCommentTokens(chunks) { if (ch == '/') { state = 0; tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT)); + continue; // skip lookbehind } else if (ch != '*') { state = 5; } } + + // push char on lookbehind if it's not a comment token. Don't + // waste space with lots of space ; just leave enough to indicate + // boundaries. + if (3 > state || state > 6) { + var isSpace = PR_isSpaceChar(ch); + if (!(lastCh === ' ' && isSpace)) { + if (lookBehind.length > 16) { lookBehind.shift(); } + lastCh = isSpace ? ' ' : ch; + lookBehind.push(lastCh); + } + } } } k += s.length; diff --git a/tests/prettify_test.html b/tests/prettify_test.html index 93d9fa0c..99eaba29 100644 --- a/tests/prettify_test.html +++ b/tests/prettify_test.html @@ -150,6 +150,47 @@
+/foo/; // a slash starting a line treated as a regexp beginning +"foo".match(/fo+$/); +// this line comment not treated as a regular expressions +"foo /bar/".test(/"baz"/); // test string and regexp boundaries +var division = /\b\d+\/\d+/g; // test char sets and escaping of specials +var allSpecials = /([^\(\)\[\]\{\}\-\?\+\*\.\^\$\/]+)\\/; + +// test that slash used in numeric context treated as an operator +1 / 2; +1. / x; +x / y; +(x) / y; +1 /* foo */ / 2; +1 /* foo *// 2; +1/2; +1./x; +x/y; +(x)/y; + +// test split over two lines. line comment should not fool it +1// +/2; + +x++/y; +x--/y; +x[y] / z; +f() / n; + +// test that slash after non postfix operator is start of regexp +log('matches = ' + /foo/.test(foo)); + +// test keyword preceders +return /a regexp/; +division = notreturn / not_a_regexp / 2; // keyword suffix does not match + +// & not used as prefix operator in javascript but this should still work +&/foo/; ++
#!/usr/bin/perl @@ -747,7 +788,7 @@Bug 8 - tabs mangled
' `END`COM// PHP has a plethora of comment types' + '`END`PLN
' + ' `END`COM\/* What is a
' + - ' "plethora"? */`END`PLN
' + + ' "plethora"? *\/`END`PLN
' + ' `END`KWDfunction`END`PLN fib`END`PUN(`END' + '`PLN$n`END`PUN)`END`PLN `END`PUN{`END`PLN
' + ' `END`COM# I don\'t know.`END`PLN
' + @@ -827,7 +868,66 @@Bug 8 - tabs mangled
'`END`PLNeleven`END`PLN `END`TYPTwelve`END`PLN `END' + '`PLNthirteen`END`PLN `END' + '`TYPFourteen`END`PLN fifteen `END`' + - 'PUN|`END') + 'PUN|`END'), + issue12: ( + '`STR/foo/`END`PUN;`END`PLN `END`COM// a slash starting a line ' + + 'treated as a regexp beginning`END`PLN
' + + '`END`STR"foo"`END`PUN.`END`PLNmatch`END`PUN(`END`STR/fo+$/`END' + + '`PUN);`END`PLN
' + + '`END`COM// this line comment not treated as a regular expressions`END' + + '`PLN
' + + '`END`STR"foo /bar/"`END`PUN.`END`PLNtest`END`PUN(`END`STR/"baz"/`END' + + '`PUN);`END`PLN `END`COM// test string and regexp boundaries' + + '`END`PLN
' + + '`END`KWDvar`END`PLN division `END`PUN=`END`PLN `END' + + '`STR/\\b\\d+\\/\\d+/`END`PLNg`END`PUN;`END`PLN `END' + + '`COM// test char sets and escaping of specials`END`PLN
' + + '`END`KWDvar`END`PLN allSpecials `END`PUN=`END`PLN `END' + + '`STR/([^\\(\\)\\[\\]\\{\\}\\-\\?\\+\\*\\.\\^\\$\\/]+)\\\\/`END' + + '`PUN;`END`PLN
' + + '
' + + '`END`COM// test that slash used in numeric context treated as an ' + + 'operator`END`PLN
' + + '`END`LIT1`END`PLN `END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PUN.`END`PLN `END`PUN/`END`PLN x`END`PUN;`END`PLN
' + + 'x `END`PUN/`END`PLN y`END`PUN;`END`PLN
' + + '`END`PUN(`END`PLNx`END`PUN)`END`PLN `END`PUN/`END`PLN y`END`PUN;`END' + + '`PLN
' + + '`END`LIT1`END`PLN `END`COM/* foo */`END`PLN `END`PUN/`END`PLN `END' + + '`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PLN `END`COM/* foo */`END`PUN/`END`PLN `END`LIT2`END' + + '`PUN;`END`PLN
' + + '`END`LIT1`END`PUN/`END`LIT2`END`PUN;`END`PLN
' + + '`END`LIT1`END`PUN./`END`PLNx`END`PUN;`END`PLN
' + + 'x`END`PUN/`END`PLNy`END`PUN;`END`PLN
' + + '`END`PUN(`END`PLNx`END`PUN)/`END`PLNy`END`PUN;`END`PLN
' + + '
' + + '`END`COM// test split over two lines. line comment should not ' + + 'fool it`END`PLN
' + + '`END`LIT1`END`COM//`END`PLN
' + + '`END`PUN/`END`LIT2`END`PUN;`END`PLN
' + + '
' + + 'x`END`PUN++/`END`PLNy`END`PUN;`END`PLN
' + + 'x`END`PUN--/`END`PLNy`END`PUN;`END`PLN
' + + 'x`END`PUN[`END`PLNy`END`PUN]`END`PLN `END`PUN/`END`PLN z`END`PUN;`END' + + '`PLN
' + + 'f`END`PUN()`END`PLN `END`PUN/`END`PLN n`END`PUN;`END`PLN
' + + '
' + + '`END`COM// test that slash after non postfix operator is start of ' + + 'regexp`END`PLN
' + + 'log`END`PUN(`END`STR\'matches = \'`END`PLN `END`PUN+`END`PLN `END' + + '`STR/foo/`END`PUN.`END`PLNtest`END`PUN(`END`PLNfoo`END`PUN));`END' + + '`PLN
' + + '
' + + '`END`COM// test keyword preceders`END`PLN
' + + '`END`KWDreturn`END`PLN `END`STR/a regexp/`END`PUN;`END`PLN
' + + 'division `END`PUN=`END`PLN notreturn `END`PUN/`END`PLN not_a_regexp ' + + '`END`PUN/`END`PLN `END`LIT2`END`PUN;`END`PLN `END`COM// ' + + 'keyword suffix does not match`END`PLN
' + + '
' + + '`END`COM// & not used as prefix operator in javascript but this ' + + 'should still work`END`PLN
' + + '`END`PUN&`END`STR/foo/`END`PUN;`END') };