Skip to content

Commit

Permalink
Merge pull request #122 from nicolo-ribaudo/fix-unicode-refs
Browse files Browse the repository at this point in the history
Fix parsing of group references in unicode mode
  • Loading branch information
jviereck committed Dec 14, 2021
2 parents 450cb3f + 15cf7a5 commit f09b912
Show file tree
Hide file tree
Showing 3 changed files with 122 additions and 17 deletions.
45 changes: 28 additions & 17 deletions parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -851,16 +851,6 @@

var res, match, from = pos;

if (hasUnicodeFlag) {
if (res = matchReg(/^\d/)) {
if (res[0] !== "0" || (res = matchReg(/^\d/)) ) {
bail("Invalid decimal escape in unicode mode", null, from, pos);
}
return createEscaped('null', 0x0000, '0', 1);
}
return false;
}

if (res = matchReg(/^(?!0)\d+/)) {
match = res[0];
var refIdx = parseInt(res[0], 10);
Expand All @@ -878,6 +868,16 @@
// a second time with the total normal-groups.
backrefDenied.push(refIdx);

// \1 octal escapes are disallowed in unicode mode, but they might
// be references to groups which haven't been parsed yet.
// We must parse a second time to determine if \1 is a reference
// or an octal scape, and then we can report the error.
if (firstIteration) {
shouldReparse = true;
} else {
bailOctalEscapeIfUnicode(from, pos);
}

// Reset the position again, as maybe only parts of the previous
// matched numbers are actual octal numbers. E.g. in '019' only
// the '01' should be matched.
Expand All @@ -904,6 +904,9 @@
// /\091/.exec('\091')[0].length === 3
else if (res = matchReg(/^[0-7]{1,3}/)) {
match = res[0];
if (match !== '0') {
bailOctalEscapeIfUnicode(from, pos);
}
if (/^0{1,3}$/.test(match)) {
// If they are all zeros, then only take the first one.
return createEscaped('null', 0x0000, '0', match.length);
Expand All @@ -914,6 +917,12 @@
return false;
}

function bailOctalEscapeIfUnicode(from, pos) {
if (hasUnicodeFlag || hasUnicodeSetFlag) {
bail("Invalid decimal escape in unicode mode", null, from, pos);
}
}

function parseCharacterClassEscape() {
// CharacterClassEscape :: one of d D s S w W
var res;
Expand Down Expand Up @@ -1500,6 +1509,7 @@
var backrefDenied = [];
var closedCaptureCounter = 0;
var firstIteration = true;
var shouldReparse = false;
var hasUnicodeFlag = (flags || "").indexOf("u") !== -1;
var hasUnicodeSetFlag = (flags || "").indexOf("v") !== -1;
var pos = 0;
Expand Down Expand Up @@ -1532,13 +1542,14 @@
// the total number of capture groups set.
//
// SEE: https://github.com/jviereck/regjsparser/issues/70
for (var i = 0; i < backrefDenied.length; i++) {
if (backrefDenied[i] <= closedCaptureCounter) {
// Parse the input a second time.
pos = 0;
firstIteration = false;
return parseDisjunction();
}
shouldReparse = shouldReparse || backrefDenied.some(function (ref) {
return ref <= closedCaptureCounter;
});
if (shouldReparse) {
// Parse the input a second time.
pos = 0;
firstIteration = false;
return parseDisjunction();
}

return result;
Expand Down
6 changes: 6 additions & 0 deletions test/test-data-unicode-set.json
Original file line number Diff line number Diff line change
Expand Up @@ -1493,5 +1493,11 @@
9
],
"raw": "[^\\q{AB}]"
},
"\\1": {
"type": "error",
"name": "SyntaxError",
"message": "Invalid decimal escape in unicode mode at position 1\n \\1\n ^",
"input": "\\1"
}
}
88 changes: 88 additions & 0 deletions test/test-data-unicode.json
Original file line number Diff line number Diff line change
Expand Up @@ -1140,5 +1140,93 @@
2
],
"raw": "\\0"
},
"(.)\\1": {
"type": "alternative",
"body": [
{
"type": "group",
"behavior": "normal",
"body": [
{
"type": "dot",
"range": [
1,
2
],
"raw": "."
}
],
"range": [
0,
3
],
"raw": "(.)"
},
{
"type": "reference",
"matchIndex": 1,
"range": [
3,
5
],
"raw": "\\1"
}
],
"range": [
0,
5
],
"raw": "(.)\\1"
},
"\\1(.)": {
"type": "alternative",
"body": [
{
"type": "reference",
"matchIndex": 1,
"range": [
0,
2
],
"raw": "\\1"
},
{
"type": "group",
"behavior": "normal",
"body": [
{
"type": "dot",
"range": [
3,
4
],
"raw": "."
}
],
"range": [
2,
5
],
"raw": "(.)"
}
],
"range": [
0,
5
],
"raw": "\\1(.)"
},
"(.)\\2": {
"type": "error",
"name": "SyntaxError",
"message": "Invalid decimal escape in unicode mode at position 4\n (.)\\2\n ^",
"input": "(.)\\2"
},
"\\2(.)": {
"type": "error",
"name": "SyntaxError",
"message": "Invalid decimal escape in unicode mode at position 1\n \\2(.)\n ^",
"input": "\\2(.)"
}
}

0 comments on commit f09b912

Please sign in to comment.