From f13852851997a57e48b146cdc5d6f9964076ec65 Mon Sep 17 00:00:00 2001 From: haoqixu Date: Fri, 7 Nov 2025 16:12:24 +0800 Subject: [PATCH 1/4] add testcases --- _packages/api/test/api.test.ts | 26 ++++++++++++++++++ internal/api/encoder/encoder_test.go | 18 +++++++++++++ .../encodeSourceFileWithUnicodeEscapes.txt | 27 +++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt diff --git a/_packages/api/test/api.test.ts b/_packages/api/test/api.test.ts index a01b029fac..70aac1aaad 100644 --- a/_packages/api/test/api.test.ts +++ b/_packages/api/test/api.test.ts @@ -11,6 +11,7 @@ import { isTemplateHead, isTemplateMiddle, isTemplateTail, + isStringLiteral, } from "@typescript/ast"; import assert from "node:assert"; import { @@ -113,6 +114,31 @@ describe("SourceFile", () => { }); }); +test("unicode escapes", () => { + const srcFiles = { + "/src/1.ts": `"😃"`, + "/src/2.ts": `"\\ud83d\\ude03"`, // this is "😃" + } + + const api = spawnAPI({ + "/tsconfig.json": "{}", + ...srcFiles, + }); + const project = api.loadProject("/tsconfig.json"); + + Object.keys(srcFiles).forEach((file) => { + const sourceFile = project.getSourceFile(file); + assert.ok(sourceFile); + + sourceFile.forEachChild(function visit(node) { + if (isStringLiteral(node)) { + assert.equal(node.text, "😃"); + } + node.forEachChild(visit); + }); + }) +}); + test("Object equality", () => { const api = spawnAPI(); const project = api.loadProject("/tsconfig.json"); diff --git a/internal/api/encoder/encoder_test.go b/internal/api/encoder/encoder_test.go index 4d1be79d36..232c0fec37 100644 --- a/internal/api/encoder/encoder_test.go +++ b/internal/api/encoder/encoder_test.go @@ -35,6 +35,24 @@ func TestEncodeSourceFile(t *testing.T) { }) } +func TestEncodeSourceFileWithUnicodeEscapes(t *testing.T) { + t.Parallel() + sourceFile := parser.ParseSourceFile(ast.SourceFileParseOptions{ + FileName: "/test.ts", + Path: "/test.ts", + }, `let a = "😃"; let b = "\ud83d\ude03"; let c = "\udc00\ud83d\ude03"; let d = "\ud83d\ud83d\ude03"`, core.ScriptKindTS) + t.Run("baseline", func(t *testing.T) { + t.Parallel() + buf, err := encoder.EncodeSourceFile(sourceFile, "") + assert.NilError(t, err) + + str := formatEncodedSourceFile(buf) + baseline.Run(t, "encodeSourceFileWithUnicodeEscapes.txt", str, baseline.Options{ + Subfolder: "api", + }) + }) +} + func BenchmarkEncodeSourceFile(b *testing.B) { repo.SkipIfNoTypeScriptSubmodule(b) filePath := filepath.Join(repo.TypeScriptSubmodulePath, "src/compiler/checker.ts") diff --git a/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt b/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt new file mode 100644 index 0000000000..a54f51c7de --- /dev/null +++ b/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt @@ -0,0 +1,27 @@ +KindSourceFile [0, 98), i=1, next=0 + NodeList [0, 98), i=2, next=27 + KindVariableStatement [0, 15), i=3, next=9 + KindVariableDeclarationList [0, 14), i=4, next=0 + NodeList [3, 14), i=5, next=0 + KindVariableDeclaration [3, 14), i=6, next=0 + KindIdentifier "a" [3, 5), i=7, next=8 + KindStringLiteral "😃" [7, 14), i=8, next=0 + KindVariableStatement [15, 39), i=9, next=15 + KindVariableDeclarationList [15, 38), i=10, next=0 + NodeList [19, 38), i=11, next=0 + KindVariableDeclaration [19, 38), i=12, next=0 + KindIdentifier "b" [19, 21), i=13, next=14 + KindStringLiteral "��" [23, 38), i=14, next=0 + KindVariableStatement [39, 69), i=15, next=21 + KindVariableDeclarationList [39, 68), i=16, next=0 + NodeList [43, 68), i=17, next=0 + KindVariableDeclaration [43, 68), i=18, next=0 + KindIdentifier "c" [43, 45), i=19, next=20 + KindStringLiteral "���" [47, 68), i=20, next=0 + KindVariableStatement [69, 98), i=21, next=0 + KindVariableDeclarationList [69, 98), i=22, next=0 + NodeList [73, 98), i=23, next=0 + KindVariableDeclaration [73, 98), i=24, next=0 + KindIdentifier "d" [73, 75), i=25, next=26 + KindStringLiteral "���" [77, 98), i=26, next=0 + KindEndOfFile [98, 98), i=27, next=0 From bc121e4604fbf86bdfabbd84d8b2bb5c480c5db7 Mon Sep 17 00:00:00 2001 From: haoqixu Date: Fri, 7 Nov 2025 16:59:21 +0800 Subject: [PATCH 2/4] Fix scanning of valid surrogate pairs --- internal/scanner/scanner.go | 7 +++++++ internal/scanner/utilities.go | 19 +++++++++++++++++++ .../encodeSourceFileWithUnicodeEscapes.txt | 6 +++--- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index 1ee64b62f1..72ae4f7ca6 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -1629,6 +1629,13 @@ func (s *Scanner) scanEscapeSequence(flags EscapeSequenceScanningFlags) string { codePoint := s.scanUnicodeEscape(flags&EscapeSequenceScanningFlagsReportInvalidEscapeErrors != 0) if codePoint < 0 { return s.text[start:s.pos] + } else if codePointIsHighSurrogate(codePoint) && s.char() == '\\' && s.charAt(1) == 'u' { + savedPos := s.pos + nextCodePoint := s.scanUnicodeEscape(flags&EscapeSequenceScanningFlagsReportInvalidEscapeErrors != 0) + if codePointIsLowSurrogate(nextCodePoint) { + return string(surrogatePairToCodepoint(codePoint, nextCodePoint)) + } + s.pos = savedPos // restore position because we do not consume nextCodePoint } return string(codePoint) case 'x': diff --git a/internal/scanner/utilities.go b/internal/scanner/utilities.go index 06ba7cf666..06b29a1e2a 100644 --- a/internal/scanner/utilities.go +++ b/internal/scanner/utilities.go @@ -8,6 +8,25 @@ import ( "github.com/microsoft/typescript-go/internal/core" ) +const ( + surr1 = 0xd800 + surr2 = 0xdc00 + surr3 = 0xe000 + surrSelf = 0x10000 +) + +func codePointIsHighSurrogate(r rune) bool { + return surr1 <= r && r < surr2 +} + +func codePointIsLowSurrogate(r rune) bool { + return surr2 <= r && r < surr3 +} + +func surrogatePairToCodepoint(r1, r2 rune) rune { + return (r1-surr1)<<10 | (r2 - surr2) + surrSelf +} + func tokenIsIdentifierOrKeyword(token ast.Kind) bool { return token >= ast.KindIdentifier } diff --git a/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt b/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt index a54f51c7de..93102c786a 100644 --- a/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt +++ b/testdata/baselines/reference/api/encodeSourceFileWithUnicodeEscapes.txt @@ -11,17 +11,17 @@ KindSourceFile [0, 98), i=1, next=0 NodeList [19, 38), i=11, next=0 KindVariableDeclaration [19, 38), i=12, next=0 KindIdentifier "b" [19, 21), i=13, next=14 - KindStringLiteral "��" [23, 38), i=14, next=0 + KindStringLiteral "😃" [23, 38), i=14, next=0 KindVariableStatement [39, 69), i=15, next=21 KindVariableDeclarationList [39, 68), i=16, next=0 NodeList [43, 68), i=17, next=0 KindVariableDeclaration [43, 68), i=18, next=0 KindIdentifier "c" [43, 45), i=19, next=20 - KindStringLiteral "���" [47, 68), i=20, next=0 + KindStringLiteral "�😃" [47, 68), i=20, next=0 KindVariableStatement [69, 98), i=21, next=0 KindVariableDeclarationList [69, 98), i=22, next=0 NodeList [73, 98), i=23, next=0 KindVariableDeclaration [73, 98), i=24, next=0 KindIdentifier "d" [73, 75), i=25, next=26 - KindStringLiteral "���" [77, 98), i=26, next=0 + KindStringLiteral "�😃" [77, 98), i=26, next=0 KindEndOfFile [98, 98), i=27, next=0 From 9117cb982919cce78923d6ac0b0c0db199d19689 Mon Sep 17 00:00:00 2001 From: haoqixu Date: Fri, 7 Nov 2025 17:27:02 +0800 Subject: [PATCH 3/4] npx hereby format --- _packages/api/test/api.test.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/_packages/api/test/api.test.ts b/_packages/api/test/api.test.ts index 70aac1aaad..1a097cf1b3 100644 --- a/_packages/api/test/api.test.ts +++ b/_packages/api/test/api.test.ts @@ -8,10 +8,10 @@ import { cast, isImportDeclaration, isNamedImports, + isStringLiteral, isTemplateHead, isTemplateMiddle, isTemplateTail, - isStringLiteral, } from "@typescript/ast"; import assert from "node:assert"; import { @@ -118,7 +118,7 @@ test("unicode escapes", () => { const srcFiles = { "/src/1.ts": `"😃"`, "/src/2.ts": `"\\ud83d\\ude03"`, // this is "😃" - } + }; const api = spawnAPI({ "/tsconfig.json": "{}", @@ -126,7 +126,7 @@ test("unicode escapes", () => { }); const project = api.loadProject("/tsconfig.json"); - Object.keys(srcFiles).forEach((file) => { + Object.keys(srcFiles).forEach(file => { const sourceFile = project.getSourceFile(file); assert.ok(sourceFile); @@ -136,7 +136,7 @@ test("unicode escapes", () => { } node.forEachChild(visit); }); - }) + }); }); test("Object equality", () => { From c949dc5b4ba03345510912ca26d7bcaacf46a39c Mon Sep 17 00:00:00 2001 From: haoqixu Date: Mon, 10 Nov 2025 13:44:34 +0800 Subject: [PATCH 4/4] add testcases --- .../reference/compiler/unicodeEscapes.js | 47 +++++++++++++++++++ .../reference/compiler/unicodeEscapes.symbols | 33 +++++++++++++ .../reference/compiler/unicodeEscapes.types | 39 +++++++++++++++ .../tests/cases/compiler/unicodeEscapes.tsx | 19 ++++++++ 4 files changed, 138 insertions(+) create mode 100644 testdata/baselines/reference/compiler/unicodeEscapes.js create mode 100644 testdata/baselines/reference/compiler/unicodeEscapes.symbols create mode 100644 testdata/baselines/reference/compiler/unicodeEscapes.types create mode 100644 testdata/tests/cases/compiler/unicodeEscapes.tsx diff --git a/testdata/baselines/reference/compiler/unicodeEscapes.js b/testdata/baselines/reference/compiler/unicodeEscapes.js new file mode 100644 index 0000000000..511517eb10 --- /dev/null +++ b/testdata/baselines/reference/compiler/unicodeEscapes.js @@ -0,0 +1,47 @@ +//// [tests/cases/compiler/unicodeEscapes.tsx] //// + +//// [unicodeEscapes.tsx] +// low-high surrogate pair - the "correct" case +export const highLow = "\ud83d\ude03" as const; + +// high surrogate +export const high = "\ud83d" as const; + +// low surrogate +export const low = "\ude03" as const; + +// two high surrogates +export const highHigh = "\ud83d\ud83d" as const; + +// two low surrogates +export const lowLow = "\ude03\ude03" as const; + +// swapped expected order of surrogates +export const lowHigh = "\ude03\ud83d" as const; + + +//// [unicodeEscapes.js] +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.lowHigh = exports.lowLow = exports.highHigh = exports.low = exports.high = exports.highLow = void 0; +// low-high surrogate pair - the "correct" case +exports.highLow = "\ud83d\ude03"; +// high surrogate +exports.high = "\ud83d"; +// low surrogate +exports.low = "\ude03"; +// two high surrogates +exports.highHigh = "\ud83d\ud83d"; +// two low surrogates +exports.lowLow = "\ude03\ude03"; +// swapped expected order of surrogates +exports.lowHigh = "\ude03\ud83d"; + + +//// [unicodeEscapes.d.ts] +export declare const highLow: "😃"; +export declare const high: "�"; +export declare const low: "�"; +export declare const highHigh: "��"; +export declare const lowLow: "��"; +export declare const lowHigh: "��"; diff --git a/testdata/baselines/reference/compiler/unicodeEscapes.symbols b/testdata/baselines/reference/compiler/unicodeEscapes.symbols new file mode 100644 index 0000000000..38d3027a44 --- /dev/null +++ b/testdata/baselines/reference/compiler/unicodeEscapes.symbols @@ -0,0 +1,33 @@ +//// [tests/cases/compiler/unicodeEscapes.tsx] //// + +=== unicodeEscapes.tsx === +// low-high surrogate pair - the "correct" case +export const highLow = "\ud83d\ude03" as const; +>highLow : Symbol(highLow, Decl(unicodeEscapes.tsx, 1, 12)) +>const : Symbol(const) + +// high surrogate +export const high = "\ud83d" as const; +>high : Symbol(high, Decl(unicodeEscapes.tsx, 4, 12)) +>const : Symbol(const) + +// low surrogate +export const low = "\ude03" as const; +>low : Symbol(low, Decl(unicodeEscapes.tsx, 7, 12)) +>const : Symbol(const) + +// two high surrogates +export const highHigh = "\ud83d\ud83d" as const; +>highHigh : Symbol(highHigh, Decl(unicodeEscapes.tsx, 10, 12)) +>const : Symbol(const) + +// two low surrogates +export const lowLow = "\ude03\ude03" as const; +>lowLow : Symbol(lowLow, Decl(unicodeEscapes.tsx, 13, 12)) +>const : Symbol(const) + +// swapped expected order of surrogates +export const lowHigh = "\ude03\ud83d" as const; +>lowHigh : Symbol(lowHigh, Decl(unicodeEscapes.tsx, 16, 12)) +>const : Symbol(const) + diff --git a/testdata/baselines/reference/compiler/unicodeEscapes.types b/testdata/baselines/reference/compiler/unicodeEscapes.types new file mode 100644 index 0000000000..101ed66c3c --- /dev/null +++ b/testdata/baselines/reference/compiler/unicodeEscapes.types @@ -0,0 +1,39 @@ +//// [tests/cases/compiler/unicodeEscapes.tsx] //// + +=== unicodeEscapes.tsx === +// low-high surrogate pair - the "correct" case +export const highLow = "\ud83d\ude03" as const; +>highLow : "😃" +>"\ud83d\ude03" as const : "😃" +>"\ud83d\ude03" : "😃" + +// high surrogate +export const high = "\ud83d" as const; +>high : "�" +>"\ud83d" as const : "�" +>"\ud83d" : "�" + +// low surrogate +export const low = "\ude03" as const; +>low : "�" +>"\ude03" as const : "�" +>"\ude03" : "�" + +// two high surrogates +export const highHigh = "\ud83d\ud83d" as const; +>highHigh : "��" +>"\ud83d\ud83d" as const : "��" +>"\ud83d\ud83d" : "��" + +// two low surrogates +export const lowLow = "\ude03\ude03" as const; +>lowLow : "��" +>"\ude03\ude03" as const : "��" +>"\ude03\ude03" : "��" + +// swapped expected order of surrogates +export const lowHigh = "\ude03\ud83d" as const; +>lowHigh : "��" +>"\ude03\ud83d" as const : "��" +>"\ude03\ud83d" : "��" + diff --git a/testdata/tests/cases/compiler/unicodeEscapes.tsx b/testdata/tests/cases/compiler/unicodeEscapes.tsx new file mode 100644 index 0000000000..c77066fbd1 --- /dev/null +++ b/testdata/tests/cases/compiler/unicodeEscapes.tsx @@ -0,0 +1,19 @@ +// @declaration: true + +// low-high surrogate pair - the "correct" case +export const highLow = "\ud83d\ude03" as const; + +// high surrogate +export const high = "\ud83d" as const; + +// low surrogate +export const low = "\ude03" as const; + +// two high surrogates +export const highHigh = "\ud83d\ud83d" as const; + +// two low surrogates +export const lowLow = "\ude03\ude03" as const; + +// swapped expected order of surrogates +export const lowHigh = "\ude03\ud83d" as const;