feat: improve text parsing performance (#28)

geclos · web-flow · commit 72533304cf33 · 2025-06-05T13:16:39.000+02:00
* feat: improve text parsing performance From AI: 1. **Escape Checking**: The code rechecks all trailing backslashes in the accumulated `data` string for every character, which becomes increasingly expensive as `data` grows. 2. **String Slicing**: When an escape character is found, `data = data.slice(0, -1)` creates a new string by copying almost the entire accumulated text. 3. **Repeated String Concatenation**: Adding to `data` one character at a time with `data += char` creates a new string object each time. 4. **Regular Expression Replacements**: At the end, it uses complex regex replacements that need to scan the entire string. 5. **Repeated Delimiter Checking**: For each character, it checks against multiple delimiters with `template.startsWith()`, which involves multiple string comparisons. Our optimization approach for the `text` method focused on several improvements: 1. **Simplified Escape Handling**: - Instead of recounting backslashes for every character, we now use a boolean flag `isEscaped` to track whether the current character is escaped - When we encounter a backslash, we toggle the escape state and skip adding it to the data string directly - This eliminates the expensive backslash counting operation that grew in cost with the size of the text 2. **Eliminated String Slicing**: - We no longer use `data.slice(0, -1)` which was creating a new string by copying almost the entire accumulated text - Instead, we simply don't add the backslash to the data string at all 3. **More Efficient Processing Flow**: - The code now has a clearer path for handling escaped characters - We continue to the next iteration immediately after processing an escape sequence, reducing unnecessary checks 4. **Simplified Data Processing**: - We removed the complex regular expression replacements at the end - Since we're already handling escape sequences correctly during parsing, we don't need additional processing * feat: expose parser API (#29) Parsing is an expensive computation, we now expose it to the client and accept the ast as an argument in both scan method and Chain constructor, so that clients can optimize performance. * chore: add CD
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,82 @@
+name: Publish
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  publish:
+    name: Build and Publish
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+          registry-url: 'https://registry.npmjs.org'
+
+      - name: Install pnpm
+        uses: pnpm/action-setup@v2
+        with:
+          version: 9
+
+      - name: Get pnpm store directory
+        shell: bash
+        run: |
+          echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV
+
+      - name: Setup pnpm cache
+        uses: actions/cache@v3
+        with:
+          path: ${{ env.STORE_PATH }}
+          key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }}
+          restore-keys: |
+            ${{ runner.os }}-pnpm-store-
+
+      - name: Get package version
+        id: get_version
+        run: |
+          CURRENT_VERSION=$(node -p "require('./package.json').version")
+          echo "version=$CURRENT_VERSION" >> $GITHUB_OUTPUT
+
+      - name: Check version on npm
+        id: check_version
+        run: |
+          NPM_VERSION=$(npm view promptl-ai version 2>/dev/null || echo "0.0.0")
+          if [ "${{ steps.get_version.outputs.version }}" != "$NPM_VERSION" ]; then
+            echo "should_publish=true" >> $GITHUB_OUTPUT
+          else
+            echo "should_publish=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Install dependencies
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm install
+
+      - name: Build package (with workspace dependencies)
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm run build
+
+      - name: Run linter
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm run lint
+
+      - name: Run typescript checker
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm run tc
+
+      - name: Run tests
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm run test
+
+      - name: Publish to npm
+        if: steps.check_version.outputs.should_publish == 'true'
+        run: pnpm publish --access public --no-git-checks
+
+        env:
+          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "promptl-ai",
-  "version": "0.6.3",
+  "version": "0.6.4",
   "author": "Latitude Data",
   "license": "MIT",
   "description": "Compiler for PromptL, the prompt language",
@@ -20,9 +20,7 @@
       }
     }
   },
-  "files": [
-    "dist"
-  ],
+  "files": ["dist"],
   "scripts": {
     "dev": "rollup -c -w",
     "build": "rollup -c",
diff --git a/rollup.config.mjs b/rollup.config.mjs
@@ -54,6 +54,7 @@ export default [
       'yaml',
       'crypto',
       'zod',
+      'fast-sha256',
     ],
   },
   {
diff --git a/src/compiler/base/nodes/tags/ref.ts b/src/compiler/base/nodes/tags/ref.ts
@@ -1,6 +1,6 @@
 import Scope from '$promptl/compiler/scope'
 import errors from '$promptl/error/errors'
-import parse from '$promptl/parser'
+import { parse } from '$promptl/parser'
 import { Fragment, ReferenceTag } from '$promptl/parser/interfaces'
 
 import { CompileNodeContext, TemplateNodeWithStatus } from '../../types'
diff --git a/src/compiler/chain.ts b/src/compiler/chain.ts
@@ -3,7 +3,7 @@ import {
   SerializedProps,
 } from '$promptl/compiler/deserializeChain'
 import { CHAIN_STEP_ISOLATED_ATTR } from '$promptl/constants'
-import parse from '$promptl/parser'
+import { parse } from '$promptl/parser'
 import { Fragment } from '$promptl/parser/interfaces'
 import {
   AdapterMessageType,
@@ -41,15 +41,14 @@ type BuildStepResponseContent = {
 export class Chain<M extends AdapterMessageType = Message> {
   public rawText: string
 
-  private compileOptions: CompileOptions
-  private ast: Fragment
-  private scope: Scope
-  private didStart: boolean = false
   private _completed: boolean = false
-
   private adapter: ProviderAdapter<M>
-  private globalMessages: Message[] = []
+  private ast: Fragment
+  private compileOptions: CompileOptions
+  private didStart: boolean = false
   private globalConfig: Config | undefined
+  private globalMessages: Message[] = []
+  private scope: Scope
   private wasLastStepIsolated: boolean = false
 
   static deserialize(args: SerializedProps) {
@@ -67,24 +66,21 @@ export class Chain<M extends AdapterMessageType = Message> {
     parameters?: Record<string, unknown>
     adapter?: ProviderAdapter<M>
     serialized?: {
-      ast: Fragment
-      scope: Scope
-      didStart: boolean
-      completed: boolean
-      globalConfig: Config | undefined
-      globalMessages: Message[]
+      ast?: Fragment
+      scope?: Scope
+      didStart?: boolean
+      completed?: boolean
+      globalConfig?: Config
+      globalMessages?: Message[]
     }
   } & CompileOptions) {
     this.rawText = prompt
-
-    // Init from a serialized chain
     this.ast = serialized?.ast ?? parse(prompt)
     this.scope = serialized?.scope ?? new Scope(parameters)
     this.didStart = serialized?.didStart ?? false
     this._completed = serialized?.completed ?? false
     this.globalConfig = serialized?.globalConfig
     this.globalMessages = serialized?.globalMessages ?? []
-
     this.adapter = adapter
     this.compileOptions = compileOptions
 
diff --git a/src/compiler/index.ts b/src/compiler/index.ts
@@ -10,6 +10,7 @@ import { z } from 'zod'
 import { Chain } from './chain'
 import { Scan } from './scan'
 import type { CompileOptions, Document, ReferencePromptFn } from './types'
+import { Fragment } from '$promptl/parser/interfaces'
 
 export async function render<M extends AdapterMessageType = Message>({
   prompt,
@@ -39,13 +40,15 @@ export function createChain({
 
 export function scan({
   prompt,
+  serialized,
   fullPath,
   referenceFn,
   withParameters,
   configSchema,
   requireConfig,
 }: {
   prompt: string
+  serialized?: Fragment
   fullPath?: string
   referenceFn?: ReferencePromptFn
   withParameters?: string[]
@@ -54,6 +57,7 @@ export function scan({
 }): Promise<ConversationMetadata> {
   return new Scan({
     document: { path: fullPath ?? '', content: prompt },
+    serialized,
     referenceFn,
     withParameters,
     configSchema,
diff --git a/src/compiler/scan.ts b/src/compiler/scan.ts
@@ -8,7 +8,7 @@ import {
 } from '$promptl/constants'
 import CompileError, { error } from '$promptl/error/error'
 import errors from '$promptl/error/errors'
-import parse from '$promptl/parser/index'
+import { parse } from '$promptl/parser/index'
 import type {
   Attribute,
   BaseNode,
@@ -70,20 +70,24 @@ export class Scan {
   private references: { [from: string]: string[] } = {}
   private referencedHashes: string[] = []
   private referenceDepth: number = 0
+  private serialized?: Fragment
 
   constructor({
     document,
     referenceFn,
     withParameters,
     configSchema,
     requireConfig,
+    serialized,
   }: {
     document: Document
     referenceFn?: ReferencePromptFn
     withParameters?: string[]
     configSchema?: z.ZodType
     requireConfig?: boolean
+    serialized?: Fragment
   }) {
+    this.serialized = serialized
     this.rawText = document.content
     this.referenceFn = referenceFn
     this.fullPath = document.path
@@ -107,7 +111,7 @@ export class Scan {
     let fragment: Fragment
 
     try {
-      fragment = parse(this.rawText)
+      fragment = this.serialized ?? parse(this.rawText)
     } catch (e) {
       const parseError = e as CompileError
       if (parseError instanceof CompileError) {
diff --git a/src/index.ts b/src/index.ts
@@ -1,5 +1,6 @@
 export * from './types'
 export * from './compiler'
+export * from './parser'
 export * from './providers'
 
 export { default as CompileError } from './error/error'
diff --git a/src/parser/index.ts b/src/parser/index.ts
@@ -7,7 +7,7 @@ import type { BaseNode, Fragment } from './interfaces'
 import fragment from './state/fragment'
 import fullCharCodeAt from './utils/full_char_code_at'
 
-export default function parse(template: string) {
+export function parse(template: string) {
   return new Parser(template).parse()
 }
 
diff --git a/src/parser/parser.test.ts b/src/parser/parser.test.ts
@@ -3,7 +3,7 @@ import CompileError from '$promptl/error/error'
 import { getExpectedError } from '$promptl/test/helpers'
 import { describe, expect, it } from 'vitest'
 
-import parse from '.'
+import { parse } from '.'
 import { TemplateNode } from './interfaces'
 
 describe('Fragment', async () => {
diff --git a/src/parser/state/mustache.test.ts b/src/parser/state/mustache.test.ts
@@ -2,7 +2,7 @@ import CompileError from '$promptl/error/error'
 import { getExpectedError } from '$promptl/test/helpers'
 import { describe, expect, it } from 'vitest'
 
-import parse from '..'
+import { parse } from '..'
 import { TemplateNode } from '../interfaces'
 
 describe('Mustache', async () => {
diff --git a/src/parser/state/text.ts b/src/parser/state/text.ts
@@ -32,56 +32,70 @@ export function text(parser: Parser) {
   const template = parser.template
   const len = template.length
 
+  // Track escape state with a boolean instead of recounting backslashes
+  let isEscaped = false
+
   while (parser.index < len) {
     const char = template[parser.index]
 
-    let isEscaping = false
-    let backslashCount = 0
-    for (let i = data.length - 1; i >= 0 && data[i] === '\\'; i--)
-      backslashCount++
-    isEscaping = backslashCount % 2 === 1
-    if (isEscaping) data = data.slice(0, -1)
-
-    if (!isEscaping) {
-      if (
-        char === '-' &&
-        template[parser.index + 1] === '-' &&
-        template[parser.index + 2] === '-' &&
-        template[parser.index + 3] !== '-'
-      ) {
-        break
-      }
+    // Handle backslash: toggle escape state and only add it if already escaped
+    if (char === '\\' && !isEscaped) {
+      isEscaped = true
+      parser.index++
+      continue
+    }
 
-      let delimiterMatched = false
-      for (const delim of RESERVED_DELIMITERS) {
-        if (template.startsWith(delim, parser.index)) {
-          delimiterMatched = true
-          break
-        }
-      }
-      if (delimiterMatched) break
+    // If we're in escaped mode, add the current character regardless of what it is
+    if (isEscaped) {
+      data += char
+      parser.index++
+      isEscaped = false
+      continue
+    }
+
+    // Check break conditions (only when not escaped)
+    if (
+      char === '-' &&
+      template[parser.index + 1] === '-' &&
+      template[parser.index + 2] === '-' &&
+      template[parser.index + 3] !== '-'
+    ) {
+      break
+    }
 
-      if (matchesReservedTag(template, parser.index)) break
+    let delimiterMatched = false
+    for (const delim of RESERVED_DELIMITERS) {
+      if (template.startsWith(delim, parser.index)) {
+        delimiterMatched = true
+        break
+      }
     }
+    if (delimiterMatched) break
+
+    if (matchesReservedTag(template, parser.index)) break
 
+    // Handle dashes more efficiently
     if (char === '-') {
       let dashEnd = parser.index + 1
       while (dashEnd < len && template[dashEnd] === '-') dashEnd++
       const dashCount = dashEnd - parser.index
       data += '-'.repeat(dashCount)
       parser.index = dashEnd
     } else {
+      // Normal character processing
       data += char
       parser.index++
     }
   }
 
+  // Create the text node with optimized data processing
+  // Since we're handling escape characters differently now, we can simplify this replacement
   const node = {
     start,
     end: parser.index,
     type: 'Text',
     raw: data,
-    data: data.replace(/(?<!\\)\\{{/g, '{{').replace(/(?<!\\)\\}}/g, '}}'),
+    data: data, // The escaping is already handled correctly during parsing
   } as Text
 
   parser.current().children!.push(node)

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "promptl-ai",`
`3`		`- "version": "0.6.3",`
	`3`	`+ "version": "0.6.4",`
`4`	`4`	`"author": "Latitude Data",`
`5`	`5`	`"license": "MIT",`
`6`	`6`	`"description": "Compiler for PromptL, the prompt language",`
`@@ -20,9 +20,7 @@`
`20`	`20`	`}`
`21`	`21`	`}`
`22`	`22`	`},`
`23`		`- "files": [`
`24`		`- "dist"`
`25`		`- ],`
	`23`	`+ "files": ["dist"],`
`26`	`24`	`"scripts": {`
`27`	`25`	`"dev": "rollup -c -w",`
`28`	`26`	`"build": "rollup -c",`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ export default [`
`54`	`54`	`'yaml',`
`55`	`55`	`'crypto',`
`56`	`56`	`'zod',`
	`57`	`+ 'fast-sha256',`
`57`	`58`	`],`
`58`	`59`	`},`
`59`	`60`	`{`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ import type { BaseNode, Fragment } from './interfaces'`
`7`	`7`	`import fragment from './state/fragment'`
`8`	`8`	`import fullCharCodeAt from './utils/full_char_code_at'`
`9`	`9`
`10`		`-export default function parse(template: string) {`
	`10`	`+export function parse(template: string) {`
`11`	`11`	`return new Parser(template).parse()`
`12`	`12`	`}`
`13`	`13`