honojs-middleware/packages/ua-blocker/src/escape.ts

// Constants derived from typical core-js patterns for string escaping
// (e.g., related to JS_STRING_ESCAPE or similar proposals)
const FIRST_DIGIT_OR_ASCII = /^[0-9A-Za-z]/
const SYNTAX_SOLIDUS = /[/]/
const OTHER_PUNCTUATORS_AND_WHITESPACES = /[!"#$&'()*+,./:;<=>?@[\]^`{|}~\s]/

// Control characters to escape, mapping to their escape sequence character
const ControlEscape: { [key: string]: string } = {
  '\0': '0', // Null character
  '\b': 'b', // Backspace
  '\t': 't', // Horizontal tab
  '\n': 'n', // Line feed (new line)
  '\v': 'v', // Vertical tab
  '\f': 'f', // Form feed
  '\r': 'r', // Carriage return
  '"': '"', // Double quote
  "'": "'", // Single quote
  '\\': '\\', // Backslash
}

/**
 * Escapes a character for use in a string literal.
 * Primarily handles:
 * - Control characters (0x00-0x1F) -> \u00XX
 * - Unpaired surrogates -> \uDXXX
 * Other characters are returned as is, as their escaping is handled by
 * ControlEscape, SYNTAX_SOLIDUS, or they are passed through.
 */
function _escapeChar(char: string): string {
  const charCode = char.charCodeAt(0)

  if (charCode < 0x20) {
    // ASCII Control characters
    return '\\u' + charCode.toString(16).padStart(4, '0')
  }
  if (charCode >= 0xd800 && charCode <= 0xdfff) {
    // Surrogates (typically unpaired if passed to this function directly)
    return '\\u' + charCode.toString(16).padStart(4, '0')
  }
  // For other characters (printable ASCII, other Unicode symbols) that might
  // be passed via FIRST_DIGIT_OR_ASCII or OTHER_PUNCTUATORS_AND_WHITESPACES,
  // they should remain themselves if they don't require \uXXXX escaping.
  return char
}

/**
 * `RegExp.escape` polyfill inspired by core-js
 */
function $escape(S: string): string {
  const length = S.length
  if (length === 0) {
    return ''
  }
  // Pre-allocate array; join is efficient for this.
  const result = new Array<string>(length)

  for (let i = 0; i < length; i++) {
    const chr = S.charAt(i)

    if (i === 0 && FIRST_DIGIT_OR_ASCII.exec(chr)) {
      result[i] = _escapeChar(chr)
    } else if (Object.prototype.hasOwnProperty.call(ControlEscape, chr)) {
      result[i] = '\\' + ControlEscape[chr]
    } else if (SYNTAX_SOLIDUS.exec(chr)) {
      result[i] = '\\' + chr
    } else if (OTHER_PUNCTUATORS_AND_WHITESPACES.exec(chr)) {
      result[i] = _escapeChar(chr)
    } else {
      const charCode = chr.charCodeAt(0)
      // Single UTF-16 code unit (non-surrogate)
      if ((charCode & 0xf800) !== 0xd800) {
        result[i] = chr
      }
      // Unpaired surrogate
      else if (
        charCode >= 0xdc00 || // Low surrogate (implies it's unpaired if we are here)
        i + 1 >= length || // High surrogate at end of string
        (S.charCodeAt(i + 1) & 0xfc00) !== 0xdc00 // High surrogate not followed by low surrogate
      ) {
        result[i] = _escapeChar(chr)
      }
      // Surrogate pair
      else {
        result[i] = chr // High surrogate
        i++ // Advance to include the low surrogate
        result[i] = S.charAt(i) // Low surrogate
      }
    }
  }

  return result.join('')
}

interface RegExpConstructor {
  escape(str: string): string
}

//@ts-expect-error: RegExp.escape is only included in Node.js v24+ and Deno v2.3+
export const escape = (RegExp.escape as RegExpConstructor['escape']) || $escape
feat: introduce UA Blocker Middleware (#1181) * feat: create AI bot blocker middleware * chore: add changeset * fix: uppercase function called too late * chore: don't version-control robots.json * chore: track `robots.json` * ci: add `@hono/ai-robots-txt` workflow script * fix: change initial version * feat: add automatic `robots.json` sync from upstream * feat!: change package name and architecture * refactor(ua-blocker): prebuild compiled regex (#1) * fix: add json data files to tsconfig * chore: rename workflow files * fix: test if string _contains_ "Yes" It might be a markdown link, so not the exact string, but "[Yes](<link>)" * fix: tests reflect the fixed "Yes" check * feat: move generator back to prebuild generated regex should use the version-controled robots.json, not directly the upstream file * chore: add .zed * chore: remove unused files * fix: properly setup workspace before running scripts * chore: remove `prebuild` script from `build`, `typecheck`, and `test` * chore: run `getrobotstxt` and `prebuild` * fix: export `RegExp`s, not `string[]`s * chore: mention RegExp and uppercase matching in docs * fix: adapt tests to regex exports * chore: add tests for direct regex passing * chore: format code --------- Co-authored-by: Jonathan Haines <jonno.haines@gmail.com> 2025-06-14 05:00:42 +08:00			`// Constants derived from typical core-js patterns for string escaping`
			`// (e.g., related to JS_STRING_ESCAPE or similar proposals)`
			`const FIRST_DIGIT_OR_ASCII = /^[0-9A-Za-z]/`
			`const SYNTAX_SOLIDUS = /[/]/`
			const OTHER_PUNCTUATORS_AND_WHITESPACES = /[!"#$&'()*+,./:;<=>?@[\]^`{\|}~\s]/

			`// Control characters to escape, mapping to their escape sequence character`
			`const ControlEscape: { [key: string]: string } = {`
			`'\0': '0', // Null character`
			`'\b': 'b', // Backspace`
			`'\t': 't', // Horizontal tab`
			`'\n': 'n', // Line feed (new line)`
			`'\v': 'v', // Vertical tab`
			`'\f': 'f', // Form feed`
			`'\r': 'r', // Carriage return`
			`'"': '"', // Double quote`
			`"'": "'", // Single quote`
			`'\\': '\\', // Backslash`
			`}`

			`/**`
			`* Escapes a character for use in a string literal.`
			`* Primarily handles:`
			`* - Control characters (0x00-0x1F) -> \u00XX`
			`* - Unpaired surrogates -> \uDXXX`
			`* Other characters are returned as is, as their escaping is handled by`
			`* ControlEscape, SYNTAX_SOLIDUS, or they are passed through.`
			`*/`
			`function _escapeChar(char: string): string {`
			`const charCode = char.charCodeAt(0)`

			`if (charCode < 0x20) {`
			`// ASCII Control characters`
			`return '\\u' + charCode.toString(16).padStart(4, '0')`
			`}`
			`if (charCode >= 0xd800 && charCode <= 0xdfff) {`
			`// Surrogates (typically unpaired if passed to this function directly)`
			`return '\\u' + charCode.toString(16).padStart(4, '0')`
			`}`
			`// For other characters (printable ASCII, other Unicode symbols) that might`
			`// be passed via FIRST_DIGIT_OR_ASCII or OTHER_PUNCTUATORS_AND_WHITESPACES,`
			`// they should remain themselves if they don't require \uXXXX escaping.`
			`return char`
			`}`

			`/**`
			* `RegExp.escape` polyfill inspired by core-js
			`*/`
			`function $escape(S: string): string {`
			`const length = S.length`
			`if (length === 0) {`
			`return ''`
			`}`
			`// Pre-allocate array; join is efficient for this.`
			`const result = new Array<string>(length)`

			`for (let i = 0; i < length; i++) {`
			`const chr = S.charAt(i)`

			`if (i === 0 && FIRST_DIGIT_OR_ASCII.exec(chr)) {`
			`result[i] = _escapeChar(chr)`
			`} else if (Object.prototype.hasOwnProperty.call(ControlEscape, chr)) {`
			`result[i] = '\\' + ControlEscape[chr]`
			`} else if (SYNTAX_SOLIDUS.exec(chr)) {`
			`result[i] = '\\' + chr`
			`} else if (OTHER_PUNCTUATORS_AND_WHITESPACES.exec(chr)) {`
			`result[i] = _escapeChar(chr)`
			`} else {`
			`const charCode = chr.charCodeAt(0)`
			`// Single UTF-16 code unit (non-surrogate)`
			`if ((charCode & 0xf800) !== 0xd800) {`
			`result[i] = chr`
			`}`
			`// Unpaired surrogate`
			`else if (`
			`charCode >= 0xdc00 \|\| // Low surrogate (implies it's unpaired if we are here)`
			`i + 1 >= length \|\| // High surrogate at end of string`
			`(S.charCodeAt(i + 1) & 0xfc00) !== 0xdc00 // High surrogate not followed by low surrogate`
			`) {`
			`result[i] = _escapeChar(chr)`
			`}`
			`// Surrogate pair`
			`else {`
			`result[i] = chr // High surrogate`
			`i++ // Advance to include the low surrogate`
			`result[i] = S.charAt(i) // Low surrogate`
			`}`
			`}`
			`}`

			`return result.join('')`
			`}`

			`interface RegExpConstructor {`
			`escape(str: string): string`
			`}`

			`//@ts-expect-error: RegExp.escape is only included in Node.js v24+ and Deno v2.3+`
			`export const escape = (RegExp.escape as RegExpConstructor['escape']) \|\| $escape`