honojs-middleware/packages/ua-blocker/src/ai-bots.test.ts

import { Hono } from 'hono'
import { aiBots, nonRespectingAiBots, AI_ROBOTS_TXT, useAiRobotsTxt } from './ai-bots'
import {
  ALL_BOTS,
  NON_RESPECTING_BOTS,
  ROBOTS_TXT,
  ALL_BOTS_REGEX,
  NON_RESPECTING_BOTS_REGEX,
} from './generated'

describe('AI Bots module', () => {
  describe('aiBots export', () => {
    it('Should export ALL_BOTS_REGEX from generated', () => {
      expect(aiBots).toBe(ALL_BOTS_REGEX)
    })

    it('Should be a RegExp object', () => {
      expect(aiBots instanceof RegExp).toBe(true)
      expect(aiBots.source.length).toBeGreaterThan(0)
      expect(aiBots.toString()).toMatch(/^\/.*\/$/)
    })

    it('Should include known AI bots', () => {
      expect(aiBots.test('GPTBOT')).toBe(true)
      expect(aiBots.test('CLAUDEBOT')).toBe(true)
      expect(aiBots.test('BYTESPIDER')).toBe(true)
      expect(aiBots.test('CHATGPT-USER')).toBe(true)
    })

    it('Should be properly formatted as a regex', () => {
      expect(aiBots.source).toContain('|')
    })
  })

  describe('nonRespectingAiBots export', () => {
    it('Should export NON_RESPECTING_BOTS_REGEX from generated', () => {
      expect(nonRespectingAiBots).toBe(NON_RESPECTING_BOTS_REGEX)
    })

    it('Should be a RegExp object', () => {
      expect(nonRespectingAiBots instanceof RegExp).toBe(true)
      expect(nonRespectingAiBots.source.length).toBeGreaterThan(0)
      expect(nonRespectingAiBots.toString()).toMatch(/^\/.*\/$/)
    })

    it('Should be a subset of aiBots', () => {
      // Check if all non-respecting bots are included in the aiBots pattern
      // by testing the original string array against both regexes
      NON_RESPECTING_BOTS.forEach((bot) => {
        expect(aiBots.test(bot.toUpperCase())).toBe(true)
      })
    })

    it('Should include known non-respecting bots', () => {
      expect(nonRespectingAiBots.test('BYTESPIDER')).toBe(true)
      expect(nonRespectingAiBots.test('IASKSPIDER/2.0')).toBe(true)
    })

    it('Should not include known respecting bots', () => {
      expect(nonRespectingAiBots.test('GPTBOT')).toBe(false)
      expect(nonRespectingAiBots.test('CHATGPT-USER')).toBe(false)
    })

    it('Should have a pattern that is shorter than aiBots pattern', () => {
      expect(nonRespectingAiBots.source.length).toBeLessThan(aiBots.source.length)
    })
  })

  describe('AI_ROBOTS_TXT export', () => {
    it('Should export ROBOTS_TXT from generated', () => {
      expect(AI_ROBOTS_TXT).toBe(ROBOTS_TXT)
    })

    it('Should be a non-empty string', () => {
      expect(typeof AI_ROBOTS_TXT).toBe('string')
      expect(AI_ROBOTS_TXT.length).toBeGreaterThan(0)
    })

    it('Should contain User-agent directives', () => {
      expect(AI_ROBOTS_TXT).toContain('User-agent:')
    })

    it('Should contain Disallow directive', () => {
      expect(AI_ROBOTS_TXT).toContain('Disallow: /')
    })

    it('Should have proper robots.txt format', () => {
      const lines = AI_ROBOTS_TXT.split('\n')
      const userAgentLines = lines.filter((line) => line.startsWith('User-agent:'))
      const disallowIndex = lines.findIndex((line) => line === 'Disallow: /')

      expect(userAgentLines.length).toBeGreaterThan(0)
      expect(disallowIndex).toBeGreaterThan(0)
      expect(lines[lines.length - 1]).toBe('')
    })

    it('Should include all AI bots', () => {
      ALL_BOTS.forEach((bot) => {
        expect(AI_ROBOTS_TXT).toContain(`User-agent: ${bot}`)
      })
    })
  })

  describe('useAiRobotsTxt function', () => {
    it('Should return a function', () => {
      const middleware = useAiRobotsTxt()
      expect(typeof middleware).toBe('function')
    })

    it('Should create working Hono middleware', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())

      const res = await app.request('/robots.txt')
      expect(res.status).toBe(200)
    })

    it('Should serve the correct robots.txt content', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())

      const res = await app.request('/robots.txt')
      const content = await res.text()

      expect(content).toBe(AI_ROBOTS_TXT)
    })

    it('Should set correct content type', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())

      const res = await app.request('/robots.txt')
      expect(res.headers.get('Content-Type')).toBe('text/plain; charset=UTF-8')
    })

    it('Should work with different paths', async () => {
      const app = new Hono()
      app.use('/custom-robots.txt', useAiRobotsTxt())
      app.use('/api/robots.txt', useAiRobotsTxt())

      const res1 = await app.request('/custom-robots.txt')
      const res2 = await app.request('/api/robots.txt')

      expect(res1.status).toBe(200)
      expect(res2.status).toBe(200)
      expect(await res1.text()).toBe(AI_ROBOTS_TXT)
      expect(await res2.text()).toBe(AI_ROBOTS_TXT)
    })

    it('Should not interfere with other routes', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())
      app.get('/other', (c) => c.text('other content'))
      app.get('/api/data', (c) => c.json({ data: 'test' }))

      const robotsRes = await app.request('/robots.txt')
      const otherRes = await app.request('/other')
      const apiRes = await app.request('/api/data')

      expect(robotsRes.status).toBe(200)
      expect(await robotsRes.text()).toBe(AI_ROBOTS_TXT)

      expect(otherRes.status).toBe(200)
      expect(await otherRes.text()).toBe('other content')

      expect(apiRes.status).toBe(200)
      expect(await apiRes.json()).toEqual({ data: 'test' })
    })

    it('Should handle requests with different methods', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())

      // GET request
      const getRes = await app.request('/robots.txt', { method: 'GET' })
      expect(getRes.status).toBe(200)
      expect(await getRes.text()).toBe(AI_ROBOTS_TXT)

      // HEAD request
      const headRes = await app.request('/robots.txt', { method: 'HEAD' })
      expect(headRes.status).toBe(200)
      expect(headRes.headers.get('Content-Type')).toBe('text/plain; charset=UTF-8')

      // POST request should also work since it's middleware
      const postRes = await app.request('/robots.txt', { method: 'POST' })
      expect(postRes.status).toBe(200)
      expect(await postRes.text()).toBe(AI_ROBOTS_TXT)
    })

    it('Should serve consistent content across multiple requests', async () => {
      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())

      const requests = Array.from({ length: 5 }, () => app.request('/robots.txt'))
      const responses = await Promise.all(requests)
      const contents = await Promise.all(responses.map((res) => res.text()))

      responses.forEach((res) => {
        expect(res.status).toBe(200)
        expect(res.headers.get('Content-Type')).toBe('text/plain; charset=UTF-8')
      })

      contents.forEach((content) => {
        expect(content).toBe(AI_ROBOTS_TXT)
      })

      // All contents should be identical
      const uniqueContents = [...new Set(contents)]
      expect(uniqueContents.length).toBe(1)
    })
  })

  describe('Integration tests', () => {
    it('Should work together with uaBlocker middleware', async () => {
      // This test ensures the ai-bots module integrates well with the main uaBlocker
      const { uaBlocker } = await import('./index')

      const app = new Hono()
      app.use('/robots.txt', useAiRobotsTxt())
      app.use('*', uaBlocker({ blocklist: nonRespectingAiBots }))
      app.get('/', (c) => c.text('Hello World'))

      // Should serve robots.txt
      const robotsRes = await app.request('/robots.txt')
      expect(robotsRes.status).toBe(200)
      expect(await robotsRes.text()).toBe(AI_ROBOTS_TXT)

      // Should block non-respecting bots
      const blockedRes = await app.request('/', {
        headers: { 'User-Agent': 'Bytespider' },
      })
      expect(blockedRes.status).toBe(403)

      // Should allow respecting bots
      const allowedRes = await app.request('/', {
        headers: { 'User-Agent': 'GPTBot' },
      })
      expect(allowedRes.status).toBe(200)
      expect(await allowedRes.text()).toBe('Hello World')
    })

    it('Should work with demo pattern', async () => {
      const { uaBlocker } = await import('./index')

      const app = new Hono()
      app.use('*', uaBlocker({ blocklist: nonRespectingAiBots }))
      app.use('/robots.txt', useAiRobotsTxt())
      app.get('/', (c) => c.text('Hello World'))

      // Test the same pattern as shown in demo.ts
      const robotsRes = await app.request('/robots.txt')
      expect(robotsRes.status).toBe(200)
      expect(robotsRes.headers.get('Content-Type')).toBe('text/plain; charset=UTF-8')

      const homeRes = await app.request('/')
      expect(homeRes.status).toBe(200)
      expect(await homeRes.text()).toBe('Hello World')
    })
  })

  describe('Data consistency validation', () => {
    it('Should have robots.txt that matches bot lists', () => {
      const userAgentLines = AI_ROBOTS_TXT.split('\n')
        .filter((line) => line.startsWith('User-agent:'))
        .map((line) => line.replace('User-agent: ', ''))

      expect(userAgentLines.sort()).toEqual(ALL_BOTS.sort())
    })
  })
})