From 47cb4c85693ebe77d3b8b55f6e1e59778f671a15 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 16 Jul 2025 12:16:43 +0900 Subject: [PATCH] chore(ua-blocker): update robots.json from upstream (#1309) Co-authored-by: yusukebe <10682+yusukebe@users.noreply.github.com> --- .changeset/auto-sync-robots.md | 5 +++++ packages/ua-blocker/src/data/robots.json | 7 +++++++ packages/ua-blocker/src/generated.ts | 9 +++++---- 3 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 .changeset/auto-sync-robots.md diff --git a/.changeset/auto-sync-robots.md b/.changeset/auto-sync-robots.md new file mode 100644 index 00000000..daf4c74d --- /dev/null +++ b/.changeset/auto-sync-robots.md @@ -0,0 +1,5 @@ +--- +'@hono/ua-blocker': patch +--- + +chore(ua-blocker): sync `robots.json` with upstream diff --git a/packages/ua-blocker/src/data/robots.json b/packages/ua-blocker/src/data/robots.json index 2453a4eb..17bc870d 100644 --- a/packages/ua-blocker/src/data/robots.json +++ b/packages/ua-blocker/src/data/robots.json @@ -251,6 +251,13 @@ "frequency": "No information.", "description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search." }, + "GoogleAgent-Mariner": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Agents", + "frequency": "Unclear at this time.", + "description": "GoogleAgent-Mariner is an AI agent created by Google that can use a web browser. It can intelligently navigate and interact with websites to complete multi-step tasks on behalf of a human user. More info can be found at https://darkvisitors.com/agents/agents/googleagent-mariner" + }, "GoogleOther": { "description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\"", "frequency": "No information.", diff --git a/packages/ua-blocker/src/generated.ts b/packages/ua-blocker/src/generated.ts index fb5f14c2..6a80922a 100644 --- a/packages/ua-blocker/src/generated.ts +++ b/packages/ua-blocker/src/generated.ts @@ -35,6 +35,7 @@ User-agent: FriendlyCrawler User-agent: Gemini-Deep-Research User-agent: Google-CloudVertexBot User-agent: Google-Extended +User-agent: GoogleAgent-Mariner User-agent: GoogleOther User-agent: GoogleOther-Image User-agent: GoogleOther-Video @@ -87,7 +88,7 @@ User-agent: YandexAdditionalBot User-agent: YouBot Disallow: / `; -export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Gemini-Deep-Research", "Google-CloudVertexBot", "Google-Extended", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot-OCOB", "SemrushBot-SWA", "Sidetrade indexer bot", "SummalyBot", "Thinkbot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"]; -export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Awario", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "Gemini-Deep-Research", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "SummalyBot", "Thinkbot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot"]; -export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GEMINI-DEEP-RESEARCH|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT-OCOB|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|SUMMALYBOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/; -export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|GEMINI-DEEP-RESEARCH|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|SUMMALYBOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT)/; +export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Gemini-Deep-Research", "Google-CloudVertexBot", "Google-Extended", "GoogleAgent-Mariner", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot-OCOB", "SemrushBot-SWA", "Sidetrade indexer bot", "SummalyBot", "Thinkbot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"]; +export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Awario", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "Gemini-Deep-Research", "GoogleAgent-Mariner", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "SummalyBot", "Thinkbot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot"]; +export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GEMINI-DEEP-RESEARCH|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEAGENT-MARINER|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT-OCOB|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|SUMMALYBOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/; +export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|GEMINI-DEEP-RESEARCH|GOOGLEAGENT-MARINER|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|SUMMALYBOT|THINKBOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT)/;