From 02a703cffadc906d2eb83ada9aa7c4bf1950fd28 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 4 Jul 2025 18:13:57 +0900 Subject: [PATCH] chore(ua-blocker): update robots.json from upstream (#1280) Co-authored-by: yusukebe <10682+yusukebe@users.noreply.github.com> --- .changeset/auto-sync-robots.md | 5 +++ packages/ua-blocker/src/data/robots.json | 48 +++++++++--------------- packages/ua-blocker/src/generated.ts | 14 +++---- 3 files changed, 28 insertions(+), 39 deletions(-) create mode 100644 .changeset/auto-sync-robots.md diff --git a/.changeset/auto-sync-robots.md b/.changeset/auto-sync-robots.md new file mode 100644 index 00000000..daf4c74d --- /dev/null +++ b/.changeset/auto-sync-robots.md @@ -0,0 +1,5 @@ +--- +'@hono/ua-blocker': patch +--- + +chore(ua-blocker): sync `robots.json` with upstream diff --git a/packages/ua-blocker/src/data/robots.json b/packages/ua-blocker/src/data/robots.json index b0334dd3..fc0925a6 100644 --- a/packages/ua-blocker/src/data/robots.json +++ b/packages/ua-blocker/src/data/robots.json @@ -230,6 +230,13 @@ "operator": "Unknown", "respect": "[Yes](https://imho.alex-kunz.com/2024/01/25/an-update-on-friendly-crawler)" }, + "Gemini-Deep-Research": { + "operator": "Unclear at this time.", + "respect": "Unclear at this time.", + "function": "AI Assistants", + "frequency": "Unclear at this time.", + "description": "Gemini-Deep-Research is the agent responsible for collecting and scanning resources used in Google Gemini's Deep Research feature, which acts as a personal research assistant. More info can be found at https://darkvisitors.com/agents/agents/gemini-deep-research" + }, "Google-CloudVertexBot": { "operator": "Google", "respect": "[Yes](https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers)", @@ -363,6 +370,13 @@ "frequency": "Unclear at this time.", "description": "Operator and data use is unclear at this time." }, + "netEstate Imprint Crawler": { + "operator": "netEstate", + "respect": "Unclear at this time.", + "function": "AI Data Scrapers", + "frequency": "Unclear at this time.", + "description": "netEstate Imprint Crawler is an AI data scraper operated by netEstate. If you think this is incorrect or can provide additional detail about its purpose, please contact us. More info can be found at https://darkvisitors.com/agents/agents/netestate-imprint-crawler" + }, "NovaAct": { "operator": "Unclear at this time.", "respect": "Unclear at this time.", @@ -489,47 +503,19 @@ "operator": "[Zyte](https://www.zyte.com)", "respect": "Unclear at this time." }, - "SemrushBot": { - "operator": "[Semrush](https://www.semrush.com/)", - "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Crawls your site for ContentShake AI tool.", - "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." - }, - "SemrushBot-BA": { - "operator": "[Semrush](https://www.semrush.com/)", - "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Crawls your site for ContentShake AI tool.", - "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." - }, - "SemrushBot-CT": { - "operator": "[Semrush](https://www.semrush.com/)", - "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Crawls your site for ContentShake AI tool.", - "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." - }, "SemrushBot-OCOB": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", "function": "Crawls your site for ContentShake AI tool.", "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." - }, - "SemrushBot-SI": { - "operator": "[Semrush](https://www.semrush.com/)", - "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Crawls your site for ContentShake AI tool.", - "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + "description": "Data collected is used for the ContentShake AI tool reports." }, "SemrushBot-SWA": { "operator": "[Semrush](https://www.semrush.com/)", "respect": "[Yes](https://www.semrush.com/bot/)", - "function": "Checks URLs on your site for SWA tool.", + "function": "Checks URLs on your site for SEO Writing Assistant.", "frequency": "Roughly once every 10 seconds.", - "description": "You enter one text (on-demand) and we will make suggestions on it (the tool uses AI but we are not actively crawling the web, you need to manually enter one text/URL)." + "description": "Data collected is used for the SEO Writing Assistant tool to check if URL is accessible." }, "Sidetrade indexer bot": { "description": "AI product training.", diff --git a/packages/ua-blocker/src/generated.ts b/packages/ua-blocker/src/generated.ts index 370191fe..d4d0144d 100644 --- a/packages/ua-blocker/src/generated.ts +++ b/packages/ua-blocker/src/generated.ts @@ -32,6 +32,7 @@ User-agent: facebookexternalhit User-agent: Factset_spyderbot User-agent: FirecrawlAgent User-agent: FriendlyCrawler +User-agent: Gemini-Deep-Research User-agent: Google-CloudVertexBot User-agent: Google-Extended User-agent: GoogleOther @@ -51,6 +52,7 @@ User-agent: Meta-ExternalFetcher User-agent: MistralAI-User User-agent: MistralAI-User/1.0 User-agent: MyCentralAIScraperBot +User-agent: netEstate Imprint Crawler User-agent: NovaAct User-agent: OAI-SearchBot User-agent: omgili @@ -69,11 +71,7 @@ User-agent: QuillBot User-agent: quillbot.com User-agent: SBIntuitionsBot User-agent: Scrapy -User-agent: SemrushBot -User-agent: SemrushBot-BA -User-agent: SemrushBot-CT User-agent: SemrushBot-OCOB -User-agent: SemrushBot-SI User-agent: SemrushBot-SWA User-agent: Sidetrade indexer bot User-agent: TikTokSpider @@ -87,7 +85,7 @@ User-agent: YandexAdditionalBot User-agent: YouBot Disallow: / `; -export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Google-CloudVertexBot", "Google-Extended", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot", "SemrushBot-BA", "SemrushBot-CT", "SemrushBot-OCOB", "SemrushBot-SI", "SemrushBot-SWA", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"]; -export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Awario", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MyCentralAIScraperBot", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot"]; -export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT|SEMRUSHBOT-BA|SEMRUSHBOT-CT|SEMRUSHBOT-OCOB|SEMRUSHBOT-SI|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/; -export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT)/; +export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "Awario", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "FacebookBot", "facebookexternalhit", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Gemini-Deep-Research", "Google-CloudVertexBot", "Google-Extended", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MistralAI-User/1.0", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot-OCOB", "SemrushBot-SWA", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "WARDBot", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"]; +export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Awario", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Datenbank Crawler", "Devin", "Diffbot", "DuckAssistBot", "Echobot Bot", "EchoboxBot", "facebookexternalhit", "Factset_spyderbot", "Gemini-Deep-Research", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User", "MyCentralAIScraperBot", "netEstate Imprint Crawler", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "WARDBot", "Webzio-Extended", "wpbot"]; +export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|AWARIO|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GEMINI-DEEP-RESEARCH|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT-OCOB|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WARDBOT|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/; +export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|AWARIO|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DATENBANK CRAWLER|DEVIN|DIFFBOT|DUCKASSISTBOT|ECHOBOT BOT|ECHOBOXBOT|FACEBOOKEXTERNALHIT|FACTSET_SPYDERBOT|GEMINI-DEEP-RESEARCH|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER|MYCENTRALAISCRAPERBOT|NETESTATE IMPRINT CRAWLER|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|WARDBOT|WEBZIO-EXTENDED|WPBOT)/;