chore(ua-blocker): update robots.json from upstream (#1229)
* chore(ua-blocker): update robots.json from upstream * add changeset * format --------- Co-authored-by: yusukebe <10682+yusukebe@users.noreply.github.com> Co-authored-by: Yusuke Wada <yusuke@kamawada.com>pull/1232/head
parent
a2409d2314
commit
40f916f944
|
@ -0,0 +1,5 @@
|
|||
---
|
||||
'@hono/ua-blocker': patch
|
||||
---
|
||||
|
||||
chore(ua-blocker): update robots.json from upstream
|
|
@ -314,6 +314,13 @@
|
|||
"description": "MistralAI-User is for user actions in LeChat. When users ask LeChat a question, it may visit a web page to help answer and include a link to the source in its response.",
|
||||
"respect": "Yes"
|
||||
},
|
||||
"MyCentralAIScraperBot": {
|
||||
"operator": "Unclear at this time.",
|
||||
"respect": "Unclear at this time.",
|
||||
"function": "AI data scraper",
|
||||
"frequency": "Unclear at this time.",
|
||||
"description": "Operator and data use is unclear at this time."
|
||||
},
|
||||
"NovaAct": {
|
||||
"operator": "Unclear at this time.",
|
||||
"respect": "Unclear at this time.",
|
||||
|
@ -398,6 +405,13 @@
|
|||
"operator": "[phind](https://www.phind.com/)",
|
||||
"respect": "Unclear at this time."
|
||||
},
|
||||
"Poseidon Research Crawler": {
|
||||
"operator": "[Poseidon Research](https://www.poseidonresearch.com)",
|
||||
"description": "Lab focused on scaling the interpretability research necessary to make better AI systems possible.",
|
||||
"frequency": "No explicit frequency provided.",
|
||||
"function": "AI research crawler",
|
||||
"respect": "Unclear at this time."
|
||||
},
|
||||
"QualifiedBot": {
|
||||
"description": "Operated by Qualified as part of their suite of AI product offerings.",
|
||||
"frequency": "No explicit frequency provided.",
|
||||
|
|
|
@ -44,6 +44,7 @@ User-agent: Meta-ExternalAgent
|
|||
User-agent: meta-externalfetcher
|
||||
User-agent: Meta-ExternalFetcher
|
||||
User-agent: MistralAI-User/1.0
|
||||
User-agent: MyCentralAIScraperBot
|
||||
User-agent: NovaAct
|
||||
User-agent: OAI-SearchBot
|
||||
User-agent: omgili
|
||||
|
@ -56,6 +57,7 @@ User-agent: Perplexity-User
|
|||
User-agent: PerplexityBot
|
||||
User-agent: PetalBot
|
||||
User-agent: PhindBot
|
||||
User-agent: Poseidon Research Crawler
|
||||
User-agent: QualifiedBot
|
||||
User-agent: QuillBot
|
||||
User-agent: quillbot.com
|
||||
|
@ -78,7 +80,7 @@ User-agent: YandexAdditionalBot
|
|||
User-agent: YouBot
|
||||
Disallow: /
|
||||
`;
|
||||
export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Diffbot", "DuckAssistBot", "EchoboxBot", "FacebookBot", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Google-CloudVertexBot", "Google-Extended", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User/1.0", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot", "SemrushBot-BA", "SemrushBot-CT", "SemrushBot-OCOB", "SemrushBot-SI", "SemrushBot-SWA", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"];
|
||||
export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Diffbot", "DuckAssistBot", "EchoboxBot", "Factset_spyderbot", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "Webzio-Extended", "wpbot"];
|
||||
export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DIFFBOT|DUCKASSISTBOT|ECHOBOXBOT|FACEBOOKBOT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER\/1.0|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT|SEMRUSHBOT-BA|SEMRUSHBOT-CT|SEMRUSHBOT-OCOB|SEMRUSHBOT-SI|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/;
|
||||
export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DIFFBOT|DUCKASSISTBOT|ECHOBOXBOT|FACTSET_SPYDERBOT|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|WEBZIO-EXTENDED|WPBOT)/;
|
||||
export const ALL_BOTS = ["AI2Bot", "Ai2Bot-Dolma", "aiHitBot", "Amazonbot", "Andibot", "anthropic-ai", "Applebot", "Applebot-Extended", "bedrockbot", "Brightbot 1.0", "Bytespider", "CCBot", "ChatGPT-User", "Claude-SearchBot", "Claude-User", "Claude-Web", "ClaudeBot", "cohere-ai", "cohere-training-data-crawler", "Cotoyogi", "Crawlspace", "Diffbot", "DuckAssistBot", "EchoboxBot", "FacebookBot", "Factset_spyderbot", "FirecrawlAgent", "FriendlyCrawler", "Google-CloudVertexBot", "Google-Extended", "GoogleOther", "GoogleOther-Image", "GoogleOther-Video", "GPTBot", "iaskspider/2.0", "ICC-Crawler", "ImagesiftBot", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "meta-externalagent", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MistralAI-User/1.0", "MyCentralAIScraperBot", "NovaAct", "OAI-SearchBot", "omgili", "omgilibot", "Operator", "PanguBot", "Panscient", "panscient.com", "Perplexity-User", "PerplexityBot", "PetalBot", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "SBIntuitionsBot", "Scrapy", "SemrushBot", "SemrushBot-BA", "SemrushBot-CT", "SemrushBot-OCOB", "SemrushBot-SI", "SemrushBot-SWA", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "VelenPublicWebCrawler", "Webzio-Extended", "wpbot", "YandexAdditional", "YandexAdditionalBot", "YouBot"];
|
||||
export const NON_RESPECTING_BOTS = ["Andibot", "anthropic-ai", "Applebot", "Brightbot 1.0", "Bytespider", "Claude-Web", "cohere-ai", "cohere-training-data-crawler", "Diffbot", "DuckAssistBot", "EchoboxBot", "Factset_spyderbot", "iaskspider/2.0", "img2dataset", "ISSCyberRiskCrawler", "Kangaroo Bot", "Meta-ExternalAgent", "meta-externalfetcher", "Meta-ExternalFetcher", "MyCentralAIScraperBot", "NovaAct", "Operator", "PanguBot", "Perplexity-User", "PhindBot", "Poseidon Research Crawler", "QualifiedBot", "QuillBot", "quillbot.com", "Scrapy", "Sidetrade indexer bot", "TikTokSpider", "Timpibot", "Webzio-Extended", "wpbot"];
|
||||
export const ALL_BOTS_REGEX = /(AI2BOT|AI2BOT-DOLMA|AIHITBOT|AMAZONBOT|ANDIBOT|ANTHROPIC-AI|APPLEBOT|APPLEBOT-EXTENDED|BEDROCKBOT|BRIGHTBOT 1.0|BYTESPIDER|CCBOT|CHATGPT-USER|CLAUDE-SEARCHBOT|CLAUDE-USER|CLAUDE-WEB|CLAUDEBOT|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|COTOYOGI|CRAWLSPACE|DIFFBOT|DUCKASSISTBOT|ECHOBOXBOT|FACEBOOKBOT|FACTSET_SPYDERBOT|FIRECRAWLAGENT|FRIENDLYCRAWLER|GOOGLE-CLOUDVERTEXBOT|GOOGLE-EXTENDED|GOOGLEOTHER|GOOGLEOTHER-IMAGE|GOOGLEOTHER-VIDEO|GPTBOT|IASKSPIDER\/2.0|ICC-CRAWLER|IMAGESIFTBOT|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MISTRALAI-USER\/1.0|MYCENTRALAISCRAPERBOT|NOVAACT|OAI-SEARCHBOT|OMGILI|OMGILIBOT|OPERATOR|PANGUBOT|PANSCIENT|PANSCIENT.COM|PERPLEXITY-USER|PERPLEXITYBOT|PETALBOT|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SBINTUITIONSBOT|SCRAPY|SEMRUSHBOT|SEMRUSHBOT-BA|SEMRUSHBOT-CT|SEMRUSHBOT-OCOB|SEMRUSHBOT-SI|SEMRUSHBOT-SWA|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|VELENPUBLICWEBCRAWLER|WEBZIO-EXTENDED|WPBOT|YANDEXADDITIONAL|YANDEXADDITIONALBOT|YOUBOT)/;
|
||||
export const NON_RESPECTING_BOTS_REGEX = /(ANDIBOT|ANTHROPIC-AI|APPLEBOT|BRIGHTBOT 1.0|BYTESPIDER|CLAUDE-WEB|COHERE-AI|COHERE-TRAINING-DATA-CRAWLER|DIFFBOT|DUCKASSISTBOT|ECHOBOXBOT|FACTSET_SPYDERBOT|IASKSPIDER\/2.0|IMG2DATASET|ISSCYBERRISKCRAWLER|KANGAROO BOT|META-EXTERNALAGENT|META-EXTERNALFETCHER|META-EXTERNALFETCHER|MYCENTRALAISCRAPERBOT|NOVAACT|OPERATOR|PANGUBOT|PERPLEXITY-USER|PHINDBOT|POSEIDON RESEARCH CRAWLER|QUALIFIEDBOT|QUILLBOT|QUILLBOT.COM|SCRAPY|SIDETRADE INDEXER BOT|TIKTOKSPIDER|TIMPIBOT|WEBZIO-EXTENDED|WPBOT)/;
|
||||
|
|
Loading…
Reference in New Issue