############# Exclusions section for ALL robots ################# User-agent: * Disallow: /typo3/ User-agent: * Disallow: /flozecontent.xml ############# Exclusions section for specific robots ################# #### Exclude TAGENT - it requests robots.txt before every GET # and GETs files too quickly. Here is a sample from the access log: # sv.tkensaku.com - - [22/Jan/2002:11:38:05 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:06 -0500] "GET /reviews/ HTTP/1.0" 200 14750 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:08 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:09 -0500] "GET /previews/ HTTP/1.0" 200 9163 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:10 -0500] "GET /robots.txt HTTP/1.0" 200 210 "TAGENT/V0.5" # sv.tkensaku.com - - [22/Jan/2002:11:38:12 -0500] "GET /articles/ HTTP/1.0" 200 9489 "TAGENT/V0.5" # User-agent: TAGENT Disallow: / #### Exclude Teleport Pro # User-agent: Teleport Pro Disallow: / #### Exclude AlkalineBOT # # On 10-Mar-2002 from remote host syr-24-95-161-196.twcny.rr.com # User-agent: AlkalineBOT Disallow: / #### Exclude Whizbang (see http://www.whizbang.com/crawler) # User-agent: Whizbang Disallow: / #### Exclude UniverseBot # # No delay between requests. It strips off trailing slash, thus # triggering redirects. It does both HEAD and GET. Sample: # # 07:18:04 "HEAD /companies/ensemble HTTP/1.0" 301 0 "UniverseBot/1.0" # 07:18:06 "HEAD /companies/ensemble/ HTTP/1.0" 200 0 "UniverseBot/1.0" # 07:18:07 "GET /companies/ensemble HTTP/1.0" 301 247 "UniverseBot/1.0" # 07:18:09 "GET /companies/ensemble/ HTTP/1.0" 200 9961 "UniverseBot/1.0" # User-agent: UniverseBot Disallow: / #### Exclude http://www.almaden.ibm.com/cs/crawler # # We'd like to limit the sites crawling us to the main indexers. # User-agent: http://www.almaden.ibm.com/cs/crawler Disallow: / #### Exclude "SlySearch/1.0 http://www.plagiarism.org/crawler/robotinfo.html" # # This site indexes article for plagiarism checks. # User-agent: SlySearch Disallow: / #### Exclude NG/1.0 # # On 18-Oct-2002 from remote host ng1.exabot.com # # 13:11:35 "GET /news/more/1005254413/d/redir/cb_order/UNRET2003.IR HTTP/1.0" 404 244 "NG/1.0" # 13:11:37 "GET /news/more/1005254413/gi/tattletale/news/ HTTP/1.0" 404 234 "NG/1.0" # 13:11:38 "GET /news/more/1005254413/ews/ HTTP/1.0" 404 219 "NG/1.0" # User-agent: NG/1.0 Disallow: / #### Exclude spider from singingfish.com - no media to index. # User-agent: asterias Disallow: / #### Exclude spider from xo.net - no reason to index our files # User-agent: Gaisbot Disallow: / #### Exclude UbiCrawler # # On 27-Sep-2003 from remote host ubi1.iit.cnr.it # http://ubi.imc.pi.cnr.it/projects/ubicrawler/ # User-agent: UbiCrawler Disallow: / #### Exclude Wget # # It checks this only for recursive operations, not for indiv. files # User-agent: Wget Disallow: / #### Exclude TranSGeniKBot # User-agent: TranSGeniKBot Disallow: / #### Exclude Ocelli/1.1 (http://www.globalspec.com) # User-agent: Ocelli Disallow: / #### Exclude Exabot (http://www.exava.com/) # # Doesn't honor global exclusions. # User-agent: Exabot Disallow: / #### Exclude Pompos (http://www.dir.com/) # # Obscure search site - 1/4 of the URLs have %00 appended. # # Stupid thing requires *no* optional space after User-agent: User-agent:Pompos Disallow: / #### Exclude larbin (http://freshmeat.net/projects/larbin/) # # Open source spider that can be used by anyone. :-/ # User-agent: larbin Disallow: / #### Exclude Nutch (http://www.nutch.org/docs/en/bot.html) # # Open source spider that can be used by anyone. :-/ # User-agent: Nutch Disallow: / #### Exclude Jetbot (http://www.jeteye.com/jetbot.html) # # Doesn't honor global exclusions (it fetches /dl pages). # User-agent: Jetbot Disallow: /