diff --git a/README.md b/README.md index a2facd1..beacf51 100644 --- a/README.md +++ b/README.md @@ -67,9 +67,19 @@ This will return `undefined` if the URL isn't valid for this robots.txt. **boolean or undefined** +> [!CAUTION] +> This is not part of the robots.txt specification and should only be used with +> the websites owners permission. +> This method is only intended for special purposes where a user-agent shouldn't +> fallback to matching against global (*) rules. +> +> An example of this behaviour is [Google AdsBot](https://developers.google.com/search/docs/crawling-indexing/google-special-case-crawlers) +> which must be explicitly excluded. This is done with the website owners permission. + Returns trues if explicitly disallowed for the specified user agent (User Agent wildcards are discarded). This will return undefined if the URL is not valid for this robots.txt file. + ### getMatchingLineNumber(url, [ua]) **number or undefined** diff --git a/Robots.js b/Robots.js index 9fb7cf5..60cf321 100644 --- a/Robots.js +++ b/Robots.js @@ -376,9 +376,9 @@ Robots.prototype._getRule = function (url, ua, explicit) { var rules = this._rules[userAgent]; if (!explicit) { - rules = rules || this._rules['*'] + rules = rules || this._rules['*']; } - rules = rules || [] + rules = rules || []; var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); var rule = findRule(path, rules); @@ -438,21 +438,23 @@ Robots.prototype.isDisallowed = function (url, ua) { }; /** - * Returns trues if explicitly disallowed + * Returns trues if explicitly disallowed * for the specified user agent (User Agent wildcards are discarded). - * + * * This will return undefined if the URL is not valid for this robots.txt file. + * * @param {string} url * @param {string} ua * @return {boolean?} */ -Robots.prototype.isExplicitlyDisallowed = function(url, ua) { +Robots.prototype.isExplicitlyDisallowed = function (url, ua) { var rule = this._getRule(url, ua, true); if (typeof rule === 'undefined') { - return true; + return; } + return !(!rule || rule.allow); -} +}; /** * Gets the crawl delay if there is one. diff --git a/test/Robots.js b/test/Robots.js index f1575ae..18137ae 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -873,7 +873,7 @@ describe('Robots', function () { var robots = robotsParser(url, contents); expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(false) - }) + }); it('should be disallowed when user agent equal robots rule in explicit mode', function () { var contents = [ @@ -886,5 +886,18 @@ describe('Robots', function () { var robots = robotsParser(url, contents); expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(true) - }) + }); + + it('should return undefined when given an invalid URL in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser('http://example.com', contents); + + expect(robots.isExplicitlyDisallowed(url, userAgent)).to.equal(undefined) + }); });