From d5f8b28d98fe29aaf6b24fccb0331f21b1b9ceca Mon Sep 17 00:00:00 2001 From: SimonC-Audigent Date: Tue, 17 Sep 2024 10:06:20 +0100 Subject: [PATCH] add explicit disallow feature --- README.md | 6 ++++-- Robots.js | 52 ++++++++++++++++++++++++++++++++++++++++++++------ index.d.ts | 2 +- test/Robots.js | 38 ++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index fc723b8..47c3f32 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ The parser currently supports: - User-agent: - Allow: -- Disallow: +- Disallow (with explicit mode support): - Sitemap: - Crawl-delay: - Host: @@ -41,6 +41,7 @@ var robots = robotsParser('http://www.example.com/robots.txt', [ robots.isAllowed('http://www.example.com/test.html', 'Sams-Bot/1.0'); // true robots.isAllowed('http://www.example.com/dir/test.html', 'Sams-Bot/1.0'); // true robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0'); // true +robots.isDisallowed('http://www.example.com/dir/test2.html', 'Sams-Bot/1.0', true); // false robots.getCrawlDelay('Sams-Bot/1.0'); // 1 robots.getSitemaps(); // ['http://example.com/sitemap.xml'] robots.getPreferredHost(); // example.com @@ -54,11 +55,12 @@ Returns true if crawling the specified URL is allowed for the specified user-age This will return `undefined` if the URL isn't valid for this robots.txt. -### isDisallowed(url, [ua]) +### isDisallowed(url, [ua], [explicit]) **boolean or undefined** Returns true if crawling the specified URL is not allowed for the specified user-agent. +In explicit mode, user agents wildcards are discarded. This will return `undefined` if the URL isn't valid for this robots.txt. diff --git a/Robots.js b/Robots.js index f0a8e9e..db26a93 100644 --- a/Robots.js +++ b/Robots.js @@ -361,7 +361,7 @@ Robots.prototype.setPreferredHost = function (url) { this._preferredHost = url; }; -Robots.prototype._getRule = function (url, ua) { +Robots.prototype._getRule = function (url, ua, explicit) { var parsedUrl = parseUrl(url) || {}; var userAgent = formatUserAgent(ua || '*'); @@ -374,7 +374,12 @@ Robots.prototype._getRule = function (url, ua) { return; } - var rules = this._rules[userAgent] || this._rules['*'] || []; + var rules = this._rules[userAgent]; + if (!explicit) { + rules = rules || this._rules['*'] + } + rules = rules || [] + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); var rule = findRule(path, rules); @@ -422,16 +427,51 @@ Robots.prototype.getMatchingLineNumber = function (url, ua) { }; /** - * Returns the opposite of isAllowed() - * + * In standard mode, it returns the opposite of is allowed(). + * In explicit mode, it will return: + * - true if the the agent is explicitly disallowed (wildcard non included), + * - throws an error if the user agent is not specified, + * - and false otherwise. * @param {string} url * @param {string} ua * @return {boolean} */ -Robots.prototype.isDisallowed = function (url, ua) { - return !this.isAllowed(url, ua); +Robots.prototype.isDisallowed = function (url, ua, explicit) { + if ((explicit === true) && (ua === undefined)) { + throw new Error("User Agent must be specified in explicit mode") + } + + var rule = this._getRule(url, ua, explicit); + if (typeof rule === 'undefined') { + return true; + } + return !(!rule || rule.allow); }; +Robots.prototype.isExplicitlyDisallowed = function(url, ua) { + var parsedUrl = parseUrl(url) || {}; + var userAgent = formatUserAgent(ua); + + // The base URL must match otherwise this robots.txt is not valid for it. + if ( + parsedUrl.protocol !== this._url.protocol || + parsedUrl.hostname !== this._url.hostname || + parsedUrl.port !== this._url.port + ) { + return; + } + + var rules = this._rules[userAgent] || []; + var path = urlEncodeToUpper(parsedUrl.pathname + parsedUrl.search); + var rule = findRule(path, rules); + + if (typeof rule === 'undefined') { + return; + } + + return !(!rule || rule.allow); +} + /** * Gets the crawl delay if there is one. * diff --git a/index.d.ts b/index.d.ts index 5446898..852ddec 100644 --- a/index.d.ts +++ b/index.d.ts @@ -2,7 +2,7 @@ declare module 'robots-parser'; interface Robot { isAllowed(url: string, ua?: string): boolean | undefined; - isDisallowed(url: string, ua?: string): boolean | undefined; + isDisallowed(url: string, ua?: string, explicit?: boolean): boolean | undefined; getMatchingLineNumber(url: string, ua?: string): number; getCrawlDelay(ua?: string): number | undefined; getSitemaps(): string[]; diff --git a/test/Robots.js b/test/Robots.js index 666d9b8..6f979ba 100644 --- a/test/Robots.js +++ b/test/Robots.js @@ -861,4 +861,42 @@ describe('Robots', function () { testRobots('https://www.example.com/robots.txt', contents, allowed, disallowed); }); + + it('should not be disallowed when wildcard is used in explicit mode', function () { + var contents = [ + 'User-agent: *', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed(url, userAgent, true)).to.equal(false) + }) + + it('should be disallowed when user agent equal robots rule in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var userAgent = 'SomeBot'; + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed(url, userAgent, true)).to.equal(true) + }) + + it('should throw an error when user agent is not set in explicit mode', function () { + var contents = [ + 'User-agent: SomeBot', + 'Disallow: /', + ].join('\n') + + var url = 'https://www.example.com/hello' + var robots = robotsParser(url, contents); + + expect(robots.isDisallowed.bind(robots, url, undefined, true)).to.throw("User Agent must be specified in explicit mode") + }) });